From 5767035276d55aeca7fdb50e7ceaf09b72d0d62e Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 13 Aug 2024 15:02:26 -0400 Subject: [PATCH 001/161] Initial dagster integration --- pyproject.toml | 3 + src/mozilla_sec_eia/assets.py | 50 ++++++ src/mozilla_sec_eia/ex_21/extractor.py | 3 - src/mozilla_sec_eia/extract.py | 235 ++++++++++++++++--------- src/mozilla_sec_eia/utils/cloud.py | 90 +++++----- tests/conftest.py | 22 --- tests/unit/extract_test.py | 64 +++++-- tests/unit/utils_test.py | 20 +-- 8 files changed, 310 insertions(+), 177 deletions(-) create mode 100644 src/mozilla_sec_eia/assets.py diff --git a/pyproject.toml b/pyproject.toml index ca30b8a..985f73e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -20,6 +20,9 @@ license = {file = "LICENSE.txt"} dependencies = [ "accelerate>=0.21.0,<1.0", # Hugging Face dependency for PyTorch models "cloud-sql-python-connector[pg8000]", + "dagster>=1.7.15", # 1.7.13 & 1.7.14 were both breaking things + "dagster-mlflow", + "dagster-webserver", "datasets>=2.1,<3", # Access Hugging Face datasets "seqeval>=1.2,<2", # Sequence labeling evaluation "google-cloud-secret-manager>=2,<3", diff --git a/src/mozilla_sec_eia/assets.py b/src/mozilla_sec_eia/assets.py new file mode 100644 index 0000000..e5656ac --- /dev/null +++ b/src/mozilla_sec_eia/assets.py @@ -0,0 +1,50 @@ +"""Define asset jobs and configuration.""" + +import logging + +import coloredlogs +from dagster import Definitions, EnvVar, define_asset_job + +from mozilla_sec_eia.extract import ExtractConfig, basic_10k_extract, basic_10k_validate +from mozilla_sec_eia.utils.cloud import GCSArchive, MlflowInterface + +logger = logging.getLogger("catalystcoop") +log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s" +coloredlogs.install(fmt=log_format, logger=logger) + +extract_job = define_asset_job( + name="extract_job", + selection=[basic_10k_extract], +) +validate_job = define_asset_job( + name="validate_job", + selection=[basic_10k_validate], +) + +cloud_interface = GCSArchive( + filings_bucket_name=EnvVar("GCS_FILINGS_BUCKET_NAME"), + labels_bucket_name=EnvVar("GCS_LABELS_BUCKET_NAME"), + metadata_db_instance_connection=EnvVar("GCS_METADATA_DB_INSTANCE_CONNECTION"), + user=EnvVar("GCS_IAM_USER"), + metadata_db_name=EnvVar("GCS_METADATA_DB_NAME"), + project=EnvVar("GCS_PROJECT"), +) + +defs = Definitions( + assets=[basic_10k_validate, basic_10k_extract], + jobs=[extract_job, validate_job], + resources={ + "cloud_interface": cloud_interface, + "basic_10k_extract_config": ExtractConfig(), + "basic_10k_extract_mlflow": MlflowInterface( + experiment_name="basic_10k_extraction", + tracking_uri=EnvVar("MLFLOW_TRACKING_URI"), + cloud_interface=cloud_interface, + ), + "basic_10k_extract_validate_mlflow": MlflowInterface( + experiment_name="basic_10k_extraction_validation", + tracking_uri=EnvVar("MLFLOW_TRACKING_URI"), + cloud_interface=cloud_interface, + ), + }, +) diff --git a/src/mozilla_sec_eia/ex_21/extractor.py b/src/mozilla_sec_eia/ex_21/extractor.py index faf2dcb..8a3b617 100644 --- a/src/mozilla_sec_eia/ex_21/extractor.py +++ b/src/mozilla_sec_eia/ex_21/extractor.py @@ -27,7 +27,6 @@ from transformers.data.data_collator import default_data_collator from mozilla_sec_eia.ex_21.create_labeled_dataset import format_as_ner_annotations -from mozilla_sec_eia.utils.cloud import initialize_mlflow LABELS = ["O", "B-Subsidiary", "I-Subsidiary", "B-Loc", "I-Loc", "B-Own_Per"] @@ -140,7 +139,6 @@ def log_model(finetuned_model: Trainer): def load_model(): """Load fine-tuned model from mlflow artifacts.""" - initialize_mlflow() return mlflow.transformers.load_model( "models:/layoutlm_extractor/1", return_type="components" ) @@ -160,7 +158,6 @@ def train_model( test_size: Proportion of labeled dataset to use for test set. """ # Prepare mlflow for tracking/logging model - initialize_mlflow() mlflow.set_experiment("/finetune-layoutlmv3") # Prepare model diff --git a/src/mozilla_sec_eia/extract.py b/src/mozilla_sec_eia/extract.py index d21c3f5..f4bf1bc 100644 --- a/src/mozilla_sec_eia/extract.py +++ b/src/mozilla_sec_eia/extract.py @@ -2,15 +2,18 @@ import io import logging +import tempfile from importlib import resources +from pathlib import Path import mlflow import pandas as pd import pandera as pa +from dagster import ConfigurableResource, asset from mlflow.entities import Run from mozilla_sec_eia import basic_10k -from mozilla_sec_eia.utils.cloud import GCSArchive, initialize_mlflow +from mozilla_sec_eia.utils.cloud import GCSArchive logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -40,6 +43,22 @@ def _log_artifact_as_csv( return mlflow.log_text(artifact.to_csv(index=index), artifact_name) +def _load_artifact_as_parquet(run: Run, artifact_name: str) -> pd.DataFrame: + """Download a CSV and parse to DataFrame from mlflow tracking server.""" + df = pd.read_parquet(run.info.artifact_uri + artifact_name) + return df + + +def _log_artifact_as_parquet( + artifact: pd.DataFrame, artifact_name: str, index: bool = True +): + """Upload a DataFrame as a CSV to mlflow tracking server.""" + with tempfile.TemporaryDirectory() as tmp_dir: + parquet_path = Path(tmp_dir) / artifact_name + artifact.to_parquet(parquet_path, index=index) + return mlflow.log_artifact(parquet_path, artifact_name) + + def _get_most_recent_run(experiment_name: str): """Search mlflow for most recent extraction run with specified experiment name.""" run_metadata = mlflow.search_runs(experiment_names=[experiment_name]) @@ -76,7 +95,7 @@ def _get_filings_to_extract( most_recent_run, "/extraction_metadata.csv" ).set_index("filename") ) - extracted = _load_artifact_as_csv(most_recent_run, "/extracted.csv") + extracted = _load_artifact_as_parquet(most_recent_run, "/extracted.parquet") run_id = most_recent_run.info.run_id filings_to_extract = metadata[~metadata["filename"].isin(extraction_metadata.index)] @@ -137,85 +156,17 @@ def compute_validation_metrics( } -def validate_extraction(dataset: str): - """Run extraction on validation set and compare results to labeled data.""" - validation_set = pd.read_csv( - resources.files("mozilla_sec_eia.package_data") / f"{dataset}_labels.csv" - ) - - # Get metadata for labelled filings - archive = GCSArchive() - to_extract = archive.get_metadata(filenames=list(validation_set["filename"])) - - # Extract data from filings - extracted = extract_filings( - dataset=dataset, metadata=to_extract, experiment_suffix="validation" - ) - # Set index for validation set based on returned extracted DF - validation_set = validation_set.set_index(extracted.index.names) - - # Get extraction run from mlflow and start again to log validation metrics - experiment_name = _get_experiment_name(dataset, experiment_suffix="validation") - run = _get_most_recent_run(experiment_name) - with mlflow.start_run(run_id=run.info.run_id): - # Compute metrics and log - if dataset == "basic_10k": - mlflow.log_metrics( - compute_validation_metrics(extracted, validation_set, "value") - ) - # Log validation set used to compute metrics - _log_artifact_as_csv(validation_set, "labels.csv") - - def extract_filings( dataset: str, - continue_run: bool = False, - num_filings: int = -1, - metadata: pd.DataFrame | None = None, - experiment_suffix: str | None = None, + filings_to_extract: pd.DataFrame, + extraction_metadata: pd.DataFrame, + extracted: pd.DataFrame, + num_filings: int, + experiment_name: str, + cloud_interface: GCSArchive, + run_id: str | None = None, ) -> pd.DataFrame: - """Extra data from SEC 10k and exhibit 21 filings. - - This function takes several parameters to decide which filings to extract data - from. If `continue_run` is set, it will search the mlflow tracking server for - the most recent extraction run for the specified dataset, and download corresponding - metadata and extraction results. It will then filter out any filings that were - already extracted in the run. This is useful for testing to be able perform extraction - on subsets of the data and continue where it left off. If `metadata` is passed - in, this is expected to be a selection of filing metadata that specifies exactly - which filings to extract. This is used for validation to only extract filings in - the validation set. - - Args: - dataset: Data to extract, should be 'basic_10k' or 'ex21'. - continue_run: Whether to continue a previous extraction run. - num_filings: Number of filings to extract in run. - metadata: Specific selection of filing metadata to extract. - experiment_suffix: Add to mlflow run to differentiate run from basic extraction. - """ - if dataset not in ["ex21", "basic_10k"]: - raise RuntimeError( - f"{dataset} is not a valid dataset. Must be 'ex21' or 'basic_10k'." - ) - - initialize_mlflow() - - # Get filing metadata if not passed in explicitly - archive = GCSArchive() - if metadata is None: - metadata = archive.get_metadata() - - experiment_name = _get_experiment_name(dataset, experiment_suffix=experiment_suffix) - - # Get filings to extract as well as any existing metadata for run - filings_to_extract, extraction_metadata, extracted, run_id = ( - _get_filings_to_extract( - experiment_name, - metadata, - continue_run=continue_run, - num_filings=num_filings, - ) - ) + """Extract filings in `filings_to_extract`.""" mlflow.set_experiment(experiment_name) with mlflow.start_run(run_id=run_id): # Extract data for desired filings @@ -224,7 +175,7 @@ def extract_filings( filings_to_extract, extraction_metadata, extracted, - archive, + cloud_interface, ) else: logger.warning("Exhibit 21 extraction is not yet implemented.") @@ -234,14 +185,136 @@ def extract_filings( mlflow.log_metrics( { "num_failed": (~extraction_metadata["success"]).sum(), - "ratio_extracted": len(extraction_metadata) / len(metadata), + "ratio_extracted": len(extraction_metadata) / num_filings, } ) # Log the extraction results + metadata for future reference/analysis _log_artifact_as_csv(extraction_metadata, "extraction_metadata.csv") - _log_artifact_as_csv(extracted, "extracted.csv") + _log_artifact_as_parquet(extracted, "extracted.parquet") logger.info( f"Finished extracting {len(extraction_metadata)} filings from {dataset}." ) return extracted + + +class ExtractConfig(ConfigurableResource): + """Basic configuration for an extraction run.""" + + num_filings: int = -1 + + +def extract_asset_factory(dataset: str) -> asset: + """Produce asset to extract `dataset`.""" + + @asset( + name=f"{dataset}_extract", + required_resource_keys={ + f"{dataset}_extract_config", + f"{dataset}_extract_mlflow", + "cloud_interface", + }, + ) + def extract(context) -> pd.DataFrame: + config = context.resources.original_resource_dict[f"{dataset}_extract_config"] + cloud_interface: GCSArchive = context.resources.cloud_interface + mlflow_interface = context.resources.original_resource_dict[ + f"{dataset}_extract_mlflow" + ] + experiment_name = mlflow_interface.experiment_name + metadata = cloud_interface.get_metadata() + + # Get filings to extract as well as any existing metadata for run + filings_to_extract, extraction_metadata, extracted, run_id = ( + _get_filings_to_extract( + experiment_name, + metadata, + continue_run=mlflow_interface.continue_run, + num_filings=config.num_filings, + ) + ) + + return extract_filings( + dataset=dataset, + filings_to_extract=filings_to_extract, + extraction_metadata=extraction_metadata, + extracted=extracted, + num_filings=len(metadata), + experiment_name=experiment_name, + cloud_interface=cloud_interface, + run_id=run_id, + ) + + return extract + + +def validate_extraction( + dataset: str, experiment_name: str, cloud_interface: GCSArchive +): + """Run extraction on validation set and compare results to labeled data.""" + validation_set = pd.read_csv( + resources.files("mozilla_sec_eia.package_data") / f"{dataset}_labels.csv" + ) + + # Get metadata for labelled filings + to_extract = cloud_interface.get_metadata( + filenames=list(validation_set["filename"]) + ) + + # Get filings to extract as well as any existing metadata for run + filings_to_extract, extraction_metadata, extracted, run_id = ( + _get_filings_to_extract( + experiment_name, + to_extract, + ) + ) + + # Extract data from filings + extracted = extract_filings( + dataset=dataset, + filings_to_extract=filings_to_extract, + extraction_metadata=extraction_metadata, + extracted=extracted, + num_filings=len(to_extract), + experiment_name=experiment_name, + cloud_interface=cloud_interface, + run_id=run_id, + ) + + # Set index for validation set based on returned extracted DF + validation_set = validation_set.set_index(extracted.index.names) + + # Get extraction run from mlflow and start again to log validation metrics + run = _get_most_recent_run(experiment_name) + with mlflow.start_run(run_id=run.info.run_id): + # Compute metrics and log + if dataset == "basic_10k": + mlflow.log_metrics( + compute_validation_metrics(extracted, validation_set, "value") + ) + # Log validation set used to compute metrics + _log_artifact_as_csv(validation_set, "labels.csv") + + +def validate_extraction_asset_factory(dataset: str): + """Create asset that extracts validation filings and compute validation metrics.""" + + @asset( + name=f"{dataset}_extract_validate", + required_resource_keys={ + f"{dataset}_extract_validate_mlflow", + "cloud_interface", + }, + ) + def validate(context): + cloud_interface: GCSArchive = context.resources.cloud_interface + experiment_name = context.resources.original_resource_dict[ + f"{dataset}_extract_validate_mlflow" + ].experiment_name + return validate_extraction(dataset, experiment_name, cloud_interface) + + return validate + + +basic_10k_extract = extract_asset_factory("basic_10k") +basic_10k_validate = validate_extraction_asset_factory("basic_10k") diff --git a/src/mozilla_sec_eia/utils/cloud.py b/src/mozilla_sec_eia/utils/cloud.py index 1d1866d..6753baa 100644 --- a/src/mozilla_sec_eia/utils/cloud.py +++ b/src/mozilla_sec_eia/utils/cloud.py @@ -11,13 +11,14 @@ from typing import BinaryIO, TextIO import fitz +import mlflow import pandas as pd import pg8000 +from dagster import ConfigurableResource from google.cloud import secretmanager, storage from google.cloud.sql.connector import Connector from PIL import Image -from pydantic import BaseModel, Field, PrivateAttr -from pydantic_settings import BaseSettings, SettingsConfigDict +from pydantic import BaseModel, PrivateAttr from sqlalchemy import Engine, create_engine, select from sqlalchemy.orm import Session from xhtml2pdf import pisa @@ -142,8 +143,8 @@ def from_file( ) -class GoogleCloudSettings(BaseSettings): - """Load environment variables to manage access to cloud resources. +class GCSArchive(ConfigurableResource): + """Provides an interface for archived filings on GCS. This class looks for several environment variables to configure access to cloud resources. These can be set directly, or be in a @@ -161,35 +162,23 @@ class GoogleCloudSettings(BaseSettings): MLFLOW_TRACKING_URI: URI of mlflow tracking server. """ - model_config = SettingsConfigDict(env_file=".env") - - filings_bucket_name: str = Field(validation_alias="GCS_FILINGS_BUCKET_NAME") - labels_bucket_name: str = Field(validation_alias="GCS_LABELS_BUCKET_NAME") - metadata_db_instance_connection: str = Field( - validation_alias="GCS_METADATA_DB_INSTANCE_CONNECTION" - ) - user: str = Field(validation_alias="GCS_IAM_USER") - metadata_db_name: str = Field(validation_alias="GCS_METADATA_DB_NAME") - project: str = Field(validation_alias="GCS_PROJECT") - tracking_uri: str = Field(validation_alias="MLFLOW_TRACKING_URI") - - -class GCSArchive(BaseModel): - """Provides an interface for archived filings on GCS.""" - - settings: GoogleCloudSettings = Field(default_factory=lambda: GoogleCloudSettings()) + filings_bucket_name: str + labels_bucket_name: str + metadata_db_instance_connection: str + user: str + metadata_db_name: str + project: str _filings_bucket = PrivateAttr() _labels_bucket = PrivateAttr() _engine = PrivateAttr() _metadata_df = PrivateAttr(default=None) - def __init__(self, **kwargs): + def setup_for_execution(self, context): """Initialize interface to filings archive on GCS.""" - super().__init__(**kwargs) self._engine = self._get_engine() - self._filings_bucket = self._get_bucket(self.settings.filings_bucket_name) - self._labels_bucket = self._get_bucket(self.settings.labels_bucket_name) + self._filings_bucket = self._get_bucket(self.filings_bucket_name) + self._labels_bucket = self._get_bucket(self.labels_bucket_name) Base.metadata.create_all(self._engine) @@ -208,10 +197,10 @@ def _get_engine(self) -> Engine: def getconn() -> pg8000.dbapi.Connection: conn: pg8000.dbapi.Connection = connector.connect( - self.settings.metadata_db_instance_connection, + self.metadata_db_instance_connection, "pg8000", - user=self.settings.user, - db=self.settings.metadata_db_name, + user=self.user, + db=self.metadata_db_name, enable_iam_auth=True, ) return conn @@ -414,17 +403,34 @@ def _access_secret_version(secret_id: str, project_id: str, version_id="latest") return response.payload.data.decode("UTF-8") -def initialize_mlflow(settings: GoogleCloudSettings | None = None): - """Set appropriate environment variables to prepare connection to tracking server.""" - if settings is None: - settings = GoogleCloudSettings() - - os.environ["MLFLOW_TRACKING_USERNAME"] = "admin" - os.environ["MLFLOW_TRACKING_PASSWORD"] = _access_secret_version( - "mlflow_admin_password", settings.project - ) - os.environ["MLFLOW_TRACKING_URI"] = settings.tracking_uri - os.environ["MLFLOW_GCS_DOWNLOAD_CHUNK_SIZE"] = "20971520" - os.environ["MLFLOW_GCS_UPLOAD_CHUNK_SIZE"] = "20971520" - os.environ["MLFLOW_HTTP_REQUEST_TIMEOUT"] = "900" - logger.info(f"Initialized tracking with mlflow server: {settings.tracking_uri}") +class MlflowInterface(ConfigurableResource): + """Initialize interface to mlflow for desired experiment.""" + + experiment_name: str + continue_run: bool = False + tracking_uri: str + cloud_interface: GCSArchive + artifact_location: str | None = None + + def setup_for_execution(self, context): + """Do runtime configuration of mlflow.""" + os.environ["MLFLOW_TRACKING_USERNAME"] = "admin" + os.environ["MLFLOW_TRACKING_PASSWORD"] = _access_secret_version( + "mlflow_admin_password", self.cloud_interface.project + ) + os.environ["MLFLOW_TRACKING_URI"] = self.tracking_uri + os.environ["MLFLOW_GCS_DOWNLOAD_CHUNK_SIZE"] = "20971520" + os.environ["MLFLOW_GCS_UPLOAD_CHUNK_SIZE"] = "20971520" + os.environ["MLFLOW_HTTP_REQUEST_TIMEOUT"] = "900" + logger.info(f"Initialized tracking with mlflow server: {self.tracking_uri}") + + self.create_experiment() + + def create_experiment(self): + """Create experiment if it doesn't already exist.""" + logger.info(f"Creating experiment: {self.experiment_name}") + if not mlflow.get_experiment_by_name(self.experiment_name): + mlflow.create_experiment( + name=self.experiment_name, + artifact_location=self.artifact_location, + ) diff --git a/tests/conftest.py b/tests/conftest.py index 62c0058..6db667b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,11 +1,9 @@ """PyTest configuration module. Defines useful fixtures, command line args.""" import logging -import unittest from pathlib import Path import pytest -from mozilla_sec_eia.utils.cloud import GoogleCloudSettings, initialize_mlflow logger = logging.getLogger(__name__) @@ -35,23 +33,3 @@ def test_dir() -> Path: Mostly this is meant as an example of a fixture. """ return Path(__file__).parent - - -@pytest.fixture -def test_settings(test_dir): - """Return test GoogleCloudSettings object.""" - return GoogleCloudSettings(_env_file=test_dir / "test.env") - - -@pytest.fixture -def test_mlflow_init_func(test_settings): - """Return a function that can replace ``initialize_mlflow`` with no external calls.""" - - def _test_init(): - with unittest.mock.patch( - "mozilla_sec_eia.utils.cloud._access_secret_version", - new=lambda *args: "password", - ): - return initialize_mlflow(test_settings) - - return _test_init diff --git a/tests/unit/extract_test.py b/tests/unit/extract_test.py index 63b59cb..4441c2b 100644 --- a/tests/unit/extract_test.py +++ b/tests/unit/extract_test.py @@ -1,15 +1,20 @@ """Test extraction tools/methods.""" +import logging import unittest -import mlflow import pandas as pd import pytest +from dagster import build_asset_context from mozilla_sec_eia.extract import ( + ExtractConfig, _get_most_recent_run, + basic_10k_extract, compute_validation_metrics, - extract_filings, ) +from mozilla_sec_eia.utils.cloud import GCSArchive, MlflowInterface + +logger = logging.getLogger(f"catalystcoop.{__name__}") @pytest.fixture @@ -58,7 +63,6 @@ def second_run_results(): def test_extract_basic_10k( - test_mlflow_init_func, filings_metadata, first_run_results, second_run_results, @@ -66,30 +70,54 @@ def test_extract_basic_10k( ): """Test high level extraction workflow.""" - class FakeArchive: + class FakeArchive(GCSArchive): + filings_bucket_name: str = "" + labels_bucket_name: str = "" + metadata_db_instance_connection: str = "" + user: str = "" + metadata_db_name: str = "" + project: str = "" + + def setup_for_execution(self, context): + pass + def get_metadata(self): return filings_metadata - with ( - unittest.mock.patch("mozilla_sec_eia.extract.initialize_mlflow"), - unittest.mock.patch("mozilla_sec_eia.extract.GCSArchive", new=FakeArchive), - ): - # Initialize mlflow with test settings - test_mlflow_init_func() - mlflow.create_experiment( - "basic_10k_extraction", artifact_location=str(tmp_path) - ) + # Initialize mlflow with test settings + experiment_name = "basic_10k_extract_unit_test" + with unittest.mock.patch( + "mozilla_sec_eia.utils.cloud._access_secret_version", new=lambda *args: "" + ): for i, results in enumerate([first_run_results, second_run_results]): - kwargs = {"num_filings": 3} if i == 0 else {"continue_run": True} - with unittest.mock.patch( - "mozilla_sec_eia.extract.basic_10k.extract", new=lambda *args: results + logger.info(f"Run {i} of basic 10k extraction.") + with ( + build_asset_context( + resources={ + "basic_10k_extract_config": ExtractConfig( + num_filings=3 if i == 0 else -1 + ), + "basic_10k_extract_mlflow": MlflowInterface( + experiment_name=experiment_name, + continue_run=i > 0, + tracking_uri="sqlite:///:memory:", + cloud_interface=FakeArchive(), + artifact_location=str(tmp_path), + ), + "cloud_interface": FakeArchive(), + } + ) as context, + unittest.mock.patch( + "mozilla_sec_eia.extract.basic_10k.extract", + new=lambda *args: results, + ), ): metadata = results[0] # Run extract method - extract_filings("basic_10k", **kwargs) - run = _get_most_recent_run("basic_10k_extraction") + basic_10k_extract(context) + run = _get_most_recent_run(experiment_name) assert run.data.metrics["num_failed"] == (~metadata["success"]).sum() assert run.data.metrics["ratio_extracted"] == len(metadata) / len( filings_metadata diff --git a/tests/unit/utils_test.py b/tests/unit/utils_test.py index 1907de4..b935c8f 100644 --- a/tests/unit/utils_test.py +++ b/tests/unit/utils_test.py @@ -9,7 +9,6 @@ from mozilla_sec_eia.utils.cloud import ( Exhibit21, GCSArchive, - GoogleCloudSettings, Sec10K, ) @@ -21,17 +20,16 @@ def test_archive(): unittest.mock.patch("mozilla_sec_eia.utils.cloud.GCSArchive._get_engine"), unittest.mock.patch("mozilla_sec_eia.utils.cloud.GCSArchive._get_bucket"), ): - return GCSArchive( - settings=GoogleCloudSettings( - GCS_FILINGS_BUCKET_NAME="filings_bucket_name", - GCS_LABELS_BUCKET_NAME="labels_bucket_name", - GCS_METADATA_DB_INSTANCE_CONNECTION="metadata_db_instance_connection", - GCS_IAM_USER="user", - GCS_METADATA_DB_NAME="metadata_db_name", - GCS_PROJECT="project_name", - MLFLOW_TRACKING_URI="http://tracking.server", - ) + archive = GCSArchive( + filings_bucket_name="filings_bucket_name", + labels_bucket_name="labels_bucket_name", + metadata_db_instance_connection="metadata_db_instance_connection", + user="user", + metadata_db_name="metadata_db_name", + project="project_name", ) + archive.setup_for_execution("fake_context") + return archive @dataclass From 9d9fbfd409057bc3c9fe9c8d730d1b3e40d060d0 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 14 Aug 2024 10:02:34 -0400 Subject: [PATCH 002/161] Update validate integration test to dagster infra --- tests/integration/extract_test.py | 38 +++++++++++++++++++++++-------- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/tests/integration/extract_test.py b/tests/integration/extract_test.py index 7414459..495488c 100644 --- a/tests/integration/extract_test.py +++ b/tests/integration/extract_test.py @@ -1,22 +1,40 @@ """Validate basic 10k and exhibit 21 extraction.""" -import unittest - +import dotenv +from dagster import EnvVar, build_asset_context from mozilla_sec_eia.extract import ( - _get_experiment_name, _get_most_recent_run, - validate_extraction, + basic_10k_validate, ) +from mozilla_sec_eia.utils.cloud import GCSArchive, MlflowInterface -def test_basic_10k_extraction(test_mlflow_init_func): +def test_basic_10k_extraction(): """Run full 10k extraction on validation set and verify desired metrics are met.""" - with unittest.mock.patch("mozilla_sec_eia.extract.initialize_mlflow"): - test_mlflow_init_func() - validate_extraction("basic_10k") - run = _get_most_recent_run( - _get_experiment_name("basic_10k", experiment_suffix="validation") + dotenv.load_dotenv() + experiment_name = "basic_10k_validate_test" + cloud_interface = GCSArchive( + filings_bucket_name=EnvVar("GCS_FILINGS_BUCKET_NAME"), + labels_bucket_name=EnvVar("GCS_LABELS_BUCKET_NAME"), + metadata_db_instance_connection=EnvVar("GCS_METADATA_DB_INSTANCE_CONNECTION"), + user=EnvVar("GCS_IAM_USER"), + metadata_db_name=EnvVar("GCS_METADATA_DB_NAME"), + project=EnvVar("GCS_PROJECT"), ) + with build_asset_context( + resources={ + "basic_10k_extract_validate_mlflow": MlflowInterface( + experiment_name=experiment_name, + continue_run=False, + tracking_uri="sqlite:///:memory:", + cloud_interface=cloud_interface, + ), + "cloud_interface": cloud_interface, + } + ) as context: + basic_10k_validate(context) + run = _get_most_recent_run(experiment_name) + assert run.data.metrics["precision"] == 1 assert run.data.metrics["recall"] == 1 From ee77e7a975da0030867538b9db685bae54f07863 Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 27 Aug 2024 13:56:05 -0400 Subject: [PATCH 003/161] Generalize mltools --- src/mozilla_sec_eia/assets.py | 74 +-- src/mozilla_sec_eia/basic_10k.py | 30 +- src/mozilla_sec_eia/cli.py | 90 ---- src/mozilla_sec_eia/extract.py | 461 ++++-------------- src/mozilla_sec_eia/utils/cloud.py | 52 +- .../utils/ml_tools/__init__.py | 13 + .../utils/ml_tools/experiment_tracking.py | 199 ++++++++ src/mozilla_sec_eia/utils/ml_tools/models.py | 162 ++++++ 8 files changed, 479 insertions(+), 602 deletions(-) create mode 100644 src/mozilla_sec_eia/utils/ml_tools/__init__.py create mode 100644 src/mozilla_sec_eia/utils/ml_tools/experiment_tracking.py create mode 100644 src/mozilla_sec_eia/utils/ml_tools/models.py diff --git a/src/mozilla_sec_eia/assets.py b/src/mozilla_sec_eia/assets.py index 8efd199..748690d 100644 --- a/src/mozilla_sec_eia/assets.py +++ b/src/mozilla_sec_eia/assets.py @@ -3,81 +3,15 @@ import logging import coloredlogs -from dagster import Definitions, EnvVar, define_asset_job +from dagster import Definitions -from mozilla_sec_eia.ex_21.train_extractor import train_model -from mozilla_sec_eia.extract import ( - ExtractConfig, - basic_10k_extract, - basic_10k_validate, - ex21_extract, - ex21_validate, -) -from mozilla_sec_eia.utils.cloud import GCSArchive, MlflowInterface +from mozilla_sec_eia.utils import ml_tools logger = logging.getLogger("catalystcoop") log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s" coloredlogs.install(fmt=log_format, logger=logger) -extract_job = define_asset_job( - name="extract_job", - selection=[basic_10k_extract, ex21_extract], -) -validate_job = define_asset_job( - name="validate_job", - selection=[basic_10k_validate, ex21_validate], -) -finetune_model_job = define_asset_job( - name="finetune_model_job", - selection=[train_model], -) - -cloud_interface = GCSArchive( - filings_bucket_name=EnvVar("GCS_FILINGS_BUCKET_NAME"), - labels_bucket_name=EnvVar("GCS_LABELS_BUCKET_NAME"), - metadata_db_instance_connection=EnvVar("GCS_METADATA_DB_INSTANCE_CONNECTION"), - user=EnvVar("GCS_IAM_USER"), - metadata_db_name=EnvVar("GCS_METADATA_DB_NAME"), - project=EnvVar("GCS_PROJECT"), -) - defs = Definitions( - assets=[ - basic_10k_validate, - basic_10k_extract, - ex21_validate, - ex21_extract, - train_model, - ], - jobs=[extract_job, validate_job, finetune_model_job], - resources={ - "cloud_interface": cloud_interface, - "basic_10k_extract_config": ExtractConfig(), - "ex21_extract_config": ExtractConfig(), - "basic_10k_extract_mlflow": MlflowInterface( - experiment_name="basic_10k_extraction", - tracking_uri=EnvVar("MLFLOW_TRACKING_URI"), - cloud_interface=cloud_interface, - ), - "ex21_extract_mlflow": MlflowInterface( - experiment_name="basic_10k_extraction", - tracking_uri=EnvVar("MLFLOW_TRACKING_URI"), - cloud_interface=cloud_interface, - ), - "basic_10k_extract_validate_mlflow": MlflowInterface( - experiment_name="basic_10k_extraction_validation", - tracking_uri=EnvVar("MLFLOW_TRACKING_URI"), - cloud_interface=cloud_interface, - ), - "ex21_extract_validate_mlflow": MlflowInterface( - experiment_name="basic_10k_extraction_validation", - tracking_uri=EnvVar("MLFLOW_TRACKING_URI"), - cloud_interface=cloud_interface, - ), - "layoutlm_mlflow_interface": MlflowInterface( - experiment_name="/finetune-layoutlmv3", - tracking_uri=EnvVar("MLFLOW_TRACKING_URI"), - cloud_interface=cloud_interface, - ), - }, + jobs=ml_tools.get_ml_model_jobs(), + resources=ml_tools.get_ml_model_resources(), ) diff --git a/src/mozilla_sec_eia/basic_10k.py b/src/mozilla_sec_eia/basic_10k.py index 9e0786e..cf04ada 100644 --- a/src/mozilla_sec_eia/basic_10k.py +++ b/src/mozilla_sec_eia/basic_10k.py @@ -1,9 +1,9 @@ """Implement functions for handling data from basic 10k filings (not exhibit 21).""" import logging -from concurrent.futures import ProcessPoolExecutor import pandas as pd +from dagster import Out, op from mozilla_sec_eia.utils.cloud import GCSArchive, Sec10K @@ -67,11 +67,10 @@ def _extract_10k(filing: Sec10K): return pd.DataFrame(values), filing.filename, unmatched_keys +@op(out={"extraction_metadata": Out(), "extracted": Out()}) def extract( + cloud_interface: GCSArchive, filings_to_extract: pd.DataFrame, - extraction_metadata: pd.DataFrame, - extracted: pd.DataFrame, - archive: GCSArchive, ) -> tuple[pd.DataFrame, pd.DataFrame]: """Extract basic 10K data and write to postgres table. @@ -81,15 +80,20 @@ def extract( """ logger.info("Starting basic 10K extraction.") logger.info(f"Extracting {len(filings_to_extract)} filings.") - with ProcessPoolExecutor() as executor: - for ext, filename, unmatched_keys in executor.map( - _extract_10k, archive.iterate_filings(filings_to_extract) - ): - extraction_metadata.loc[filename, ["success", "unmatched_keys"]] = [ - len(ext) > 0, - ",".join(unmatched_keys), - ] - extracted = pd.concat([extracted, ext]) + + extraction_metadata = pd.DataFrame( + {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} + ).set_index("filename") + extracted = pd.DataFrame() + + for filing in cloud_interface.iterate_filings(filings_to_extract): + ext, filename, unmatched_keys = _extract_10k(filing) + extraction_metadata.loc[filename, ["success", "unmatched_keys"]] = [ + len(ext) > 0, + ",".join(unmatched_keys), + ] + extracted = pd.concat([extracted, ext]) + return ( extraction_metadata, extracted.set_index(["filename", "filer_count", "block", "block_count", "key"]), diff --git a/src/mozilla_sec_eia/cli.py b/src/mozilla_sec_eia/cli.py index 5b38a0a..2d4a84a 100755 --- a/src/mozilla_sec_eia/cli.py +++ b/src/mozilla_sec_eia/cli.py @@ -4,93 +4,3 @@ to add new scripts which can be accessed through one top-level interface. """ - -import argparse -import logging -import sys -from pathlib import Path - -import coloredlogs - -from mozilla_sec_eia.ex_21.create_labeled_dataset import ( - create_inputs_for_label_studio, -) -from mozilla_sec_eia.ex_21.rename_labeled_filings import rename_filings -from mozilla_sec_eia.ex_21.train_extractor import train_model -from mozilla_sec_eia.extract import extract_filings, validate_extraction -from mozilla_sec_eia.utils import GCSArchive - -# This is the module-level logger, for any logs -logger = logging.getLogger(__name__) -ROOT_DIR = Path(__file__).parent.parent.parent.resolve() - - -def parse_command_line(argv: list[str]) -> argparse.Namespace: - """Parse command line arguments. See the -h option for details. - - Args: - argv (str): Command line arguments, including caller filename. - - Returns: - dict: Dictionary of command line arguments and their parsed values. - - """ - - def formatter(prog) -> argparse.HelpFormatter: - """This is a hack to create HelpFormatter with a particular width.""" - return argparse.HelpFormatter(prog, width=88) - - # Use the module-level docstring as the script's description in the help message. - parser = argparse.ArgumentParser(description=__doc__, formatter_class=formatter) - subparsers = parser.add_subparsers(required=True) - - # Add command to validate filing archive contents - validate_parser = subparsers.add_parser("validate_archive") - validate_parser.set_defaults(func=lambda: GCSArchive().validate_archive()) - - # Add command to fine-tune ex21 extractor - validate_parser = subparsers.add_parser("finetune_ex21") - validate_parser.add_argument("--labeled-json-path") - validate_parser.add_argument("--gcs-training-data-dir", default="labeled/") - validate_parser.add_argument("--model-output-dir", default="layoutlm_trainer") - validate_parser.add_argument("--test-size", default=0.2) - validate_parser.set_defaults(func=train_model) - - # Add command to rename labeled filings on GCS - validate_parser = subparsers.add_parser("rename_filings") - validate_parser.set_defaults(func=rename_filings) - - # Add command to extract basic 10k data - extract_parser = subparsers.add_parser("extract") - extract_parser.add_argument("--dataset", nargs=1, default="basic_10k") - extract_parser.add_argument("--continue-run", action="store_true", default=False) - extract_parser.add_argument("--num-filings", default=-1, nargs="?", type=int) - extract_parser.set_defaults(func=extract_filings) - - validate_extract_parser = subparsers.add_parser("validate") - validate_extract_parser.add_argument("--dataset", nargs=1, default="basic_10k") - validate_extract_parser.set_defaults(func=validate_extraction) - - # Add command to create Label Studio inputs from cached Ex. 21 images and PDFs - validate_parser = subparsers.add_parser("create_ls_inputs") - validate_parser.add_argument("--pdfs-dir", default=ROOT_DIR / "sec10k_filings/pdfs") - validate_parser.add_argument("--cache-dir", default=ROOT_DIR / "sec10k_filings") - validate_parser.set_defaults(func=create_inputs_for_label_studio) - - arguments = parser.parse_args(argv[1:]) - - return arguments - - -def main() -> int: - """Demonstrate a really basic command line interface (CLI) that takes arguments.""" - logger = logging.getLogger("catalystcoop") - log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s" - coloredlogs.install(fmt=log_format, logger=logger) - - args = parse_command_line(sys.argv) - return args.func(**{key: val for key, val in vars(args).items() if key != "func"}) - - -if __name__ == "__main__": - sys.exit(main()) diff --git a/src/mozilla_sec_eia/extract.py b/src/mozilla_sec_eia/extract.py index 44d598f..90d87ba 100644 --- a/src/mozilla_sec_eia/extract.py +++ b/src/mozilla_sec_eia/extract.py @@ -2,25 +2,31 @@ import io import logging +import math import tempfile -from importlib import resources from pathlib import Path import mlflow +import numpy as np import pandas as pd import pandera as pa -from dagster import ConfigurableResource, asset +from dagster import ( + Config, + DynamicOut, + DynamicOutput, + GraphDefinition, + OpDefinition, + graph, + op, +) from mlflow.entities import Run from mozilla_sec_eia import basic_10k -from mozilla_sec_eia.ex_21.inference import clean_extracted_df, perform_inference -from mozilla_sec_eia.utils.cloud import ( - GCSArchive, - get_metadata_filename, -) -from mozilla_sec_eia.utils.layoutlm import ( - load_model, +from mozilla_sec_eia.utils.cloud import GCSArchive +from mozilla_sec_eia.utils.ml_tools.experiment_tracking import ( + get_tracking_resource_name, ) +from mozilla_sec_eia.utils.ml_tools.models import pudl_model logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -68,61 +74,81 @@ def _log_artifact_as_parquet( return mlflow.log_artifact(parquet_path, artifact_name) -def _get_most_recent_run(experiment_name: str): - """Search mlflow for most recent extraction run with specified experiment name.""" - run_metadata = mlflow.search_runs(experiment_names=[experiment_name]) +@op +def get_filings_to_extract( + cloud_interface: GCSArchive, +) -> pd.DataFrame: + """Return filing metadata.""" + return cloud_interface.get_metadata() + + +class ChunkFilingsConfig(Config): + """Config how many filings are extracted and chunk_size for extraction.""" - # Mlflow returns runs ordered by their runtime, so it's easy to grab the latest run - # This assert will ensure this doesn't silently break if the ordering changes - assert run_metadata.loc[0, "end_time"] == run_metadata["end_time"].max() - return mlflow.get_run(run_metadata.loc[0, "run_id"]) + chunk_size: int = 1000 + num_filings: int = -1 -def _get_filings_to_extract( - experiment_name: str, +@op(out=DynamicOut()) +def chunk_filings( + config: ChunkFilingsConfig, metadata: pd.DataFrame, - continue_run: bool = False, - num_filings: int = -1, -): - """Get filings that should be extracted by run. +) -> pd.DataFrame: + """Split filings into chunks for parallel extraction.""" + filings_to_extract = metadata + if config.num_filings > 0: + filings_to_extract = filings_to_extract.sample(config.num_filings) + + for i, chunk in enumerate( + np.array_split( + filings_to_extract, math.ceil(len(filings_to_extract) / config.chunk_size) + ) + ): + yield DynamicOutput(chunk, mapping_key=str(i)) - Args: - experiment_name: Name of mlflow experiment. - metadata: Metadata for full set of filings to potentially extract. - continue_run: Whether to continue a previous extraction run. - num_filings: Number of filings to extract. - """ - extraction_metadata = pd.DataFrame( - {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} - ).set_index("filename") - extracted = pd.DataFrame() - run_id = None - if continue_run: - most_recent_run = _get_most_recent_run(experiment_name) - extraction_metadata = ExtractionMetadataSchema.validate( - _load_artifact_as_csv( - most_recent_run, "/extraction_metadata.csv" - ).set_index("filename") + +def extract_model_factory( + dataset_name: str, extract_op: OpDefinition | GraphDefinition +): + """Produce a `pudl_model` to extract data from sec10k filings.""" + experiment_name = f"{dataset_name}_extraction" + experiment_tracker_resource = get_tracking_resource_name(experiment_name) + + @op(required_resource_keys=[experiment_tracker_resource]) + def log_extraction_data( + metadata: pd.DataFrame, + extraction_metadata: list[pd.DataFrame], + extracted: list[pd.DataFrame], + ): + extraction_metadata = pd.concat(extraction_metadata) + extracted = pd.concat(extracted) + # Use metadata to log generic metrics + extraction_metadata = ExtractionMetadataSchema.validate(extraction_metadata) + mlflow.log_metrics( + { + "num_failed": (~extraction_metadata["success"]).sum(), + "ratio_extracted": len(extraction_metadata) / len(metadata), + } ) - extracted = _load_artifact_as_parquet(most_recent_run, "/extracted.parquet") - run_id = most_recent_run.info.run_id - - filings_to_extract = metadata[~metadata["filename"].isin(extraction_metadata.index)] - if num_filings > 0: - filings_to_extract = filings_to_extract.sample(num_filings) - return ( - filings_to_extract, - extraction_metadata, - extracted, - run_id, - ) + # Log the extraction results + metadata for future reference/analysis + _log_artifact_as_csv(extraction_metadata, "extraction_metadata.csv") + _log_artifact_as_parquet(extracted, "extracted.parquet") + + @pudl_model(experiment_name=experiment_name) + @graph(name=experiment_name) + def extract_filings(): + filings_to_extract = get_filings_to_extract() + filing_chunks = chunk_filings(filings_to_extract) + extraction_metadata, extracted = filing_chunks.map(extract_op) + + return log_extraction_data( + filings_to_extract, + extraction_metadata.collect(), + extracted.collect(), + ) -def _get_experiment_name(dataset: str, experiment_suffix: str | None = None) -> str: - experiment_name = f"{dataset}_extraction" - if experiment_suffix is not None: - experiment_name += f"_{experiment_suffix}" - return experiment_name + return extract_filings def compute_validation_metrics( @@ -165,325 +191,4 @@ def compute_validation_metrics( } -def extract_filings( - dataset: str, - filings_to_extract: pd.DataFrame, - extraction_metadata: pd.DataFrame, - extracted: pd.DataFrame, - num_filings: int, - experiment_name: str, - cloud_interface: GCSArchive, - run_id: str | None = None, -) -> pd.DataFrame: - """Extract filings in `filings_to_extract`.""" - mlflow.set_experiment(experiment_name) - with mlflow.start_run(run_id=run_id): - # Extract data for desired filings - if dataset == "basic_10k": - extraction_metadata, extracted = basic_10k.extract( - filings_to_extract, - extraction_metadata, - extracted, - cloud_interface, - ) - else: - model_checkpoint = load_model() - model = model_checkpoint["model"] - processor = model_checkpoint["tokenizer"] - # populate extraction metadata with filenames - # TODO: does extraction md already have filenames in it? check this - extraction_metadata = pd.concat( - [ - extraction_metadata, - pd.DataFrame( - { - "filename": filings_to_extract["filename"].unique(), - "success": False, - } - ).set_index("filename"), - ] - ) - # TODO: there's probably a faster way to do this with less caching - with tempfile.TemporaryDirectory() as temp_dir: - # get Sec10K objects - # TODO: does it save time if we don't cache them? - temp_dir = Path(temp_dir) - cloud_interface.get_filings( - filings_to_extract, cache_directory=temp_dir, cache_pdf=True - ) - _, _, extracted, extraction_metadata = perform_inference( - pdfs_dir=temp_dir, - model=model, - processor=processor, - extraction_metadata=extraction_metadata, - ) - extracted["filename"] = extracted["id"].apply(get_metadata_filename) - extracted = extracted.set_index("filename") - - # Use metadata to log generic metrics - extraction_metadata = ExtractionMetadataSchema.validate(extraction_metadata) - mlflow.log_metrics( - { - "num_failed": (~extraction_metadata["success"]).sum(), - "ratio_extracted": len(extraction_metadata) / num_filings, - } - ) - - # Log the extraction results + metadata for future reference/analysis - _log_artifact_as_csv(extraction_metadata, "extraction_metadata.csv") - _log_artifact_as_parquet(extracted, "extracted.parquet") - logger.info( - f"Finished extracting {len(extraction_metadata)} filings from {dataset}." - ) - return extracted - - -class ExtractConfig(ConfigurableResource): - """Basic configuration for an extraction run.""" - - num_filings: int = -1 - - -def extract_asset_factory(dataset: str) -> asset: - """Produce asset to extract `dataset`.""" - - @asset( - name=f"{dataset}_extract", - required_resource_keys={ - f"{dataset}_extract_config", - f"{dataset}_extract_mlflow", - "cloud_interface", - }, - ) - def extract(context) -> pd.DataFrame: - config = context.resources.original_resource_dict[f"{dataset}_extract_config"] - cloud_interface: GCSArchive = context.resources.cloud_interface - mlflow_interface = context.resources.original_resource_dict[ - f"{dataset}_extract_mlflow" - ] - experiment_name = mlflow_interface.experiment_name - metadata = cloud_interface.get_metadata() - - # Get filings to extract as well as any existing metadata for run - filings_to_extract, extraction_metadata, extracted, run_id = ( - _get_filings_to_extract( - experiment_name, - metadata, - continue_run=mlflow_interface.continue_run, - num_filings=config.num_filings, - ) - ) - - return extract_filings( - dataset=dataset, - filings_to_extract=filings_to_extract, - extraction_metadata=extraction_metadata, - extracted=extracted, - num_filings=len(metadata), - experiment_name=experiment_name, - cloud_interface=cloud_interface, - run_id=run_id, - ) - - return extract - - -def jaccard_similarity( - computed_df: pd.DataFrame, validation_df: pd.DataFrame, value_col: str -) -> float: - """Get the Jaccard similarity between two Series. - - Calculated as the intersection of the set divided - by the union of the set. - - Args: - computed_df: Extracted data. - validation_df: Expected extraction results. - value_col: Column to calculate Jaccard similarity on. - Must be present in both dataframes. - """ - # fill nans to make similarity comparison more accurate - if (computed_df[value_col].dtype == float) and ( - validation_df[value_col].dtype == float - ): - computed_df[value_col] = computed_df[value_col].fillna(999) - validation_df[value_col] = validation_df[value_col].fillna(999) - else: - computed_df[value_col] = computed_df[value_col].fillna("zzz") - validation_df[value_col] = validation_df[value_col].fillna("zzz") - intersection = set(computed_df[value_col]).intersection( - set(validation_df[value_col]) - ) - union = set(computed_df[value_col]).union(set(validation_df[value_col])) - return float(len(intersection)) / float(len(union)) - - -def compute_ex21_validation_metrics( - computed_df: pd.DataFrame, validation_df: pd.DataFrame -): - """Compute validation metrics for Ex. 21 extraction.""" - shared_cols = validation_df.columns.intersection(computed_df.columns) - validation_df = validation_df.astype(computed_df[shared_cols].dtypes) - n_equal = 0 - validation_filenames = validation_df["id"].unique() - n_files = len(validation_filenames) - table_metrics_dict = {} - jaccard_dict = {} - incorrect_files = [] - # iterate through each file and check each extracted table - for filename in validation_filenames: - extracted_table_df = computed_df[computed_df["id"] == filename].reset_index( - drop=True - ) - validation_table_df = validation_df[ - validation_df["id"] == filename - ].reset_index(drop=True) - # check if the tables are exactly equal - if extracted_table_df.equals(validation_table_df): - # TODO: strip llc and other company strings before comparison - n_equal += 1 - else: - incorrect_files.append(filename) - # compute precision and recall for each column - table_metrics_dict[filename] = {} - jaccard_dict[filename] = {} - for col in ["subsidiary", "loc", "own_per"]: - table_prec_recall = compute_validation_metrics( - extracted_table_df, validation_table_df, value_col=col - ) - table_metrics_dict[filename][f"{col}_precision"] = table_prec_recall[ - "precision" - ] - table_metrics_dict[filename][f"{col}_recall"] = table_prec_recall["recall"] - # get the jaccard similarity between columns - jaccard_dict[filename][col] = jaccard_similarity( - computed_df=extracted_table_df, - validation_df=validation_table_df, - value_col=col, - ) - - jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient="index").reset_index() - prec_recall_df = pd.DataFrame.from_dict( - table_metrics_dict, orient="index" - ).reset_index() - _log_artifact_as_csv( - jaccard_df, - artifact_name="jaccard_per_table.csv", - ) - _log_artifact_as_csv( - prec_recall_df, - artifact_name="precision_recall_per_table.csv", - ) - _log_artifact_as_csv( - pd.DataFrame({"filename": incorrect_files}), - artifact_name="incorrect_filenames.csv", - ) - return { - "table_accuracy": n_equal / n_files, - "avg_subsidiary_jaccard_sim": jaccard_df["subsidiary"].sum() / n_files, - "avg_location_jaccard_sim": jaccard_df["loc"].sum() / n_files, - "avg_own_per_jaccard_sim": jaccard_df["own_per"].sum() / n_files, - "avg_subsidiary_precision": prec_recall_df["subsidiary_precision"].sum() - / n_files, - "avg_location_precision": prec_recall_df["loc_precision"].sum() / n_files, - "avg_own_per_precision": prec_recall_df["own_per_precision"].sum() / n_files, - "avg_subsidiary_recall": prec_recall_df["subsidiary_recall"].sum() / n_files, - "avg_location_recall": prec_recall_df["loc_recall"].sum() / n_files, - "avg_own_per_recall": prec_recall_df["own_per_recall"].sum() / n_files, - } - - -def clean_ex21_validation_set(validation_df: pd.DataFrame): - """Clean Ex. 21 validation data to match extracted format.""" - validation_df = validation_df.rename( - columns={ - "Filename": "id", - "Subsidiary": "subsidiary", - "Location of Incorporation": "loc", - "Ownership Percentage": "own_per", - } - ) - validation_df["own_per"] = validation_df["own_per"].astype(str) - validation_df["filename"] = validation_df["id"].apply(get_metadata_filename) - validation_df = clean_extracted_df(validation_df) - return validation_df - - -def validate_extraction( - dataset: str, experiment_name: str, cloud_interface: GCSArchive -): - """Run extraction on validation set and compare results to labeled data.""" - validation_set = pd.read_csv( - resources.files("mozilla_sec_eia.package_data") / f"{dataset}_labels.csv" - ) - if dataset == "ex21": - validation_set = clean_ex21_validation_set(validation_set) - - # Get metadata for labelled filings - to_extract = cloud_interface.get_metadata( - filenames=list(validation_set["filename"]) - ) - - # Get filings to extract as well as any existing metadata for run - filings_to_extract, extraction_metadata, extracted, run_id = ( - _get_filings_to_extract( - experiment_name, - to_extract, - ) - ) - - # Extract data from filings - extracted = extract_filings( - dataset=dataset, - filings_to_extract=filings_to_extract, - extraction_metadata=extraction_metadata, - extracted=extracted, - num_filings=len(to_extract), - experiment_name=experiment_name, - cloud_interface=cloud_interface, - run_id=run_id, - ) - - # Set index for validation set based on returned extracted DF - validation_set = validation_set.set_index(extracted.index.names) - - # Get extraction run from mlflow and start again to log validation metrics - run = _get_most_recent_run(experiment_name) - with mlflow.start_run(run_id=run.info.run_id): - # Compute metrics and log - if dataset == "basic_10k": - mlflow.log_metrics( - compute_validation_metrics(extracted, validation_set, "value") - ) - else: - mlflow.log_metrics( - compute_ex21_validation_metrics(extracted, validation_set) - ) - # Log validation set used to compute metrics - _log_artifact_as_csv(validation_set, "labels.csv") - - -def validate_extraction_asset_factory(dataset: str): - """Create asset that extracts validation filings and compute validation metrics.""" - - @asset( - name=f"{dataset}_extract_validate", - required_resource_keys={ - f"{dataset}_extract_validate_mlflow", - "cloud_interface", - }, - ) - def validate(context): - cloud_interface: GCSArchive = context.resources.cloud_interface - experiment_name = context.resources.original_resource_dict[ - f"{dataset}_extract_validate_mlflow" - ].experiment_name - return validate_extraction(dataset, experiment_name, cloud_interface) - - return validate - - -basic_10k_extract = extract_asset_factory("basic_10k") -basic_10k_validate = validate_extraction_asset_factory("basic_10k") -ex21_extract = extract_asset_factory("ex21") -ex21_validate = validate_extraction_asset_factory("ex21") +basic_10k_extract = extract_model_factory("basic_10k", basic_10k.extract) diff --git a/src/mozilla_sec_eia/utils/cloud.py b/src/mozilla_sec_eia/utils/cloud.py index 9248140..076a5de 100644 --- a/src/mozilla_sec_eia/utils/cloud.py +++ b/src/mozilla_sec_eia/utils/cloud.py @@ -3,7 +3,6 @@ import base64 import io import logging -import os import re from contextlib import contextmanager from hashlib import md5 @@ -11,11 +10,10 @@ from typing import BinaryIO, TextIO import fitz -import mlflow import pandas as pd import pg8000 from dagster import ConfigurableResource -from google.cloud import secretmanager, storage +from google.cloud import storage from google.cloud.sql.connector import Connector from PIL import Image from pydantic import BaseModel, PrivateAttr @@ -400,51 +398,3 @@ def validate_archive(self) -> bool: def get_metadata_filename(local_filename: str): """Transform a local filename into the filename in GCSArchiver metadata.""" return "edgar/data/" + local_filename.replace("-", "/", 1) + ".txt" - - -def _access_secret_version(secret_id: str, project_id: str, version_id="latest"): - # Create the Secret Manager client. - client = secretmanager.SecretManagerServiceClient() - - # Build the resource name of the secret version. - name = f"projects/{project_id}/secrets/{secret_id}/versions/{version_id}" - - # Access the secret version. - response = client.access_secret_version(name=name) - - # Return the decoded payload. - return response.payload.data.decode("UTF-8") - - -class MlflowInterface(ConfigurableResource): - """Initialize interface to mlflow for desired experiment.""" - - experiment_name: str - continue_run: bool = False - tracking_uri: str - cloud_interface: GCSArchive - artifact_location: str | None = None - - def setup_for_execution(self, context): - """Do runtime configuration of mlflow.""" - os.environ["MLFLOW_TRACKING_USERNAME"] = "admin" - os.environ["MLFLOW_TRACKING_PASSWORD"] = _access_secret_version( - "mlflow_admin_password", self.cloud_interface.project - ) - os.environ["MLFLOW_TRACKING_URI"] = self.tracking_uri - os.environ["MLFLOW_GCS_DOWNLOAD_CHUNK_SIZE"] = "20971520" - os.environ["MLFLOW_GCS_UPLOAD_CHUNK_SIZE"] = "20971520" - os.environ["MLFLOW_HTTP_REQUEST_TIMEOUT"] = "900" - os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true" - logger.info(f"Initialized tracking with mlflow server: {self.tracking_uri}") - - self.create_experiment() - - def create_experiment(self): - """Create experiment if it doesn't already exist.""" - logger.info(f"Creating experiment: {self.experiment_name}") - if not mlflow.get_experiment_by_name(self.experiment_name): - mlflow.create_experiment( - name=self.experiment_name, - artifact_location=self.artifact_location, - ) diff --git a/src/mozilla_sec_eia/utils/ml_tools/__init__.py b/src/mozilla_sec_eia/utils/ml_tools/__init__.py new file mode 100644 index 0000000..f3448ee --- /dev/null +++ b/src/mozilla_sec_eia/utils/ml_tools/__init__.py @@ -0,0 +1,13 @@ +"""Implements shared tooling for machine learning models in PUDL.""" + +from . import models + + +def get_ml_model_resources(): + """Return default configuration for all PUDL models.""" + return models.MODEL_RESOURCES + + +def get_ml_model_jobs() -> list[str]: + """Return all jobs created through `pudl_model` decorator.""" + return list(models.PUDL_MODELS.values()) diff --git a/src/mozilla_sec_eia/utils/ml_tools/experiment_tracking.py b/src/mozilla_sec_eia/utils/ml_tools/experiment_tracking.py new file mode 100644 index 0000000..394b8b1 --- /dev/null +++ b/src/mozilla_sec_eia/utils/ml_tools/experiment_tracking.py @@ -0,0 +1,199 @@ +"""This module implements experiment tracking tooling using mlflow as a backend. + +:class:`ExperimentTracker`'s are created using an op factory :func:`experiment_tracker_factory` +and can be passed around to op's which make up a PUDL model. This class will maintain +state between ops, ensuring that all parameters and metrics are logged to the appropriate +mlflow run. The following command will launch the mlflow UI to view model results: +`mlflow ui --backend-store-uri {tracking_uri}`. `tracking_uri` by default will point +to a file named 'experiments.sqlite' in the base directory of your PUDL repo, but +this is a configurable value, which can be found in the dagster UI. +""" + +import atexit +import logging +import os +from contextlib import contextmanager + +import mlflow +from dagster import ConfigurableResource, InitResourceContext, op +from google.cloud import secretmanager + +logger = logging.getLogger(f"catalystcoop.{__name__}") + + +def _flatten_model_config(model_config: dict) -> dict: + """Take nested dictionary defining model config and flatten for logging purposes. + + This is essentially a translation layer between Dagster configuration and mlflow, + which does not support displaying nested parameters in the UI. + + Examples: + >>> _flatten_model_config( + ... { + ... 'ferc_to_ferc': { + ... 'link_ids_cross_year': { + ... 'compute_distance_matrix': { + ... 'distance_threshold': .5, + ... 'metric': 'euclidean', + ... }, + ... 'match_orphaned_records': {'distance_threshold': 0.5}, + ... } + ... } + ... } + ... ) == { + ... 'ferc_to_ferc.link_ids_cross_year.compute_distance_matrix.distance_threshold': 0.5, + ... 'ferc_to_ferc.link_ids_cross_year.compute_distance_matrix.metric': 'euclidean', + ... 'ferc_to_ferc.link_ids_cross_year.match_orphaned_records.distance_threshold': 0.5 + ... } + True + """ + + def _flatten_level(config_level: dict, param_name: str): + flattened_dict = {} + for key, val in config_level.items(): + flattened_param = f"{param_name}.{key}" + if isinstance(val, dict): + flattened_dict |= _flatten_level(val, param_name=flattened_param) + else: + flattened_dict[flattened_param[1:]] = val + return flattened_dict + + return _flatten_level(model_config, "") + + +class ExperimentTracker(ConfigurableResource): + """Class to manage tracking a machine learning model using MLflow. + + The following command will launch the mlflow UI to view model results: + `mlflow ui --backend-store-uri {tracking_uri}`. From here, you can compare metrics + from multiple runs, and track performance. + + This class is designed to be created using the `op` :func:`create_experiment_tracker`. + This allows the `ExperimentTracker` to be passed around within a Dagster `graph`, + and be used for mlflow logging in any of the `op`'s that make up the `graph`. This + is useful because Dagster executes `op`'s in separate processes, while mlflow does + not maintain state between processes. This design also allows configuration of + the ExperimentTracker to be set from the Dagster UI. + + Currently, we are only doing experiment tracking in a local context, but if we were + to setup a tracking server, we could point the `tracking_uri` at this remote server + without having to modify the models. Experiment tracking can also be done outside + of the PUDL context. If doing exploratory work in a notebook, you can use mlflow + directly in a notebook with the same experiment name used here, and mlflow will + seamlessly integrate the results with those from PUDL runs. + """ + + tracking_uri: str + tracking_enabled: bool = True + artifact_location: str | None = None + experiment_name: str + tags: dict = {} + project: str + + @contextmanager + def yield_for_execution( + self, + context: InitResourceContext, + ) -> "ExperimentTracker": + """Create experiment tracker for specified experiment.""" + if self.tracking_enabled: + self._configure_mlflow() + + # Get run_id associated with current dagster run + experiment_id = self.get_or_create_experiment( + experiment_name=self.experiment_name, + artifact_location=self.artifact_location, + ) + mlflow_run_id = self._get_mlflow_run_id(context.run_id, experiment_id) + + # Hack to stop mlflow from ending run at process barrier + # This is borrowed from the official dagster mlflow resource found here: + # https://github.com/dagster-io/dagster/blob/master/python_modules/libraries/dagster-mlflow/dagster_mlflow/resources.py + atexit.unregister(mlflow.end_run) + + # Create new run under specified experiment + with mlflow.start_run( + run_id=mlflow_run_id, + experiment_id=experiment_id, + tags=self.tags | {"dagster_run_id": context.run_id}, + ): + yield self + + def _get_tracking_password(self, version_id: str = "latest"): + """Get tracking server password from gcloud secrets.""" + # Create the Secret Manager client. + client = secretmanager.SecretManagerServiceClient() + + # Build the resource name of the secret version. + name = f"projects/{self.project}/secrets/mlflow_admin_password/versions/{version_id}" + + # Access the secret version. + response = client.access_secret_version(name=name) + + # Return the decoded payload. + return response.payload.data.decode("UTF-8") + + def _configure_mlflow(self): + """Do runtime configuration of mlflow.""" + os.environ["MLFLOW_TRACKING_USERNAME"] = "admin" + os.environ["MLFLOW_TRACKING_PASSWORD"] = self._get_tracking_password() + os.environ["MLFLOW_TRACKING_URI"] = self.tracking_uri + os.environ["MLFLOW_GCS_DOWNLOAD_CHUNK_SIZE"] = "20971520" + os.environ["MLFLOW_GCS_UPLOAD_CHUNK_SIZE"] = "20971520" + os.environ["MLFLOW_HTTP_REQUEST_TIMEOUT"] = "900" + os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true" + + def _get_mlflow_run_id(self, dagster_run_id: str, experiment_id: str): + """Search for existing run tagged with dagster run id or start new run.""" + run_df = mlflow.search_runs( + experiment_ids=[experiment_id], + filter_string=f"tags.dagster_run_id='{dagster_run_id}'", + ) + + run_id = None + if not run_df.empty: + run_id = run_df.loc[0, "run_id"] + return run_id + + @staticmethod + def get_or_create_experiment( + experiment_name: str, artifact_location: str = "" + ) -> str: + """Retrieve the ID of an existing MLflow experiment or create a new one if it doesn't exist. + + This function checks if an experiment with the given name exists within MLflow. + If it does, the function returns its ID. If not, it creates a new experiment + with the provided name and returns its ID. + + Returns: + ID of the existing or newly created MLflow experiment. + """ + if experiment := mlflow.get_experiment_by_name(experiment_name): + experiment_id = experiment.experiment_id + else: + experiment_id = mlflow.create_experiment( + experiment_name, artifact_location=artifact_location + ) + + return experiment_id + + +def get_tracking_resource_name(experiment_name: str): + """Return expected name of experiment tracking resource given experiment name.""" + return f"{experiment_name}_tracker" + + +def experiment_tracker_teardown_factory( + experiment_name: str, +) -> ExperimentTracker: + """Use config to create an experiment tracker.""" + atexit.unregister(mlflow.end_run) + + @op( + name=f"{experiment_name}_tracker_teardown", + required_resource_keys=[f"{experiment_name}_tracker"], + ) + def teardown_experiment_tracker(_results): + mlflow.end_run() + + return teardown_experiment_tracker diff --git a/src/mozilla_sec_eia/utils/ml_tools/models.py b/src/mozilla_sec_eia/utils/ml_tools/models.py new file mode 100644 index 0000000..606bdc7 --- /dev/null +++ b/src/mozilla_sec_eia/utils/ml_tools/models.py @@ -0,0 +1,162 @@ +"""Provides tooling for developing/tracking ml models within PUDL. + +The main interface from this module is the :func:`pudl_model` decorator, which +is meant to be applied to a dagster `graph`. This decorator will handle finding all +configuration for a model/passing configuration to dagster, creating an +:class:`ExperimentTracker` for the model, and ultimately will return a `job` +from the model. + +There are a few different ways to provide configuration for a PUDL model. First, configuration will come from default values for any dagster `Config`'s which are associated +with `op`'s which make up the model `graph`. For more info on dagster configuration, +see https://docs.dagster.io/concepts/configuration/config-schema. The next way to +provide configuration is through the yaml file: `pudl.package_data.settings.pudl_models.yml`. +Any configuration in this file should be follow dagster's config-schema formatting, +see the `ferc_to_ferc` entry as an example. Configuration provided this way will +override any default values. The final way to provide configuration is through the +dagster UI. To provide configuration this way, click `Open Launchpad` in the UI, and +values can be edited here. This configuration will override both default values and +yaml configuration, but will only be used for a single run. +""" + +import importlib +import logging + +import mlflow +import yaml +from dagster import ( + EnvVar, + GraphDefinition, + HookContext, + JobDefinition, + OpDefinition, + RunConfig, + job, + op, + success_hook, +) + +from mozilla_sec_eia.utils import GCSArchive + +from .experiment_tracking import ( + ExperimentTracker, + experiment_tracker_teardown_factory, + get_tracking_resource_name, +) + +logger = logging.getLogger(f"catalystcoop.{__name__}") +MODEL_RESOURCES = {} +PUDL_MODELS = {} + + +def get_yml_config(experiment_name: str) -> dict: + """Load model configuration from yaml file.""" + config_file = ( + importlib.resources.files("pudl.package_data.settings") / "pudl_models.yml" + ) + config = yaml.safe_load(config_file.open("r")) + + if not (model_config := config.get(experiment_name)): + raise RuntimeError(f"No {experiment_name} entry in {config_file}") + + return {experiment_name: model_config} + + +def get_default_config(model_graph: GraphDefinition) -> dict: + """Get default config values for model.""" + + def _get_default_from_ops(node: OpDefinition | GraphDefinition): + config = {} + if isinstance(node, GraphDefinition): + config = { + "ops": { + child_node.name: _get_default_from_ops(child_node) + for child_node in node.node_defs + } + } + else: + if node.config_schema.default_provided: + config = {"config": node.config_schema.default_value} + else: + config = {"config": None} + + return config + + config = {model_graph.name: _get_default_from_ops(model_graph)} + return config + + +def get_pudl_model_job_name(experiment_name: str) -> str: + """Return expected pudl model job name based on experiment_name.""" + return f"{experiment_name}_job" + + +def pudl_model(experiment_name: str, config_from_yaml: bool = False) -> JobDefinition: + """Decorator for an ML model that will handle providing configuration to dagster.""" + + def _decorator(model_graph: GraphDefinition): + model_config = get_default_config(model_graph) + if config_from_yaml: + model_config |= get_yml_config(model_graph.name) + + # Add resources to resource dict + cloud_interface = GCSArchive( + filings_bucket_name=EnvVar("GCS_FILINGS_BUCKET_NAME"), + labels_bucket_name=EnvVar("GCS_LABELS_BUCKET_NAME"), + metadata_db_instance_connection=EnvVar( + "GCS_METADATA_DB_INSTANCE_CONNECTION" + ), + user=EnvVar("GCS_IAM_USER"), + metadata_db_name=EnvVar("GCS_METADATA_DB_NAME"), + project=EnvVar("GCS_PROJECT"), + ) + MODEL_RESOURCES.update( + { + get_tracking_resource_name(experiment_name): ExperimentTracker( + experiment_name=experiment_name, + tracking_uri=EnvVar("MLFLOW_TRACKING_URI"), + project=EnvVar("GCS_PROJECT"), + ), + "cloud_interface": cloud_interface, + } + ) + + default_config = RunConfig( + ops=model_config, + ) + + @op + def _collect_results(model_graph_output, _implicit_dependencies: list): + return model_graph_output + + @success_hook( + required_resource_keys={get_tracking_resource_name(experiment_name)} + ) + def _log_config_hook(context: HookContext): + if (config := context.op_config) is not None: + mlflow.log_params( + { + f"{context.op.name}.{param}": value + for param, value in config.items() + } + ) + + @job( + name=get_pudl_model_job_name(experiment_name), + config=default_config, + hooks={_log_config_hook}, + ) + def model_asset(**kwargs): + tracker_teardown = experiment_tracker_teardown_factory( + experiment_name=model_graph.name, + ) + graph_output = model_graph(**kwargs) + + # Pass output to teardown to create a dependency + teardown = tracker_teardown(graph_output) + + _collect_results(graph_output, [teardown]) + + PUDL_MODELS[get_pudl_model_job_name(experiment_name)] + return model_asset + + return _decorator From 53d33545dc041463fe90926d03a5dc61e8e45e50 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 28 Aug 2024 09:44:57 -0400 Subject: [PATCH 004/161] Reorg repo to move towards generalized modelling repo --- src/mozilla_sec_eia/__init__.py | 5 +- .../{utils => library}/ml_tools/__init__.py | 0 .../ml_tools/experiment_tracking.py | 15 ++++ .../{utils => library}/ml_tools/models.py | 23 ++--- .../{assets.py => model_jobs.py} | 2 +- src/mozilla_sec_eia/models/__init__.py | 3 + src/mozilla_sec_eia/models/sec10k/__init__.py | 1 + .../{ => models/sec10k}/basic_10k.py | 2 +- .../{ => models/sec10k}/ex_21/__init__.py | 0 .../sec10k}/ex_21/create_labeled_dataset.py | 6 +- .../{ => models/sec10k}/ex_21/inference.py | 18 ++-- .../sec10k}/ex_21/rename_labeled_filings.py | 2 +- .../sec10k}/ex_21/train_extractor.py | 10 +-- .../{ => models/sec10k}/extract.py | 72 ++++++++++++++-- .../{ => models/sec10k}/utils/__init__.py | 0 .../{ => models/sec10k}/utils/cloud.py | 14 ++- .../{ => models/sec10k}/utils/db_metadata.py | 0 .../{ => models/sec10k}/utils/layoutlm.py | 0 .../{ => models/sec10k}/utils/pdf.py | 0 tests/conftest.py | 25 ++++++ .../{ => models/sec10k}/extract_test.py | 0 .../{ => models/sec10k}/ex21_model_test.py | 6 +- .../unit/{ => models/sec10k}/extract_test.py | 85 +++++++++---------- tests/unit/{ => models/sec10k}/utils_test.py | 17 ++-- 24 files changed, 201 insertions(+), 105 deletions(-) rename src/mozilla_sec_eia/{utils => library}/ml_tools/__init__.py (100%) rename src/mozilla_sec_eia/{utils => library}/ml_tools/experiment_tracking.py (92%) rename src/mozilla_sec_eia/{utils => library}/ml_tools/models.py (87%) rename src/mozilla_sec_eia/{assets.py => model_jobs.py} (89%) create mode 100644 src/mozilla_sec_eia/models/__init__.py create mode 100644 src/mozilla_sec_eia/models/sec10k/__init__.py rename src/mozilla_sec_eia/{ => models/sec10k}/basic_10k.py (98%) rename src/mozilla_sec_eia/{ => models/sec10k}/ex_21/__init__.py (100%) rename src/mozilla_sec_eia/{ => models/sec10k}/ex_21/create_labeled_dataset.py (98%) rename src/mozilla_sec_eia/{ => models/sec10k}/ex_21/inference.py (98%) rename src/mozilla_sec_eia/{ => models/sec10k}/ex_21/rename_labeled_filings.py (98%) rename src/mozilla_sec_eia/{ => models/sec10k}/ex_21/train_extractor.py (94%) rename src/mozilla_sec_eia/{ => models/sec10k}/extract.py (69%) rename src/mozilla_sec_eia/{ => models/sec10k}/utils/__init__.py (100%) rename src/mozilla_sec_eia/{ => models/sec10k}/utils/cloud.py (96%) rename src/mozilla_sec_eia/{ => models/sec10k}/utils/db_metadata.py (100%) rename src/mozilla_sec_eia/{ => models/sec10k}/utils/layoutlm.py (100%) rename src/mozilla_sec_eia/{ => models/sec10k}/utils/pdf.py (100%) rename tests/integration/{ => models/sec10k}/extract_test.py (100%) rename tests/unit/{ => models/sec10k}/ex21_model_test.py (85%) rename tests/unit/{ => models/sec10k}/extract_test.py (68%) rename tests/unit/{ => models/sec10k}/utils_test.py (89%) diff --git a/src/mozilla_sec_eia/__init__.py b/src/mozilla_sec_eia/__init__.py index 40ae3f8..74617af 100644 --- a/src/mozilla_sec_eia/__init__.py +++ b/src/mozilla_sec_eia/__init__.py @@ -1,14 +1,13 @@ """A template repository for a Python package created by Catalyst Cooperative.""" import logging -from pathlib import Path import pkg_resources # In order for the package modules to be available when you import the package, # they need to be imported here somehow. Not sure if this is best practice though. -import mozilla_sec_eia.cli -import mozilla_sec_eia.utils # noqa: F401 +import mozilla_sec_eia.library +import mozilla_sec_eia.models __author__ = "Catalyst Cooperative" __contact__ = "pudl@catalyst.coop" diff --git a/src/mozilla_sec_eia/utils/ml_tools/__init__.py b/src/mozilla_sec_eia/library/ml_tools/__init__.py similarity index 100% rename from src/mozilla_sec_eia/utils/ml_tools/__init__.py rename to src/mozilla_sec_eia/library/ml_tools/__init__.py diff --git a/src/mozilla_sec_eia/utils/ml_tools/experiment_tracking.py b/src/mozilla_sec_eia/library/ml_tools/experiment_tracking.py similarity index 92% rename from src/mozilla_sec_eia/utils/ml_tools/experiment_tracking.py rename to src/mozilla_sec_eia/library/ml_tools/experiment_tracking.py index 394b8b1..adb3740 100644 --- a/src/mozilla_sec_eia/utils/ml_tools/experiment_tracking.py +++ b/src/mozilla_sec_eia/library/ml_tools/experiment_tracking.py @@ -197,3 +197,18 @@ def teardown_experiment_tracker(_results): mlflow.end_run() return teardown_experiment_tracker + + +def get_most_recent_run( + experiment_name: str, dagster_run_id: str +) -> mlflow.entities.Run: + """Search mlflow for most recent extraction run with specified experiment name.""" + run_metadata = mlflow.search_runs( + experiment_names=[experiment_name], + filter_string=f"tags.dagster_run_id!='{dagster_run_id}'", + ) + + # Mlflow returns runs ordered by their runtime, so it's easy to grab the latest run + # This assert will ensure this doesn't silently break if the ordering changes + assert run_metadata.loc[0, "end_time"] == run_metadata["end_time"].max() + return mlflow.get_run(run_metadata.loc[0, "run_id"]) diff --git a/src/mozilla_sec_eia/utils/ml_tools/models.py b/src/mozilla_sec_eia/library/ml_tools/models.py similarity index 87% rename from src/mozilla_sec_eia/utils/ml_tools/models.py rename to src/mozilla_sec_eia/library/ml_tools/models.py index 606bdc7..139066f 100644 --- a/src/mozilla_sec_eia/utils/ml_tools/models.py +++ b/src/mozilla_sec_eia/library/ml_tools/models.py @@ -29,14 +29,13 @@ HookContext, JobDefinition, OpDefinition, + ResourceDefinition, RunConfig, job, op, success_hook, ) -from mozilla_sec_eia.utils import GCSArchive - from .experiment_tracking import ( ExperimentTracker, experiment_tracker_teardown_factory, @@ -90,7 +89,11 @@ def get_pudl_model_job_name(experiment_name: str) -> str: return f"{experiment_name}_job" -def pudl_model(experiment_name: str, config_from_yaml: bool = False) -> JobDefinition: +def pudl_model( + experiment_name: str, + resources: dict[str, ResourceDefinition] = {}, + config_from_yaml: bool = False, +) -> JobDefinition: """Decorator for an ML model that will handle providing configuration to dagster.""" def _decorator(model_graph: GraphDefinition): @@ -99,16 +102,6 @@ def _decorator(model_graph: GraphDefinition): model_config |= get_yml_config(model_graph.name) # Add resources to resource dict - cloud_interface = GCSArchive( - filings_bucket_name=EnvVar("GCS_FILINGS_BUCKET_NAME"), - labels_bucket_name=EnvVar("GCS_LABELS_BUCKET_NAME"), - metadata_db_instance_connection=EnvVar( - "GCS_METADATA_DB_INSTANCE_CONNECTION" - ), - user=EnvVar("GCS_IAM_USER"), - metadata_db_name=EnvVar("GCS_METADATA_DB_NAME"), - project=EnvVar("GCS_PROJECT"), - ) MODEL_RESOURCES.update( { get_tracking_resource_name(experiment_name): ExperimentTracker( @@ -116,8 +109,8 @@ def _decorator(model_graph: GraphDefinition): tracking_uri=EnvVar("MLFLOW_TRACKING_URI"), project=EnvVar("GCS_PROJECT"), ), - "cloud_interface": cloud_interface, } + | resources ) default_config = RunConfig( @@ -156,7 +149,7 @@ def model_asset(**kwargs): _collect_results(graph_output, [teardown]) - PUDL_MODELS[get_pudl_model_job_name(experiment_name)] + PUDL_MODELS[get_pudl_model_job_name(experiment_name)] = model_asset return model_asset return _decorator diff --git a/src/mozilla_sec_eia/assets.py b/src/mozilla_sec_eia/model_jobs.py similarity index 89% rename from src/mozilla_sec_eia/assets.py rename to src/mozilla_sec_eia/model_jobs.py index 748690d..9e5a5b3 100644 --- a/src/mozilla_sec_eia/assets.py +++ b/src/mozilla_sec_eia/model_jobs.py @@ -5,7 +5,7 @@ import coloredlogs from dagster import Definitions -from mozilla_sec_eia.utils import ml_tools +from mozilla_sec_eia.library import ml_tools logger = logging.getLogger("catalystcoop") log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s" diff --git a/src/mozilla_sec_eia/models/__init__.py b/src/mozilla_sec_eia/models/__init__.py new file mode 100644 index 0000000..ba3ac6a --- /dev/null +++ b/src/mozilla_sec_eia/models/__init__.py @@ -0,0 +1,3 @@ +"""Implement specific PUDL models in this module.""" + +from .sec10k import extract diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py new file mode 100644 index 0000000..001c6ad --- /dev/null +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -0,0 +1 @@ +"""Implement models to extract data from SEC10k filings.""" diff --git a/src/mozilla_sec_eia/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py similarity index 98% rename from src/mozilla_sec_eia/basic_10k.py rename to src/mozilla_sec_eia/models/sec10k/basic_10k.py index cf04ada..e5b5f72 100644 --- a/src/mozilla_sec_eia/basic_10k.py +++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py @@ -5,7 +5,7 @@ import pandas as pd from dagster import Out, op -from mozilla_sec_eia.utils.cloud import GCSArchive, Sec10K +from .utils.cloud import GCSArchive, Sec10K logger = logging.getLogger(f"catalystcoop.{__name__}") EXPERIMENT_NAME = "basic_10k_extraction" diff --git a/src/mozilla_sec_eia/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py similarity index 100% rename from src/mozilla_sec_eia/ex_21/__init__.py rename to src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py diff --git a/src/mozilla_sec_eia/ex_21/create_labeled_dataset.py b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py similarity index 98% rename from src/mozilla_sec_eia/ex_21/create_labeled_dataset.py rename to src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py index 8353dca..55e1d5a 100644 --- a/src/mozilla_sec_eia/ex_21/create_labeled_dataset.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py @@ -7,9 +7,9 @@ import pandas as pd -from mozilla_sec_eia.utils.cloud import GCSArchive -from mozilla_sec_eia.utils.layoutlm import normalize_bboxes -from mozilla_sec_eia.utils.pdf import ( +from ..utils.cloud import GCSArchive +from ..utils.layoutlm import normalize_bboxes +from ..utils.pdf import ( get_pdf_data_from_path, pil_to_cv2, render_page, diff --git a/src/mozilla_sec_eia/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py similarity index 98% rename from src/mozilla_sec_eia/ex_21/inference.py rename to src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index ce1926f..2016630 100644 --- a/src/mozilla_sec_eia/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -16,21 +16,21 @@ ) from transformers.tokenization_utils_base import BatchEncoding -from mozilla_sec_eia.ex_21.create_labeled_dataset import ( - BBOX_COLS_PDF, - format_label_studio_output, - get_image_dict, -) -from mozilla_sec_eia.ex_21.train_extractor import BBOX_COLS, LABELS -from mozilla_sec_eia.utils.cloud import get_metadata_filename -from mozilla_sec_eia.utils.layoutlm import ( +from ..utils.cloud import get_metadata_filename +from ..utils.layoutlm import ( get_id_label_conversions, iob_to_label, normalize_bboxes, ) -from mozilla_sec_eia.utils.pdf import ( +from ..utils.pdf import ( get_pdf_data_from_path, ) +from .create_labeled_dataset import ( + BBOX_COLS_PDF, + format_label_studio_output, + get_image_dict, +) +from .train_extractor import BBOX_COLS, LABELS # When handling multi page documents LayoutLM uses a sliding 'frame' # with some overlap between frames. The overlap creates multiple diff --git a/src/mozilla_sec_eia/ex_21/rename_labeled_filings.py b/src/mozilla_sec_eia/models/sec10k/ex_21/rename_labeled_filings.py similarity index 98% rename from src/mozilla_sec_eia/ex_21/rename_labeled_filings.py rename to src/mozilla_sec_eia/models/sec10k/ex_21/rename_labeled_filings.py index 9c7e4e5..182dd04 100644 --- a/src/mozilla_sec_eia/ex_21/rename_labeled_filings.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/rename_labeled_filings.py @@ -5,7 +5,7 @@ import pandas as pd -from mozilla_sec_eia.utils.cloud import GCSArchive +from ..utils.cloud import GCSArchive logger = logging.getLogger(f"catalystcoop.{__name__}") diff --git a/src/mozilla_sec_eia/ex_21/train_extractor.py b/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py similarity index 94% rename from src/mozilla_sec_eia/ex_21/train_extractor.py rename to src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py index e771fbb..53ed85e 100644 --- a/src/mozilla_sec_eia/ex_21/train_extractor.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py @@ -9,7 +9,7 @@ import mlflow import numpy as np -from dagster import Config, asset +from dagster import Config from datasets import ( Array2D, Array3D, @@ -27,9 +27,8 @@ ) from transformers.data.data_collator import default_data_collator -from mozilla_sec_eia.ex_21.create_labeled_dataset import format_as_ner_annotations -from mozilla_sec_eia.utils.cloud import MlflowInterface -from mozilla_sec_eia.utils.layoutlm import get_id_label_conversions, log_model +from ..utils.layoutlm import get_id_label_conversions, log_model +from .create_labeled_dataset import format_as_ner_annotations LABELS = [ "O", @@ -144,10 +143,9 @@ class FineTuneConfig(Config): test_size: float = 0.2 -@asset def train_model( config: FineTuneConfig, - layoutlm_mlflow_interface: MlflowInterface, + layoutlm_mlflow_interface, ): """Train LayoutLM model with labeled data.""" # Prepare model diff --git a/src/mozilla_sec_eia/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py similarity index 69% rename from src/mozilla_sec_eia/extract.py rename to src/mozilla_sec_eia/models/sec10k/extract.py index 90d87ba..602e57a 100644 --- a/src/mozilla_sec_eia/extract.py +++ b/src/mozilla_sec_eia/models/sec10k/extract.py @@ -16,17 +16,21 @@ DynamicOutput, GraphDefinition, OpDefinition, + OpExecutionContext, + Out, graph, op, ) from mlflow.entities import Run -from mozilla_sec_eia import basic_10k -from mozilla_sec_eia.utils.cloud import GCSArchive -from mozilla_sec_eia.utils.ml_tools.experiment_tracking import ( +from mozilla_sec_eia.library.ml_tools.experiment_tracking import ( + get_most_recent_run, get_tracking_resource_name, ) -from mozilla_sec_eia.utils.ml_tools.models import pudl_model +from mozilla_sec_eia.library.ml_tools.models import pudl_model + +from . import basic_10k +from .utils.cloud import GCSArchive, cloud_interface_resource logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -107,6 +111,12 @@ def chunk_filings( yield DynamicOutput(chunk, mapping_key=str(i)) +class GetMostRecentRunResultsConfig(Config): + """Configuration specifying whether to get run results and continue.""" + + continue_run: bool = False + + def extract_model_factory( dataset_name: str, extract_op: OpDefinition | GraphDefinition ): @@ -114,14 +124,50 @@ def extract_model_factory( experiment_name = f"{dataset_name}_extraction" experiment_tracker_resource = get_tracking_resource_name(experiment_name) + @op( + required_resource_keys=[experiment_tracker_resource], + out={ + "extraction_metadata": Out(), + "extracted": Out(), + "filings_to_extract": Out(), + }, + ) + def get_most_recent_run_results( + context: OpExecutionContext, + config: GetMostRecentRunResultsConfig, + filings_to_extract: pd.DataFrame, + ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + extraction_metadata = pd.DataFrame( + {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} + ).set_index("filename") + extracted = pd.DataFrame() + + if config.continue_run: + most_recent_run = get_most_recent_run(experiment_name, context.run_id) + extraction_metadata = ExtractionMetadataSchema.validate( + _load_artifact_as_csv( + most_recent_run, "/extraction_metadata.csv" + ).set_index("filename") + ) + extracted = _load_artifact_as_parquet(most_recent_run, "/extracted.parquet") + filings_to_extract = filings_to_extract[ + ~filings_to_extract["filename"].isin(extraction_metadata.index) + ] + + return extraction_metadata, extracted, filings_to_extract + @op(required_resource_keys=[experiment_tracker_resource]) def log_extraction_data( metadata: pd.DataFrame, extraction_metadata: list[pd.DataFrame], extracted: list[pd.DataFrame], + previous_run_extraction_metadata: pd.DataFrame, + previous_run_extracted_data: pd.DataFrame, ): - extraction_metadata = pd.concat(extraction_metadata) - extracted = pd.concat(extracted) + extraction_metadata = pd.concat( + extraction_metadata + [previous_run_extraction_metadata] + ) + extracted = pd.concat(extracted + [previous_run_extracted_data]) # Use metadata to log generic metrics extraction_metadata = ExtractionMetadataSchema.validate(extraction_metadata) mlflow.log_metrics( @@ -135,17 +181,25 @@ def log_extraction_data( _log_artifact_as_csv(extraction_metadata, "extraction_metadata.csv") _log_artifact_as_parquet(extracted, "extracted.parquet") - @pudl_model(experiment_name=experiment_name) + @pudl_model( + experiment_name=experiment_name, + resources={"cloud_interface": cloud_interface_resource}, + ) @graph(name=experiment_name) def extract_filings(): - filings_to_extract = get_filings_to_extract() + metadata = get_filings_to_extract() + previous_extraction_metadata, previous_extracted, filings_to_extract = ( + get_most_recent_run_results(metadata) + ) filing_chunks = chunk_filings(filings_to_extract) extraction_metadata, extracted = filing_chunks.map(extract_op) return log_extraction_data( - filings_to_extract, + metadata, extraction_metadata.collect(), extracted.collect(), + previous_extraction_metadata, + previous_extracted, ) return extract_filings diff --git a/src/mozilla_sec_eia/utils/__init__.py b/src/mozilla_sec_eia/models/sec10k/utils/__init__.py similarity index 100% rename from src/mozilla_sec_eia/utils/__init__.py rename to src/mozilla_sec_eia/models/sec10k/utils/__init__.py diff --git a/src/mozilla_sec_eia/utils/cloud.py b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py similarity index 96% rename from src/mozilla_sec_eia/utils/cloud.py rename to src/mozilla_sec_eia/models/sec10k/utils/cloud.py index 076a5de..1232f45 100644 --- a/src/mozilla_sec_eia/utils/cloud.py +++ b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py @@ -12,7 +12,7 @@ import fitz import pandas as pd import pg8000 -from dagster import ConfigurableResource +from dagster import ConfigurableResource, EnvVar from google.cloud import storage from google.cloud.sql.connector import Connector from PIL import Image @@ -21,7 +21,7 @@ from sqlalchemy.orm import Session from xhtml2pdf import pisa -from mozilla_sec_eia.utils.db_metadata import Base, Sec10kMetadata +from .db_metadata import Base, Sec10kMetadata logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -398,3 +398,13 @@ def validate_archive(self) -> bool: def get_metadata_filename(local_filename: str): """Transform a local filename into the filename in GCSArchiver metadata.""" return "edgar/data/" + local_filename.replace("-", "/", 1) + ".txt" + + +cloud_interface_resource = GCSArchive( + filings_bucket_name=EnvVar("GCS_FILINGS_BUCKET_NAME"), + labels_bucket_name=EnvVar("GCS_LABELS_BUCKET_NAME"), + metadata_db_instance_connection=EnvVar("GCS_METADATA_DB_INSTANCE_CONNECTION"), + user=EnvVar("GCS_IAM_USER"), + metadata_db_name=EnvVar("GCS_METADATA_DB_NAME"), + project=EnvVar("GCS_PROJECT"), +) diff --git a/src/mozilla_sec_eia/utils/db_metadata.py b/src/mozilla_sec_eia/models/sec10k/utils/db_metadata.py similarity index 100% rename from src/mozilla_sec_eia/utils/db_metadata.py rename to src/mozilla_sec_eia/models/sec10k/utils/db_metadata.py diff --git a/src/mozilla_sec_eia/utils/layoutlm.py b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py similarity index 100% rename from src/mozilla_sec_eia/utils/layoutlm.py rename to src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py diff --git a/src/mozilla_sec_eia/utils/pdf.py b/src/mozilla_sec_eia/models/sec10k/utils/pdf.py similarity index 100% rename from src/mozilla_sec_eia/utils/pdf.py rename to src/mozilla_sec_eia/models/sec10k/utils/pdf.py diff --git a/tests/conftest.py b/tests/conftest.py index 6db667b..7fafe27 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,6 +4,7 @@ from pathlib import Path import pytest +from mozilla_sec_eia.library.ml_tools.experiment_tracking import ExperimentTracker logger = logging.getLogger(__name__) @@ -33,3 +34,27 @@ def test_dir() -> Path: Mostly this is meant as an example of a fixture. """ return Path(__file__).parent + + +class TestTracker(ExperimentTracker): + """Create sub-class of `ExperimentTracker` to use in testing context. + + Test class creates an in-memory sqlite db for tracking, and a temporary directory + for artifact storage. + """ + + def _get_tracking_password(self): + return "password" + + +@pytest.fixture +def test_tracker_factory(tmp_path): + def factory(experiment_name: str) -> TestTracker: + return TestTracker( + artifact_location=str(tmp_path), + tracking_uri="sqlite:///:memory:", + experiment_name=experiment_name, + project="", + ) + + return factory diff --git a/tests/integration/extract_test.py b/tests/integration/models/sec10k/extract_test.py similarity index 100% rename from tests/integration/extract_test.py rename to tests/integration/models/sec10k/extract_test.py diff --git a/tests/unit/ex21_model_test.py b/tests/unit/models/sec10k/ex21_model_test.py similarity index 85% rename from tests/unit/ex21_model_test.py rename to tests/unit/models/sec10k/ex21_model_test.py index 30f10a5..2953ff8 100644 --- a/tests/unit/ex21_model_test.py +++ b/tests/unit/models/sec10k/ex21_model_test.py @@ -1,9 +1,9 @@ """Unit tests for the LayoutLM model and table extractor.""" import torch -from mozilla_sec_eia.ex_21.inference import get_flattened_mode_predictions -from mozilla_sec_eia.ex_21.train_extractor import LABELS -from mozilla_sec_eia.utils.layoutlm import get_id_label_conversions +from mozilla_sec_eia.models.sec10k.ex_21.inference import get_flattened_mode_predictions +from mozilla_sec_eia.models.sec10k.ex_21.train_extractor import LABELS +from mozilla_sec_eia.models.sec10k.utils.layoutlm import get_id_label_conversions def test_bbox_overlap_prediction_tie_break(): diff --git a/tests/unit/extract_test.py b/tests/unit/models/sec10k/extract_test.py similarity index 68% rename from tests/unit/extract_test.py rename to tests/unit/models/sec10k/extract_test.py index 4441c2b..5ee7086 100644 --- a/tests/unit/extract_test.py +++ b/tests/unit/models/sec10k/extract_test.py @@ -1,18 +1,20 @@ """Test extraction tools/methods.""" import logging -import unittest import pandas as pd import pytest -from dagster import build_asset_context -from mozilla_sec_eia.extract import ( - ExtractConfig, - _get_most_recent_run, - basic_10k_extract, +from dagster import Out, op +from mozilla_sec_eia.library.ml_tools.experiment_tracking import ( + get_most_recent_run, + get_tracking_resource_name, +) +from mozilla_sec_eia.models.sec10k.extract import ( + ChunkFilingsConfig, compute_validation_metrics, + extract_model_factory, ) -from mozilla_sec_eia.utils.cloud import GCSArchive, MlflowInterface +from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -62,11 +64,11 @@ def second_run_results(): ) -def test_extract_basic_10k( +def test_sec10k_extract_pipeline( filings_metadata, first_run_results, second_run_results, - tmp_path, + test_tracker_factory, ): """Test high level extraction workflow.""" @@ -84,44 +86,33 @@ def setup_for_execution(self, context): def get_metadata(self): return filings_metadata - # Initialize mlflow with test settings - experiment_name = "basic_10k_extract_unit_test" - - with unittest.mock.patch( - "mozilla_sec_eia.utils.cloud._access_secret_version", new=lambda *args: "" - ): - for i, results in enumerate([first_run_results, second_run_results]): - logger.info(f"Run {i} of basic 10k extraction.") - with ( - build_asset_context( - resources={ - "basic_10k_extract_config": ExtractConfig( - num_filings=3 if i == 0 else -1 - ), - "basic_10k_extract_mlflow": MlflowInterface( - experiment_name=experiment_name, - continue_run=i > 0, - tracking_uri="sqlite:///:memory:", - cloud_interface=FakeArchive(), - artifact_location=str(tmp_path), - ), - "cloud_interface": FakeArchive(), - } - ) as context, - unittest.mock.patch( - "mozilla_sec_eia.extract.basic_10k.extract", - new=lambda *args: results, - ), - ): - metadata = results[0] - - # Run extract method - basic_10k_extract(context) - run = _get_most_recent_run(experiment_name) - assert run.data.metrics["num_failed"] == (~metadata["success"]).sum() - assert run.data.metrics["ratio_extracted"] == len(metadata) / len( - filings_metadata - ) + dataset_name = "test_pipeline" + experiment_name = f"{dataset_name}_extraction" + test_tracker = test_tracker_factory(experiment_name) + + for i, results in enumerate([first_run_results, second_run_results]): + + @op(out={"extraction_metadata": Out(), "extracted": Out()}) + def _fake_extract(_filings_to_extract): + return results[0], results[1] + + test_job = extract_model_factory(dataset_name, _fake_extract) + resources = { + "basic_10k_extract_config": ChunkFilingsConfig( + num_filings=3 if i == 0 else -1 + ), + get_tracking_resource_name(experiment_name): test_tracker, + "cloud_interface": FakeArchive(), + } + metadata = results[0] + + # Run extract method + test_job.execute_in_process(resources=resources) + run = get_most_recent_run(experiment_name, dagster_run_id="") + assert run.data.metrics["num_failed"] == (~metadata["success"]).sum() + assert run.data.metrics["ratio_extracted"] == len(metadata) / len( + filings_metadata + ) @pytest.mark.parametrize( diff --git a/tests/unit/utils_test.py b/tests/unit/models/sec10k/utils_test.py similarity index 89% rename from tests/unit/utils_test.py rename to tests/unit/models/sec10k/utils_test.py index b935c8f..de9fbd7 100644 --- a/tests/unit/utils_test.py +++ b/tests/unit/models/sec10k/utils_test.py @@ -6,7 +6,7 @@ import pandas as pd import pytest -from mozilla_sec_eia.utils.cloud import ( +from mozilla_sec_eia.models.sec10k.utils.cloud import ( Exhibit21, GCSArchive, Sec10K, @@ -17,8 +17,12 @@ def test_archive(): """Return test GCSArchive class.""" with ( - unittest.mock.patch("mozilla_sec_eia.utils.cloud.GCSArchive._get_engine"), - unittest.mock.patch("mozilla_sec_eia.utils.cloud.GCSArchive._get_bucket"), + unittest.mock.patch( + "mozilla_sec_eia.models.sec10k.utils.cloud.GCSArchive._get_engine" + ), + unittest.mock.patch( + "mozilla_sec_eia.models.sec10k.utils.cloud.GCSArchive._get_bucket" + ), ): archive = GCSArchive( filings_bucket_name="filings_bucket_name", @@ -91,7 +95,8 @@ def test_validate_archive(test_archive, archive_files, metadata_files, valid, mo return_value=pd.DataFrame({"filename": metadata_files}) ) mocker.patch( - "mozilla_sec_eia.utils.cloud.GCSArchive.get_metadata", new=metadata_mock + "mozilla_sec_eia.models.sec10k.utils.cloud.GCSArchive.get_metadata", + new=metadata_mock, ) assert test_archive.validate_archive() == valid @@ -174,7 +179,9 @@ def test_validate_archive(test_archive, archive_files, metadata_files, valid, mo ) def test_10k(filing_text, ex_21_version, actually_has_ex_21): """Test that SEC10k's are properly parsed.""" - with unittest.mock.patch("mozilla_sec_eia.utils.cloud.logger") as mock_logger: + with unittest.mock.patch( + "mozilla_sec_eia.models.sec10k.utils.cloud.logger" + ) as mock_logger: filing = Sec10K.from_file( file=io.StringIO(filing_text), filename="sec10k.html", From 014bcb1048d8940a18c9f1df2e2af971790c3de0 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 28 Aug 2024 10:06:03 -0400 Subject: [PATCH 005/161] Change library module structure --- src/mozilla_sec_eia/library/{ml_tools => }/__init__.py | 0 .../library/{ml_tools => }/experiment_tracking.py | 0 src/mozilla_sec_eia/library/{ml_tools => }/models.py | 0 src/mozilla_sec_eia/model_jobs.py | 6 +++--- src/mozilla_sec_eia/models/sec10k/extract.py | 4 ++-- tests/conftest.py | 2 +- tests/unit/models/sec10k/extract_test.py | 2 +- 7 files changed, 7 insertions(+), 7 deletions(-) rename src/mozilla_sec_eia/library/{ml_tools => }/__init__.py (100%) rename src/mozilla_sec_eia/library/{ml_tools => }/experiment_tracking.py (100%) rename src/mozilla_sec_eia/library/{ml_tools => }/models.py (100%) diff --git a/src/mozilla_sec_eia/library/ml_tools/__init__.py b/src/mozilla_sec_eia/library/__init__.py similarity index 100% rename from src/mozilla_sec_eia/library/ml_tools/__init__.py rename to src/mozilla_sec_eia/library/__init__.py diff --git a/src/mozilla_sec_eia/library/ml_tools/experiment_tracking.py b/src/mozilla_sec_eia/library/experiment_tracking.py similarity index 100% rename from src/mozilla_sec_eia/library/ml_tools/experiment_tracking.py rename to src/mozilla_sec_eia/library/experiment_tracking.py diff --git a/src/mozilla_sec_eia/library/ml_tools/models.py b/src/mozilla_sec_eia/library/models.py similarity index 100% rename from src/mozilla_sec_eia/library/ml_tools/models.py rename to src/mozilla_sec_eia/library/models.py diff --git a/src/mozilla_sec_eia/model_jobs.py b/src/mozilla_sec_eia/model_jobs.py index 9e5a5b3..e6438ab 100644 --- a/src/mozilla_sec_eia/model_jobs.py +++ b/src/mozilla_sec_eia/model_jobs.py @@ -5,13 +5,13 @@ import coloredlogs from dagster import Definitions -from mozilla_sec_eia.library import ml_tools +from mozilla_sec_eia.library import get_ml_model_jobs, get_ml_model_resources logger = logging.getLogger("catalystcoop") log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s" coloredlogs.install(fmt=log_format, logger=logger) defs = Definitions( - jobs=ml_tools.get_ml_model_jobs(), - resources=ml_tools.get_ml_model_resources(), + jobs=get_ml_model_jobs(), + resources=get_ml_model_resources(), ) diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py index 602e57a..a6fcf1e 100644 --- a/src/mozilla_sec_eia/models/sec10k/extract.py +++ b/src/mozilla_sec_eia/models/sec10k/extract.py @@ -23,11 +23,11 @@ ) from mlflow.entities import Run -from mozilla_sec_eia.library.ml_tools.experiment_tracking import ( +from mozilla_sec_eia.library.experiment_tracking import ( get_most_recent_run, get_tracking_resource_name, ) -from mozilla_sec_eia.library.ml_tools.models import pudl_model +from mozilla_sec_eia.library.models import pudl_model from . import basic_10k from .utils.cloud import GCSArchive, cloud_interface_resource diff --git a/tests/conftest.py b/tests/conftest.py index 7fafe27..c10e825 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -4,7 +4,7 @@ from pathlib import Path import pytest -from mozilla_sec_eia.library.ml_tools.experiment_tracking import ExperimentTracker +from mozilla_sec_eia.library.experiment_tracking import ExperimentTracker logger = logging.getLogger(__name__) diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py index 5ee7086..c5db638 100644 --- a/tests/unit/models/sec10k/extract_test.py +++ b/tests/unit/models/sec10k/extract_test.py @@ -5,7 +5,7 @@ import pandas as pd import pytest from dagster import Out, op -from mozilla_sec_eia.library.ml_tools.experiment_tracking import ( +from mozilla_sec_eia.library.experiment_tracking import ( get_most_recent_run, get_tracking_resource_name, ) From 54041487d81f0ad1e080854b307ce237b3c4b3ab Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 28 Aug 2024 10:17:15 -0400 Subject: [PATCH 006/161] Create turn experiment_tracking into sub-package --- src/mozilla_sec_eia/cli.py | 6 ------ .../library/experiment_tracking/__init__.py | 8 ++++++++ .../mlflow_resource.py} | 0 3 files changed, 8 insertions(+), 6 deletions(-) delete mode 100755 src/mozilla_sec_eia/cli.py create mode 100644 src/mozilla_sec_eia/library/experiment_tracking/__init__.py rename src/mozilla_sec_eia/library/{experiment_tracking.py => experiment_tracking/mlflow_resource.py} (100%) diff --git a/src/mozilla_sec_eia/cli.py b/src/mozilla_sec_eia/cli.py deleted file mode 100755 index 2d4a84a..0000000 --- a/src/mozilla_sec_eia/cli.py +++ /dev/null @@ -1,6 +0,0 @@ -"""Implements CLI for SEC to EIA linkage development. - -CLI is structured with nested sub-commands to make it easy -to add new scripts which can be accessed through one top-level -interface. -""" diff --git a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py new file mode 100644 index 0000000..7f9e7f9 --- /dev/null +++ b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py @@ -0,0 +1,8 @@ +"""Implement tooling to interface with mlflow experiment tracking.""" + +from .mlflow_resource import ( + ExperimentTracker, + experiment_tracker_teardown_factory, + get_most_recent_run, + get_tracking_resource_name, +) diff --git a/src/mozilla_sec_eia/library/experiment_tracking.py b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py similarity index 100% rename from src/mozilla_sec_eia/library/experiment_tracking.py rename to src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py From 886614fdadc24a33415fd493cee2600a2c8424be Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 28 Aug 2024 10:28:54 -0400 Subject: [PATCH 007/161] Remove unused function --- .../experiment_tracking/mlflow_resource.py | 40 ------------------- 1 file changed, 40 deletions(-) diff --git a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py index adb3740..b033d4c 100644 --- a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py +++ b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py @@ -21,46 +21,6 @@ logger = logging.getLogger(f"catalystcoop.{__name__}") -def _flatten_model_config(model_config: dict) -> dict: - """Take nested dictionary defining model config and flatten for logging purposes. - - This is essentially a translation layer between Dagster configuration and mlflow, - which does not support displaying nested parameters in the UI. - - Examples: - >>> _flatten_model_config( - ... { - ... 'ferc_to_ferc': { - ... 'link_ids_cross_year': { - ... 'compute_distance_matrix': { - ... 'distance_threshold': .5, - ... 'metric': 'euclidean', - ... }, - ... 'match_orphaned_records': {'distance_threshold': 0.5}, - ... } - ... } - ... } - ... ) == { - ... 'ferc_to_ferc.link_ids_cross_year.compute_distance_matrix.distance_threshold': 0.5, - ... 'ferc_to_ferc.link_ids_cross_year.compute_distance_matrix.metric': 'euclidean', - ... 'ferc_to_ferc.link_ids_cross_year.match_orphaned_records.distance_threshold': 0.5 - ... } - True - """ - - def _flatten_level(config_level: dict, param_name: str): - flattened_dict = {} - for key, val in config_level.items(): - flattened_param = f"{param_name}.{key}" - if isinstance(val, dict): - flattened_dict |= _flatten_level(val, param_name=flattened_param) - else: - flattened_dict[flattened_param[1:]] = val - return flattened_dict - - return _flatten_level(model_config, "") - - class ExperimentTracker(ConfigurableResource): """Class to manage tracking a machine learning model using MLflow. From dec80b81cce5113834f2826a31dd906757ac5a04 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 28 Aug 2024 10:40:06 -0400 Subject: [PATCH 008/161] Gracefully handle mlflow run on failure --- src/mozilla_sec_eia/library/models.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/library/models.py b/src/mozilla_sec_eia/library/models.py index 139066f..1094f9f 100644 --- a/src/mozilla_sec_eia/library/models.py +++ b/src/mozilla_sec_eia/library/models.py @@ -31,10 +31,12 @@ OpDefinition, ResourceDefinition, RunConfig, + failure_hook, job, op, success_hook, ) +from mlflow.entities.run_status import RunStatus from .experiment_tracking import ( ExperimentTracker, @@ -133,10 +135,21 @@ def _log_config_hook(context: HookContext): } ) + @failure_hook( + required_resource_keys={get_tracking_resource_name(experiment_name)} + ) + def _end_mlflow_run_with_failure(context: HookContext): + exception = context.op_exception + + if isinstance(exception, KeyboardInterrupt): + mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED)) + else: + mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED)) + @job( name=get_pudl_model_job_name(experiment_name), config=default_config, - hooks={_log_config_hook}, + hooks={_log_config_hook, _end_mlflow_run_with_failure}, ) def model_asset(**kwargs): tracker_teardown = experiment_tracker_teardown_factory( From e725f3dedbbd3ee82dc4750cbc0c382525aa874b Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 28 Aug 2024 11:01:27 -0400 Subject: [PATCH 009/161] Fix variable name --- src/mozilla_sec_eia/library/models.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/mozilla_sec_eia/library/models.py b/src/mozilla_sec_eia/library/models.py index 1094f9f..1d8b38d 100644 --- a/src/mozilla_sec_eia/library/models.py +++ b/src/mozilla_sec_eia/library/models.py @@ -151,7 +151,7 @@ def _end_mlflow_run_with_failure(context: HookContext): config=default_config, hooks={_log_config_hook, _end_mlflow_run_with_failure}, ) - def model_asset(**kwargs): + def model_job(**kwargs): tracker_teardown = experiment_tracker_teardown_factory( experiment_name=model_graph.name, ) @@ -162,7 +162,7 @@ def model_asset(**kwargs): _collect_results(graph_output, [teardown]) - PUDL_MODELS[get_pudl_model_job_name(experiment_name)] = model_asset - return model_asset + PUDL_MODELS[get_pudl_model_job_name(experiment_name)] = model_job + return model_job return _decorator From df44ed5eee5a7d8e75f2e177a037088a597c08ad Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 28 Aug 2024 12:35:36 -0400 Subject: [PATCH 010/161] Change experiment tracker resource names --- .../library/experiment_tracking/__init__.py | 1 - .../experiment_tracking/mlflow_resource.py | 7 +- src/mozilla_sec_eia/library/models.py | 27 ++-- src/mozilla_sec_eia/model_jobs.py | 3 +- src/mozilla_sec_eia/models/sec10k/extract.py | 130 ++++++++++-------- tests/unit/models/sec10k/extract_test.py | 9 +- 6 files changed, 86 insertions(+), 91 deletions(-) diff --git a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py index 7f9e7f9..65cbf7a 100644 --- a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py +++ b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py @@ -4,5 +4,4 @@ ExperimentTracker, experiment_tracker_teardown_factory, get_most_recent_run, - get_tracking_resource_name, ) diff --git a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py index b033d4c..f2c113f 100644 --- a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py +++ b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py @@ -138,11 +138,6 @@ def get_or_create_experiment( return experiment_id -def get_tracking_resource_name(experiment_name: str): - """Return expected name of experiment tracking resource given experiment name.""" - return f"{experiment_name}_tracker" - - def experiment_tracker_teardown_factory( experiment_name: str, ) -> ExperimentTracker: @@ -151,7 +146,7 @@ def experiment_tracker_teardown_factory( @op( name=f"{experiment_name}_tracker_teardown", - required_resource_keys=[f"{experiment_name}_tracker"], + required_resource_keys=["experiment_tracker"], ) def teardown_experiment_tracker(_results): mlflow.end_run() diff --git a/src/mozilla_sec_eia/library/models.py b/src/mozilla_sec_eia/library/models.py index 1d8b38d..135df39 100644 --- a/src/mozilla_sec_eia/library/models.py +++ b/src/mozilla_sec_eia/library/models.py @@ -41,7 +41,6 @@ from .experiment_tracking import ( ExperimentTracker, experiment_tracker_teardown_factory, - get_tracking_resource_name, ) logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -104,16 +103,13 @@ def _decorator(model_graph: GraphDefinition): model_config |= get_yml_config(model_graph.name) # Add resources to resource dict - MODEL_RESOURCES.update( - { - get_tracking_resource_name(experiment_name): ExperimentTracker( - experiment_name=experiment_name, - tracking_uri=EnvVar("MLFLOW_TRACKING_URI"), - project=EnvVar("GCS_PROJECT"), - ), - } - | resources - ) + model_resources = { + "experiment_tracker": ExperimentTracker( + experiment_name=experiment_name, + tracking_uri=EnvVar("MLFLOW_TRACKING_URI"), + project=EnvVar("GCS_PROJECT"), + ), + } | resources default_config = RunConfig( ops=model_config, @@ -123,9 +119,7 @@ def _decorator(model_graph: GraphDefinition): def _collect_results(model_graph_output, _implicit_dependencies: list): return model_graph_output - @success_hook( - required_resource_keys={get_tracking_resource_name(experiment_name)} - ) + @success_hook(required_resource_keys={"experiment_tracker"}) def _log_config_hook(context: HookContext): if (config := context.op_config) is not None: mlflow.log_params( @@ -135,9 +129,7 @@ def _log_config_hook(context: HookContext): } ) - @failure_hook( - required_resource_keys={get_tracking_resource_name(experiment_name)} - ) + @failure_hook(required_resource_keys={"experiment_tracker"}) def _end_mlflow_run_with_failure(context: HookContext): exception = context.op_exception @@ -150,6 +142,7 @@ def _end_mlflow_run_with_failure(context: HookContext): name=get_pudl_model_job_name(experiment_name), config=default_config, hooks={_log_config_hook, _end_mlflow_run_with_failure}, + resource_defs=model_resources, ) def model_job(**kwargs): tracker_teardown = experiment_tracker_teardown_factory( diff --git a/src/mozilla_sec_eia/model_jobs.py b/src/mozilla_sec_eia/model_jobs.py index e6438ab..2a8918b 100644 --- a/src/mozilla_sec_eia/model_jobs.py +++ b/src/mozilla_sec_eia/model_jobs.py @@ -5,7 +5,7 @@ import coloredlogs from dagster import Definitions -from mozilla_sec_eia.library import get_ml_model_jobs, get_ml_model_resources +from mozilla_sec_eia.library import get_ml_model_jobs logger = logging.getLogger("catalystcoop") log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s" @@ -13,5 +13,4 @@ defs = Definitions( jobs=get_ml_model_jobs(), - resources=get_ml_model_resources(), ) diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py index a6fcf1e..795f988 100644 --- a/src/mozilla_sec_eia/models/sec10k/extract.py +++ b/src/mozilla_sec_eia/models/sec10k/extract.py @@ -18,14 +18,15 @@ OpDefinition, OpExecutionContext, Out, + ResourceDefinition, graph, op, ) from mlflow.entities import Run from mozilla_sec_eia.library.experiment_tracking import ( + ExperimentTracker, get_most_recent_run, - get_tracking_resource_name, ) from mozilla_sec_eia.library.models import pudl_model @@ -117,73 +118,80 @@ class GetMostRecentRunResultsConfig(Config): continue_run: bool = False +@op( + out={ + "extraction_metadata": Out(), + "extracted": Out(), + "filings_to_extract": Out(), + }, +) +def get_most_recent_run_results( + context: OpExecutionContext, + config: GetMostRecentRunResultsConfig, + experiment_tracker: ExperimentTracker, + filings_to_extract: pd.DataFrame, +) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: + """Get results from previous run to continue extraction.""" + extraction_metadata = pd.DataFrame( + {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} + ).set_index("filename") + extracted = pd.DataFrame() + + if config.continue_run: + most_recent_run = get_most_recent_run( + experiment_tracker.experiment_name, context.run_id + ) + extraction_metadata = ExtractionMetadataSchema.validate( + _load_artifact_as_csv( + most_recent_run, "/extraction_metadata.csv" + ).set_index("filename") + ) + extracted = _load_artifact_as_parquet(most_recent_run, "/extracted.parquet") + filings_to_extract = filings_to_extract[ + ~filings_to_extract["filename"].isin(extraction_metadata.index) + ] + + return extraction_metadata, extracted, filings_to_extract + + +@op(required_resource_keys=["experiment_tracker"]) +def log_extraction_data( + metadata: pd.DataFrame, + extraction_metadata: list[pd.DataFrame], + extracted: list[pd.DataFrame], + previous_run_extraction_metadata: pd.DataFrame, + previous_run_extracted_data: pd.DataFrame, +): + """Log results from extraction run.""" + extraction_metadata = pd.concat( + extraction_metadata + [previous_run_extraction_metadata] + ) + extracted = pd.concat(extracted + [previous_run_extracted_data]) + # Use metadata to log generic metrics + extraction_metadata = ExtractionMetadataSchema.validate(extraction_metadata) + mlflow.log_metrics( + { + "num_failed": (~extraction_metadata["success"]).sum(), + "ratio_extracted": len(extraction_metadata) / len(metadata), + } + ) + + # Log the extraction results + metadata for future reference/analysis + _log_artifact_as_csv(extraction_metadata, "extraction_metadata.csv") + _log_artifact_as_parquet(extracted, "extracted.parquet") + + def extract_model_factory( - dataset_name: str, extract_op: OpDefinition | GraphDefinition + dataset_name: str, + extract_op: OpDefinition | GraphDefinition, + resources: dict[str, ResourceDefinition] = {}, ): """Produce a `pudl_model` to extract data from sec10k filings.""" experiment_name = f"{dataset_name}_extraction" - experiment_tracker_resource = get_tracking_resource_name(experiment_name) - - @op( - required_resource_keys=[experiment_tracker_resource], - out={ - "extraction_metadata": Out(), - "extracted": Out(), - "filings_to_extract": Out(), - }, - ) - def get_most_recent_run_results( - context: OpExecutionContext, - config: GetMostRecentRunResultsConfig, - filings_to_extract: pd.DataFrame, - ) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: - extraction_metadata = pd.DataFrame( - {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} - ).set_index("filename") - extracted = pd.DataFrame() - - if config.continue_run: - most_recent_run = get_most_recent_run(experiment_name, context.run_id) - extraction_metadata = ExtractionMetadataSchema.validate( - _load_artifact_as_csv( - most_recent_run, "/extraction_metadata.csv" - ).set_index("filename") - ) - extracted = _load_artifact_as_parquet(most_recent_run, "/extracted.parquet") - filings_to_extract = filings_to_extract[ - ~filings_to_extract["filename"].isin(extraction_metadata.index) - ] - - return extraction_metadata, extracted, filings_to_extract - - @op(required_resource_keys=[experiment_tracker_resource]) - def log_extraction_data( - metadata: pd.DataFrame, - extraction_metadata: list[pd.DataFrame], - extracted: list[pd.DataFrame], - previous_run_extraction_metadata: pd.DataFrame, - previous_run_extracted_data: pd.DataFrame, - ): - extraction_metadata = pd.concat( - extraction_metadata + [previous_run_extraction_metadata] - ) - extracted = pd.concat(extracted + [previous_run_extracted_data]) - # Use metadata to log generic metrics - extraction_metadata = ExtractionMetadataSchema.validate(extraction_metadata) - mlflow.log_metrics( - { - "num_failed": (~extraction_metadata["success"]).sum(), - "ratio_extracted": len(extraction_metadata) / len(metadata), - } - ) - - # Log the extraction results + metadata for future reference/analysis - _log_artifact_as_csv(extraction_metadata, "extraction_metadata.csv") - _log_artifact_as_parquet(extracted, "extracted.parquet") @pudl_model( experiment_name=experiment_name, - resources={"cloud_interface": cloud_interface_resource}, + resources={"cloud_interface": cloud_interface_resource} | resources, ) @graph(name=experiment_name) def extract_filings(): diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py index c5db638..925b5c1 100644 --- a/tests/unit/models/sec10k/extract_test.py +++ b/tests/unit/models/sec10k/extract_test.py @@ -7,7 +7,6 @@ from dagster import Out, op from mozilla_sec_eia.library.experiment_tracking import ( get_most_recent_run, - get_tracking_resource_name, ) from mozilla_sec_eia.models.sec10k.extract import ( ChunkFilingsConfig, @@ -96,18 +95,20 @@ def get_metadata(self): def _fake_extract(_filings_to_extract): return results[0], results[1] - test_job = extract_model_factory(dataset_name, _fake_extract) resources = { "basic_10k_extract_config": ChunkFilingsConfig( num_filings=3 if i == 0 else -1 ), - get_tracking_resource_name(experiment_name): test_tracker, + "experiment_tracker": test_tracker, "cloud_interface": FakeArchive(), } + test_job = extract_model_factory( + dataset_name, _fake_extract, resources=resources + ) metadata = results[0] # Run extract method - test_job.execute_in_process(resources=resources) + test_job.execute_in_process() run = get_most_recent_run(experiment_name, dagster_run_id="") assert run.data.metrics["num_failed"] == (~metadata["success"]).sum() assert run.data.metrics["ratio_extracted"] == len(metadata) / len( From 93da0522ce593f3ff30a94775421f90082ff5ff8 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 28 Aug 2024 17:10:41 -0400 Subject: [PATCH 011/161] Add mlflow artifact io-manager --- .../library/experiment_tracking/__init__.py | 1 + .../experiment_tracking/mlflow_io_managers.py | 100 ++++++++ .../experiment_tracking/mlflow_resource.py | 41 ++-- src/mozilla_sec_eia/library/models.py | 24 +- src/mozilla_sec_eia/models/sec10k/extract.py | 217 ++++++++++-------- tests/conftest.py | 17 ++ tests/unit/models/sec10k/extract_test.py | 124 +++++++--- 7 files changed, 377 insertions(+), 147 deletions(-) create mode 100644 src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py diff --git a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py index 65cbf7a..5a468d7 100644 --- a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py +++ b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py @@ -1,5 +1,6 @@ """Implement tooling to interface with mlflow experiment tracking.""" +from .mlflow_io_managers import MlflowPandasArtifactIOManager from .mlflow_resource import ( ExperimentTracker, experiment_tracker_teardown_factory, diff --git a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py new file mode 100644 index 0000000..a1a6850 --- /dev/null +++ b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py @@ -0,0 +1,100 @@ +"""Implement IO managers for loading models/artifacts from tracking server.""" + +import io +import logging +import tempfile +from pathlib import Path +from typing import Literal + +import mlflow +import pandas as pd +from dagster import ConfigurableIOManager, InputContext, OutputContext +from mlflow.entities import Run + +from .mlflow_resource import ExperimentTracker + +logger = logging.getLogger(f"catalystcoop.{__name__}") + + +class MlflowPandasArtifactIOManager(ConfigurableIOManager): + """Implement IO manager for logging/loading parquet files as mlflow artifacts.""" + + experiment_tracker: ExperimentTracker + #: By default handles artifacts from current run, but can be used with previous run. + use_previous_mlflow_run: bool = False + file_type: Literal["parquet", "csv"] = "parquet" + + def _load_artifact_as_csv(self, run: Run, artifact_name: str) -> pd.DataFrame: + """Download a CSV and parse to DataFrame from mlflow tracking server.""" + df = pd.read_csv( + io.StringIO( + mlflow.artifacts.load_text(run.info.artifact_uri + f"/{artifact_name}") + ) + ) + return df + + def _log_artifact_as_csv( + self, artifact: pd.DataFrame, artifact_name: str, index: bool = True + ): + """Upload a DataFrame as a CSV to mlflow tracking server.""" + return mlflow.log_text(artifact.to_csv(index=index), artifact_name) + + def _load_artifact_as_parquet(self, run: Run, artifact_name: str) -> pd.DataFrame: + """Download a CSV and parse to DataFrame from mlflow tracking server.""" + df = pd.read_parquet(run.info.artifact_uri + f"/{artifact_name}") + return df + + def _log_artifact_as_parquet( + self, artifact: pd.DataFrame, artifact_name: str, index: bool = True + ): + """Upload a DataFrame as a CSV to mlflow tracking server.""" + with tempfile.TemporaryDirectory() as tmp_dir: + parquet_path = Path(tmp_dir) / artifact_name + artifact.to_parquet(parquet_path, index=index) + return mlflow.log_artifact(parquet_path, artifact_name) + + def _get_dagster_run_id(self, context: InputContext | OutputContext) -> str: + """Return dagster run id of current dagster run.""" + return context.get_identifier()[0] + + def handle_output(self, context: OutputContext, df: pd.DataFrame): + """Attach dataframe to run as artifact.""" + if self.use_previous_mlflow_run: + raise NotImplementedError( + "MlflowPandasArtifactIOManager can not be used to add artifacts to completed run." + ) + + if self.file_type == "csv": + self._log_artifact_as_csv(df, artifact_name=f"{context.name}.csv") + else: + self._log_artifact_as_parquet(df, artifact_name=f"{context.name}.parquet") + + def _get_run_info(self) -> Run: + """Use `dagster_run_id` and `use_previous_mlflow_run` to get run info from appropriate mlflow run.""" + dagster_run_id = self.experiment_tracker.get_run_id() + filter_string = f"tags.dagster_run_id='{dagster_run_id}'" + if self.use_previous_mlflow_run: + filter_string = f"tags.dagster_run_id!='{dagster_run_id}'" + + run_metadata = mlflow.search_runs( + experiment_names=[self.experiment_tracker.experiment_name], + filter_string=filter_string, + ) + + # Mlflow returns runs ordered by their runtime, so it's easy to grab the latest run + return mlflow.get_run(run_metadata.loc[0, "run_id"]) + + def load_input(self, context: InputContext) -> pd.DataFrame: + """Handle loading dataframes from mlflow run artifacts.""" + mlflow_run = self._get_run_info() + + if self.file_type == "csv": + df = self._load_artifact_as_csv( + mlflow_run, artifact_name=f"{context.name}.csv" + ) + else: + df = self._load_artifact_as_parquet( + mlflow_run, artifact_name=f"{context.name}.parquet" + ) + + return df diff --git a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py index f2c113f..744c013 100644 --- a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py +++ b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py @@ -15,8 +15,9 @@ from contextlib import contextmanager import mlflow -from dagster import ConfigurableResource, InitResourceContext, op +from dagster import ConfigurableResource, In, InitResourceContext, Nothing, op from google.cloud import secretmanager +from pydantic import PrivateAttr logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -50,15 +51,24 @@ class ExperimentTracker(ConfigurableResource): tags: dict = {} project: str + _run_id: str = PrivateAttr() + @contextmanager def yield_for_execution( self, context: InitResourceContext, ) -> "ExperimentTracker": """Create experiment tracker for specified experiment.""" + self._run_id = context.run_id + if self.tracking_enabled: self._configure_mlflow() + # Hack to stop mlflow from ending run at process barrier + # This is borrowed from the official dagster mlflow resource found here: + # https://github.com/dagster-io/dagster/blob/master/python_modules/libraries/dagster-mlflow/dagster_mlflow/resources.py + atexit.unregister(mlflow.end_run) + # Get run_id associated with current dagster run experiment_id = self.get_or_create_experiment( experiment_name=self.experiment_name, @@ -66,18 +76,18 @@ def yield_for_execution( ) mlflow_run_id = self._get_mlflow_run_id(context.run_id, experiment_id) - # Hack to stop mlflow from ending run at process barrier - # This is borrowed from the official dagster mlflow resource found here: - # https://github.com/dagster-io/dagster/blob/master/python_modules/libraries/dagster-mlflow/dagster_mlflow/resources.py - atexit.unregister(mlflow.end_run) - - # Create new run under specified experiment - with mlflow.start_run( - run_id=mlflow_run_id, - experiment_id=experiment_id, - tags=self.tags | {"dagster_run_id": context.run_id}, - ): + if (active_run := mlflow.active_run()) is not None: + if active_run.info.run_id != mlflow_run_id: + raise RuntimeError("Found conflicting active mlflow run!") yield self + else: + # Create new run under specified experiment + with mlflow.start_run( + run_id=mlflow_run_id, + experiment_id=experiment_id, + tags=self.tags | {"dagster_run_id": context.run_id}, + ): + yield self def _get_tracking_password(self, version_id: str = "latest"): """Get tracking server password from gcloud secrets.""" @@ -115,6 +125,10 @@ def _get_mlflow_run_id(self, dagster_run_id: str, experiment_id: str): run_id = run_df.loc[0, "run_id"] return run_id + def get_run_id(self): + """Return current dagster run_id.""" + return self._run_id + @staticmethod def get_or_create_experiment( experiment_name: str, artifact_location: str = "" @@ -147,8 +161,9 @@ def experiment_tracker_teardown_factory( @op( name=f"{experiment_name}_tracker_teardown", required_resource_keys=["experiment_tracker"], + ins={"model_done": In(Nothing)}, ) - def teardown_experiment_tracker(_results): + def teardown_experiment_tracker(): mlflow.end_run() return teardown_experiment_tracker diff --git a/src/mozilla_sec_eia/library/models.py b/src/mozilla_sec_eia/library/models.py index 135df39..6cb7bd8 100644 --- a/src/mozilla_sec_eia/library/models.py +++ b/src/mozilla_sec_eia/library/models.py @@ -40,6 +40,7 @@ from .experiment_tracking import ( ExperimentTracker, + MlflowPandasArtifactIOManager, experiment_tracker_teardown_factory, ) @@ -92,6 +93,7 @@ def get_pudl_model_job_name(experiment_name: str) -> str: def pudl_model( experiment_name: str, + mlflow_pandas_io_manager_file_type: str = "parquet", resources: dict[str, ResourceDefinition] = {}, config_from_yaml: bool = False, ) -> JobDefinition: @@ -103,11 +105,21 @@ def _decorator(model_graph: GraphDefinition): model_config |= get_yml_config(model_graph.name) # Add resources to resource dict + experiment_tracker = ExperimentTracker( + experiment_name=experiment_name, + tracking_uri=EnvVar("MLFLOW_TRACKING_URI"), + project=EnvVar("GCS_PROJECT"), + ) model_resources = { - "experiment_tracker": ExperimentTracker( - experiment_name=experiment_name, - tracking_uri=EnvVar("MLFLOW_TRACKING_URI"), - project=EnvVar("GCS_PROJECT"), + "experiment_tracker": experiment_tracker, + "mlflow_pandas_artifact_io_manager": MlflowPandasArtifactIOManager( + file_type=mlflow_pandas_io_manager_file_type, + experiment_tracker=experiment_tracker, + ), + "previous_run_mlflow_pandas_artifact_io_manager": MlflowPandasArtifactIOManager( + use_previous_mlflow_run=True, + file_type=mlflow_pandas_io_manager_file_type, + experiment_tracker=experiment_tracker, ), } | resources @@ -151,9 +163,7 @@ def model_job(**kwargs): graph_output = model_graph(**kwargs) # Pass output to teardown to create a dependency - teardown = tracker_teardown(graph_output) - - _collect_results(graph_output, [teardown]) + tracker_teardown(graph_output) PUDL_MODELS[get_pudl_model_job_name(experiment_name)] = model_job return model_job diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py index 795f988..d7b9661 100644 --- a/src/mozilla_sec_eia/models/sec10k/extract.py +++ b/src/mozilla_sec_eia/models/sec10k/extract.py @@ -1,10 +1,7 @@ """Implement top level extraction methods and tooling.""" -import io import logging import math -import tempfile -from pathlib import Path import mlflow import numpy as np @@ -15,19 +12,15 @@ DynamicOut, DynamicOutput, GraphDefinition, + GraphOut, + In, OpDefinition, - OpExecutionContext, Out, - ResourceDefinition, + Output, graph, op, ) -from mlflow.entities import Run -from mozilla_sec_eia.library.experiment_tracking import ( - ExperimentTracker, - get_most_recent_run, -) from mozilla_sec_eia.library.models import pudl_model from . import basic_10k @@ -48,39 +41,8 @@ class ExtractionMetadataSchema(pa.DataFrameModel): success: bool = pa.Field(coerce=True) -def _load_artifact_as_csv(run: Run, artifact_name: str) -> pd.DataFrame: - """Download a CSV and parse to DataFrame from mlflow tracking server.""" - df = pd.read_csv( - io.StringIO(mlflow.artifacts.load_text(run.info.artifact_uri + artifact_name)) - ) - return df - - -def _log_artifact_as_csv( - artifact: pd.DataFrame, artifact_name: str, index: bool = True -): - """Upload a DataFrame as a CSV to mlflow tracking server.""" - return mlflow.log_text(artifact.to_csv(index=index), artifact_name) - - -def _load_artifact_as_parquet(run: Run, artifact_name: str) -> pd.DataFrame: - """Download a CSV and parse to DataFrame from mlflow tracking server.""" - df = pd.read_parquet(run.info.artifact_uri + artifact_name) - return df - - -def _log_artifact_as_parquet( - artifact: pd.DataFrame, artifact_name: str, index: bool = True -): - """Upload a DataFrame as a CSV to mlflow tracking server.""" - with tempfile.TemporaryDirectory() as tmp_dir: - parquet_path = Path(tmp_dir) / artifact_name - artifact.to_parquet(parquet_path, index=index) - return mlflow.log_artifact(parquet_path, artifact_name) - - @op -def get_filings_to_extract( +def get_filing_metadata( cloud_interface: GCSArchive, ) -> pd.DataFrame: """Return filing metadata.""" @@ -91,19 +53,14 @@ class ChunkFilingsConfig(Config): """Config how many filings are extracted and chunk_size for extraction.""" chunk_size: int = 1000 - num_filings: int = -1 @op(out=DynamicOut()) def chunk_filings( config: ChunkFilingsConfig, - metadata: pd.DataFrame, + filings_to_extract: pd.DataFrame, ) -> pd.DataFrame: """Split filings into chunks for parallel extraction.""" - filings_to_extract = metadata - if config.num_filings > 0: - filings_to_extract = filings_to_extract.sample(config.num_filings) - for i, chunk in enumerate( np.array_split( filings_to_extract, math.ceil(len(filings_to_extract) / config.chunk_size) @@ -119,42 +76,12 @@ class GetMostRecentRunResultsConfig(Config): @op( + required_resource_keys=["experiment_tracker"], out={ - "extraction_metadata": Out(), - "extracted": Out(), - "filings_to_extract": Out(), + "extraction_metadata": Out(io_manager_key="mlflow_pandas_artifact_io_manager"), + "extracted": Out(io_manager_key="mlflow_pandas_artifact_io_manager"), }, ) -def get_most_recent_run_results( - context: OpExecutionContext, - config: GetMostRecentRunResultsConfig, - experiment_tracker: ExperimentTracker, - filings_to_extract: pd.DataFrame, -) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: - """Get results from previous run to continue extraction.""" - extraction_metadata = pd.DataFrame( - {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} - ).set_index("filename") - extracted = pd.DataFrame() - - if config.continue_run: - most_recent_run = get_most_recent_run( - experiment_tracker.experiment_name, context.run_id - ) - extraction_metadata = ExtractionMetadataSchema.validate( - _load_artifact_as_csv( - most_recent_run, "/extraction_metadata.csv" - ).set_index("filename") - ) - extracted = _load_artifact_as_parquet(most_recent_run, "/extracted.parquet") - filings_to_extract = filings_to_extract[ - ~filings_to_extract["filename"].isin(extraction_metadata.index) - ] - - return extraction_metadata, extracted, filings_to_extract - - -@op(required_resource_keys=["experiment_tracker"]) def log_extraction_data( metadata: pd.DataFrame, extraction_metadata: list[pd.DataFrame], @@ -176,29 +103,50 @@ def log_extraction_data( } ) - # Log the extraction results + metadata for future reference/analysis - _log_artifact_as_csv(extraction_metadata, "extraction_metadata.csv") - _log_artifact_as_parquet(extracted, "extracted.parquet") + # Return metadata and extracted data (they'll be logged as artifacts by io-manager) + return extraction_metadata, extracted + + +class FilingsToExtractConfig(Config): + """Define configuration for filtering filings to extract.""" + + num_filings: int = -1 + + +@op +def get_filings_to_extract( + config: FilingsToExtractConfig, + filing_metadata: pd.DataFrame, + previous_extraction_metadata: pd.DataFrame, + previous_extracted: pd.DataFrame, +): + """Filter out any previously extracted filings and sub-sample to `num_filings`.""" + filings_to_extract = filing_metadata + if config.num_filings > 0: + filings_to_extract = filings_to_extract.sample(config.num_filings) + + filings_to_extract = filings_to_extract[ + ~filings_to_extract["filename"].isin(previous_extraction_metadata.index) + ] + return filings_to_extract -def extract_model_factory( +def extract_graph_factory( dataset_name: str, extract_op: OpDefinition | GraphDefinition, - resources: dict[str, ResourceDefinition] = {}, ): """Produce a `pudl_model` to extract data from sec10k filings.""" experiment_name = f"{dataset_name}_extraction" - @pudl_model( - experiment_name=experiment_name, - resources={"cloud_interface": cloud_interface_resource} | resources, - ) @graph(name=experiment_name) - def extract_filings(): - metadata = get_filings_to_extract() - previous_extraction_metadata, previous_extracted, filings_to_extract = ( - get_most_recent_run_results(metadata) + def extract_filings(previous_extraction_metadata, previous_extracted): + metadata = get_filing_metadata() + filings_to_extract = get_filings_to_extract( + metadata, + previous_extraction_metadata, + previous_extracted, ) + filing_chunks = chunk_filings(filings_to_extract) extraction_metadata, extracted = filing_chunks.map(extract_op) @@ -213,6 +161,84 @@ def extract_filings(): return extract_filings +@op( + ins={ + "extraction_metadata": In( + input_manager_key="previous_run_mlflow_pandas_artifact_io_manager" + ), + "extracted": In( + input_manager_key="previous_run_mlflow_pandas_artifact_io_manager" + ), + } +) +def get_previous_run_data( + continue_previous_run, extraction_metadata: pd.DataFrame, extracted: pd.DataFrame +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Return previous run data loaded by io-manager.""" + extraction_metadata = ExtractionMetadataSchema.validate(extraction_metadata) + + return extraction_metadata, extracted + + +@op +def get_empty_run_data(start_new_run): + """Return empty dataframes representing run metadata and extracted data.""" + extraction_metadata = pd.DataFrame( + {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} + ).set_index("filename") + extracted = pd.DataFrame() + + return extraction_metadata, extracted + + +class ContinuePreviousRunConfig(Config): + """Configure whether to continue a previous extraction run or not.""" + + continue_run: bool = False + + +@op(out={"continue_run": Out(is_required=False), "new_run": Out(is_required=False)}) +def continue_previous_run(config: ContinuePreviousRunConfig): + """Create branch dictating whether a previous extraction run is continued or not.""" + if config.continue_run: + yield Output(True, "continue_run") + else: + yield Output(True, "new_run") + + +@op(out={"previous_run_extraction_metadata": Out(), "previous_extracted": Out()}) +def merge_branches(dfs: list[tuple[pd.DataFrame, pd.DataFrame]]): + """Merge branches created by `continue_previous_run` and return.""" + dfs = dfs[0] + return dfs[0], dfs[1] + + +@graph( + out={ + "previous_run_extraction_metadata": GraphOut(), + "previous_extracted": GraphOut(), + } +) +def get_starting_data(): + """Get previous run data if configured to do so.""" + continue_run, new_run = continue_previous_run() + previous_data = get_previous_run_data(continue_run) + new_data = get_empty_run_data(new_run) + return merge_branches([previous_data, new_data]) + + +@pudl_model( + "basic_10k_extraction", resources={"cloud_interface": cloud_interface_resource} +) +@graph +def basic_10k_extraction_model(): + """Implement basic 10k extraction pudl_model.""" + previous_extraction_metadata, previous_extracted = get_starting_data() + return extract_graph_factory("basic_10k", basic_10k.extract)( + previous_extraction_metadata, previous_extracted + ) + + def compute_validation_metrics( computed_set: pd.DataFrame, validation_set: pd.DataFrame, @@ -251,6 +277,3 @@ def compute_validation_metrics( "precision": true_positives / computed_len, "recall": true_positives / validation_len, } - - -basic_10k_extract = extract_model_factory("basic_10k", basic_10k.extract) diff --git a/tests/conftest.py b/tests/conftest.py index c10e825..4e9b0f6 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,6 +3,7 @@ import logging from pathlib import Path +import mlflow import pytest from mozilla_sec_eia.library.experiment_tracking import ExperimentTracker @@ -58,3 +59,19 @@ def factory(experiment_name: str) -> TestTracker: ) return factory + + +@pytest.fixture +def get_most_recent_mlflow_run_factory(): + def _get_run(experiment_name: str): + """Search mlflow for most recent run with specified experiment name.""" + run_metadata = mlflow.search_runs( + experiment_names=[experiment_name], + ) + + # Mlflow returns runs ordered by their runtime, so it's easy to grab the latest run + # This assert will ensure this doesn't silently break if the ordering changes + assert run_metadata.loc[0, "end_time"] == run_metadata["end_time"].max() + return mlflow.get_run(run_metadata.loc[0, "run_id"]) + + return _get_run diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py index 925b5c1..d4421fb 100644 --- a/tests/unit/models/sec10k/extract_test.py +++ b/tests/unit/models/sec10k/extract_test.py @@ -4,14 +4,14 @@ import pandas as pd import pytest -from dagster import Out, op -from mozilla_sec_eia.library.experiment_tracking import ( - get_most_recent_run, +from dagster import Out, RunConfig, op +from mozilla_sec_eia.library.experiment_tracking.mlflow_io_managers import ( + MlflowPandasArtifactIOManager, ) from mozilla_sec_eia.models.sec10k.extract import ( - ChunkFilingsConfig, + FilingsToExtractConfig, compute_validation_metrics, - extract_model_factory, + extract_graph_factory, ) from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive @@ -63,11 +63,58 @@ def second_run_results(): ) +@pytest.mark.parametrize( + "filings_metadata,previous_extraction_metadata,num_filings,num_failed", + [ + ( + pd.DataFrame( + {"filename": ["filing1", "filing2", "filing3", "filing4", "filing5"]} + ), + pd.DataFrame( + {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} + ).set_index("filename"), + -1, + 0, + ), + ( + pd.DataFrame( + {"filename": ["filing1", "filing2", "filing3", "filing4", "filing5"]} + ), + pd.DataFrame( + {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} + ).set_index("filename"), + -1, + 3, + ), + ( + pd.DataFrame( + {"filename": ["filing1", "filing2", "filing3", "filing4", "filing5"]} + ), + pd.DataFrame( + {"filename": ["filing1", "filing2"], "success": [True, True]} + ).set_index("filename"), + -1, + 0, + ), + ( + pd.DataFrame( + {"filename": ["filing1", "filing2", "filing3", "filing4", "filing5"]} + ), + pd.DataFrame( + {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} + ).set_index("filename"), + 2, + 1, + ), + ], +) def test_sec10k_extract_pipeline( filings_metadata, - first_run_results, - second_run_results, + previous_extraction_metadata, + num_filings, + num_failed, test_tracker_factory, + get_most_recent_mlflow_run_factory, ): """Test high level extraction workflow.""" @@ -85,35 +132,52 @@ def setup_for_execution(self, context): def get_metadata(self): return filings_metadata + @op(out={"extraction_metadata": Out(), "extracted": Out()}) + def test_extract( + cloud_interface: GCSArchive, + filings_to_extract: pd.DataFrame, + ) -> tuple[pd.DataFrame, pd.DataFrame]: + md = filings_to_extract + md["success"] = True + md.iloc[:num_failed, 1] = False + return md.set_index("filename"), pd.DataFrame() + dataset_name = "test_pipeline" experiment_name = f"{dataset_name}_extraction" test_tracker = test_tracker_factory(experiment_name) - for i, results in enumerate([first_run_results, second_run_results]): - - @op(out={"extraction_metadata": Out(), "extracted": Out()}) - def _fake_extract(_filings_to_extract): - return results[0], results[1] - - resources = { - "basic_10k_extract_config": ChunkFilingsConfig( - num_filings=3 if i == 0 else -1 + test_graph = extract_graph_factory("test_extract", test_extract) + resources = { + "experiment_tracker": test_tracker, + "cloud_interface": FakeArchive(), + "mlflow_pandas_artifact_io_manager": MlflowPandasArtifactIOManager( + experiment_tracker=test_tracker + ), + } + extraction_metadata = ( + test_graph.to_job() + .execute_in_process( + resources=resources, + run_config=RunConfig( + { + "get_filings_to_extract": FilingsToExtractConfig( + num_filings=num_filings + ) + } ), - "experiment_tracker": test_tracker, - "cloud_interface": FakeArchive(), - } - test_job = extract_model_factory( - dataset_name, _fake_extract, resources=resources - ) - metadata = results[0] - - # Run extract method - test_job.execute_in_process() - run = get_most_recent_run(experiment_name, dagster_run_id="") - assert run.data.metrics["num_failed"] == (~metadata["success"]).sum() - assert run.data.metrics["ratio_extracted"] == len(metadata) / len( - filings_metadata + input_values={ + "previous_extraction_metadata": previous_extraction_metadata, + "previous_extracted": pd.DataFrame(), + }, ) + .output_value() + ) + + run = get_most_recent_mlflow_run_factory(experiment_name) + assert run.data.metrics["num_failed"] == num_failed + assert run.data.metrics["ratio_extracted"] == len(extraction_metadata) / len( + filings_metadata + ) @pytest.mark.parametrize( From 07713e910441175d9bb4e86d37bf0cfdf62f75f7 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 28 Aug 2024 22:38:47 -0400 Subject: [PATCH 012/161] Simplify pudl_models decorator --- src/mozilla_sec_eia/library/models.py | 42 ++++++++++++--------------- 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/src/mozilla_sec_eia/library/models.py b/src/mozilla_sec_eia/library/models.py index 6cb7bd8..427d21d 100644 --- a/src/mozilla_sec_eia/library/models.py +++ b/src/mozilla_sec_eia/library/models.py @@ -33,7 +33,6 @@ RunConfig, failure_hook, job, - op, success_hook, ) from mlflow.entities.run_status import RunStatus @@ -91,6 +90,24 @@ def get_pudl_model_job_name(experiment_name: str) -> str: return f"{experiment_name}_job" +@success_hook(required_resource_keys={"experiment_tracker"}) +def _log_config_hook(context: HookContext): + if (config := context.op_config) is not None: + mlflow.log_params( + {f"{context.op.name}.{param}": value for param, value in config.items()} + ) + + +@failure_hook(required_resource_keys={"experiment_tracker"}) +def _end_mlflow_run_with_failure(context: HookContext): + exception = context.op_exception + + if isinstance(exception, KeyboardInterrupt): + mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED)) + else: + mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED)) + + def pudl_model( experiment_name: str, mlflow_pandas_io_manager_file_type: str = "parquet", @@ -127,29 +144,6 @@ def _decorator(model_graph: GraphDefinition): ops=model_config, ) - @op - def _collect_results(model_graph_output, _implicit_dependencies: list): - return model_graph_output - - @success_hook(required_resource_keys={"experiment_tracker"}) - def _log_config_hook(context: HookContext): - if (config := context.op_config) is not None: - mlflow.log_params( - { - f"{context.op.name}.{param}": value - for param, value in config.items() - } - ) - - @failure_hook(required_resource_keys={"experiment_tracker"}) - def _end_mlflow_run_with_failure(context: HookContext): - exception = context.op_exception - - if isinstance(exception, KeyboardInterrupt): - mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED)) - else: - mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED)) - @job( name=get_pudl_model_job_name(experiment_name), config=default_config, From 5d89ec6e7bf7a7246581c084bdeae842659bb45c Mon Sep 17 00:00:00 2001 From: zschira Date: Thu, 29 Aug 2024 09:37:09 -0400 Subject: [PATCH 013/161] Split extraction logging into two funcs --- src/mozilla_sec_eia/models/sec10k/extract.py | 46 ++++++++++++------- tests/unit/models/sec10k/extract_test.py | 47 +------------------- 2 files changed, 31 insertions(+), 62 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py index d7b9661..f759ddf 100644 --- a/src/mozilla_sec_eia/models/sec10k/extract.py +++ b/src/mozilla_sec_eia/models/sec10k/extract.py @@ -75,6 +75,23 @@ class GetMostRecentRunResultsConfig(Config): continue_run: bool = False +@op(required_resource_keys=["experiment_tracker"]) +def log_extraction_data( + metadata: pd.DataFrame, + extraction_metadata: pd.DataFrame, + extracted: pd.DataFrame, +): + """Log results from extraction run.""" + mlflow.log_metrics( + { + "num_failed": (~extraction_metadata["success"]).sum(), + "ratio_extracted": len(extraction_metadata) / len(metadata), + } + ) + + return extraction_metadata, extracted + + @op( required_resource_keys=["experiment_tracker"], out={ @@ -82,28 +99,20 @@ class GetMostRecentRunResultsConfig(Config): "extracted": Out(io_manager_key="mlflow_pandas_artifact_io_manager"), }, ) -def log_extraction_data( - metadata: pd.DataFrame, +def merge_extracted_data( extraction_metadata: list[pd.DataFrame], extracted: list[pd.DataFrame], previous_run_extraction_metadata: pd.DataFrame, previous_run_extracted_data: pd.DataFrame, ): - """Log results from extraction run.""" + """Data is extracted in parallel ops, merge these plus any data from previous run.""" extraction_metadata = pd.concat( extraction_metadata + [previous_run_extraction_metadata] ) extracted = pd.concat(extracted + [previous_run_extracted_data]) # Use metadata to log generic metrics extraction_metadata = ExtractionMetadataSchema.validate(extraction_metadata) - mlflow.log_metrics( - { - "num_failed": (~extraction_metadata["success"]).sum(), - "ratio_extracted": len(extraction_metadata) / len(metadata), - } - ) - # Return metadata and extracted data (they'll be logged as artifacts by io-manager) return extraction_metadata, extracted @@ -149,15 +158,19 @@ def extract_filings(previous_extraction_metadata, previous_extracted): filing_chunks = chunk_filings(filings_to_extract) extraction_metadata, extracted = filing_chunks.map(extract_op) - - return log_extraction_data( - metadata, + extraction_metadata, extracted = merge_extracted_data( extraction_metadata.collect(), extracted.collect(), previous_extraction_metadata, previous_extracted, ) + return log_extraction_data( + metadata, + extraction_metadata, + extracted, + ) + return extract_filings @@ -227,6 +240,9 @@ def get_starting_data(): return merge_branches([previous_data, new_data]) +basic_10k_extract_graph = extract_graph_factory("basic_10k", basic_10k.extract) + + @pudl_model( "basic_10k_extraction", resources={"cloud_interface": cloud_interface_resource} ) @@ -234,9 +250,7 @@ def get_starting_data(): def basic_10k_extraction_model(): """Implement basic 10k extraction pudl_model.""" previous_extraction_metadata, previous_extracted = get_starting_data() - return extract_graph_factory("basic_10k", basic_10k.extract)( - previous_extraction_metadata, previous_extracted - ) + return basic_10k_extract_graph(previous_extraction_metadata, previous_extracted) def compute_validation_metrics( diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py index d4421fb..23ce092 100644 --- a/tests/unit/models/sec10k/extract_test.py +++ b/tests/unit/models/sec10k/extract_test.py @@ -18,51 +18,6 @@ logger = logging.getLogger(f"catalystcoop.{__name__}") -@pytest.fixture -def filings_metadata() -> pd.DataFrame: - """Return fake filing metadata.""" - return pd.DataFrame( - { - "filename": [ - "filing1", - "filing2", - "filing3", - "filing3", - "filing4", - "filing5", - ], - } - ) - - -@pytest.fixture -def first_run_results(): - """Metadata and extracted table from first run of extractor.""" - return ( - pd.DataFrame( - { - "filename": ["filing1", "filing2", "filing3"], - "success": [True, True, False], - } - ).set_index("filename"), - pd.DataFrame({"column": ["extracted table (not needed for test)"]}), - ) - - -@pytest.fixture -def second_run_results(): - """Metadata and extracted table from first run of extractor.""" - return ( - pd.DataFrame( - { - "filename": ["filing1", "filing2", "filing3", "filing4", "filing5"], - "success": [True, True, False, True, True], - }, - ).set_index("filename"), - pd.DataFrame({"column": ["extracted table (not needed for test)"]}), - ) - - @pytest.mark.parametrize( "filings_metadata,previous_extraction_metadata,num_filings,num_failed", [ @@ -154,7 +109,7 @@ def test_extract( experiment_tracker=test_tracker ), } - extraction_metadata = ( + extraction_metadata, extracted = ( test_graph.to_job() .execute_in_process( resources=resources, From c57818a3f416cc4e2e4ec0e46e25e3fb0ef2231b Mon Sep 17 00:00:00 2001 From: zschira Date: Thu, 29 Aug 2024 10:06:34 -0400 Subject: [PATCH 014/161] Add mlflow metrics io-manager --- .../library/experiment_tracking/__init__.py | 2 +- .../experiment_tracking/mlflow_io_managers.py | 54 +++++++++++++------ src/mozilla_sec_eia/library/models.py | 4 ++ src/mozilla_sec_eia/models/sec10k/extract.py | 27 +++++----- tests/unit/models/sec10k/extract_test.py | 35 ++++++------ 5 files changed, 75 insertions(+), 47 deletions(-) diff --git a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py index 5a468d7..edf7817 100644 --- a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py +++ b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py @@ -1,6 +1,6 @@ """Implement tooling to interface with mlflow experiment tracking.""" -from .mlflow_io_managers import MlflowPandasArtifactIOManager +from .mlflow_io_managers import MlflowMetricsIOManager, MlflowPandasArtifactIOManager from .mlflow_resource import ( ExperimentTracker, experiment_tracker_teardown_factory, diff --git a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py index a1a6850..bc42138 100644 --- a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py +++ b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py @@ -16,12 +16,32 @@ logger = logging.getLogger(f"catalystcoop.{__name__}") -class MlflowPandasArtifactIOManager(ConfigurableIOManager): - """Implement IO manager for logging/loading parquet files as mlflow artifacts.""" +class MlflowBaseIOManager(ConfigurableIOManager): + """Specify base config and implement helper functions for mlflow io-managers.""" experiment_tracker: ExperimentTracker #: By default handles artifacts from current run, but can be used with previous run. use_previous_mlflow_run: bool = False + + def _get_run_info(self) -> Run: + """Use `dagster_run_id` and `use_previous_mlflow_run` to get run info from appropriate mlflow run.""" + dagster_run_id = self.experiment_tracker.get_run_id() + filter_string = f"tags.dagster_run_id='{dagster_run_id}'" + if self.use_previous_mlflow_run: + filter_string = f"tags.dagster_run_id!='{dagster_run_id}'" + + run_metadata = mlflow.search_runs( + experiment_names=[self.experiment_tracker.experiment_name], + filter_string=filter_string, + ) + + # Mlflow returns runs ordered by their runtime, so it's easy to grab the latest run + return mlflow.get_run(run_metadata.loc[0, "run_id"]) + + +class MlflowPandasArtifactIOManager(MlflowBaseIOManager): + """Implement IO manager for logging/loading parquet files as mlflow artifacts.""" + file_type: Literal["parquet", "csv"] = "parquet" def _load_artifact_as_csv(self, run: Run, artifact_name: str) -> pd.DataFrame: @@ -69,21 +89,6 @@ def handle_output(self, context: OutputContext, df: pd.DataFrame): else: self._log_artifact_as_parquet(df, artifact_name=f"{context.name}.parquet") - def _get_run_info(self) -> Run: - """Use `dagster_run_id` and `use_previous_mlflow_run` to get run info from appropriate mlflow run.""" - dagster_run_id = self.experiment_tracker.get_run_id() - filter_string = f"tags.dagster_run_id='{dagster_run_id}'" - if self.use_previous_mlflow_run: - filter_string = f"tags.dagster_run_id!='{dagster_run_id}'" - - run_metadata = mlflow.search_runs( - experiment_names=[self.experiment_tracker.experiment_name], - filter_string=filter_string, - ) - - # Mlflow returns runs ordered by their runtime, so it's easy to grab the latest run - return mlflow.get_run(run_metadata.loc[0, "run_id"]) - def load_input(self, context: InputContext) -> pd.DataFrame: """Handle loading dataframes from mlflow run artifacts.""" mlflow_run = self._get_run_info() @@ -98,3 +103,18 @@ def load_input(self, context: InputContext) -> pd.DataFrame: ) return df + + +class MlflowMetricsIOManager(MlflowBaseIOManager): + """Log/load models from mlflow tracking server.""" + + experiment_tracker: ExperimentTracker + + def handle_output(self, context: OutputContext, obj: dict[str, float]): + """Log metrics to mlflow run/experiment from `experiment_tracker`.""" + mlflow.log_metrics(obj) + + def load_input(self, context: OutputContext) -> dict[str, float]: + """Log metrics to mlflow run/experiment from `experiment_tracker`.""" + run = self._get_run_info() + return run.data.metrics diff --git a/src/mozilla_sec_eia/library/models.py b/src/mozilla_sec_eia/library/models.py index 427d21d..f506d76 100644 --- a/src/mozilla_sec_eia/library/models.py +++ b/src/mozilla_sec_eia/library/models.py @@ -39,6 +39,7 @@ from .experiment_tracking import ( ExperimentTracker, + MlflowMetricsIOManager, MlflowPandasArtifactIOManager, experiment_tracker_teardown_factory, ) @@ -138,6 +139,9 @@ def _decorator(model_graph: GraphDefinition): file_type=mlflow_pandas_io_manager_file_type, experiment_tracker=experiment_tracker, ), + "mlflow_metrics_io_manager": MlflowMetricsIOManager( + experiment_tracker=experiment_tracker, + ), } | resources default_config = RunConfig( diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py index f759ddf..53e3b41 100644 --- a/src/mozilla_sec_eia/models/sec10k/extract.py +++ b/src/mozilla_sec_eia/models/sec10k/extract.py @@ -3,7 +3,6 @@ import logging import math -import mlflow import numpy as np import pandas as pd import pandera as pa @@ -75,21 +74,17 @@ class GetMostRecentRunResultsConfig(Config): continue_run: bool = False -@op(required_resource_keys=["experiment_tracker"]) +@op(out={"basic_extraction_metrics": Out(io_manager_key="mlflow_metrics_io_manager")}) def log_extraction_data( metadata: pd.DataFrame, extraction_metadata: pd.DataFrame, extracted: pd.DataFrame, ): """Log results from extraction run.""" - mlflow.log_metrics( - { - "num_failed": (~extraction_metadata["success"]).sum(), - "ratio_extracted": len(extraction_metadata) / len(metadata), - } - ) - - return extraction_metadata, extracted + return { + "num_failed": (~extraction_metadata["success"]).sum(), + "ratio_extracted": len(extraction_metadata) / len(metadata), + } @op( @@ -147,7 +142,14 @@ def extract_graph_factory( """Produce a `pudl_model` to extract data from sec10k filings.""" experiment_name = f"{dataset_name}_extraction" - @graph(name=experiment_name) + @graph( + name=experiment_name, + out={ + "extraction_metadata": GraphOut(), + "extracted": GraphOut(), + "extraction_metrics": GraphOut(), + }, + ) def extract_filings(previous_extraction_metadata, previous_extracted): metadata = get_filing_metadata() filings_to_extract = get_filings_to_extract( @@ -165,11 +167,12 @@ def extract_filings(previous_extraction_metadata, previous_extracted): previous_extracted, ) - return log_extraction_data( + extraction_metrics = log_extraction_data( metadata, extraction_metadata, extracted, ) + return extraction_metadata, extracted, extraction_metrics return extract_filings diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py index 23ce092..2b41936 100644 --- a/tests/unit/models/sec10k/extract_test.py +++ b/tests/unit/models/sec10k/extract_test.py @@ -6,6 +6,7 @@ import pytest from dagster import Out, RunConfig, op from mozilla_sec_eia.library.experiment_tracking.mlflow_io_managers import ( + MlflowMetricsIOManager, MlflowPandasArtifactIOManager, ) from mozilla_sec_eia.models.sec10k.extract import ( @@ -108,24 +109,23 @@ def test_extract( "mlflow_pandas_artifact_io_manager": MlflowPandasArtifactIOManager( experiment_tracker=test_tracker ), + "mlflow_metrics_io_manager": MlflowMetricsIOManager( + experiment_tracker=test_tracker, + ), } - extraction_metadata, extracted = ( - test_graph.to_job() - .execute_in_process( - resources=resources, - run_config=RunConfig( - { - "get_filings_to_extract": FilingsToExtractConfig( - num_filings=num_filings - ) - } - ), - input_values={ - "previous_extraction_metadata": previous_extraction_metadata, - "previous_extracted": pd.DataFrame(), - }, - ) - .output_value() + graph_result = test_graph.to_job().execute_in_process( + resources=resources, + run_config=RunConfig( + {"get_filings_to_extract": FilingsToExtractConfig(num_filings=num_filings)} + ), + input_values={ + "previous_extraction_metadata": previous_extraction_metadata, + "previous_extracted": pd.DataFrame(), + }, + ) + extraction_metadata, metrics = ( + graph_result.output_value("extraction_metadata"), + graph_result.output_value("extraction_metrics"), ) run = get_most_recent_mlflow_run_factory(experiment_name) @@ -133,6 +133,7 @@ def test_extract( assert run.data.metrics["ratio_extracted"] == len(extraction_metadata) / len( filings_metadata ) + assert run.data.metrics == metrics @pytest.mark.parametrize( From 625783b018ec8005658c818f1ea58e39810509f2 Mon Sep 17 00:00:00 2001 From: zschira Date: Thu, 29 Aug 2024 11:52:42 -0400 Subject: [PATCH 015/161] Change pudl_model to pudl_pipeline --- src/mozilla_sec_eia/library/__init__.py | 9 +-- .../library/experiment_tracking/__init__.py | 31 ++++++- .../library/experiment_tracking/validation.py | 58 ++++++++++++++ .../library/models/__init__.py | 3 + .../{models.py => models/pipelines.py} | 80 ++++++++----------- src/mozilla_sec_eia/models/sec10k/extract.py | 51 ++---------- .../{model_jobs.py => pudl_pipelines.py} | 4 +- tests/unit/models/sec10k/extract_test.py | 74 ----------------- 8 files changed, 137 insertions(+), 173 deletions(-) create mode 100644 src/mozilla_sec_eia/library/experiment_tracking/validation.py create mode 100644 src/mozilla_sec_eia/library/models/__init__.py rename src/mozilla_sec_eia/library/{models.py => models/pipelines.py} (69%) rename src/mozilla_sec_eia/{model_jobs.py => pudl_pipelines.py} (77%) diff --git a/src/mozilla_sec_eia/library/__init__.py b/src/mozilla_sec_eia/library/__init__.py index f3448ee..3bd8694 100644 --- a/src/mozilla_sec_eia/library/__init__.py +++ b/src/mozilla_sec_eia/library/__init__.py @@ -3,11 +3,6 @@ from . import models -def get_ml_model_resources(): - """Return default configuration for all PUDL models.""" - return models.MODEL_RESOURCES - - -def get_ml_model_jobs() -> list[str]: +def get_ml_pipeline_jobs() -> list[str]: """Return all jobs created through `pudl_model` decorator.""" - return list(models.PUDL_MODELS.values()) + return list(models.PUDL_PIPELINES.values()) diff --git a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py index edf7817..fe6a070 100644 --- a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py +++ b/src/mozilla_sec_eia/library/experiment_tracking/__init__.py @@ -1,8 +1,37 @@ """Implement tooling to interface with mlflow experiment tracking.""" -from .mlflow_io_managers import MlflowMetricsIOManager, MlflowPandasArtifactIOManager +from .mlflow_io_managers import ( + MlflowBaseIOManager, + MlflowMetricsIOManager, + MlflowPandasArtifactIOManager, +) from .mlflow_resource import ( ExperimentTracker, experiment_tracker_teardown_factory, get_most_recent_run, ) + + +def get_mlflow_io_manager( + key: str, experiment_tracker: ExperimentTracker, pandas_file_type: str = "parquet" +) -> MlflowBaseIOManager: + """Construct IO-manager based on key.""" + if key == "mlflow_pandas_artifact_io_manager": + io_manager = MlflowPandasArtifactIOManager( + file_type=pandas_file_type, + experiment_tracker=experiment_tracker, + ) + elif key == "previous_run_mlflow_pandas_artifact_io_manager": + io_manager = MlflowPandasArtifactIOManager( + file_type=pandas_file_type, + experiment_tracker=experiment_tracker, + use_previous_mlflow_run=True, + ) + elif key == "mlflow_metrics_io_manager": + io_manager = MlflowMetricsIOManager( + experiment_tracker=experiment_tracker, + ) + else: + raise RuntimeError(f"MlFlow IO-manager, {key}, does not exist.") + + return io_manager diff --git a/src/mozilla_sec_eia/library/experiment_tracking/validation.py b/src/mozilla_sec_eia/library/experiment_tracking/validation.py new file mode 100644 index 0000000..c8aebe1 --- /dev/null +++ b/src/mozilla_sec_eia/library/experiment_tracking/validation.py @@ -0,0 +1,58 @@ +"""Implement common utilities/functions for validating models.""" + +import pandas as pd +from dagster import OpDefinition, Out, op + + +def _pandas_compute_precision_recall( + computed_set: pd.DataFrame, + validation_set: pd.DataFrame, + value_col: str, +) -> dict: + """Compute precision and recall for extraction compared to validation set. + + Arg: + computed_set: Extracted data. + validation_set: Expected extraction results. + value_col: Column to compare when computing metrics. + """ + # Get initial length of both sets + computed_len = len(computed_set) + validation_len = len(validation_set) + + # Get index of rows only in one set and make Null in other set + idx_validation_only = validation_set.index.difference(computed_set.index) + padded_compute_set = pd.concat( + [ + computed_set[value_col], + pd.Series([None] * len(idx_validation_only), index=idx_validation_only), + ] + ).sort_index() + idx_compute_only = computed_set.index.difference(validation_set.index) + padded_validation_set = pd.concat( + [ + validation_set[value_col], + pd.Series([None] * len(idx_compute_only), index=idx_compute_only), + ] + ).sort_index() + + true_positives = (padded_compute_set == padded_validation_set).sum() + + return { + "precision": true_positives / computed_len, + "recall": true_positives / validation_len, + } + + +def pandas_precision_recall_op_factory(value_col: str) -> OpDefinition: + """Return an op that will compute precision/recall on `value_col` of dataframe.""" + + @op( + out={ + "precision_recall_metrics": Out(io_manager_key="mlflow_metrics_io_manager") + } + ) + def _precision_recall_op(computed_set: pd.DataFrame, validation_set: pd.DataFrame): + return _pandas_compute_precision_recall(computed_set, validation_set, value_col) + + return _precision_recall_op diff --git a/src/mozilla_sec_eia/library/models/__init__.py b/src/mozilla_sec_eia/library/models/__init__.py new file mode 100644 index 0000000..3deb3bb --- /dev/null +++ b/src/mozilla_sec_eia/library/models/__init__.py @@ -0,0 +1,3 @@ +"""Implement top level framework and utilities for defining pudl models/pipelines.""" + +from .pipelines import PUDL_PIPELINES, PudlPipelineConfig, pudl_pipeline diff --git a/src/mozilla_sec_eia/library/models.py b/src/mozilla_sec_eia/library/models/pipelines.py similarity index 69% rename from src/mozilla_sec_eia/library/models.py rename to src/mozilla_sec_eia/library/models/pipelines.py index f506d76..48a0a8f 100644 --- a/src/mozilla_sec_eia/library/models.py +++ b/src/mozilla_sec_eia/library/models/pipelines.py @@ -18,11 +18,9 @@ yaml configuration, but will only be used for a single run. """ -import importlib import logging import mlflow -import yaml from dagster import ( EnvVar, GraphDefinition, @@ -36,30 +34,16 @@ success_hook, ) from mlflow.entities.run_status import RunStatus +from pydantic import BaseModel -from .experiment_tracking import ( +from ..experiment_tracking import ( ExperimentTracker, - MlflowMetricsIOManager, - MlflowPandasArtifactIOManager, experiment_tracker_teardown_factory, + get_mlflow_io_manager, ) logger = logging.getLogger(f"catalystcoop.{__name__}") -MODEL_RESOURCES = {} -PUDL_MODELS = {} - - -def get_yml_config(experiment_name: str) -> dict: - """Load model configuration from yaml file.""" - config_file = ( - importlib.resources.files("pudl.package_data.settings") / "pudl_models.yml" - ) - config = yaml.safe_load(config_file.open("r")) - - if not (model_config := config.get(experiment_name)): - raise RuntimeError(f"No {experiment_name} entry in {config_file}") - - return {experiment_name: model_config} +PUDL_PIPELINES = {} def get_default_config(model_graph: GraphDefinition) -> dict: @@ -86,7 +70,7 @@ def _get_default_from_ops(node: OpDefinition | GraphDefinition): return config -def get_pudl_model_job_name(experiment_name: str) -> str: +def get_pudl_pipeline_job_name(experiment_name: str) -> str: """Return expected pudl model job name based on experiment_name.""" return f"{experiment_name}_job" @@ -109,47 +93,51 @@ def _end_mlflow_run_with_failure(context: HookContext): mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED)) -def pudl_model( - experiment_name: str, - mlflow_pandas_io_manager_file_type: str = "parquet", +class PudlPipelineConfig(BaseModel): + """Define a config format for `pudl_pipeline`'s.""" + + experiment_name: str + op_config: dict = {} + required_mlflow_io_managers: list[str] = [ + "mlflow_pandas_artifact_io_manager", + "previous_run_mlflow_pandas_artifact_io_manager", + "mlflow_metrics_io_manager", + ] + pandas_io_file_type: str = "parquet" + + +def pudl_pipeline( + config: PudlPipelineConfig, resources: dict[str, ResourceDefinition] = {}, - config_from_yaml: bool = False, ) -> JobDefinition: """Decorator for an ML model that will handle providing configuration to dagster.""" def _decorator(model_graph: GraphDefinition): - model_config = get_default_config(model_graph) - if config_from_yaml: - model_config |= get_yml_config(model_graph.name) + model_config = get_default_config(model_graph) | config.op_config # Add resources to resource dict experiment_tracker = ExperimentTracker( - experiment_name=experiment_name, + experiment_name=config.experiment_name, tracking_uri=EnvVar("MLFLOW_TRACKING_URI"), project=EnvVar("GCS_PROJECT"), ) - model_resources = { - "experiment_tracker": experiment_tracker, - "mlflow_pandas_artifact_io_manager": MlflowPandasArtifactIOManager( - file_type=mlflow_pandas_io_manager_file_type, - experiment_tracker=experiment_tracker, - ), - "previous_run_mlflow_pandas_artifact_io_manager": MlflowPandasArtifactIOManager( - use_previous_mlflow_run=True, - file_type=mlflow_pandas_io_manager_file_type, - experiment_tracker=experiment_tracker, - ), - "mlflow_metrics_io_manager": MlflowMetricsIOManager( - experiment_tracker=experiment_tracker, - ), - } | resources + model_resources = ( + {"experiment_tracker": experiment_tracker} + | { + key: get_mlflow_io_manager( + key, experiment_tracker, config.pandas_io_file_type + ) + for key in config.required_mlflow_io_managers + } + | resources + ) default_config = RunConfig( ops=model_config, ) @job( - name=get_pudl_model_job_name(experiment_name), + name=get_pudl_pipeline_job_name(config.experiment_name), config=default_config, hooks={_log_config_hook, _end_mlflow_run_with_failure}, resource_defs=model_resources, @@ -163,7 +151,7 @@ def model_job(**kwargs): # Pass output to teardown to create a dependency tracker_teardown(graph_output) - PUDL_MODELS[get_pudl_model_job_name(experiment_name)] = model_job + PUDL_PIPELINES[get_pudl_pipeline_job_name(config.experiment_name)] = model_job return model_job return _decorator diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py index 53e3b41..676ec38 100644 --- a/src/mozilla_sec_eia/models/sec10k/extract.py +++ b/src/mozilla_sec_eia/models/sec10k/extract.py @@ -20,7 +20,7 @@ op, ) -from mozilla_sec_eia.library.models import pudl_model +from mozilla_sec_eia.library.models import PudlPipelineConfig, pudl_pipeline from . import basic_10k from .utils.cloud import GCSArchive, cloud_interface_resource @@ -246,51 +246,16 @@ def get_starting_data(): basic_10k_extract_graph = extract_graph_factory("basic_10k", basic_10k.extract) -@pudl_model( - "basic_10k_extraction", resources={"cloud_interface": cloud_interface_resource} +basic_10k_extract_config = PudlPipelineConfig( + experiment_name="basic_10k_extraction", +) + + +@pudl_pipeline( + basic_10k_extract_config, resources={"cloud_interface": cloud_interface_resource} ) @graph def basic_10k_extraction_model(): """Implement basic 10k extraction pudl_model.""" previous_extraction_metadata, previous_extracted = get_starting_data() return basic_10k_extract_graph(previous_extraction_metadata, previous_extracted) - - -def compute_validation_metrics( - computed_set: pd.DataFrame, - validation_set: pd.DataFrame, - value_col: str, -) -> dict: - """Compute precision and recall for extraction compared to validation set. - - Arg: - computed_set: Extracted data. - validation_set: Expected extraction results. - value_col: Column to compare when computing metrics. - """ - # Get initial length of both sets - computed_len = len(computed_set) - validation_len = len(validation_set) - - # Get index of rows only in one set and make Null in other set - idx_validation_only = validation_set.index.difference(computed_set.index) - padded_compute_set = pd.concat( - [ - computed_set[value_col], - pd.Series([None] * len(idx_validation_only), index=idx_validation_only), - ] - ).sort_index() - idx_compute_only = computed_set.index.difference(validation_set.index) - padded_validation_set = pd.concat( - [ - validation_set[value_col], - pd.Series([None] * len(idx_compute_only), index=idx_compute_only), - ] - ).sort_index() - - true_positives = (padded_compute_set == padded_validation_set).sum() - - return { - "precision": true_positives / computed_len, - "recall": true_positives / validation_len, - } diff --git a/src/mozilla_sec_eia/model_jobs.py b/src/mozilla_sec_eia/pudl_pipelines.py similarity index 77% rename from src/mozilla_sec_eia/model_jobs.py rename to src/mozilla_sec_eia/pudl_pipelines.py index 2a8918b..42cf090 100644 --- a/src/mozilla_sec_eia/model_jobs.py +++ b/src/mozilla_sec_eia/pudl_pipelines.py @@ -5,12 +5,12 @@ import coloredlogs from dagster import Definitions -from mozilla_sec_eia.library import get_ml_model_jobs +from mozilla_sec_eia.library import get_ml_pipeline_jobs logger = logging.getLogger("catalystcoop") log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s" coloredlogs.install(fmt=log_format, logger=logger) defs = Definitions( - jobs=get_ml_model_jobs(), + jobs=get_ml_pipeline_jobs(), ) diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py index 2b41936..7d7e716 100644 --- a/tests/unit/models/sec10k/extract_test.py +++ b/tests/unit/models/sec10k/extract_test.py @@ -11,7 +11,6 @@ ) from mozilla_sec_eia.models.sec10k.extract import ( FilingsToExtractConfig, - compute_validation_metrics, extract_graph_factory, ) from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive @@ -134,76 +133,3 @@ def test_extract( filings_metadata ) assert run.data.metrics == metrics - - -@pytest.mark.parametrize( - "computed_set,validation_set,expected_precision,expected_recall", - [ - ( - pd.DataFrame({"value": ["a", "b", "c"]}, index=[0, 1, 2]), - pd.DataFrame({"value": ["a", "b", "c"]}, index=[0, 1, 2]), - 1, - 1, - ), - ( - pd.DataFrame({"value": ["a", "b", "c", "d"]}, index=[0, 1, 2, 3]), - pd.DataFrame({"value": ["a", "b", "c"]}, index=[0, 1, 2]), - 3 / 4, - 1, - ), - ( - pd.DataFrame({"value": ["a", "b", "c"]}, index=[0, 1, 2]), - pd.DataFrame({"value": ["a", "b", "c", "d"]}, index=[0, 1, 2, 3]), - 1, - 3 / 4, - ), - ( - pd.DataFrame({"value": ["a", "b", "d"]}, index=[0, 1, 2]), - pd.DataFrame({"value": ["a", "b", "c"]}, index=[0, 1, 2]), - 2 / 3, - 2 / 3, - ), - ( - pd.DataFrame( - {"value": ["a", "b", "d"], "idx0": ["1", "2", "3"], "idx1": [4, 2, 1]} - ).set_index(["idx0", "idx1"]), - pd.DataFrame( - {"value": ["a", "b", "c"], "idx0": ["1", "2", "3"], "idx1": [4, 2, 1]} - ).set_index(["idx0", "idx1"]), - 2 / 3, - 2 / 3, - ), - ( - pd.DataFrame( - { - "value": ["a", "b", "c", "d"], - "idx0": ["1", "2", "3", "4"], - "idx1": [4, 2, 1, 5], - } - ).set_index(["idx0", "idx1"]), - pd.DataFrame( - {"value": ["a", "b", "c"], "idx0": ["1", "2", "3"], "idx1": [4, 2, 1]} - ).set_index(["idx0", "idx1"]), - 3 / 4, - 1, - ), - ( - pd.DataFrame( - {"value": ["c", "b", "a"], "idx0": ["3", "2", "1"], "idx1": [1, 2, 4]} - ).set_index(["idx0", "idx1"]), - pd.DataFrame( - {"value": ["a", "b", "c"], "idx0": ["1", "2", "3"], "idx1": [4, 2, 1]} - ).set_index(["idx0", "idx1"]), - 1, - 1, - ), - ], -) -def test_compute_validation_metrics( - computed_set, validation_set, expected_precision, expected_recall -): - """Test validation metrics with test sets.""" - metrics = compute_validation_metrics(computed_set, validation_set, "value") - - assert metrics["precision"] == expected_precision - assert metrics["recall"] == expected_recall From 4f50a7ba1f2dc69ebecfa416fb720a86607966ed Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 30 Aug 2024 10:02:15 -0400 Subject: [PATCH 016/161] Add validation pipeline --- .../experiment_tracking/mlflow_io_managers.py | 4 +- .../experiment_tracking/mlflow_resource.py | 7 ++- .../library/experiment_tracking/validation.py | 57 ++++++++++++------- .../library/models/pipelines.py | 7 ++- src/mozilla_sec_eia/models/sec10k/extract.py | 53 ++++++++++++++--- .../basic_10k_labels.csv | 0 .../{ => validation_data}/ex21_labels.csv | 0 tests/unit/models/sec10k/extract_test.py | 18 +----- 8 files changed, 96 insertions(+), 50 deletions(-) rename src/mozilla_sec_eia/package_data/{ => validation_data}/basic_10k_labels.csv (100%) rename src/mozilla_sec_eia/package_data/{ => validation_data}/ex21_labels.csv (100%) diff --git a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py index bc42138..3ad69ab 100644 --- a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py +++ b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py @@ -95,11 +95,11 @@ def load_input(self, context: InputContext) -> pd.DataFrame: if self.file_type == "csv": df = self._load_artifact_as_csv( - mlflow_run, artifact_name=f"{context.name}.csv" + mlflow_run, artifact_name=f"{context.upstream_output.name}.csv" ) else: df = self._load_artifact_as_parquet( - mlflow_run, artifact_name=f"{context.name}.parquet" + mlflow_run, artifact_name=f"{context.upstream_output.name}.parquet" ) return df diff --git a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py index 744c013..38e750f 100644 --- a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py +++ b/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py @@ -78,7 +78,9 @@ def yield_for_execution( if (active_run := mlflow.active_run()) is not None: if active_run.info.run_id != mlflow_run_id: - raise RuntimeError("Found conflicting active mlflow run!") + raise RuntimeError( + f"Found conflicting active mlflow run! - {active_run.info.run_id} != {mlflow_run_id}" + ) yield self else: # Create new run under specified experiment @@ -154,9 +156,8 @@ def get_or_create_experiment( def experiment_tracker_teardown_factory( experiment_name: str, -) -> ExperimentTracker: +): """Use config to create an experiment tracker.""" - atexit.unregister(mlflow.end_run) @op( name=f"{experiment_name}_tracker_teardown", diff --git a/src/mozilla_sec_eia/library/experiment_tracking/validation.py b/src/mozilla_sec_eia/library/experiment_tracking/validation.py index c8aebe1..bed3a17 100644 --- a/src/mozilla_sec_eia/library/experiment_tracking/validation.py +++ b/src/mozilla_sec_eia/library/experiment_tracking/validation.py @@ -1,13 +1,46 @@ """Implement common utilities/functions for validating models.""" +from importlib import resources + import pandas as pd -from dagster import OpDefinition, Out, op +from dagster import Config, Out, op + + +class LoadValidationConfig(Config): + """Configuration for loading validation data.""" + + filename: str + + +@op( + required_resource_keys=["experiment_tracker"], + out={"validation_set": Out(io_manager_key="mlflow_pandas_artifact_io_manager")}, +) +def load_validation_data(config: LoadValidationConfig) -> pd.DataFrame: + """Load csv with validation data from `package_data` directory.""" + return pd.read_csv( + resources.files("mozilla_sec_eia.package_data.validation_data") + / config.filename + ) + +class PandasPrecisionRecallConfig(Config): + """Configuration for computing precision/recall from pandas dataframe.""" -def _pandas_compute_precision_recall( + value_col: str + + +@op( + out={ + "pandas_precision_recall_metrics": Out( + io_manager_key="mlflow_metrics_io_manager" + ) + } +) +def pandas_compute_precision_recall( + config: PandasPrecisionRecallConfig, computed_set: pd.DataFrame, validation_set: pd.DataFrame, - value_col: str, ) -> dict: """Compute precision and recall for extraction compared to validation set. @@ -24,14 +57,14 @@ def _pandas_compute_precision_recall( idx_validation_only = validation_set.index.difference(computed_set.index) padded_compute_set = pd.concat( [ - computed_set[value_col], + computed_set[config.value_col], pd.Series([None] * len(idx_validation_only), index=idx_validation_only), ] ).sort_index() idx_compute_only = computed_set.index.difference(validation_set.index) padded_validation_set = pd.concat( [ - validation_set[value_col], + validation_set[config.value_col], pd.Series([None] * len(idx_compute_only), index=idx_compute_only), ] ).sort_index() @@ -42,17 +75,3 @@ def _pandas_compute_precision_recall( "precision": true_positives / computed_len, "recall": true_positives / validation_len, } - - -def pandas_precision_recall_op_factory(value_col: str) -> OpDefinition: - """Return an op that will compute precision/recall on `value_col` of dataframe.""" - - @op( - out={ - "precision_recall_metrics": Out(io_manager_key="mlflow_metrics_io_manager") - } - ) - def _precision_recall_op(computed_set: pd.DataFrame, validation_set: pd.DataFrame): - return _pandas_compute_precision_recall(computed_set, validation_set, value_col) - - return _precision_recall_op diff --git a/src/mozilla_sec_eia/library/models/pipelines.py b/src/mozilla_sec_eia/library/models/pipelines.py index 48a0a8f..cfecc0d 100644 --- a/src/mozilla_sec_eia/library/models/pipelines.py +++ b/src/mozilla_sec_eia/library/models/pipelines.py @@ -30,6 +30,7 @@ ResourceDefinition, RunConfig, failure_hook, + graph, job, success_hook, ) @@ -112,8 +113,10 @@ def pudl_pipeline( ) -> JobDefinition: """Decorator for an ML model that will handle providing configuration to dagster.""" - def _decorator(model_graph: GraphDefinition): - model_config = get_default_config(model_graph) | config.op_config + def _decorator(pipeline_func): + model_graph = graph(pipeline_func) + model_config = get_default_config(model_graph) + model_config[model_graph.name]["ops"] |= config.op_config # Add resources to resource dict experiment_tracker = ExperimentTracker( diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py index 676ec38..21438c9 100644 --- a/src/mozilla_sec_eia/models/sec10k/extract.py +++ b/src/mozilla_sec_eia/models/sec10k/extract.py @@ -20,6 +20,7 @@ op, ) +from mozilla_sec_eia.library.experiment_tracking import validation from mozilla_sec_eia.library.models import PudlPipelineConfig, pudl_pipeline from . import basic_10k @@ -42,10 +43,10 @@ class ExtractionMetadataSchema(pa.DataFrameModel): @op def get_filing_metadata( - cloud_interface: GCSArchive, + cloud_interface: GCSArchive, filenames: list[str] | None = None ) -> pd.DataFrame: """Return filing metadata.""" - return cloud_interface.get_metadata() + return cloud_interface.get_metadata(filenames=filenames) class ChunkFilingsConfig(Config): @@ -150,8 +151,7 @@ def extract_graph_factory( "extraction_metrics": GraphOut(), }, ) - def extract_filings(previous_extraction_metadata, previous_extracted): - metadata = get_filing_metadata() + def extract_filings(metadata, previous_extraction_metadata, previous_extracted): filings_to_extract = get_filings_to_extract( metadata, previous_extraction_metadata, @@ -254,8 +254,47 @@ def get_starting_data(): @pudl_pipeline( basic_10k_extract_config, resources={"cloud_interface": cloud_interface_resource} ) -@graph -def basic_10k_extraction_model(): +def basic_10k_extraction_pipeline(): """Implement basic 10k extraction pudl_model.""" + filing_metadata = get_filing_metadata() previous_extraction_metadata, previous_extracted = get_starting_data() - return basic_10k_extract_graph(previous_extraction_metadata, previous_extracted) + return basic_10k_extract_graph( + filing_metadata, previous_extraction_metadata, previous_extracted + ) + + +@op +def get_validation_filenames(validation_set: pd.DataFrame) -> list[str]: + """Return filenames in validation set.""" + return list(validation_set["filename"]) + + +basic_10k_extract_validation_config = PudlPipelineConfig( + experiment_name="basic_10k_extraction_validation", + pandas_io_file_type="csv", + op_config={ + "load_validation_data": validation.LoadValidationConfig( + filename="basic_10k_labels.csv" + ), + "pandas_compute_precision_recall": validation.PandasPrecisionRecallConfig( + value_col="value" + ), + }, +) + + +@pudl_pipeline( + basic_10k_extract_validation_config, + resources={"cloud_interface": cloud_interface_resource}, +) +def basic_10k_extraction_validation_pipeline(): + """Job to validate basic 10k extraction.""" + validation_set = validation.load_validation_data() + filing_metadata = get_filing_metadata( + filenames=get_validation_filenames(validation_set) + ) + empty_metadata, empty_extracted = get_starting_data() + _, extracted, _ = basic_10k_extract_graph( + filing_metadata, empty_metadata, empty_extracted + ) + return validation.pandas_compute_precision_recall(extracted, validation_set) diff --git a/src/mozilla_sec_eia/package_data/basic_10k_labels.csv b/src/mozilla_sec_eia/package_data/validation_data/basic_10k_labels.csv similarity index 100% rename from src/mozilla_sec_eia/package_data/basic_10k_labels.csv rename to src/mozilla_sec_eia/package_data/validation_data/basic_10k_labels.csv diff --git a/src/mozilla_sec_eia/package_data/ex21_labels.csv b/src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv similarity index 100% rename from src/mozilla_sec_eia/package_data/ex21_labels.csv rename to src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py index 7d7e716..0a12a17 100644 --- a/tests/unit/models/sec10k/extract_test.py +++ b/tests/unit/models/sec10k/extract_test.py @@ -13,7 +13,6 @@ FilingsToExtractConfig, extract_graph_factory, ) -from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -73,23 +72,8 @@ def test_sec10k_extract_pipeline( ): """Test high level extraction workflow.""" - class FakeArchive(GCSArchive): - filings_bucket_name: str = "" - labels_bucket_name: str = "" - metadata_db_instance_connection: str = "" - user: str = "" - metadata_db_name: str = "" - project: str = "" - - def setup_for_execution(self, context): - pass - - def get_metadata(self): - return filings_metadata - @op(out={"extraction_metadata": Out(), "extracted": Out()}) def test_extract( - cloud_interface: GCSArchive, filings_to_extract: pd.DataFrame, ) -> tuple[pd.DataFrame, pd.DataFrame]: md = filings_to_extract @@ -104,7 +88,6 @@ def test_extract( test_graph = extract_graph_factory("test_extract", test_extract) resources = { "experiment_tracker": test_tracker, - "cloud_interface": FakeArchive(), "mlflow_pandas_artifact_io_manager": MlflowPandasArtifactIOManager( experiment_tracker=test_tracker ), @@ -118,6 +101,7 @@ def test_extract( {"get_filings_to_extract": FilingsToExtractConfig(num_filings=num_filings)} ), input_values={ + "metadata": filings_metadata, "previous_extraction_metadata": previous_extraction_metadata, "previous_extracted": pd.DataFrame(), }, From f6ab22cfe2884e4899fd74a2ed8d8c138cb4211f Mon Sep 17 00:00:00 2001 From: zschira Date: Mon, 2 Sep 2024 13:00:45 -0400 Subject: [PATCH 017/161] Streamline construction of dagster jobs for running/testing pudl models --- src/mozilla_sec_eia/library/__init__.py | 7 - .../library/experiment_tracking/validation.py | 77 ----- .../__init__.py | 17 +- .../mlflow_io_managers.py | 35 +- .../mlflow_resource.py | 110 ++----- .../library/mlflow/validation.py | 88 +++++ .../library/models/__init__.py | 3 - .../library/models/pipelines.py | 160 --------- src/mozilla_sec_eia/library/pipeline.py | 93 ++++++ .../models/sec10k/basic_10k.py | 167 +++++----- src/mozilla_sec_eia/models/sec10k/extract.py | 304 +----------------- src/mozilla_sec_eia/models/sec10k/pipeline.py | 168 ++++++++++ .../models/sec10k/utils/cloud.py | 14 +- src/mozilla_sec_eia/pudl_pipelines.py | 26 +- .../pudl_validation_pipelines.py | 32 ++ tests/conftest.py | 6 +- tests/unit/models/sec10k/extract_test.py | 168 ++++------ 17 files changed, 615 insertions(+), 860 deletions(-) delete mode 100644 src/mozilla_sec_eia/library/experiment_tracking/validation.py rename src/mozilla_sec_eia/library/{experiment_tracking => mlflow}/__init__.py (58%) rename src/mozilla_sec_eia/library/{experiment_tracking => mlflow}/mlflow_io_managers.py (71%) rename src/mozilla_sec_eia/library/{experiment_tracking => mlflow}/mlflow_resource.py (53%) create mode 100644 src/mozilla_sec_eia/library/mlflow/validation.py delete mode 100644 src/mozilla_sec_eia/library/models/__init__.py delete mode 100644 src/mozilla_sec_eia/library/models/pipelines.py create mode 100644 src/mozilla_sec_eia/library/pipeline.py create mode 100644 src/mozilla_sec_eia/models/sec10k/pipeline.py create mode 100644 src/mozilla_sec_eia/pudl_validation_pipelines.py diff --git a/src/mozilla_sec_eia/library/__init__.py b/src/mozilla_sec_eia/library/__init__.py index 3bd8694..3fef8c0 100644 --- a/src/mozilla_sec_eia/library/__init__.py +++ b/src/mozilla_sec_eia/library/__init__.py @@ -1,8 +1 @@ """Implements shared tooling for machine learning models in PUDL.""" - -from . import models - - -def get_ml_pipeline_jobs() -> list[str]: - """Return all jobs created through `pudl_model` decorator.""" - return list(models.PUDL_PIPELINES.values()) diff --git a/src/mozilla_sec_eia/library/experiment_tracking/validation.py b/src/mozilla_sec_eia/library/experiment_tracking/validation.py deleted file mode 100644 index bed3a17..0000000 --- a/src/mozilla_sec_eia/library/experiment_tracking/validation.py +++ /dev/null @@ -1,77 +0,0 @@ -"""Implement common utilities/functions for validating models.""" - -from importlib import resources - -import pandas as pd -from dagster import Config, Out, op - - -class LoadValidationConfig(Config): - """Configuration for loading validation data.""" - - filename: str - - -@op( - required_resource_keys=["experiment_tracker"], - out={"validation_set": Out(io_manager_key="mlflow_pandas_artifact_io_manager")}, -) -def load_validation_data(config: LoadValidationConfig) -> pd.DataFrame: - """Load csv with validation data from `package_data` directory.""" - return pd.read_csv( - resources.files("mozilla_sec_eia.package_data.validation_data") - / config.filename - ) - - -class PandasPrecisionRecallConfig(Config): - """Configuration for computing precision/recall from pandas dataframe.""" - - value_col: str - - -@op( - out={ - "pandas_precision_recall_metrics": Out( - io_manager_key="mlflow_metrics_io_manager" - ) - } -) -def pandas_compute_precision_recall( - config: PandasPrecisionRecallConfig, - computed_set: pd.DataFrame, - validation_set: pd.DataFrame, -) -> dict: - """Compute precision and recall for extraction compared to validation set. - - Arg: - computed_set: Extracted data. - validation_set: Expected extraction results. - value_col: Column to compare when computing metrics. - """ - # Get initial length of both sets - computed_len = len(computed_set) - validation_len = len(validation_set) - - # Get index of rows only in one set and make Null in other set - idx_validation_only = validation_set.index.difference(computed_set.index) - padded_compute_set = pd.concat( - [ - computed_set[config.value_col], - pd.Series([None] * len(idx_validation_only), index=idx_validation_only), - ] - ).sort_index() - idx_compute_only = computed_set.index.difference(validation_set.index) - padded_validation_set = pd.concat( - [ - validation_set[config.value_col], - pd.Series([None] * len(idx_compute_only), index=idx_compute_only), - ] - ).sort_index() - - true_positives = (padded_compute_set == padded_validation_set).sum() - - return { - "precision": true_positives / computed_len, - "recall": true_positives / validation_len, - } diff --git a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py b/src/mozilla_sec_eia/library/mlflow/__init__.py similarity index 58% rename from src/mozilla_sec_eia/library/experiment_tracking/__init__.py rename to src/mozilla_sec_eia/library/mlflow/__init__.py index fe6a070..380a63c 100644 --- a/src/mozilla_sec_eia/library/experiment_tracking/__init__.py +++ b/src/mozilla_sec_eia/library/mlflow/__init__.py @@ -6,30 +6,25 @@ MlflowPandasArtifactIOManager, ) from .mlflow_resource import ( - ExperimentTracker, - experiment_tracker_teardown_factory, + MlflowInterface, get_most_recent_run, ) def get_mlflow_io_manager( - key: str, experiment_tracker: ExperimentTracker, pandas_file_type: str = "parquet" + key: str, + mlflow_interface: MlflowInterface | None = None, + pandas_file_type: str = "parquet", ) -> MlflowBaseIOManager: """Construct IO-manager based on key.""" if key == "mlflow_pandas_artifact_io_manager": io_manager = MlflowPandasArtifactIOManager( file_type=pandas_file_type, - experiment_tracker=experiment_tracker, - ) - elif key == "previous_run_mlflow_pandas_artifact_io_manager": - io_manager = MlflowPandasArtifactIOManager( - file_type=pandas_file_type, - experiment_tracker=experiment_tracker, - use_previous_mlflow_run=True, + mlflow_interface=mlflow_interface, ) elif key == "mlflow_metrics_io_manager": io_manager = MlflowMetricsIOManager( - experiment_tracker=experiment_tracker, + mlflow_interface=mlflow_interface, ) else: raise RuntimeError(f"MlFlow IO-manager, {key}, does not exist.") diff --git a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py similarity index 71% rename from src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py rename to src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py index 3ad69ab..e78f627 100644 --- a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_io_managers.py +++ b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py @@ -11,7 +11,7 @@ from dagster import ConfigurableIOManager, InputContext, OutputContext from mlflow.entities import Run -from .mlflow_resource import ExperimentTracker +from .mlflow_resource import MlflowInterface logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -19,28 +19,16 @@ class MlflowBaseIOManager(ConfigurableIOManager): """Specify base config and implement helper functions for mlflow io-managers.""" - experiment_tracker: ExperimentTracker + mlflow_interface: MlflowInterface #: By default handles artifacts from current run, but can be used with previous run. - use_previous_mlflow_run: bool = False def _get_run_info(self) -> Run: - """Use `dagster_run_id` and `use_previous_mlflow_run` to get run info from appropriate mlflow run.""" - dagster_run_id = self.experiment_tracker.get_run_id() - filter_string = f"tags.dagster_run_id='{dagster_run_id}'" - if self.use_previous_mlflow_run: - filter_string = f"tags.dagster_run_id!='{dagster_run_id}'" - - run_metadata = mlflow.search_runs( - experiment_names=[self.experiment_tracker.experiment_name], - filter_string=filter_string, - ) - - # Mlflow returns runs ordered by their runtime, so it's easy to grab the latest run - return mlflow.get_run(run_metadata.loc[0, "run_id"]) + """Get mlflow `Run` object using current run id.""" + return mlflow.get_run(self.mlflow_interface.mlflow_run_id) class MlflowPandasArtifactIOManager(MlflowBaseIOManager): - """Implement IO manager for logging/loading parquet files as mlflow artifacts.""" + """Implement IO manager for logging/loading dataframes as mlflow artifacts.""" file_type: Literal["parquet", "csv"] = "parquet" @@ -79,11 +67,6 @@ def _get_dagster_run_id(self, context: InputContext | OutputContext) -> str: def handle_output(self, context: OutputContext, df: pd.DataFrame): """Attach dataframe to run as artifact.""" - if self.use_previous_mlflow_run: - raise NotImplementedError( - "MlflowPandasArtifactIOManager can not be used to add artifacts to completed run." - ) - if self.file_type == "csv": self._log_artifact_as_csv(df, artifact_name=f"{context.name}.csv") else: @@ -106,15 +89,13 @@ def load_input(self, context: InputContext) -> pd.DataFrame: class MlflowMetricsIOManager(MlflowBaseIOManager): - """Log/load models from mlflow tracking server.""" - - experiment_tracker: ExperimentTracker + """Log/load metrics from mlflow tracking server.""" def handle_output(self, context: OutputContext, obj: dict[str, float]): - """Log metrics to mlflow run/experiment from `experiment_tracker`.""" + """Load metrics to mlflow run/experiment created by `MlflowInterface`.""" mlflow.log_metrics(obj) def load_input(self, context: OutputContext) -> dict[str, float]: - """Log metrics to mlflow run/experiment from `experiment_tracker`.""" + """Log metrics to mlflow run/experiment created by `MlflowInterface`.""" run = self._get_run_info() return run.data.metrics diff --git a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py similarity index 53% rename from src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py rename to src/mozilla_sec_eia/library/mlflow/mlflow_resource.py index 38e750f..35dbf26 100644 --- a/src/mozilla_sec_eia/library/experiment_tracking/mlflow_resource.py +++ b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py @@ -9,87 +9,71 @@ this is a configurable value, which can be found in the dagster UI. """ -import atexit import logging import os from contextlib import contextmanager import mlflow -from dagster import ConfigurableResource, In, InitResourceContext, Nothing, op +from dagster import ConfigurableResource, EnvVar, InitResourceContext from google.cloud import secretmanager from pydantic import PrivateAttr logger = logging.getLogger(f"catalystcoop.{__name__}") -class ExperimentTracker(ConfigurableResource): - """Class to manage tracking a machine learning model using MLflow. +class MlflowInterface(ConfigurableResource): + """Dagster resource to interface with mlflow tracking server. - The following command will launch the mlflow UI to view model results: - `mlflow ui --backend-store-uri {tracking_uri}`. From here, you can compare metrics - from multiple runs, and track performance. + This resource handles configuring mlflow to interface with a remote tracking server. + When `tracking_enabled` is set to True, this resource will also start an mlflow + run that can be used to log metrics/paramaters/artifcats which will be associated + with a validation or training run. In most cases this resource does not need to + be referenced directly, and instead the io-mangers defined in + :mod:`.mlflow_io_managers` should be used. - This class is designed to be created using the `op` :func:`create_experiment_tracker`. - This allows the `ExperimentTracker` to be passed around within a Dagster `graph`, - and be used for mlflow logging in any of the `op`'s that make up the `graph`. This - is useful because Dagster executes `op`'s in separate processes, while mlflow does - not maintain state between processes. This design also allows configuration of - the ExperimentTracker to be set from the Dagster UI. - - Currently, we are only doing experiment tracking in a local context, but if we were - to setup a tracking server, we could point the `tracking_uri` at this remote server - without having to modify the models. Experiment tracking can also be done outside - of the PUDL context. If doing exploratory work in a notebook, you can use mlflow - directly in a notebook with the same experiment name used here, and mlflow will - seamlessly integrate the results with those from PUDL runs. + Note: `tracking_enabled` SHOULD NOT be set when using a dagster multi-process + executor. mlflow will create a new run for every process, which gets very messy. """ - tracking_uri: str + tracking_uri: str = EnvVar("MLFLOW_TRACKING_URI") tracking_enabled: bool = True artifact_location: str | None = None experiment_name: str tags: dict = {} - project: str + project: str = EnvVar("GCS_PROJECT") - _run_id: str = PrivateAttr() + _mlflow_run_id: str = PrivateAttr() @contextmanager def yield_for_execution( self, context: InitResourceContext, - ) -> "ExperimentTracker": + ) -> "MlflowInterface": """Create experiment tracker for specified experiment.""" - self._run_id = context.run_id + dagster_run_id = context.run_id + self._mlflow_run_id = None + self._configure_mlflow() if self.tracking_enabled: - self._configure_mlflow() - - # Hack to stop mlflow from ending run at process barrier - # This is borrowed from the official dagster mlflow resource found here: - # https://github.com/dagster-io/dagster/blob/master/python_modules/libraries/dagster-mlflow/dagster_mlflow/resources.py - atexit.unregister(mlflow.end_run) - # Get run_id associated with current dagster run experiment_id = self.get_or_create_experiment( experiment_name=self.experiment_name, artifact_location=self.artifact_location, ) - mlflow_run_id = self._get_mlflow_run_id(context.run_id, experiment_id) - - if (active_run := mlflow.active_run()) is not None: - if active_run.info.run_id != mlflow_run_id: - raise RuntimeError( - f"Found conflicting active mlflow run! - {active_run.info.run_id} != {mlflow_run_id}" - ) + # Create new run under specified experiment + with mlflow.start_run( + experiment_id=experiment_id, + tags=self.tags | {"dagster_run_id": dagster_run_id}, + ) as run: + self._mlflow_run_id = run.info.run_id yield self - else: - # Create new run under specified experiment - with mlflow.start_run( - run_id=mlflow_run_id, - experiment_id=experiment_id, - tags=self.tags | {"dagster_run_id": context.run_id}, - ): - yield self + else: + yield self + + @property + def mlflow_run_id(self) -> str | None: + """Return run id of current run.""" + return self._mlflow_run_id def _get_tracking_password(self, version_id: str = "latest"): """Get tracking server password from gcloud secrets.""" @@ -115,22 +99,6 @@ def _configure_mlflow(self): os.environ["MLFLOW_HTTP_REQUEST_TIMEOUT"] = "900" os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true" - def _get_mlflow_run_id(self, dagster_run_id: str, experiment_id: str): - """Search for existing run tagged with dagster run id or start new run.""" - run_df = mlflow.search_runs( - experiment_ids=[experiment_id], - filter_string=f"tags.dagster_run_id='{dagster_run_id}'", - ) - - run_id = None - if not run_df.empty: - run_id = run_df.loc[0, "run_id"] - return run_id - - def get_run_id(self): - """Return current dagster run_id.""" - return self._run_id - @staticmethod def get_or_create_experiment( experiment_name: str, artifact_location: str = "" @@ -154,22 +122,6 @@ def get_or_create_experiment( return experiment_id -def experiment_tracker_teardown_factory( - experiment_name: str, -): - """Use config to create an experiment tracker.""" - - @op( - name=f"{experiment_name}_tracker_teardown", - required_resource_keys=["experiment_tracker"], - ins={"model_done": In(Nothing)}, - ) - def teardown_experiment_tracker(): - mlflow.end_run() - - return teardown_experiment_tracker - - def get_most_recent_run( experiment_name: str, dagster_run_id: str ) -> mlflow.entities.Run: diff --git a/src/mozilla_sec_eia/library/mlflow/validation.py b/src/mozilla_sec_eia/library/mlflow/validation.py new file mode 100644 index 0000000..999cbdd --- /dev/null +++ b/src/mozilla_sec_eia/library/mlflow/validation.py @@ -0,0 +1,88 @@ +"""Implement common utilities/functions for validating models.""" + +from importlib import resources + +import pandas as pd +from dagster import AssetIn, AssetsDefinition, asset + + +def load_validation_data_asset_factory( + asset_name: str, + filename: str, + index_cols: str | list[str] | None = None, +) -> AssetsDefinition: + """Construct asset for loading validation data from CSV in `package_data`.""" + + @asset( + name=asset_name, + io_manager_key="mlflow_pandas_artifact_io_manager", + ) + def load_validation_data() -> pd.DataFrame: + """Load csv with validation data from `package_data` directory.""" + df = pd.read_csv( + resources.files("mozilla_sec_eia.package_data.validation_data") / filename + ) + if index_cols is not None: + df = df.set_index(index_cols) + return df + + return load_validation_data + + +def pandas_precision_recall_asset_factory( + validation_asset: str, + computed_asset: str, + value_col: str, +) -> AssetsDefinition: + """Produce asset to compute precision and recall on pandas dataframe. + + The returned asset will take upstream computed/validation assets and compute + precision/recall on `value_col`. + + Arg: + validation_asset: Upstream asset containing dataframe of validation set. + computed_asset: Upstream asset containing dataframe of computed data. + value_col: Column to compare when computing metrics. + """ + + @asset( + ins={ + "computed_set": AssetIn(computed_asset), + "validation_set": AssetIn(validation_asset), + }, + io_manager_key="mlflow_metrics_io_manager", + ) + def pandas_compute_precision_recall( + computed_set: pd.DataFrame, + validation_set: pd.DataFrame, + ) -> dict: + """Asset which will return computed metrics from dataframes.""" + # Get initial length of both sets + computed_len = len(computed_set) + validation_len = len(validation_set) + + # Get index of rows only in one set and make Null in other set + idx_validation_only = validation_set.index.difference(computed_set.index) + padded_compute_set = pd.concat( + [ + computed_set[value_col], + pd.Series([None] * len(idx_validation_only), index=idx_validation_only), + ] + ).sort_index() + idx_compute_only = computed_set.index.difference(validation_set.index) + padded_validation_set = pd.concat( + [ + validation_set[value_col], + pd.Series([None] * len(idx_compute_only), index=idx_compute_only), + ] + ).sort_index() + + true_positives = (padded_compute_set == padded_validation_set).sum() + + return { + "precision": true_positives / computed_len, + "recall": true_positives / validation_len, + } + + # Return new asset + return pandas_compute_precision_recall diff --git a/src/mozilla_sec_eia/library/models/__init__.py b/src/mozilla_sec_eia/library/models/__init__.py deleted file mode 100644 index 3deb3bb..0000000 --- a/src/mozilla_sec_eia/library/models/__init__.py +++ /dev/null @@ -1,3 +0,0 @@ -"""Implement top level framework and utilities for defining pudl models/pipelines.""" - -from .pipelines import PUDL_PIPELINES, PudlPipelineConfig, pudl_pipeline diff --git a/src/mozilla_sec_eia/library/models/pipelines.py b/src/mozilla_sec_eia/library/models/pipelines.py deleted file mode 100644 index cfecc0d..0000000 --- a/src/mozilla_sec_eia/library/models/pipelines.py +++ /dev/null @@ -1,160 +0,0 @@ -"""Provides tooling for developing/tracking ml models within PUDL. - -The main interface from this module is the :func:`pudl_model` decorator, which -is meant to be applied to a dagster `graph`. This decorator will handle finding all -configuration for a model/passing configuration to dagster, creating an -:class:`ExperimentTracker` for the model, and ultimately will return a `job` -from the model. - -There are a few different ways to provide configuration for a PUDL model. First, configuration will come from default values for any dagster `Config`'s which are associated -with `op`'s which make up the model `graph`. For more info on dagster configuration, -see https://docs.dagster.io/concepts/configuration/config-schema. The next way to -provide configuration is through the yaml file: `pudl.package_data.settings.pudl_models.yml`. -Any configuration in this file should be follow dagster's config-schema formatting, -see the `ferc_to_ferc` entry as an example. Configuration provided this way will -override any default values. The final way to provide configuration is through the -dagster UI. To provide configuration this way, click `Open Launchpad` in the UI, and -values can be edited here. This configuration will override both default values and -yaml configuration, but will only be used for a single run. -""" - -import logging - -import mlflow -from dagster import ( - EnvVar, - GraphDefinition, - HookContext, - JobDefinition, - OpDefinition, - ResourceDefinition, - RunConfig, - failure_hook, - graph, - job, - success_hook, -) -from mlflow.entities.run_status import RunStatus -from pydantic import BaseModel - -from ..experiment_tracking import ( - ExperimentTracker, - experiment_tracker_teardown_factory, - get_mlflow_io_manager, -) - -logger = logging.getLogger(f"catalystcoop.{__name__}") -PUDL_PIPELINES = {} - - -def get_default_config(model_graph: GraphDefinition) -> dict: - """Get default config values for model.""" - - def _get_default_from_ops(node: OpDefinition | GraphDefinition): - config = {} - if isinstance(node, GraphDefinition): - config = { - "ops": { - child_node.name: _get_default_from_ops(child_node) - for child_node in node.node_defs - } - } - else: - if node.config_schema.default_provided: - config = {"config": node.config_schema.default_value} - else: - config = {"config": None} - - return config - - config = {model_graph.name: _get_default_from_ops(model_graph)} - return config - - -def get_pudl_pipeline_job_name(experiment_name: str) -> str: - """Return expected pudl model job name based on experiment_name.""" - return f"{experiment_name}_job" - - -@success_hook(required_resource_keys={"experiment_tracker"}) -def _log_config_hook(context: HookContext): - if (config := context.op_config) is not None: - mlflow.log_params( - {f"{context.op.name}.{param}": value for param, value in config.items()} - ) - - -@failure_hook(required_resource_keys={"experiment_tracker"}) -def _end_mlflow_run_with_failure(context: HookContext): - exception = context.op_exception - - if isinstance(exception, KeyboardInterrupt): - mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED)) - else: - mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED)) - - -class PudlPipelineConfig(BaseModel): - """Define a config format for `pudl_pipeline`'s.""" - - experiment_name: str - op_config: dict = {} - required_mlflow_io_managers: list[str] = [ - "mlflow_pandas_artifact_io_manager", - "previous_run_mlflow_pandas_artifact_io_manager", - "mlflow_metrics_io_manager", - ] - pandas_io_file_type: str = "parquet" - - -def pudl_pipeline( - config: PudlPipelineConfig, - resources: dict[str, ResourceDefinition] = {}, -) -> JobDefinition: - """Decorator for an ML model that will handle providing configuration to dagster.""" - - def _decorator(pipeline_func): - model_graph = graph(pipeline_func) - model_config = get_default_config(model_graph) - model_config[model_graph.name]["ops"] |= config.op_config - - # Add resources to resource dict - experiment_tracker = ExperimentTracker( - experiment_name=config.experiment_name, - tracking_uri=EnvVar("MLFLOW_TRACKING_URI"), - project=EnvVar("GCS_PROJECT"), - ) - model_resources = ( - {"experiment_tracker": experiment_tracker} - | { - key: get_mlflow_io_manager( - key, experiment_tracker, config.pandas_io_file_type - ) - for key in config.required_mlflow_io_managers - } - | resources - ) - - default_config = RunConfig( - ops=model_config, - ) - - @job( - name=get_pudl_pipeline_job_name(config.experiment_name), - config=default_config, - hooks={_log_config_hook, _end_mlflow_run_with_failure}, - resource_defs=model_resources, - ) - def model_job(**kwargs): - tracker_teardown = experiment_tracker_teardown_factory( - experiment_name=model_graph.name, - ) - graph_output = model_graph(**kwargs) - - # Pass output to teardown to create a dependency - tracker_teardown(graph_output) - - PUDL_PIPELINES[get_pudl_pipeline_job_name(config.experiment_name)] = model_job - return model_job - - return _decorator diff --git a/src/mozilla_sec_eia/library/pipeline.py b/src/mozilla_sec_eia/library/pipeline.py new file mode 100644 index 0000000..e35f1a9 --- /dev/null +++ b/src/mozilla_sec_eia/library/pipeline.py @@ -0,0 +1,93 @@ +"""Implement helper methods for constructing dagster jobs. + +Methods defined here are the main interface for constructing PUDL model jobs. +`create_production_pipeline` will produce a dagster job that will use the default +multi-process executor to run a PUDL model. `create_validation_pipeline` is meant for +testing/validating models with an mlflow run backing the dagster run for logging. +""" + +import mlflow +from dagster import ( + AssetsDefinition, + HookContext, + ResourceDefinition, + define_asset_job, + failure_hook, + in_process_executor, + success_hook, +) +from mlflow.entities import RunStatus + +PUDL_PIPELINE_PRODUCTION_JOBS = [] +PUDL_PIPELINE_PRODUCTION_ASSETS = [] +PUDL_PIPELINE_PRODUCTION_RESOURCES = {} + +PUDL_PIPELINE_VALIDATION_JOBS = [] +PUDL_PIPELINE_VALIDATION_ASSETS = [] +PUDL_PIPELINE_VALIDATION_RESOURCES = {} + + +def create_production_pipeline( + pipeline_name: str, + assets: list[AssetsDefinition], + resources: dict[str, ResourceDefinition], + **kwargs, +): + """Construct a dagster job and supply Definitions with assets and resources.""" + PUDL_PIPELINE_PRODUCTION_JOBS.append( + define_asset_job( + pipeline_name, + selection=assets, + **kwargs, + ) + ) + PUDL_PIPELINE_PRODUCTION_ASSETS.extend(assets) + PUDL_PIPELINE_PRODUCTION_RESOURCES.update(resources) + + +@success_hook(required_resource_keys={"mlflow_interface"}) +def log_op_config(context: HookContext): + """Log any config supplied to ops/assets in validation job to mlflow tracking server.""" + if context.op_config is not None: + mlflow.log_params(context.op_config) + + +@failure_hook(required_resource_keys={"mlflow_interface"}) +def end_run_on_failure(context: HookContext): + """Inform mlflow about job failure.""" + if isinstance(context.op_exception, KeyboardInterrupt): + mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED)) + else: + mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED)) + + +def create_validation_pipeline( + pipeline_name: str, + assets: list[AssetsDefinition], + resources: dict[str, ResourceDefinition], + **kwargs, +): + """Construct a dagster job and supply Definitions with assets and resources.""" + PUDL_PIPELINE_VALIDATION_JOBS.append( + define_asset_job( + pipeline_name, + selection=assets, + executor_def=in_process_executor, + hooks={log_op_config, end_run_on_failure}, + # Configure mlflow_interface for job with appropriate experiment name + config={ + "ops": {}, + "resources": { + "mlflow_interface": { + "config": { + "experiment_name": pipeline_name, + "tracking_enabled": True, + } + } + }, + }, + **kwargs, + ) + ) + PUDL_PIPELINE_VALIDATION_ASSETS.extend(assets) + PUDL_PIPELINE_VALIDATION_RESOURCES.update(resources) diff --git a/src/mozilla_sec_eia/models/sec10k/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py index e5b5f72..dfc2fc8 100644 --- a/src/mozilla_sec_eia/models/sec10k/basic_10k.py +++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py @@ -3,98 +3,97 @@ import logging import pandas as pd -from dagster import Out, op -from .utils.cloud import GCSArchive, Sec10K +from .extract import Sec10kExtractor +from .utils.cloud import Sec10K logger = logging.getLogger(f"catalystcoop.{__name__}") -EXPERIMENT_NAME = "basic_10k_extraction" -def _extract_10k(filing: Sec10K): - """Extract basic company data from filing.""" - logger.info(f"Extracting 10K company data from filing: {filing.filename}") - header = True - current_block = None - values = [] - filer_count = 0 - block_counts = { - "company data": 0, - "filing values": 0, - "business address": 0, - "mail address": 0, - "former company": 0, - } - unmatched_keys = [] - for line in filing.filing_text.splitlines(): - match line.replace("\t", "").lower().split(":"): - case ["filer", ""]: - filer_count += 1 - header = False - case [ - ( - "company data" - | "filing values" - | "business address" - | "mail address" - | "former company" - ) as block, - "", - ] if not header: - current_block = block - block_counts[current_block] += 1 - case [key, ""] if current_block is not None: - key = f"{block}_{key}".replace(" ", "_") - logger.warning(f"No value found for {key} for filing {filing.filename}") - unmatched_keys.append(key) - case [key, value] if current_block is not None: - key = key.replace(" ", "_") - values.append( - { - "filename": filing.filename, - "filer_count": filer_count - 1, - "block": current_block.replace(" ", "_"), - "block_count": block_counts[current_block] - 1, - "key": key.replace(" ", "_"), - "value": value, - } - ) - case ["" | ""]: - break - case _ if header: - continue +class Basic10kExtractor(Sec10kExtractor): + """Implement Sec10kExtractor for basic 10k company info data.""" - return pd.DataFrame(values), filing.filename, unmatched_keys + def _extract_10k(self, filing: Sec10K): + """Extract basic company data from filing.""" + logger.info(f"Extracting 10K company data from filing: {filing.filename}") + header = True + current_block = None + values = [] + filer_count = 0 + block_counts = { + "company data": 0, + "filing values": 0, + "business address": 0, + "mail address": 0, + "former company": 0, + } + unmatched_keys = [] + for line in filing.filing_text.splitlines(): + match line.replace("\t", "").lower().split(":"): + case ["filer", ""]: + filer_count += 1 + header = False + case [ + ( + "company data" + | "filing values" + | "business address" + | "mail address" + | "former company" + ) as block, + "", + ] if not header: + current_block = block + block_counts[current_block] += 1 + case [key, ""] if current_block is not None: + key = f"{block}_{key}".replace(" ", "_") + logger.warning( + f"No value found for {key} for filing {filing.filename}" + ) + unmatched_keys.append(key) + case [key, value] if current_block is not None: + key = key.replace(" ", "_") + values.append( + { + "filename": filing.filename, + "filer_count": filer_count - 1, + "block": current_block.replace(" ", "_"), + "block_count": block_counts[current_block] - 1, + "key": key.replace(" ", "_"), + "value": value, + } + ) + case ["" | ""]: + break + case _ if header: + continue + return pd.DataFrame(values), filing.filename, unmatched_keys -@op(out={"extraction_metadata": Out(), "extracted": Out()}) -def extract( - cloud_interface: GCSArchive, - filings_to_extract: pd.DataFrame, -) -> tuple[pd.DataFrame, pd.DataFrame]: - """Extract basic 10K data and write to postgres table. + def extract_filings( + self, + filings_to_extract: pd.DataFrame, + ) -> tuple[pd.DataFrame, pd.DataFrame]: + """Extract basic 10K data and return extracted data/metadata.""" + logger.info("Starting basic 10K extraction.") + logger.info(f"Extracting {len(filings_to_extract)} filings.") - Args: - continue_run: If true, only extract filings not in DB, otherwise clobber - basic_10k table. - """ - logger.info("Starting basic 10K extraction.") - logger.info(f"Extracting {len(filings_to_extract)} filings.") + extraction_metadata = pd.DataFrame( + {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} + ).set_index("filename") + extracted = pd.DataFrame() - extraction_metadata = pd.DataFrame( - {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} - ).set_index("filename") - extracted = pd.DataFrame() + for filing in self.cloud_interface.iterate_filings(filings_to_extract): + ext, filename, unmatched_keys = self._extract_10k(filing) + extraction_metadata.loc[filename, ["success", "unmatched_keys"]] = [ + len(ext) > 0, + ",".join(unmatched_keys), + ] + extracted = pd.concat([extracted, ext]) - for filing in cloud_interface.iterate_filings(filings_to_extract): - ext, filename, unmatched_keys = _extract_10k(filing) - extraction_metadata.loc[filename, ["success", "unmatched_keys"]] = [ - len(ext) > 0, - ",".join(unmatched_keys), - ] - extracted = pd.concat([extracted, ext]) - - return ( - extraction_metadata, - extracted.set_index(["filename", "filer_count", "block", "block_count", "key"]), - ) + return ( + extraction_metadata, + extracted.set_index( + ["filename", "filer_count", "block", "block_count", "key"] + ), + ) diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py index 21438c9..ad3760d 100644 --- a/src/mozilla_sec_eia/models/sec10k/extract.py +++ b/src/mozilla_sec_eia/models/sec10k/extract.py @@ -1,300 +1,20 @@ -"""Implement top level extraction methods and tooling.""" +"""Implement base class for an SEC10k extractor.""" -import logging -import math - -import numpy as np import pandas as pd -import pandera as pa -from dagster import ( - Config, - DynamicOut, - DynamicOutput, - GraphDefinition, - GraphOut, - In, - OpDefinition, - Out, - Output, - graph, - op, -) - -from mozilla_sec_eia.library.experiment_tracking import validation -from mozilla_sec_eia.library.models import PudlPipelineConfig, pudl_pipeline - -from . import basic_10k -from .utils.cloud import GCSArchive, cloud_interface_resource - -logger = logging.getLogger(f"catalystcoop.{__name__}") - -DATASETS = ["ex21", "basic_10k"] - - -class ExtractionMetadataSchema(pa.DataFrameModel): - """Define the required schema for extraction metadata. - - Extra columns are permitted, but these are required for computing extraction metrics. - """ +from dagster import ConfigurableResource - filename: pa.typing.Index[str] = pa.Field(check_name=True) - success: bool = pa.Field(coerce=True) - - -@op -def get_filing_metadata( - cloud_interface: GCSArchive, filenames: list[str] | None = None -) -> pd.DataFrame: - """Return filing metadata.""" - return cloud_interface.get_metadata(filenames=filenames) - - -class ChunkFilingsConfig(Config): - """Config how many filings are extracted and chunk_size for extraction.""" - - chunk_size: int = 1000 - - -@op(out=DynamicOut()) -def chunk_filings( - config: ChunkFilingsConfig, - filings_to_extract: pd.DataFrame, -) -> pd.DataFrame: - """Split filings into chunks for parallel extraction.""" - for i, chunk in enumerate( - np.array_split( - filings_to_extract, math.ceil(len(filings_to_extract) / config.chunk_size) - ) - ): - yield DynamicOutput(chunk, mapping_key=str(i)) +from .utils.cloud import GCSArchive -class GetMostRecentRunResultsConfig(Config): - """Configuration specifying whether to get run results and continue.""" +class Sec10kExtractor(ConfigurableResource): + """Base class for extracting SEC 10k data.""" - continue_run: bool = False - - -@op(out={"basic_extraction_metrics": Out(io_manager_key="mlflow_metrics_io_manager")}) -def log_extraction_data( - metadata: pd.DataFrame, - extraction_metadata: pd.DataFrame, - extracted: pd.DataFrame, -): - """Log results from extraction run.""" - return { - "num_failed": (~extraction_metadata["success"]).sum(), - "ratio_extracted": len(extraction_metadata) / len(metadata), - } - - -@op( - required_resource_keys=["experiment_tracker"], - out={ - "extraction_metadata": Out(io_manager_key="mlflow_pandas_artifact_io_manager"), - "extracted": Out(io_manager_key="mlflow_pandas_artifact_io_manager"), - }, -) -def merge_extracted_data( - extraction_metadata: list[pd.DataFrame], - extracted: list[pd.DataFrame], - previous_run_extraction_metadata: pd.DataFrame, - previous_run_extracted_data: pd.DataFrame, -): - """Data is extracted in parallel ops, merge these plus any data from previous run.""" - extraction_metadata = pd.concat( - extraction_metadata + [previous_run_extraction_metadata] - ) - extracted = pd.concat(extracted + [previous_run_extracted_data]) - # Use metadata to log generic metrics - extraction_metadata = ExtractionMetadataSchema.validate(extraction_metadata) - - return extraction_metadata, extracted - - -class FilingsToExtractConfig(Config): - """Define configuration for filtering filings to extract.""" - - num_filings: int = -1 - - -@op -def get_filings_to_extract( - config: FilingsToExtractConfig, - filing_metadata: pd.DataFrame, - previous_extraction_metadata: pd.DataFrame, - previous_extracted: pd.DataFrame, -): - """Filter out any previously extracted filings and sub-sample to `num_filings`.""" - filings_to_extract = filing_metadata - if config.num_filings > 0: - filings_to_extract = filings_to_extract.sample(config.num_filings) - - filings_to_extract = filings_to_extract[ - ~filings_to_extract["filename"].isin(previous_extraction_metadata.index) - ] - return filings_to_extract - - -def extract_graph_factory( - dataset_name: str, - extract_op: OpDefinition | GraphDefinition, -): - """Produce a `pudl_model` to extract data from sec10k filings.""" - experiment_name = f"{dataset_name}_extraction" - - @graph( - name=experiment_name, - out={ - "extraction_metadata": GraphOut(), - "extracted": GraphOut(), - "extraction_metrics": GraphOut(), - }, - ) - def extract_filings(metadata, previous_extraction_metadata, previous_extracted): - filings_to_extract = get_filings_to_extract( - metadata, - previous_extraction_metadata, - previous_extracted, - ) + cloud_interface: GCSArchive - filing_chunks = chunk_filings(filings_to_extract) - extraction_metadata, extracted = filing_chunks.map(extract_op) - extraction_metadata, extracted = merge_extracted_data( - extraction_metadata.collect(), - extracted.collect(), - previous_extraction_metadata, - previous_extracted, + def extract_filings( + self, filing_metadata: pd.DataFrame + ) -> tuple[pd.DataFrame, pd.DataFrame]: + """Method must be implemented by subclasses to extract SEC10k filings.""" + raise NotImplementedError( + "extract_filings must be implemented by any subclass!" ) - - extraction_metrics = log_extraction_data( - metadata, - extraction_metadata, - extracted, - ) - return extraction_metadata, extracted, extraction_metrics - - return extract_filings - - -@op( - ins={ - "extraction_metadata": In( - input_manager_key="previous_run_mlflow_pandas_artifact_io_manager" - ), - "extracted": In( - input_manager_key="previous_run_mlflow_pandas_artifact_io_manager" - ), - } -) -def get_previous_run_data( - continue_previous_run, extraction_metadata: pd.DataFrame, extracted: pd.DataFrame -) -> tuple[pd.DataFrame, pd.DataFrame]: - """Return previous run data loaded by io-manager.""" - extraction_metadata = ExtractionMetadataSchema.validate(extraction_metadata) - - return extraction_metadata, extracted - - -@op -def get_empty_run_data(start_new_run): - """Return empty dataframes representing run metadata and extracted data.""" - extraction_metadata = pd.DataFrame( - {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} - ).set_index("filename") - extracted = pd.DataFrame() - - return extraction_metadata, extracted - - -class ContinuePreviousRunConfig(Config): - """Configure whether to continue a previous extraction run or not.""" - - continue_run: bool = False - - -@op(out={"continue_run": Out(is_required=False), "new_run": Out(is_required=False)}) -def continue_previous_run(config: ContinuePreviousRunConfig): - """Create branch dictating whether a previous extraction run is continued or not.""" - if config.continue_run: - yield Output(True, "continue_run") - else: - yield Output(True, "new_run") - - -@op(out={"previous_run_extraction_metadata": Out(), "previous_extracted": Out()}) -def merge_branches(dfs: list[tuple[pd.DataFrame, pd.DataFrame]]): - """Merge branches created by `continue_previous_run` and return.""" - dfs = dfs[0] - return dfs[0], dfs[1] - - -@graph( - out={ - "previous_run_extraction_metadata": GraphOut(), - "previous_extracted": GraphOut(), - } -) -def get_starting_data(): - """Get previous run data if configured to do so.""" - continue_run, new_run = continue_previous_run() - previous_data = get_previous_run_data(continue_run) - new_data = get_empty_run_data(new_run) - return merge_branches([previous_data, new_data]) - - -basic_10k_extract_graph = extract_graph_factory("basic_10k", basic_10k.extract) - - -basic_10k_extract_config = PudlPipelineConfig( - experiment_name="basic_10k_extraction", -) - - -@pudl_pipeline( - basic_10k_extract_config, resources={"cloud_interface": cloud_interface_resource} -) -def basic_10k_extraction_pipeline(): - """Implement basic 10k extraction pudl_model.""" - filing_metadata = get_filing_metadata() - previous_extraction_metadata, previous_extracted = get_starting_data() - return basic_10k_extract_graph( - filing_metadata, previous_extraction_metadata, previous_extracted - ) - - -@op -def get_validation_filenames(validation_set: pd.DataFrame) -> list[str]: - """Return filenames in validation set.""" - return list(validation_set["filename"]) - - -basic_10k_extract_validation_config = PudlPipelineConfig( - experiment_name="basic_10k_extraction_validation", - pandas_io_file_type="csv", - op_config={ - "load_validation_data": validation.LoadValidationConfig( - filename="basic_10k_labels.csv" - ), - "pandas_compute_precision_recall": validation.PandasPrecisionRecallConfig( - value_col="value" - ), - }, -) - - -@pudl_pipeline( - basic_10k_extract_validation_config, - resources={"cloud_interface": cloud_interface_resource}, -) -def basic_10k_extraction_validation_pipeline(): - """Job to validate basic 10k extraction.""" - validation_set = validation.load_validation_data() - filing_metadata = get_filing_metadata( - filenames=get_validation_filenames(validation_set) - ) - empty_metadata, empty_extracted = get_starting_data() - _, extracted, _ = basic_10k_extract_graph( - filing_metadata, empty_metadata, empty_extracted - ) - return validation.pandas_compute_precision_recall(extracted, validation_set) diff --git a/src/mozilla_sec_eia/models/sec10k/pipeline.py b/src/mozilla_sec_eia/models/sec10k/pipeline.py new file mode 100644 index 0000000..e4b7d0d --- /dev/null +++ b/src/mozilla_sec_eia/models/sec10k/pipeline.py @@ -0,0 +1,168 @@ +"""Implement top level extraction methods and tooling.""" + +import logging + +import pandas as pd +import pandera as pa +from dagster import ( + AssetExecutionContext, + AssetIn, + AssetOut, + StaticPartitionsDefinition, + asset, + multi_asset, + with_resources, +) + +from mozilla_sec_eia.library.mlflow import validation +from mozilla_sec_eia.library.pipeline import ( + create_production_pipeline, + create_validation_pipeline, +) + +from .basic_10k import Basic10kExtractor +from .extract import Sec10kExtractor +from .utils.cloud import GCSArchive, cloud_interface_resource + +logger = logging.getLogger(f"catalystcoop.{__name__}") + +DATASETS = ["ex21", "basic_10k"] + + +class ExtractionMetadataSchema(pa.DataFrameModel): + """Define the required schema for extraction metadata. + + Extra columns are permitted, but these are required for computing extraction metrics. + """ + + filename: pa.typing.Index[str] = pa.Field(check_name=True) + success: bool = pa.Field(coerce=True) + + +# Create year_quarter partitions +partitions_def = StaticPartitionsDefinition( + [f"{year}q{quarter}" for year in range(1994, 2024) for quarter in range(1, 5)] +) + + +def sec10k_extraction_asset_factory( + name: str, + sec10k_extractor: Sec10kExtractor, + partitions_def=None, + filing_metadata_asset_name: str = "sec10k_filing_metadata", + extraction_metadata_asset_name: str = "extraction_metadata", + extracted_asset_name: str = "extraction_metadata", +): + """Create asset to extract data from sec10k data. + + Args: + name: Name of extraction asset. + sec10k_extractor: Subclass of Sec10kExtractor used to extract data. + partitions_def: Partitions for asset (production uses year_quarter parts, + validation is not partitioned. + filing_metadata_asset_name: Name of input asset with metadata of filings to + extract. + extraction_metadata_asset_name: Name of output asset containing metadata + from extraction run. + extracted_asset_name: Name of output asset containing extracted data. + """ + + @multi_asset( + name=name, + outs={ + extraction_metadata_asset_name: AssetOut(), + extracted_asset_name: AssetOut(), + }, + ins={"sec10k_filing_metadata": AssetIn(filing_metadata_asset_name)}, + partitions_def=partitions_def, + ) + def extract_filings( + sec10k_extractor: Sec10kExtractor, sec10k_filing_metadata: pd.DataFrame + ) -> tuple[pd.DataFrame, pd.DataFrame]: + """Run Sec10kExtractor on selected partition and return.""" + extraction_metadata, extracted = sec10k_extractor.extract_filings( + sec10k_filing_metadata + ) + return extraction_metadata, extracted + + return with_resources([extract_filings], {"sec10k_extractor": sec10k_extractor})[0] + + +@asset(partitions_def=partitions_def) +def sec10k_filing_metadata( + context: AssetExecutionContext, + cloud_interface: GCSArchive, +) -> pd.DataFrame: + """Return filing metadata for year_quarter partition.""" + year_quarter = context.partition_key + df = cloud_interface.get_metadata(year_quarter=year_quarter) + return df + + +# Create asset to load basic 10k validation data +basic_10k_validation_set = validation.load_validation_data_asset_factory( + "basic_10k_validation_set", + "basic_10k_labels.csv", + index_cols=["filename", "filer_count", "block", "block_count", "key"], +) + + +# Create asset to compute precision/recall on basic 10k extraction of validation set +basic_10k_extracted_validation_asset_name = "basic_10k_company_info_validation" +basic_10k_extraction_validation_metrics = ( + validation.pandas_precision_recall_asset_factory( + validation_asset="basic_10k_validation_set", + computed_asset=basic_10k_extracted_validation_asset_name, + value_col="value", + ) +) + + +@asset(name="sec10k_filing_metadata_validation") +def basic_10k_validation_filing_metadata( + cloud_interface: GCSArchive, + basic_10k_validation_set: pd.DataFrame, +) -> pd.DataFrame: + """Get sec 10k filing metadata from validation set.""" + filing_metadata = cloud_interface.get_metadata() + return filing_metadata[ + filing_metadata["filename"].isin( + basic_10k_validation_set.index.get_level_values("filename").unique() + ) + ] + + +# Register basic 10k extraction pipeline +create_production_pipeline( + "basic_10k_extraction", + [ + sec10k_filing_metadata, + sec10k_extraction_asset_factory( + "basic_10k", + Basic10kExtractor(cloud_interface=cloud_interface_resource), + partitions_def=partitions_def, + extraction_metadata_asset_name="basic_10k_extraction_metadata", + extracted_asset_name="basic_10k_company_info", + ), + ], + resources={"cloud_interface": cloud_interface_resource}, +) + + +# Register basic 10k extraction validation pipeline +create_validation_pipeline( + "basic_10k_extraction", + [ + basic_10k_validation_filing_metadata, + sec10k_extraction_asset_factory( + "basic_10k", + Basic10kExtractor(cloud_interface=cloud_interface_resource), + filing_metadata_asset_name="sec10k_filing_metadata_validation", + extraction_metadata_asset_name="basic_10k_extraction_validation_metadata", + extracted_asset_name=basic_10k_extracted_validation_asset_name, + ), + basic_10k_validation_set, + basic_10k_extraction_validation_metrics, + ], + resources={"cloud_interface": cloud_interface_resource}, +) diff --git a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py index 1232f45..81c7a35 100644 --- a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py +++ b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py @@ -170,7 +170,6 @@ class GCSArchive(ConfigurableResource): _filings_bucket = PrivateAttr() _labels_bucket = PrivateAttr() _engine = PrivateAttr() - _metadata_df = PrivateAttr(default=None) def setup_for_execution(self, context): """Initialize interface to filings archive on GCS.""" @@ -214,16 +213,13 @@ def create_session(self) -> Session: with Session(self._engine) as session: yield session - def get_metadata(self, filenames: list[str] | None = None) -> pd: + def get_metadata(self, year_quarter: str | None = None) -> pd: """Return dataframe of filing metadata.""" - if self._metadata_df is None: - selection = select(Sec10kMetadata) - if filenames is not None: - selection = selection.where(Sec10kMetadata.filename.in_(filenames)) + selection = select(Sec10kMetadata) + if year_quarter is not None: + selection = selection.where(Sec10kMetadata.year_quarter == year_quarter) - self._metadata_df = pd.read_sql(selection, self._engine) - - return self._metadata_df + return pd.read_sql(selection, self._engine) def get_filing_blob(self, year_quarter: str, path: str) -> storage.Blob: """Return Blob pointing to file in GCS bucket.""" diff --git a/src/mozilla_sec_eia/pudl_pipelines.py b/src/mozilla_sec_eia/pudl_pipelines.py index 42cf090..3a427c4 100644 --- a/src/mozilla_sec_eia/pudl_pipelines.py +++ b/src/mozilla_sec_eia/pudl_pipelines.py @@ -1,16 +1,32 @@ -"""Define asset jobs and configuration.""" +"""Define production pipelines for running PUDL models.""" import logging import coloredlogs -from dagster import Definitions +from dagster import Definitions, EnvVar -from mozilla_sec_eia.library import get_ml_pipeline_jobs +from mozilla_sec_eia.library.mlflow import MlflowInterface +from mozilla_sec_eia.library.pipeline import ( + PUDL_PIPELINE_PRODUCTION_ASSETS, + PUDL_PIPELINE_PRODUCTION_JOBS, + PUDL_PIPELINE_PRODUCTION_RESOURCES, +) logger = logging.getLogger("catalystcoop") log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s" coloredlogs.install(fmt=log_format, logger=logger) -defs = Definitions( - jobs=get_ml_pipeline_jobs(), + +mlflow_interface = MlflowInterface( + experiment_name="", + tracking_uri=EnvVar("MLFLOW_TRACKING_URI"), + project=EnvVar("GCS_PROJECT"), +) + +production_io_resources = {} | PUDL_PIPELINE_PRODUCTION_RESOURCES + +production_pipelines = Definitions( + assets=PUDL_PIPELINE_PRODUCTION_ASSETS, + jobs=PUDL_PIPELINE_PRODUCTION_JOBS, + resources=production_io_resources | {"mlflow_interface": mlflow_interface}, ) diff --git a/src/mozilla_sec_eia/pudl_validation_pipelines.py b/src/mozilla_sec_eia/pudl_validation_pipelines.py new file mode 100644 index 0000000..44e7a55 --- /dev/null +++ b/src/mozilla_sec_eia/pudl_validation_pipelines.py @@ -0,0 +1,32 @@ +"""Define jobs to test/validate PUDL models.""" + +import logging + +import coloredlogs +from dagster import Definitions + +from mozilla_sec_eia.library.mlflow import MlflowInterface, get_mlflow_io_manager +from mozilla_sec_eia.library.pipeline import ( + PUDL_PIPELINE_VALIDATION_ASSETS, + PUDL_PIPELINE_VALIDATION_JOBS, + PUDL_PIPELINE_VALIDATION_RESOURCES, +) + +logger = logging.getLogger("catalystcoop") +log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s" +coloredlogs.install(fmt=log_format, logger=logger) + + +# Configure at launch so experiment name can be supplied by config +mlflow_interface = MlflowInterface.configure_at_launch() + +validation_io_resources = { + key: get_mlflow_io_manager(key, mlflow_interface=mlflow_interface) + for key in ["mlflow_pandas_artifact_io_manager", "mlflow_metrics_io_manager"] +} | PUDL_PIPELINE_VALIDATION_RESOURCES + +validation_pipelines = Definitions( + assets=PUDL_PIPELINE_VALIDATION_ASSETS, + jobs=PUDL_PIPELINE_VALIDATION_JOBS, + resources=validation_io_resources | {"mlflow_interface": mlflow_interface}, +) diff --git a/tests/conftest.py b/tests/conftest.py index 4e9b0f6..bcdedee 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,7 +5,7 @@ import mlflow import pytest -from mozilla_sec_eia.library.experiment_tracking import ExperimentTracker +from mozilla_sec_eia.library.mlflow import MlflowInterface logger = logging.getLogger(__name__) @@ -37,8 +37,8 @@ def test_dir() -> Path: return Path(__file__).parent -class TestTracker(ExperimentTracker): - """Create sub-class of `ExperimentTracker` to use in testing context. +class TestTracker(MlflowInterface): + """Create sub-class of `MlflowInterface` to use in testing context. Test class creates an in-memory sqlite db for tracking, and a temporary directory for artifact storage. diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py index 0a12a17..9a25fdb 100644 --- a/tests/unit/models/sec10k/extract_test.py +++ b/tests/unit/models/sec10k/extract_test.py @@ -1,119 +1,81 @@ """Test extraction tools/methods.""" import logging +from unittest.mock import Mock import pandas as pd -import pytest -from dagster import Out, RunConfig, op -from mozilla_sec_eia.library.experiment_tracking.mlflow_io_managers import ( - MlflowMetricsIOManager, - MlflowPandasArtifactIOManager, -) -from mozilla_sec_eia.models.sec10k.extract import ( - FilingsToExtractConfig, - extract_graph_factory, +from dagster import asset, build_asset_context, materialize +from mozilla_sec_eia.models.sec10k.extract import Sec10kExtractor +from mozilla_sec_eia.models.sec10k.pipeline import ( + sec10k_extraction_asset_factory, + sec10k_filing_metadata, ) +from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive logger = logging.getLogger(f"catalystcoop.{__name__}") -@pytest.mark.parametrize( - "filings_metadata,previous_extraction_metadata,num_filings,num_failed", - [ - ( - pd.DataFrame( - {"filename": ["filing1", "filing2", "filing3", "filing4", "filing5"]} - ), - pd.DataFrame( - {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} - ).set_index("filename"), - -1, - 0, - ), - ( - pd.DataFrame( - {"filename": ["filing1", "filing2", "filing3", "filing4", "filing5"]} - ), - pd.DataFrame( - {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} - ).set_index("filename"), - -1, - 3, - ), - ( - pd.DataFrame( - {"filename": ["filing1", "filing2", "filing3", "filing4", "filing5"]} - ), - pd.DataFrame( - {"filename": ["filing1", "filing2"], "success": [True, True]} - ).set_index("filename"), - -1, - 0, - ), - ( - pd.DataFrame( - {"filename": ["filing1", "filing2", "filing3", "filing4", "filing5"]} - ), - pd.DataFrame( - {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} - ).set_index("filename"), - 2, - 1, - ), - ], -) -def test_sec10k_extract_pipeline( - filings_metadata, - previous_extraction_metadata, - num_filings, - num_failed, - test_tracker_factory, - get_most_recent_mlflow_run_factory, -): - """Test high level extraction workflow.""" +def test_sec10k_filing_metadata(): + """Test loading sec10k filing metadata.""" + # Prepare inputs to sec10k_filing_metadata + context = build_asset_context(partition_key="2024q1") + cloud_interface = Mock() + output_df = pd.DataFrame({"col": ["fake_col"]}) + cloud_interface.get_metadata.return_value = output_df + + returned_df = sec10k_filing_metadata( + context=context, + cloud_interface=cloud_interface, + ) - @op(out={"extraction_metadata": Out(), "extracted": Out()}) - def test_extract( - filings_to_extract: pd.DataFrame, - ) -> tuple[pd.DataFrame, pd.DataFrame]: - md = filings_to_extract - md["success"] = True - md.iloc[:num_failed, 1] = False - return md.set_index("filename"), pd.DataFrame() + # Check that GCSArchive.get_metadata was called correctly + cloud_interface.get_metadata.assert_called_once_with(year_quarter="2024q1") + pd.testing.assert_frame_equal(returned_df, output_df) - dataset_name = "test_pipeline" - experiment_name = f"{dataset_name}_extraction" - test_tracker = test_tracker_factory(experiment_name) - test_graph = extract_graph_factory("test_extract", test_extract) - resources = { - "experiment_tracker": test_tracker, - "mlflow_pandas_artifact_io_manager": MlflowPandasArtifactIOManager( - experiment_tracker=test_tracker - ), - "mlflow_metrics_io_manager": MlflowMetricsIOManager( - experiment_tracker=test_tracker, - ), - } - graph_result = test_graph.to_job().execute_in_process( - resources=resources, - run_config=RunConfig( - {"get_filings_to_extract": FilingsToExtractConfig(num_filings=num_filings)} - ), - input_values={ - "metadata": filings_metadata, - "previous_extraction_metadata": previous_extraction_metadata, - "previous_extracted": pd.DataFrame(), - }, - ) - extraction_metadata, metrics = ( - graph_result.output_value("extraction_metadata"), - graph_result.output_value("extraction_metrics"), +def test_sec10k_extraction(): + """Test loading sec10k filing metadata.""" + fake_extraction_metadata = pd.DataFrame({"extraction_metadata": ["fake_col"]}) + fake_extracted = pd.DataFrame({"extracted": ["fake_col"]}) + fake_filing_metadata = pd.DataFrame({"filing_metadata": ["fake_col"]}) + + # Create fake Sec10kExtractor + class TestSec10kExtractor(Sec10kExtractor): + def extract_filings(self, filing_metadata): + pd.testing.assert_frame_equal(filing_metadata, fake_filing_metadata) + return fake_extraction_metadata, fake_extracted + + # Create fake GCSArchive + class FakeArchive(GCSArchive): + filings_bucket_name: str = "" + labels_bucket_name: str = "" + metadata_db_instance_connection: str = "" + user: str = "" + metadata_db_name: str = "" + project: str = "" + + def setup_for_execution(self, context): + pass + + # Asset to return fake filing metadata + @asset + def fake_filing_metadata_asset(): + return fake_filing_metadata + + # Create fake extraction asset with configured inputs + extraction_multi_asset = sec10k_extraction_asset_factory( + name="test_sec10k_extraction", + sec10k_extractor=TestSec10kExtractor(cloud_interface=FakeArchive()), + filing_metadata_asset_name="fake_filing_metadata_asset", + extracted_asset_name="test_sec10k_extraction", + extraction_metadata_asset_name="test_sec10k_extraction_metadata", ) - run = get_most_recent_mlflow_run_factory(experiment_name) - assert run.data.metrics["num_failed"] == num_failed - assert run.data.metrics["ratio_extracted"] == len(extraction_metadata) / len( - filings_metadata + # Run assets and review results + result = materialize([fake_filing_metadata_asset, extraction_multi_asset]) + pd.testing.assert_frame_equal( + result.asset_value("test_sec10k_extraction_metadata"), fake_extraction_metadata + ) + pd.testing.assert_frame_equal( + result.asset_value("test_sec10k_extraction"), fake_extracted ) - assert run.data.metrics == metrics From f20fb7dcb7a11bd89ede8538b8a950a412a6b22d Mon Sep 17 00:00:00 2001 From: zschira Date: Mon, 2 Sep 2024 15:39:30 -0400 Subject: [PATCH 018/161] Remove old comment --- src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py index e78f627..75a67f7 100644 --- a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py +++ b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py @@ -20,7 +20,6 @@ class MlflowBaseIOManager(ConfigurableIOManager): """Specify base config and implement helper functions for mlflow io-managers.""" mlflow_interface: MlflowInterface - #: By default handles artifacts from current run, but can be used with previous run. def _get_run_info(self) -> Run: """Get mlflow `Run` object using current run id.""" From 92e2e009e395eefc4de960c29e4d03dd04c9b359 Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 3 Sep 2024 09:45:57 -0400 Subject: [PATCH 019/161] Add ex21 to dagster jobs --- .../library/mlflow/__init__.py | 10 + .../library/mlflow/mlflow_io_managers.py | 2 +- .../library/mlflow/validation.py | 88 ------ .../library/validation_helpers.py | 81 ++++++ .../models/sec10k/basic_10k.py | 2 + .../models/sec10k/ex_21/inference.py | 183 +++++++------ .../models/sec10k/ex_21/train_extractor.py | 4 +- src/mozilla_sec_eia/models/sec10k/extract.py | 1 + src/mozilla_sec_eia/models/sec10k/pipeline.py | 252 +++++++++++++++--- .../models/sec10k/utils/layoutlm.py | 54 ++-- src/mozilla_sec_eia/pudl_pipelines.py | 14 +- .../pudl_validation_pipelines.py | 16 +- tests/unit/models/sec10k/extract_test.py | 2 + 13 files changed, 477 insertions(+), 232 deletions(-) delete mode 100644 src/mozilla_sec_eia/library/mlflow/validation.py create mode 100644 src/mozilla_sec_eia/library/validation_helpers.py diff --git a/src/mozilla_sec_eia/library/mlflow/__init__.py b/src/mozilla_sec_eia/library/mlflow/__init__.py index 380a63c..c5e6642 100644 --- a/src/mozilla_sec_eia/library/mlflow/__init__.py +++ b/src/mozilla_sec_eia/library/mlflow/__init__.py @@ -1,5 +1,7 @@ """Implement tooling to interface with mlflow experiment tracking.""" +from dagster import EnvVar + from .mlflow_io_managers import ( MlflowBaseIOManager, MlflowMetricsIOManager, @@ -10,6 +12,14 @@ get_most_recent_run, ) +mlflow_production_interface = MlflowInterface( + experiment_name="", + tracking_uri=EnvVar("MLFLOW_TRACKING_URI"), + project=EnvVar("GCS_PROJECT"), + tracking_enabled=False, +) +mlflow_train_test_interface = MlflowInterface.configure_at_launch() + def get_mlflow_io_manager( key: str, diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py index 75a67f7..7aa05d7 100644 --- a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py +++ b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py @@ -94,7 +94,7 @@ def handle_output(self, context: OutputContext, obj: dict[str, float]): """Load metrics to mlflow run/experiment created by `MlflowInterface`.""" mlflow.log_metrics(obj) - def load_input(self, context: OutputContext) -> dict[str, float]: + def load_input(self, context: InputContext) -> dict[str, float]: """Log metrics to mlflow run/experiment created by `MlflowInterface`.""" run = self._get_run_info() return run.data.metrics diff --git a/src/mozilla_sec_eia/library/mlflow/validation.py b/src/mozilla_sec_eia/library/mlflow/validation.py deleted file mode 100644 index 999cbdd..0000000 --- a/src/mozilla_sec_eia/library/mlflow/validation.py +++ /dev/null @@ -1,88 +0,0 @@ -"""Implement common utilities/functions for validating models.""" - -from importlib import resources - -import pandas as pd -from dagster import AssetIn, AssetsDefinition, asset - - -def load_validation_data_asset_factory( - asset_name: str, - filename: str, - index_cols: str | list[str] | None = None, -) -> AssetsDefinition: - """Construct asset for loading validation data from CSV in `package_data`.""" - - @asset( - name=asset_name, - io_manager_key="mlflow_pandas_artifact_io_manager", - ) - def load_validation_data() -> pd.DataFrame: - """Load csv with validation data from `package_data` directory.""" - df = pd.read_csv( - resources.files("mozilla_sec_eia.package_data.validation_data") / filename - ) - if index_cols is not None: - df = df.set_index(index_cols) - return df - - return load_validation_data - - -def pandas_precision_recall_asset_factory( - validation_asset: str, - computed_asset: str, - value_col: str, -) -> AssetsDefinition: - """Produce asset to compute precision and recall on pandas dataframe. - - The returned asset will take upstream computed/validation assets and compute - precision/recall on `value_col`. - - Arg: - validation_asset: Upstream asset containing dataframe of validation set. - computed_asset: Upstream asset containing dataframe of computed data. - value_col: Column to compare when computing metrics. - """ - - @asset( - ins={ - "computed_set": AssetIn(computed_asset), - "validation_set": AssetIn(validation_asset), - }, - io_manager_key="mlflow_metrics_io_manager", - ) - def pandas_compute_precision_recall( - computed_set: pd.DataFrame, - validation_set: pd.DataFrame, - ) -> dict: - """Asset which will return computed metrics from dataframes.""" - # Get initial length of both sets - computed_len = len(computed_set) - validation_len = len(validation_set) - - # Get index of rows only in one set and make Null in other set - idx_validation_only = validation_set.index.difference(computed_set.index) - padded_compute_set = pd.concat( - [ - computed_set[value_col], - pd.Series([None] * len(idx_validation_only), index=idx_validation_only), - ] - ).sort_index() - idx_compute_only = computed_set.index.difference(validation_set.index) - padded_validation_set = pd.concat( - [ - validation_set[value_col], - pd.Series([None] * len(idx_compute_only), index=idx_compute_only), - ] - ).sort_index() - - true_positives = (padded_compute_set == padded_validation_set).sum() - - return { - "precision": true_positives / computed_len, - "recall": true_positives / validation_len, - } - - # Return new asset - return pandas_compute_precision_recall diff --git a/src/mozilla_sec_eia/library/validation_helpers.py b/src/mozilla_sec_eia/library/validation_helpers.py new file mode 100644 index 0000000..62c1825 --- /dev/null +++ b/src/mozilla_sec_eia/library/validation_helpers.py @@ -0,0 +1,81 @@ +"""Implement common utilities/functions for validating models.""" + +from importlib import resources + +import pandas as pd + + +def load_validation_data( + filename: str, index_cols: list[str] | None = None +) -> pd.DataFrame: + """Load csv with validation data from `package_data` directory.""" + df = pd.read_csv( + resources.files("mozilla_sec_eia.package_data.validation_data") / filename + ) + if index_cols is not None: + df = df.set_index(index_cols) + return df + + +def pandas_compute_precision_recall( + computed_set: pd.DataFrame, + validation_set: pd.DataFrame, + value_col: str, +) -> dict: + """Asset which will return computed metrics from dataframes.""" + # Get initial length of both sets + computed_len = len(computed_set) + validation_len = len(validation_set) + + # Get index of rows only in one set and make Null in other set + idx_validation_only = validation_set.index.difference(computed_set.index) + padded_compute_set = pd.concat( + [ + computed_set[value_col], + pd.Series([None] * len(idx_validation_only), index=idx_validation_only), + ] + ).sort_index() + idx_compute_only = computed_set.index.difference(validation_set.index) + padded_validation_set = pd.concat( + [ + validation_set[value_col], + pd.Series([None] * len(idx_compute_only), index=idx_compute_only), + ] + ).sort_index() + + true_positives = (padded_compute_set == padded_validation_set).sum() + + return { + "precision": true_positives / computed_len, + "recall": true_positives / validation_len, + } + + +def jaccard_similarity( + computed_df: pd.DataFrame, validation_df: pd.DataFrame, value_col: str +) -> float: + """Get the Jaccard similarity between two Series. + + Calculated as the intersection of the set divided + by the union of the set. + + Args: + computed_df: Extracted data. + validation_df: Expected extraction results. + value_col: Column to calculate Jaccard similarity on. + Must be present in both dataframes. + """ + # fill nans to make similarity comparison more accurate + if (computed_df[value_col].dtype == float) and ( + validation_df[value_col].dtype == float + ): + computed_df[value_col] = computed_df[value_col].fillna(999) + validation_df[value_col] = validation_df[value_col].fillna(999) + else: + computed_df[value_col] = computed_df[value_col].fillna("zzz") + validation_df[value_col] = validation_df[value_col].fillna("zzz") + intersection = set(computed_df[value_col]).intersection( + set(validation_df[value_col]) + ) + union = set(computed_df[value_col]).union(set(validation_df[value_col])) + return float(len(intersection)) / float(len(union)) diff --git a/src/mozilla_sec_eia/models/sec10k/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py index dfc2fc8..df57ac5 100644 --- a/src/mozilla_sec_eia/models/sec10k/basic_10k.py +++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py @@ -13,6 +13,8 @@ class Basic10kExtractor(Sec10kExtractor): """Implement Sec10kExtractor for basic 10k company info data.""" + name: str = "basic_10k_extractor" + def _extract_10k(self, filing: Sec10K): """Extract basic company data from filing.""" logger.info(f"Extracting 10K company data from filing: {filing.filename}") diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index 2016630..56648eb 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -2,22 +2,25 @@ import logging import os +import tempfile +from contextlib import contextmanager from pathlib import Path import numpy as np import pandas as pd import torch from datasets import Dataset +from pydantic import PrivateAttr from transformers import ( - AutoProcessor, - LayoutLMv3ForTokenClassification, Pipeline, pipeline, ) from transformers.tokenization_utils_base import BatchEncoding +from ..extract import Sec10kExtractor from ..utils.cloud import get_metadata_filename from ..utils.layoutlm import ( + LayoutlmResource, get_id_label_conversions, iob_to_label, normalize_bboxes, @@ -184,84 +187,106 @@ def _get_data(dataset): yield from dataset -def perform_inference( - pdfs_dir: Path, - model: LayoutLMv3ForTokenClassification, - processor: AutoProcessor, - extraction_metadata: pd.DataFrame, - dataset_ind: list = None, - labeled_json_dir: Path = None, - has_labels: bool = False, - device="cpu", -): - """Predict entities with a fine-tuned model and extract Ex. 21 tables. - - This function starts by creating a HuggingFace dataset from PDFs in `pdfs_dir` - that the model can then perform inference on (`create_inference_dataset`). - Then it creates an instance of the custom LayoutLM inference pipeline and - runs the dataset through the pipeline. The pipeline outputs logits, predictions, - and an output dataframe with extracted Ex. 21 table. - - Arguments: - pdfs_dir: Path to the directory with PDFs that are being used for inference. - model: A fine-tuned LayoutLM model. - processor: The tokenizer and encoder for model inputs. - extraction_metadata: A dataframe to track extraction success metrics. Should - have columns 'filename' and 'success'. - dataset_ind: A list of index numbers of dataset records to be used for inference - Default is None, in which the entire dataset created from the PDF directory - is used. - labeled_json_dir: Path to the directory with labeled JSONs from Label Studio. Cannot - be None if has_labels is True. - has_labels: Boolean, true if the data has associated labels that can be used in - visualizing and validating results. - device: String or int, specify what computation device to use for inference - i.e. "mps", "cpu", "cuda" - - Returns: - logits: A list of logits. The list is the length of the number of documents in the - dataset (number of PDFs in pdfs_dir). Each logit object in the list is of - shape (batch_size, seq_len, num_labels). Seq_len is - the same as token length (512 in this case). - predictions: A list of predictions. The list is the length of the number of documents - in the dataset (number of PDFs in pdfs_dir). - From the logits, we take the highest score for each token, using argmax. - This serves as the predicted label for each token. It is shape (seq_len) or token - length. - output_dfs: The extracted Ex. 21 tables. This is one big dataframe with an ID column - that is the filename of the extracted Ex. 21. Dataframe contains columns id, - subsidiary, loc, own_per. - """ - dataset = create_inference_dataset( - pdfs_dir=pdfs_dir, labeled_json_dir=labeled_json_dir, has_labels=has_labels - ) - if dataset_ind: - dataset = dataset.select(dataset_ind) - - # TODO: figure out device argument - pipe = pipeline( - "token-classification", - model=model, - tokenizer=processor, - pipeline_class=LayoutLMInferencePipeline, - device=device, - ) +class Exhibit21Extractor(Sec10kExtractor): + """Implement `Sec10kExtractor` interface for exhibit 21 data.""" + + layoutlm: LayoutlmResource + name: str = "exhibit21_extractor" + device: str = "cpu" + has_labels: bool = False + dataset_ind: list | None = None + _pdf_dir: Path = PrivateAttr() + _labeled_json_dir: Path | None = PrivateAttr(default=None) + + @contextmanager + def yield_for_execution(self, context): + """Setup temp path working directories.""" + with ( + tempfile.TemporaryDirectory() as pdf_dir, + tempfile.TemporaryDirectory() as labeled_json_dir, + ): + self._pdf_dir = pdf_dir + if self.has_labels: + self._labeled_json_dir = labeled_json_dir + yield self + + def extract_filings( + self, filing_metadata: pd.DataFrame + ) -> tuple[pd.DataFrame, pd.DataFrame]: + """Predict entities with a fine-tuned model and extract Ex. 21 tables. + + This function starts by creating a HuggingFace dataset from PDFs in `pdfs_dir` + that the model can then perform inference on (`create_inference_dataset`). + Then it creates an instance of the custom LayoutLM inference pipeline and + runs the dataset through the pipeline. The pipeline outputs logits, predictions, + and an output dataframe with extracted Ex. 21 table. + + Arguments: + pdfs_dir: Path to the directory with PDFs that are being used for inference. + model: A fine-tuned LayoutLM model. + processor: The tokenizer and encoder for model inputs. + extraction_metadata: A dataframe to track extraction success metrics. Should + have columns 'filename' and 'success'. + dataset_ind: A list of index numbers of dataset records to be used for inference + Default is None, in which the entire dataset created from the PDF directory + is used. + labeled_json_dir: Path to the directory with labeled JSONs from Label Studio. Cannot + be None if has_labels is True. + has_labels: Boolean, true if the data has associated labels that can be used in + visualizing and validating results. + device: String or int, specify what computation device to use for inference + i.e. "mps", "cpu", "cuda" + + Returns: + logits: A list of logits. The list is the length of the number of documents in the + dataset (number of PDFs in pdfs_dir). Each logit object in the list is of + shape (batch_size, seq_len, num_labels). Seq_len is + the same as token length (512 in this case). + predictions: A list of predictions. The list is the length of the number of documents + in the dataset (number of PDFs in pdfs_dir). + From the logits, we take the highest score for each token, using argmax. + This serves as the predicted label for each token. It is shape (seq_len) or token + length. + output_dfs: The extracted Ex. 21 tables. This is one big dataframe with an ID column + that is the filename of the extracted Ex. 21. Dataframe contains columns id, + subsidiary, loc, own_per. + """ + dataset = create_inference_dataset( + pdfs_dir=self._pdf_dir, + labeled_json_dir=self._labeled_json_dir, + has_labels=self.has_labels, + ) + if self.dataset_ind: + dataset = dataset.select(self.dataset_ind) + + # TODO: figure out device argument + model, processor = self.layoutlm.get_model_components() + pipe = pipeline( + "token-classification", + model=model, + tokenizer=processor, + pipeline_class=LayoutLMInferencePipeline, + device=self.device, + ) - logits = [] - predictions = [] - all_output_df = pd.DataFrame(columns=["id", "subsidiary", "loc", "own_per"]) - for logit, pred, output_df in pipe(_get_data(dataset)): - logits.append(logit) - predictions.append(pred) - if not output_df.empty: - filename = get_metadata_filename(output_df["id"].iloc[0]) - extraction_metadata.loc[filename, ["success"]] = True - all_output_df = pd.concat([all_output_df, output_df]) - all_output_df.columns.name = None - all_output_df = clean_extracted_df(all_output_df) - all_output_df = all_output_df[["id", "subsidiary", "loc", "own_per"]] - all_output_df = all_output_df.reset_index(drop=True) - return logits, predictions, all_output_df, extraction_metadata + logits = [] + predictions = [] + all_output_df = pd.DataFrame(columns=["id", "subsidiary", "loc", "own_per"]) + extraction_metadata = pd.DataFrame( + {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} + ).set_index("filename") + for logit, pred, output_df in pipe(_get_data(dataset)): + logits.append(logit) + predictions.append(pred) + if not output_df.empty: + filename = get_metadata_filename(output_df["id"].iloc[0]) + extraction_metadata.loc[filename, ["success"]] = True + all_output_df = pd.concat([all_output_df, output_df]) + all_output_df.columns.name = None + all_output_df = clean_extracted_df(all_output_df) + all_output_df = all_output_df[["id", "subsidiary", "loc", "own_per"]] + all_output_df = all_output_df.reset_index(drop=True) + return logits, predictions, all_output_df, extraction_metadata class LayoutLMInferencePipeline(Pipeline): diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py b/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py index 53ed85e..00c80f3 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py @@ -27,7 +27,7 @@ ) from transformers.data.data_collator import default_data_collator -from ..utils.layoutlm import get_id_label_conversions, log_model +from ..utils.layoutlm import get_id_label_conversions from .create_labeled_dataset import format_as_ner_annotations LABELS = [ @@ -191,4 +191,4 @@ def train_model( # Train inside mlflow run. Mlflow will automatically handle logging training metrcis with mlflow.start_run(): trainer.train() - log_model(trainer) + # log_model(trainer) diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py index ad3760d..4461b49 100644 --- a/src/mozilla_sec_eia/models/sec10k/extract.py +++ b/src/mozilla_sec_eia/models/sec10k/extract.py @@ -10,6 +10,7 @@ class Sec10kExtractor(ConfigurableResource): """Base class for extracting SEC 10k data.""" cloud_interface: GCSArchive + name: str def extract_filings( self, filing_metadata: pd.DataFrame diff --git a/src/mozilla_sec_eia/models/sec10k/pipeline.py b/src/mozilla_sec_eia/models/sec10k/pipeline.py index e4b7d0d..5604d6a 100644 --- a/src/mozilla_sec_eia/models/sec10k/pipeline.py +++ b/src/mozilla_sec_eia/models/sec10k/pipeline.py @@ -14,15 +14,21 @@ with_resources, ) -from mozilla_sec_eia.library.mlflow import validation +from mozilla_sec_eia.library import validation_helpers +from mozilla_sec_eia.library.mlflow import ( + mlflow_production_interface, + mlflow_train_test_interface, +) from mozilla_sec_eia.library.pipeline import ( create_production_pipeline, create_validation_pipeline, ) from .basic_10k import Basic10kExtractor +from .ex_21.inference import Exhibit21Extractor, clean_extracted_df from .extract import Sec10kExtractor -from .utils.cloud import GCSArchive, cloud_interface_resource +from .utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename +from .utils.layoutlm import LayoutlmResource logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -45,6 +51,17 @@ class ExtractionMetadataSchema(pa.DataFrameModel): ) +@asset(partitions_def=partitions_def) +def sec10k_filing_metadata( + context: AssetExecutionContext, + cloud_interface: GCSArchive, +) -> pd.DataFrame: + """Return filing metadata for year_quarter partition.""" + year_quarter = context.partition_key + df = cloud_interface.get_metadata(year_quarter=year_quarter) + return df + + def sec10k_extraction_asset_factory( name: str, sec10k_extractor: Sec10kExtractor, @@ -75,50 +92,55 @@ def sec10k_extraction_asset_factory( }, ins={"sec10k_filing_metadata": AssetIn(filing_metadata_asset_name)}, partitions_def=partitions_def, + required_resource_keys={sec10k_extractor.name}, ) def extract_filings( - sec10k_extractor: Sec10kExtractor, sec10k_filing_metadata: pd.DataFrame + context: AssetExecutionContext, sec10k_filing_metadata: pd.DataFrame ) -> tuple[pd.DataFrame, pd.DataFrame]: """Run Sec10kExtractor on selected partition and return.""" - extraction_metadata, extracted = sec10k_extractor.extract_filings( + extractor = context.resources.original_resource_dict[sec10k_extractor.name] + extraction_metadata, extracted = extractor.extract_filings( sec10k_filing_metadata ) return extraction_metadata, extracted - return with_resources([extract_filings], {"sec10k_extractor": sec10k_extractor})[0] + return with_resources([extract_filings], {sec10k_extractor.name: sec10k_extractor})[ + 0 + ] -@asset(partitions_def=partitions_def) -def sec10k_filing_metadata( - context: AssetExecutionContext, - cloud_interface: GCSArchive, -) -> pd.DataFrame: - """Return filing metadata for year_quarter partition.""" - year_quarter = context.partition_key - df = cloud_interface.get_metadata(year_quarter=year_quarter) - return df +@asset +def basic_10k_validation_set() -> pd.DataFrame: + """Return dataframe containing basic 10k validation data.""" + return validation_helpers.load_validation_data( + "basic_10k_labels.csv", + index_cols=["filename", "filer_count", "block", "block_count", "key"], + ) -# Create asset to load basic 10k validation data -basic_10k_validation_set = validation.load_validation_data_asset_factory( - "basic_10k_validation_set", - "basic_10k_labels.csv", - index_cols=["filename", "filer_count", "block", "block_count", "key"], -) +basic_10k_extracted_validation_asset_name = "basic_10k_company_info_validation" -# Create asset to compute precision/recall on basic 10k extraction of validation set -basic_10k_extracted_validation_asset_name = "basic_10k_company_info_validation" -basic_10k_extraction_validation_metrics = ( - validation.pandas_precision_recall_asset_factory( - validation_asset="basic_10k_validation_set", - computed_asset=basic_10k_extracted_validation_asset_name, - value_col="value", - ) +@asset( + ins={ + basic_10k_extracted_validation_asset_name: AssetIn( + basic_10k_extracted_validation_asset_name + ), + "basic_10k_validation_set": AssetIn(), + }, + io_manager_key="mlflow_metrics_io_manager", ) +def basic_10k_extraction_validation_metrics(**kwargs): + """Compute basic 10k extraction validation metrics.""" + computed = kwargs[basic_10k_extracted_validation_asset_name] + validation = kwargs["basic_10k_validation_set"] + + return validation_helpers.pandas_compute_precision_recall( + computed, validation, value_col="value" + ) -@asset(name="sec10k_filing_metadata_validation") +@asset def basic_10k_validation_filing_metadata( cloud_interface: GCSArchive, basic_10k_validation_set: pd.DataFrame, @@ -157,7 +179,7 @@ def basic_10k_validation_filing_metadata( sec10k_extraction_asset_factory( "basic_10k", Basic10kExtractor(cloud_interface=cloud_interface_resource), - filing_metadata_asset_name="sec10k_filing_metadata_validation", + filing_metadata_asset_name="basic_10k_validation_filing_metadata", extraction_metadata_asset_name="basic_10k_extraction_validation_metadata", extracted_asset_name=basic_10k_extracted_validation_asset_name, ), @@ -166,3 +188,173 @@ def basic_10k_validation_filing_metadata( ], resources={"cloud_interface": cloud_interface_resource}, ) + + +@asset +def ex21_validation_set() -> pd.DataFrame: + """Return dataframe containing basic 10k validation data.""" + return clean_ex21_validation_set( + validation_helpers.load_validation_data("ex21_labels.csv") + ) + + +@asset +def ex21_validation_filing_metadata( + cloud_interface: GCSArchive, + ex21_validation_set: pd.DataFrame, +) -> pd.DataFrame: + """Get sec 10k filing metadata from validation set.""" + filing_metadata = cloud_interface.get_metadata() + return filing_metadata[ + filing_metadata["filename"].isin( + ex21_validation_set.index.get_level_values("filename").unique() + ) + ] + + +ex21_extracted_validation_asset_name = "ex21_validation" + + +@multi_asset( + ins={ + "computed_df": AssetIn(ex21_extracted_validation_asset_name), + "validation_df": AssetIn("ex21_validation_set"), + }, + outs={ + "ex21_jaccard_per_table": AssetOut( + io_manager_key="mlflow_pandas_artifact_io_manager" + ), + "ex21_precision_recall_per_table": AssetOut( + io_manager_key="mlflow_pandas_artifact_io_manager" + ), + "ex21_incorrect_filenames": AssetOut( + io_manager_key="mlflow_pandas_artifact_io_manager" + ), + "ex21_extraction_metrics": AssetOut(io_manager_key="mlflow_metrics_io_manager"), + }, +) +def ex21_validation_metrics(computed_df: pd.DataFrame, validation_df: pd.DataFrame): + """Compute validation metrics for Ex. 21 extraction.""" + shared_cols = validation_df.columns.intersection(computed_df.columns) + validation_df = validation_df.astype(computed_df[shared_cols].dtypes) + n_equal = 0 + validation_filenames = validation_df["id"].unique() + n_files = len(validation_filenames) + table_metrics_dict = {} + jaccard_dict = {} + incorrect_files = [] + # iterate through each file and check each extracted table + for filename in validation_filenames: + extracted_table_df = computed_df[computed_df["id"] == filename].reset_index( + drop=True + ) + validation_table_df = validation_df[ + validation_df["id"] == filename + ].reset_index(drop=True) + # check if the tables are exactly equal + if extracted_table_df.equals(validation_table_df): + # TODO: strip llc and other company strings before comparison + n_equal += 1 + else: + incorrect_files.append(filename) + # compute precision and recall for each column + table_metrics_dict[filename] = {} + jaccard_dict[filename] = {} + for col in ["subsidiary", "loc", "own_per"]: + table_prec_recall = validation_helpers.pandas_compute_precision_recall( + extracted_table_df, validation_table_df, value_col=col + ) + table_metrics_dict[filename][f"{col}_precision"] = table_prec_recall[ + "precision" + ] + table_metrics_dict[filename][f"{col}_recall"] = table_prec_recall["recall"] + # get the jaccard similarity between columns + jaccard_dict[filename][col] = validation_helpers.jaccard_similarity( + computed_df=extracted_table_df, + validation_df=validation_table_df, + value_col=col, + ) + + jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient="index").reset_index() + prec_recall_df = pd.DataFrame.from_dict( + table_metrics_dict, orient="index" + ).reset_index() + + return ( + jaccard_df, + prec_recall_df, + pd.DataFrame({"filename": incorrect_files}), + { + "table_accuracy": n_equal / n_files, + "avg_subsidiary_jaccard_sim": jaccard_df["subsidiary"].sum() / n_files, + "avg_location_jaccard_sim": jaccard_df["loc"].sum() / n_files, + "avg_own_per_jaccard_sim": jaccard_df["own_per"].sum() / n_files, + "avg_subsidiary_precision": prec_recall_df["subsidiary_precision"].sum() + / n_files, + "avg_location_precision": prec_recall_df["loc_precision"].sum() / n_files, + "avg_own_per_precision": prec_recall_df["own_per_precision"].sum() + / n_files, + "avg_subsidiary_recall": prec_recall_df["subsidiary_recall"].sum() + / n_files, + "avg_location_recall": prec_recall_df["loc_recall"].sum() / n_files, + "avg_own_per_recall": prec_recall_df["own_per_recall"].sum() / n_files, + }, + ) + + +def clean_ex21_validation_set(validation_df: pd.DataFrame): + """Clean Ex. 21 validation data to match extracted format.""" + validation_df = validation_df.rename( + columns={ + "Filename": "id", + "Subsidiary": "subsidiary", + "Location of Incorporation": "loc", + "Ownership Percentage": "own_per", + } + ) + validation_df["own_per"] = validation_df["own_per"].astype(str) + validation_df["filename"] = validation_df["id"].apply(get_metadata_filename) + validation_df = clean_extracted_df(validation_df) + return validation_df + + +# Register ex21 extraction pipeline +create_production_pipeline( + "ex21_extraction", + [ + sec10k_filing_metadata, + sec10k_extraction_asset_factory( + "ex21", + Exhibit21Extractor( + cloud_interface=cloud_interface_resource, + layoutlm=LayoutlmResource(mlflow_interface=mlflow_production_interface), + ), + partitions_def=partitions_def, + extraction_metadata_asset_name="ex21_extraction_metadata", + extracted_asset_name="ex21_company_info", + ), + ], + resources={"cloud_interface": cloud_interface_resource}, +) + + +# Register ex21 extraction validation pipeline +create_validation_pipeline( + "ex21_extraction", + [ + ex21_validation_filing_metadata, + sec10k_extraction_asset_factory( + "ex21", + Exhibit21Extractor( + cloud_interface=cloud_interface_resource, + layoutlm=LayoutlmResource(mlflow_interface=mlflow_train_test_interface), + ), + filing_metadata_asset_name="ex21_validation_filing_metadata", + extraction_metadata_asset_name="ex21_extraction_validation_metadata", + extracted_asset_name=ex21_extracted_validation_asset_name, + ), + ex21_validation_set, + ex21_validation_metrics, + ], + resources={"cloud_interface": cloud_interface_resource}, +) diff --git a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py index ba31fb5..1e88052 100644 --- a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py +++ b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py @@ -1,30 +1,54 @@ """Util functions for training and predicting with LayoutLM on Ex. 21 tables.""" import mlflow +from dagster import ConfigurableResource, InputContext, OutputContext from PIL import ImageDraw, ImageFont +from pydantic import PrivateAttr from transformers import ( Trainer, ) +from mozilla_sec_eia.library.mlflow import MlflowBaseIOManager, MlflowInterface -def log_model(finetuned_model: Trainer): - """Log fine-tuned model to mlflow artifacts.""" - model = {"model": finetuned_model.model, "tokenizer": finetuned_model.tokenizer} - mlflow.transformers.log_model( - model, artifact_path="layoutlm_extractor", task="token-classification" - ) +def _load_pretrained_layoutlm(version: str = "latest") -> dict: + """Function to load layoutlm from mlflow.""" + path = f"models:/layoutlm_extractor/{version}" -def load_model(version=1): - """Load fine-tuned model checkpoint from mlflow artifacts. + return mlflow.transformers.load_model(path, return_type="components") - Returns: A dictionary of the saved individual components of - either the Pipeline or the pre-trained model. - """ - # TODO: want more ability to give load_model a model path? - return mlflow.transformers.load_model( - f"models:/layoutlm_extractor/{version}", return_type="components" - ) + +class LayoutlmIOManager(MlflowBaseIOManager): + """Load and log models with mlflow tracking server.""" + + version: int | None = None + + def handle_output(self, context: OutputContext, finetuned_model: Trainer): + """Load metrics to mlflow run/experiment created by `MlflowInterface`.""" + model = {"model": finetuned_model.model, "tokenizer": finetuned_model.tokenizer} + mlflow.transformers.log_model( + model, artifact_path="layoutlm_extractor", task="token-classification" + ) + + def load_input(self, context: InputContext) -> dict: + """Log metrics to mlflow run/experiment created by `MlflowInterface`.""" + return _load_pretrained_layoutlm(self.version) + + +class LayoutlmResource(ConfigurableResource): + """Dagster resource for loading/using pretrained layoutlm model as a resource.""" + + mlflow_interface: MlflowInterface + version: str | None = None + _model_components: dict = PrivateAttr() + + def setup_for_execution(self, context): + """Load layoutlm from mlflow.""" + self._model_components = _load_pretrained_layoutlm(self.version) + + def get_model_components(self): + """Return model components from loaded model.""" + return self._model_components["model"], self._model_components["tokenizer"] def normalize_bboxes(txt_df, pg_meta_df): diff --git a/src/mozilla_sec_eia/pudl_pipelines.py b/src/mozilla_sec_eia/pudl_pipelines.py index 3a427c4..57b3808 100644 --- a/src/mozilla_sec_eia/pudl_pipelines.py +++ b/src/mozilla_sec_eia/pudl_pipelines.py @@ -3,9 +3,9 @@ import logging import coloredlogs -from dagster import Definitions, EnvVar +from dagster import Definitions -from mozilla_sec_eia.library.mlflow import MlflowInterface +from mozilla_sec_eia.library.mlflow import mlflow_production_interface from mozilla_sec_eia.library.pipeline import ( PUDL_PIPELINE_PRODUCTION_ASSETS, PUDL_PIPELINE_PRODUCTION_JOBS, @@ -16,17 +16,11 @@ log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s" coloredlogs.install(fmt=log_format, logger=logger) - -mlflow_interface = MlflowInterface( - experiment_name="", - tracking_uri=EnvVar("MLFLOW_TRACKING_URI"), - project=EnvVar("GCS_PROJECT"), -) - production_io_resources = {} | PUDL_PIPELINE_PRODUCTION_RESOURCES production_pipelines = Definitions( assets=PUDL_PIPELINE_PRODUCTION_ASSETS, jobs=PUDL_PIPELINE_PRODUCTION_JOBS, - resources=production_io_resources | {"mlflow_interface": mlflow_interface}, + resources=production_io_resources + | {"mlflow_interface": mlflow_production_interface}, ) diff --git a/src/mozilla_sec_eia/pudl_validation_pipelines.py b/src/mozilla_sec_eia/pudl_validation_pipelines.py index 44e7a55..63e9049 100644 --- a/src/mozilla_sec_eia/pudl_validation_pipelines.py +++ b/src/mozilla_sec_eia/pudl_validation_pipelines.py @@ -5,7 +5,10 @@ import coloredlogs from dagster import Definitions -from mozilla_sec_eia.library.mlflow import MlflowInterface, get_mlflow_io_manager +from mozilla_sec_eia.library.mlflow import ( + get_mlflow_io_manager, + mlflow_train_test_interface, +) from mozilla_sec_eia.library.pipeline import ( PUDL_PIPELINE_VALIDATION_ASSETS, PUDL_PIPELINE_VALIDATION_JOBS, @@ -16,17 +19,16 @@ log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s" coloredlogs.install(fmt=log_format, logger=logger) - -# Configure at launch so experiment name can be supplied by config -mlflow_interface = MlflowInterface.configure_at_launch() - validation_io_resources = { - key: get_mlflow_io_manager(key, mlflow_interface=mlflow_interface) + key: get_mlflow_io_manager( + key, mlflow_interface=mlflow_train_test_interface, pandas_file_type="csv" + ) for key in ["mlflow_pandas_artifact_io_manager", "mlflow_metrics_io_manager"] } | PUDL_PIPELINE_VALIDATION_RESOURCES validation_pipelines = Definitions( assets=PUDL_PIPELINE_VALIDATION_ASSETS, jobs=PUDL_PIPELINE_VALIDATION_JOBS, - resources=validation_io_resources | {"mlflow_interface": mlflow_interface}, + resources=validation_io_resources + | {"mlflow_interface": mlflow_train_test_interface}, ) diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py index 9a25fdb..c1cdac0 100644 --- a/tests/unit/models/sec10k/extract_test.py +++ b/tests/unit/models/sec10k/extract_test.py @@ -41,6 +41,8 @@ def test_sec10k_extraction(): # Create fake Sec10kExtractor class TestSec10kExtractor(Sec10kExtractor): + name: str = "test_extractor" + def extract_filings(self, filing_metadata): pd.testing.assert_frame_equal(filing_metadata, fake_filing_metadata) return fake_extraction_metadata, fake_extracted From 520e6d122cf8b06945108018730af75b18d9952e Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 3 Sep 2024 10:57:36 -0400 Subject: [PATCH 020/161] Prep for multiple code locations --- .../library/mlflow/__init__.py | 24 +- src/mozilla_sec_eia/library/model_jobs.py | 88 +++++ src/mozilla_sec_eia/library/pipeline.py | 93 ----- src/mozilla_sec_eia/models/sec10k/__init__.py | 57 +++ .../models/sec10k/basic_10k.py | 85 ++++- .../models/sec10k/ex_21/__init__.py | 172 +++++++++ src/mozilla_sec_eia/models/sec10k/extract.py | 75 +++- src/mozilla_sec_eia/models/sec10k/pipeline.py | 360 ------------------ src/mozilla_sec_eia/pudl_pipelines.py | 26 -- .../pudl_validation_pipelines.py | 34 -- tests/unit/models/sec10k/extract_test.py | 5 +- 11 files changed, 491 insertions(+), 528 deletions(-) create mode 100644 src/mozilla_sec_eia/library/model_jobs.py delete mode 100644 src/mozilla_sec_eia/library/pipeline.py delete mode 100644 src/mozilla_sec_eia/models/sec10k/pipeline.py delete mode 100644 src/mozilla_sec_eia/pudl_pipelines.py delete mode 100644 src/mozilla_sec_eia/pudl_validation_pipelines.py diff --git a/src/mozilla_sec_eia/library/mlflow/__init__.py b/src/mozilla_sec_eia/library/mlflow/__init__.py index c5e6642..5987f75 100644 --- a/src/mozilla_sec_eia/library/mlflow/__init__.py +++ b/src/mozilla_sec_eia/library/mlflow/__init__.py @@ -1,7 +1,5 @@ """Implement tooling to interface with mlflow experiment tracking.""" -from dagster import EnvVar - from .mlflow_io_managers import ( MlflowBaseIOManager, MlflowMetricsIOManager, @@ -12,14 +10,6 @@ get_most_recent_run, ) -mlflow_production_interface = MlflowInterface( - experiment_name="", - tracking_uri=EnvVar("MLFLOW_TRACKING_URI"), - project=EnvVar("GCS_PROJECT"), - tracking_enabled=False, -) -mlflow_train_test_interface = MlflowInterface.configure_at_launch() - def get_mlflow_io_manager( key: str, @@ -40,3 +30,17 @@ def get_mlflow_io_manager( raise RuntimeError(f"MlFlow IO-manager, {key}, does not exist.") return io_manager + + +mlflow_interface_resource = MlflowInterface.configure_at_launch() +mlflow_validation_io_managers = { + "mlflow_metrics_io_manager": get_mlflow_io_manager( + "mlflow_metrics_io_manager", + mlflow_interface=mlflow_interface_resource, + ), + "mlflow_pandas_artifact_io_manager": get_mlflow_io_manager( + "mlflow_pandas_artifact_io_manager", + mlflow_interface=mlflow_interface_resource, + pandas_file_type="csv", + ), +} diff --git a/src/mozilla_sec_eia/library/model_jobs.py b/src/mozilla_sec_eia/library/model_jobs.py new file mode 100644 index 0000000..8bdfc5c --- /dev/null +++ b/src/mozilla_sec_eia/library/model_jobs.py @@ -0,0 +1,88 @@ +"""Implement helper methods for constructing dagster jobs. + +Methods defined here are the main interface for constructing PUDL model jobs. +`create_production_model_job` will produce a dagster job that will use the default +multi-process executor to run a PUDL model. `create_validation_model_job` is meant for +testing/validating models with an mlflow run backing the dagster run for logging. +To avoid problems with mlflow runs, test/validation jobs are run with the dagster +in process executor. +""" + +import mlflow +from dagster import ( + AssetsDefinition, + HookContext, + JobDefinition, + define_asset_job, + failure_hook, + in_process_executor, + success_hook, +) +from mlflow.entities import RunStatus + + +def create_production_model_job( + job_name: str, + assets: list[AssetsDefinition], + **kwargs, +) -> JobDefinition: + """Construct a dagster job and supply Definitions with assets and resources.""" + return define_asset_job( + job_name, + selection=assets, + config={ + "ops": {}, + "resources": { + "mlflow_interface": { + "config": { + "experiment_name": job_name, + "tracking_enabled": False, + } + } + }, + }, + **kwargs, + ) + + +@success_hook(required_resource_keys={"mlflow_interface"}) +def log_op_config(context: HookContext): + """Log any config supplied to ops/assets in validation job to mlflow tracking server.""" + if context.op_config is not None: + mlflow.log_params(context.op_config) + + +@failure_hook(required_resource_keys={"mlflow_interface"}) +def end_run_on_failure(context: HookContext): + """Inform mlflow about job failure.""" + if isinstance(context.op_exception, KeyboardInterrupt): + mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED)) + else: + mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED)) + + +def create_validation_model_job( + job_name: str, + assets: list[AssetsDefinition], + **kwargs, +): + """Construct a dagster job and supply Definitions with assets and resources.""" + return define_asset_job( + job_name, + selection=assets, + executor_def=in_process_executor, + hooks={log_op_config, end_run_on_failure}, + # Configure mlflow_interface for job with appropriate experiment name + config={ + "ops": {}, + "resources": { + "mlflow_interface": { + "config": { + "experiment_name": job_name, + "tracking_enabled": True, + } + } + }, + }, + **kwargs, + ) diff --git a/src/mozilla_sec_eia/library/pipeline.py b/src/mozilla_sec_eia/library/pipeline.py deleted file mode 100644 index e35f1a9..0000000 --- a/src/mozilla_sec_eia/library/pipeline.py +++ /dev/null @@ -1,93 +0,0 @@ -"""Implement helper methods for constructing dagster jobs. - -Methods defined here are the main interface for constructing PUDL model jobs. -`create_production_pipeline` will produce a dagster job that will use the default -multi-process executor to run a PUDL model. `create_validation_pipeline` is meant for -testing/validating models with an mlflow run backing the dagster run for logging. -""" - -import mlflow -from dagster import ( - AssetsDefinition, - HookContext, - ResourceDefinition, - define_asset_job, - failure_hook, - in_process_executor, - success_hook, -) -from mlflow.entities import RunStatus - -PUDL_PIPELINE_PRODUCTION_JOBS = [] -PUDL_PIPELINE_PRODUCTION_ASSETS = [] -PUDL_PIPELINE_PRODUCTION_RESOURCES = {} - -PUDL_PIPELINE_VALIDATION_JOBS = [] -PUDL_PIPELINE_VALIDATION_ASSETS = [] -PUDL_PIPELINE_VALIDATION_RESOURCES = {} - - -def create_production_pipeline( - pipeline_name: str, - assets: list[AssetsDefinition], - resources: dict[str, ResourceDefinition], - **kwargs, -): - """Construct a dagster job and supply Definitions with assets and resources.""" - PUDL_PIPELINE_PRODUCTION_JOBS.append( - define_asset_job( - pipeline_name, - selection=assets, - **kwargs, - ) - ) - PUDL_PIPELINE_PRODUCTION_ASSETS.extend(assets) - PUDL_PIPELINE_PRODUCTION_RESOURCES.update(resources) - - -@success_hook(required_resource_keys={"mlflow_interface"}) -def log_op_config(context: HookContext): - """Log any config supplied to ops/assets in validation job to mlflow tracking server.""" - if context.op_config is not None: - mlflow.log_params(context.op_config) - - -@failure_hook(required_resource_keys={"mlflow_interface"}) -def end_run_on_failure(context: HookContext): - """Inform mlflow about job failure.""" - if isinstance(context.op_exception, KeyboardInterrupt): - mlflow.end_run(status=RunStatus.to_string(RunStatus.KILLED)) - else: - mlflow.end_run(status=RunStatus.to_string(RunStatus.FAILED)) - - -def create_validation_pipeline( - pipeline_name: str, - assets: list[AssetsDefinition], - resources: dict[str, ResourceDefinition], - **kwargs, -): - """Construct a dagster job and supply Definitions with assets and resources.""" - PUDL_PIPELINE_VALIDATION_JOBS.append( - define_asset_job( - pipeline_name, - selection=assets, - executor_def=in_process_executor, - hooks={log_op_config, end_run_on_failure}, - # Configure mlflow_interface for job with appropriate experiment name - config={ - "ops": {}, - "resources": { - "mlflow_interface": { - "config": { - "experiment_name": pipeline_name, - "tracking_enabled": True, - } - } - }, - }, - **kwargs, - ) - ) - PUDL_PIPELINE_VALIDATION_ASSETS.extend(assets) - PUDL_PIPELINE_VALIDATION_RESOURCES.update(resources) diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 001c6ad..1e2d56b 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -1 +1,58 @@ """Implement models to extract data from SEC10k filings.""" + +from dagster import ( + Definitions, + load_assets_from_modules, + load_assets_from_package_module, +) + +from mozilla_sec_eia.library import model_jobs +from mozilla_sec_eia.library.mlflow import ( + MlflowInterface, + mlflow_interface_resource, + mlflow_validation_io_managers, +) + +from . import basic_10k, ex_21, extract +from .utils.cloud import cloud_interface_resource + +basic_10k_assets = load_assets_from_modules([basic_10k]) +ex21_assets = load_assets_from_package_module(ex_21) +shared_assets = load_assets_from_modules([extract]) + +basic_10k_production_job = model_jobs.create_production_model_job( + "basic_10k_extraction", + basic_10k.production_assets, +) + +basic_10k_validation_job = model_jobs.create_production_model_job( + "basic_10k_extraction_validation", + basic_10k.validation_assets, +) + + +ex21_production_job = model_jobs.create_production_model_job( + "ex21_extraction", + ex_21.production_assets, +) + +ex21_validation_job = model_jobs.create_validation_model_job( + "ex21_extraction_validation", + ex_21.validation_assets, +) + + +defs = Definitions( + assets=basic_10k_assets + ex21_assets + shared_assets, + jobs=[ + basic_10k_production_job, + basic_10k_validation_job, + ex21_production_job, + ex21_validation_job, + ], + resources={ + "cloud_interface": cloud_interface_resource, + "mlflow_interface": mlflow_interface_resource, + } + | mlflow_validation_io_managers, +) diff --git a/src/mozilla_sec_eia/models/sec10k/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py index df57ac5..a4c0230 100644 --- a/src/mozilla_sec_eia/models/sec10k/basic_10k.py +++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py @@ -3,9 +3,16 @@ import logging import pandas as pd +from dagster import AssetIn, asset -from .extract import Sec10kExtractor -from .utils.cloud import Sec10K +from mozilla_sec_eia.library import validation_helpers + +from .extract import ( + Sec10kExtractor, + sec10k_extraction_asset_factory, + sec10k_filing_metadata, +) +from .utils.cloud import GCSArchive, Sec10K, cloud_interface_resource logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -99,3 +106,77 @@ def extract_filings( ["filename", "filer_count", "block", "block_count", "key"] ), ) + + +@asset +def basic_10k_validation_set() -> pd.DataFrame: + """Return dataframe containing basic 10k validation data.""" + return validation_helpers.load_validation_data( + "basic_10k_labels.csv", + index_cols=["filename", "filer_count", "block", "block_count", "key"], + ) + + +basic_10k_extracted_validation_asset_name = "basic_10k_company_info_validation" + + +@asset( + ins={ + basic_10k_extracted_validation_asset_name: AssetIn( + basic_10k_extracted_validation_asset_name + ), + "basic_10k_validation_set": AssetIn(), + }, + io_manager_key="mlflow_metrics_io_manager", +) +def basic_10k_extraction_validation_metrics(**kwargs): + """Compute basic 10k extraction validation metrics.""" + computed = kwargs[basic_10k_extracted_validation_asset_name] + validation = kwargs["basic_10k_validation_set"] + + return validation_helpers.pandas_compute_precision_recall( + computed, validation, value_col="value" + ) + + +@asset +def basic_10k_validation_filing_metadata( + cloud_interface: GCSArchive, + basic_10k_validation_set: pd.DataFrame, +) -> pd.DataFrame: + """Get sec 10k filing metadata from validation set.""" + filing_metadata = cloud_interface.get_metadata() + return filing_metadata[ + filing_metadata["filename"].isin( + basic_10k_validation_set.index.get_level_values("filename").unique() + ) + ] + + +basic_10k_extractor_resource = Basic10kExtractor( + cloud_interface=cloud_interface_resource +) +basic_10k_production_extraction = sec10k_extraction_asset_factory( + "basic_10k", + basic_10k_extractor_resource, + extraction_metadata_asset_name="basic_10k_extraction_metadata", + extracted_asset_name="basic_10k_company_info", +) + + +basic_10k_validation_extraction = sec10k_extraction_asset_factory( + "basic_10k_validation", + basic_10k_extractor_resource, + filing_metadata_asset_name="basic_10k_validation_filing_metadata", + extraction_metadata_asset_name="basic_10k_extraction_validation_metadata", + extracted_asset_name=basic_10k_extracted_validation_asset_name, +) + +production_assets = [basic_10k_production_extraction, sec10k_filing_metadata] + +validation_assets = [ + basic_10k_validation_extraction, + basic_10k_validation_set, + basic_10k_validation_filing_metadata, + basic_10k_extraction_validation_metrics, +] diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 549c348..52b8a9d 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -1 +1,173 @@ """Module for working with exhibit 21 data.""" + +import pandas as pd +from dagster import AssetIn, AssetOut, asset, multi_asset + +from mozilla_sec_eia.library import validation_helpers +from mozilla_sec_eia.library.mlflow import mlflow_interface_resource + +from ..extract import sec10k_extraction_asset_factory, sec10k_filing_metadata +from ..utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename +from ..utils.layoutlm import LayoutlmResource +from .inference import Exhibit21Extractor, clean_extracted_df + + +@asset +def ex21_validation_set() -> pd.DataFrame: + """Return dataframe containing basic 10k validation data.""" + return clean_ex21_validation_set( + validation_helpers.load_validation_data("ex21_labels.csv") + ) + + +@asset +def ex21_validation_filing_metadata( + cloud_interface: GCSArchive, + ex21_validation_set: pd.DataFrame, +) -> pd.DataFrame: + """Get sec 10k filing metadata from validation set.""" + filing_metadata = cloud_interface.get_metadata() + return filing_metadata[ + filing_metadata["filename"].isin( + ex21_validation_set.index.get_level_values("filename").unique() + ) + ] + + +ex21_extracted_validation_asset_name = "ex21_validation" + + +@multi_asset( + ins={ + "computed_df": AssetIn(ex21_extracted_validation_asset_name), + "validation_df": AssetIn("ex21_validation_set"), + }, + outs={ + "ex21_jaccard_per_table": AssetOut( + io_manager_key="mlflow_pandas_artifact_io_manager" + ), + "ex21_precision_recall_per_table": AssetOut( + io_manager_key="mlflow_pandas_artifact_io_manager" + ), + "ex21_incorrect_filenames": AssetOut( + io_manager_key="mlflow_pandas_artifact_io_manager" + ), + "ex21_extraction_metrics": AssetOut(io_manager_key="mlflow_metrics_io_manager"), + }, +) +def ex21_validation_metrics(computed_df: pd.DataFrame, validation_df: pd.DataFrame): + """Compute validation metrics for Ex. 21 extraction.""" + shared_cols = validation_df.columns.intersection(computed_df.columns) + validation_df = validation_df.astype(computed_df[shared_cols].dtypes) + n_equal = 0 + validation_filenames = validation_df["id"].unique() + n_files = len(validation_filenames) + table_metrics_dict = {} + jaccard_dict = {} + incorrect_files = [] + # iterate through each file and check each extracted table + for filename in validation_filenames: + extracted_table_df = computed_df[computed_df["id"] == filename].reset_index( + drop=True + ) + validation_table_df = validation_df[ + validation_df["id"] == filename + ].reset_index(drop=True) + # check if the tables are exactly equal + if extracted_table_df.equals(validation_table_df): + # TODO: strip llc and other company strings before comparison + n_equal += 1 + else: + incorrect_files.append(filename) + # compute precision and recall for each column + table_metrics_dict[filename] = {} + jaccard_dict[filename] = {} + for col in ["subsidiary", "loc", "own_per"]: + table_prec_recall = validation_helpers.pandas_compute_precision_recall( + extracted_table_df, validation_table_df, value_col=col + ) + table_metrics_dict[filename][f"{col}_precision"] = table_prec_recall[ + "precision" + ] + table_metrics_dict[filename][f"{col}_recall"] = table_prec_recall["recall"] + # get the jaccard similarity between columns + jaccard_dict[filename][col] = validation_helpers.jaccard_similarity( + computed_df=extracted_table_df, + validation_df=validation_table_df, + value_col=col, + ) + + jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient="index").reset_index() + prec_recall_df = pd.DataFrame.from_dict( + table_metrics_dict, orient="index" + ).reset_index() + + return ( + jaccard_df, + prec_recall_df, + pd.DataFrame({"filename": incorrect_files}), + { + "table_accuracy": n_equal / n_files, + "avg_subsidiary_jaccard_sim": jaccard_df["subsidiary"].sum() / n_files, + "avg_location_jaccard_sim": jaccard_df["loc"].sum() / n_files, + "avg_own_per_jaccard_sim": jaccard_df["own_per"].sum() / n_files, + "avg_subsidiary_precision": prec_recall_df["subsidiary_precision"].sum() + / n_files, + "avg_location_precision": prec_recall_df["loc_precision"].sum() / n_files, + "avg_own_per_precision": prec_recall_df["own_per_precision"].sum() + / n_files, + "avg_subsidiary_recall": prec_recall_df["subsidiary_recall"].sum() + / n_files, + "avg_location_recall": prec_recall_df["loc_recall"].sum() / n_files, + "avg_own_per_recall": prec_recall_df["own_per_recall"].sum() / n_files, + }, + ) + + +def clean_ex21_validation_set(validation_df: pd.DataFrame): + """Clean Ex. 21 validation data to match extracted format.""" + validation_df = validation_df.rename( + columns={ + "Filename": "id", + "Subsidiary": "subsidiary", + "Location of Incorporation": "loc", + "Ownership Percentage": "own_per", + } + ) + validation_df["own_per"] = validation_df["own_per"].astype(str) + validation_df["filename"] = validation_df["id"].apply(get_metadata_filename) + validation_df = clean_extracted_df(validation_df) + return validation_df + + +exhibit_21_extractor_resource = Exhibit21Extractor( + cloud_interface=cloud_interface_resource, + layoutlm=LayoutlmResource(mlflow_interface=mlflow_interface_resource), +) +ex21_production_extraction = sec10k_extraction_asset_factory( + "ex21", + exhibit_21_extractor_resource, + extraction_metadata_asset_name="ex21_extraction_metadata", + extracted_asset_name="ex21_company_info", +) + + +ex21_validation_extraction = sec10k_extraction_asset_factory( + "ex21_validation", + exhibit_21_extractor_resource, + filing_metadata_asset_name="ex21_validation_filing_metadata", + extraction_metadata_asset_name="ex21_extraction_validation_metadata", + extracted_asset_name=ex21_extracted_validation_asset_name, +) + +production_assets = [ + sec10k_filing_metadata, + ex21_production_extraction, +] + +validation_assets = [ + ex21_validation_set, + ex21_validation_filing_metadata, + ex21_validation_extraction, + ex21_validation_metrics, +] diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py index 4461b49..8904b53 100644 --- a/src/mozilla_sec_eia/models/sec10k/extract.py +++ b/src/mozilla_sec_eia/models/sec10k/extract.py @@ -1,7 +1,16 @@ """Implement base class for an SEC10k extractor.""" import pandas as pd -from dagster import ConfigurableResource +from dagster import ( + AssetExecutionContext, + AssetIn, + AssetOut, + ConfigurableResource, + StaticPartitionsDefinition, + asset, + multi_asset, + with_resources, +) from .utils.cloud import GCSArchive @@ -19,3 +28,67 @@ def extract_filings( raise NotImplementedError( "extract_filings must be implemented by any subclass!" ) + + +# Create year_quarter partitions +year_quarter_partitions = StaticPartitionsDefinition( + [f"{year}q{quarter}" for year in range(1994, 2024) for quarter in range(1, 5)] +) + + +@asset(partitions_def=year_quarter_partitions) +def sec10k_filing_metadata( + context: AssetExecutionContext, + cloud_interface: GCSArchive, +) -> pd.DataFrame: + """Return filing metadata for year_quarter partition.""" + year_quarter = context.partition_key + df = cloud_interface.get_metadata(year_quarter=year_quarter) + return df + + +def sec10k_extraction_asset_factory( + name: str, + sec10k_extractor: Sec10kExtractor, + partitions_def=year_quarter_partitions, + filing_metadata_asset_name: str = "sec10k_filing_metadata", + extraction_metadata_asset_name: str = "extraction_metadata", + extracted_asset_name: str = "extraction_metadata", +): + """Create asset to extract data from sec10k data. + + Args: + name: Name of extraction asset. + sec10k_extractor: Subclass of Sec10kExtractor used to extract data. + partitions_def: Partitions for asset (production uses year_quarter parts, + validation is not partitioned. + filing_metadata_asset_name: Name of input asset with metadata of filings to + extract. + extraction_metadata_asset_name: Name of output asset containing metadata + from extraction run. + extracted_asset_name: Name of output asset containing extracted data. + """ + + @multi_asset( + name=name, + outs={ + extraction_metadata_asset_name: AssetOut(), + extracted_asset_name: AssetOut(), + }, + ins={"sec10k_filing_metadata": AssetIn(filing_metadata_asset_name)}, + partitions_def=partitions_def, + required_resource_keys={sec10k_extractor.name}, + ) + def extract_filings( + context: AssetExecutionContext, sec10k_filing_metadata: pd.DataFrame + ) -> tuple[pd.DataFrame, pd.DataFrame]: + """Run Sec10kExtractor on selected partition and return.""" + extractor = context.resources.original_resource_dict[sec10k_extractor.name] + extraction_metadata, extracted = extractor.extract_filings( + sec10k_filing_metadata + ) + return extraction_metadata, extracted + + return with_resources([extract_filings], {sec10k_extractor.name: sec10k_extractor})[ + 0 + ] diff --git a/src/mozilla_sec_eia/models/sec10k/pipeline.py b/src/mozilla_sec_eia/models/sec10k/pipeline.py deleted file mode 100644 index 5604d6a..0000000 --- a/src/mozilla_sec_eia/models/sec10k/pipeline.py +++ /dev/null @@ -1,360 +0,0 @@ -"""Implement top level extraction methods and tooling.""" - -import logging - -import pandas as pd -import pandera as pa -from dagster import ( - AssetExecutionContext, - AssetIn, - AssetOut, - StaticPartitionsDefinition, - asset, - multi_asset, - with_resources, -) - -from mozilla_sec_eia.library import validation_helpers -from mozilla_sec_eia.library.mlflow import ( - mlflow_production_interface, - mlflow_train_test_interface, -) -from mozilla_sec_eia.library.pipeline import ( - create_production_pipeline, - create_validation_pipeline, -) - -from .basic_10k import Basic10kExtractor -from .ex_21.inference import Exhibit21Extractor, clean_extracted_df -from .extract import Sec10kExtractor -from .utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename -from .utils.layoutlm import LayoutlmResource - -logger = logging.getLogger(f"catalystcoop.{__name__}") - -DATASETS = ["ex21", "basic_10k"] - - -class ExtractionMetadataSchema(pa.DataFrameModel): - """Define the required schema for extraction metadata. - - Extra columns are permitted, but these are required for computing extraction metrics. - """ - - filename: pa.typing.Index[str] = pa.Field(check_name=True) - success: bool = pa.Field(coerce=True) - - -# Create year_quarter partitions -partitions_def = StaticPartitionsDefinition( - [f"{year}q{quarter}" for year in range(1994, 2024) for quarter in range(1, 5)] -) - - -@asset(partitions_def=partitions_def) -def sec10k_filing_metadata( - context: AssetExecutionContext, - cloud_interface: GCSArchive, -) -> pd.DataFrame: - """Return filing metadata for year_quarter partition.""" - year_quarter = context.partition_key - df = cloud_interface.get_metadata(year_quarter=year_quarter) - return df - - -def sec10k_extraction_asset_factory( - name: str, - sec10k_extractor: Sec10kExtractor, - partitions_def=None, - filing_metadata_asset_name: str = "sec10k_filing_metadata", - extraction_metadata_asset_name: str = "extraction_metadata", - extracted_asset_name: str = "extraction_metadata", -): - """Create asset to extract data from sec10k data. - - Args: - name: Name of extraction asset. - sec10k_extractor: Subclass of Sec10kExtractor used to extract data. - partitions_def: Partitions for asset (production uses year_quarter parts, - validation is not partitioned. - filing_metadata_asset_name: Name of input asset with metadata of filings to - extract. - extraction_metadata_asset_name: Name of output asset containing metadata - from extraction run. - extracted_asset_name: Name of output asset containing extracted data. - """ - - @multi_asset( - name=name, - outs={ - extraction_metadata_asset_name: AssetOut(), - extracted_asset_name: AssetOut(), - }, - ins={"sec10k_filing_metadata": AssetIn(filing_metadata_asset_name)}, - partitions_def=partitions_def, - required_resource_keys={sec10k_extractor.name}, - ) - def extract_filings( - context: AssetExecutionContext, sec10k_filing_metadata: pd.DataFrame - ) -> tuple[pd.DataFrame, pd.DataFrame]: - """Run Sec10kExtractor on selected partition and return.""" - extractor = context.resources.original_resource_dict[sec10k_extractor.name] - extraction_metadata, extracted = extractor.extract_filings( - sec10k_filing_metadata - ) - return extraction_metadata, extracted - - return with_resources([extract_filings], {sec10k_extractor.name: sec10k_extractor})[ - 0 - ] - - -@asset -def basic_10k_validation_set() -> pd.DataFrame: - """Return dataframe containing basic 10k validation data.""" - return validation_helpers.load_validation_data( - "basic_10k_labels.csv", - index_cols=["filename", "filer_count", "block", "block_count", "key"], - ) - - -basic_10k_extracted_validation_asset_name = "basic_10k_company_info_validation" - - -@asset( - ins={ - basic_10k_extracted_validation_asset_name: AssetIn( - basic_10k_extracted_validation_asset_name - ), - "basic_10k_validation_set": AssetIn(), - }, - io_manager_key="mlflow_metrics_io_manager", -) -def basic_10k_extraction_validation_metrics(**kwargs): - """Compute basic 10k extraction validation metrics.""" - computed = kwargs[basic_10k_extracted_validation_asset_name] - validation = kwargs["basic_10k_validation_set"] - - return validation_helpers.pandas_compute_precision_recall( - computed, validation, value_col="value" - ) - - -@asset -def basic_10k_validation_filing_metadata( - cloud_interface: GCSArchive, - basic_10k_validation_set: pd.DataFrame, -) -> pd.DataFrame: - """Get sec 10k filing metadata from validation set.""" - filing_metadata = cloud_interface.get_metadata() - return filing_metadata[ - filing_metadata["filename"].isin( - basic_10k_validation_set.index.get_level_values("filename").unique() - ) - ] - - -# Register basic 10k extraction pipeline -create_production_pipeline( - "basic_10k_extraction", - [ - sec10k_filing_metadata, - sec10k_extraction_asset_factory( - "basic_10k", - Basic10kExtractor(cloud_interface=cloud_interface_resource), - partitions_def=partitions_def, - extraction_metadata_asset_name="basic_10k_extraction_metadata", - extracted_asset_name="basic_10k_company_info", - ), - ], - resources={"cloud_interface": cloud_interface_resource}, -) - - -# Register basic 10k extraction validation pipeline -create_validation_pipeline( - "basic_10k_extraction", - [ - basic_10k_validation_filing_metadata, - sec10k_extraction_asset_factory( - "basic_10k", - Basic10kExtractor(cloud_interface=cloud_interface_resource), - filing_metadata_asset_name="basic_10k_validation_filing_metadata", - extraction_metadata_asset_name="basic_10k_extraction_validation_metadata", - extracted_asset_name=basic_10k_extracted_validation_asset_name, - ), - basic_10k_validation_set, - basic_10k_extraction_validation_metrics, - ], - resources={"cloud_interface": cloud_interface_resource}, -) - - -@asset -def ex21_validation_set() -> pd.DataFrame: - """Return dataframe containing basic 10k validation data.""" - return clean_ex21_validation_set( - validation_helpers.load_validation_data("ex21_labels.csv") - ) - - -@asset -def ex21_validation_filing_metadata( - cloud_interface: GCSArchive, - ex21_validation_set: pd.DataFrame, -) -> pd.DataFrame: - """Get sec 10k filing metadata from validation set.""" - filing_metadata = cloud_interface.get_metadata() - return filing_metadata[ - filing_metadata["filename"].isin( - ex21_validation_set.index.get_level_values("filename").unique() - ) - ] - - -ex21_extracted_validation_asset_name = "ex21_validation" - - -@multi_asset( - ins={ - "computed_df": AssetIn(ex21_extracted_validation_asset_name), - "validation_df": AssetIn("ex21_validation_set"), - }, - outs={ - "ex21_jaccard_per_table": AssetOut( - io_manager_key="mlflow_pandas_artifact_io_manager" - ), - "ex21_precision_recall_per_table": AssetOut( - io_manager_key="mlflow_pandas_artifact_io_manager" - ), - "ex21_incorrect_filenames": AssetOut( - io_manager_key="mlflow_pandas_artifact_io_manager" - ), - "ex21_extraction_metrics": AssetOut(io_manager_key="mlflow_metrics_io_manager"), - }, -) -def ex21_validation_metrics(computed_df: pd.DataFrame, validation_df: pd.DataFrame): - """Compute validation metrics for Ex. 21 extraction.""" - shared_cols = validation_df.columns.intersection(computed_df.columns) - validation_df = validation_df.astype(computed_df[shared_cols].dtypes) - n_equal = 0 - validation_filenames = validation_df["id"].unique() - n_files = len(validation_filenames) - table_metrics_dict = {} - jaccard_dict = {} - incorrect_files = [] - # iterate through each file and check each extracted table - for filename in validation_filenames: - extracted_table_df = computed_df[computed_df["id"] == filename].reset_index( - drop=True - ) - validation_table_df = validation_df[ - validation_df["id"] == filename - ].reset_index(drop=True) - # check if the tables are exactly equal - if extracted_table_df.equals(validation_table_df): - # TODO: strip llc and other company strings before comparison - n_equal += 1 - else: - incorrect_files.append(filename) - # compute precision and recall for each column - table_metrics_dict[filename] = {} - jaccard_dict[filename] = {} - for col in ["subsidiary", "loc", "own_per"]: - table_prec_recall = validation_helpers.pandas_compute_precision_recall( - extracted_table_df, validation_table_df, value_col=col - ) - table_metrics_dict[filename][f"{col}_precision"] = table_prec_recall[ - "precision" - ] - table_metrics_dict[filename][f"{col}_recall"] = table_prec_recall["recall"] - # get the jaccard similarity between columns - jaccard_dict[filename][col] = validation_helpers.jaccard_similarity( - computed_df=extracted_table_df, - validation_df=validation_table_df, - value_col=col, - ) - - jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient="index").reset_index() - prec_recall_df = pd.DataFrame.from_dict( - table_metrics_dict, orient="index" - ).reset_index() - - return ( - jaccard_df, - prec_recall_df, - pd.DataFrame({"filename": incorrect_files}), - { - "table_accuracy": n_equal / n_files, - "avg_subsidiary_jaccard_sim": jaccard_df["subsidiary"].sum() / n_files, - "avg_location_jaccard_sim": jaccard_df["loc"].sum() / n_files, - "avg_own_per_jaccard_sim": jaccard_df["own_per"].sum() / n_files, - "avg_subsidiary_precision": prec_recall_df["subsidiary_precision"].sum() - / n_files, - "avg_location_precision": prec_recall_df["loc_precision"].sum() / n_files, - "avg_own_per_precision": prec_recall_df["own_per_precision"].sum() - / n_files, - "avg_subsidiary_recall": prec_recall_df["subsidiary_recall"].sum() - / n_files, - "avg_location_recall": prec_recall_df["loc_recall"].sum() / n_files, - "avg_own_per_recall": prec_recall_df["own_per_recall"].sum() / n_files, - }, - ) - - -def clean_ex21_validation_set(validation_df: pd.DataFrame): - """Clean Ex. 21 validation data to match extracted format.""" - validation_df = validation_df.rename( - columns={ - "Filename": "id", - "Subsidiary": "subsidiary", - "Location of Incorporation": "loc", - "Ownership Percentage": "own_per", - } - ) - validation_df["own_per"] = validation_df["own_per"].astype(str) - validation_df["filename"] = validation_df["id"].apply(get_metadata_filename) - validation_df = clean_extracted_df(validation_df) - return validation_df - - -# Register ex21 extraction pipeline -create_production_pipeline( - "ex21_extraction", - [ - sec10k_filing_metadata, - sec10k_extraction_asset_factory( - "ex21", - Exhibit21Extractor( - cloud_interface=cloud_interface_resource, - layoutlm=LayoutlmResource(mlflow_interface=mlflow_production_interface), - ), - partitions_def=partitions_def, - extraction_metadata_asset_name="ex21_extraction_metadata", - extracted_asset_name="ex21_company_info", - ), - ], - resources={"cloud_interface": cloud_interface_resource}, -) - - -# Register ex21 extraction validation pipeline -create_validation_pipeline( - "ex21_extraction", - [ - ex21_validation_filing_metadata, - sec10k_extraction_asset_factory( - "ex21", - Exhibit21Extractor( - cloud_interface=cloud_interface_resource, - layoutlm=LayoutlmResource(mlflow_interface=mlflow_train_test_interface), - ), - filing_metadata_asset_name="ex21_validation_filing_metadata", - extraction_metadata_asset_name="ex21_extraction_validation_metadata", - extracted_asset_name=ex21_extracted_validation_asset_name, - ), - ex21_validation_set, - ex21_validation_metrics, - ], - resources={"cloud_interface": cloud_interface_resource}, -) diff --git a/src/mozilla_sec_eia/pudl_pipelines.py b/src/mozilla_sec_eia/pudl_pipelines.py deleted file mode 100644 index 57b3808..0000000 --- a/src/mozilla_sec_eia/pudl_pipelines.py +++ /dev/null @@ -1,26 +0,0 @@ -"""Define production pipelines for running PUDL models.""" - -import logging - -import coloredlogs -from dagster import Definitions - -from mozilla_sec_eia.library.mlflow import mlflow_production_interface -from mozilla_sec_eia.library.pipeline import ( - PUDL_PIPELINE_PRODUCTION_ASSETS, - PUDL_PIPELINE_PRODUCTION_JOBS, - PUDL_PIPELINE_PRODUCTION_RESOURCES, -) - -logger = logging.getLogger("catalystcoop") -log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s" -coloredlogs.install(fmt=log_format, logger=logger) - -production_io_resources = {} | PUDL_PIPELINE_PRODUCTION_RESOURCES - -production_pipelines = Definitions( - assets=PUDL_PIPELINE_PRODUCTION_ASSETS, - jobs=PUDL_PIPELINE_PRODUCTION_JOBS, - resources=production_io_resources - | {"mlflow_interface": mlflow_production_interface}, -) diff --git a/src/mozilla_sec_eia/pudl_validation_pipelines.py b/src/mozilla_sec_eia/pudl_validation_pipelines.py deleted file mode 100644 index 63e9049..0000000 --- a/src/mozilla_sec_eia/pudl_validation_pipelines.py +++ /dev/null @@ -1,34 +0,0 @@ -"""Define jobs to test/validate PUDL models.""" - -import logging - -import coloredlogs -from dagster import Definitions - -from mozilla_sec_eia.library.mlflow import ( - get_mlflow_io_manager, - mlflow_train_test_interface, -) -from mozilla_sec_eia.library.pipeline import ( - PUDL_PIPELINE_VALIDATION_ASSETS, - PUDL_PIPELINE_VALIDATION_JOBS, - PUDL_PIPELINE_VALIDATION_RESOURCES, -) - -logger = logging.getLogger("catalystcoop") -log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s" -coloredlogs.install(fmt=log_format, logger=logger) - -validation_io_resources = { - key: get_mlflow_io_manager( - key, mlflow_interface=mlflow_train_test_interface, pandas_file_type="csv" - ) - for key in ["mlflow_pandas_artifact_io_manager", "mlflow_metrics_io_manager"] -} | PUDL_PIPELINE_VALIDATION_RESOURCES - -validation_pipelines = Definitions( - assets=PUDL_PIPELINE_VALIDATION_ASSETS, - jobs=PUDL_PIPELINE_VALIDATION_JOBS, - resources=validation_io_resources - | {"mlflow_interface": mlflow_train_test_interface}, -) diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py index c1cdac0..a413767 100644 --- a/tests/unit/models/sec10k/extract_test.py +++ b/tests/unit/models/sec10k/extract_test.py @@ -5,8 +5,8 @@ import pandas as pd from dagster import asset, build_asset_context, materialize -from mozilla_sec_eia.models.sec10k.extract import Sec10kExtractor -from mozilla_sec_eia.models.sec10k.pipeline import ( +from mozilla_sec_eia.models.sec10k.extract import ( + Sec10kExtractor, sec10k_extraction_asset_factory, sec10k_filing_metadata, ) @@ -71,6 +71,7 @@ def fake_filing_metadata_asset(): filing_metadata_asset_name="fake_filing_metadata_asset", extracted_asset_name="test_sec10k_extraction", extraction_metadata_asset_name="test_sec10k_extraction_metadata", + partitions_def=None, ) # Run assets and review results From e99ee1ae6063cb976191a2937e90ac10f61d64ca Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 3 Sep 2024 11:04:58 -0400 Subject: [PATCH 021/161] Add top-level worksapce file --- workspace.yaml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 workspace.yaml diff --git a/workspace.yaml b/workspace.yaml new file mode 100644 index 0000000..144aada --- /dev/null +++ b/workspace.yaml @@ -0,0 +1,2 @@ +load_from: + - python_module: mozilla_sec_eia.models.sec10k From 559c0e6c813733f8a57cf9242fc3ac1e22680bf2 Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 3 Sep 2024 12:55:14 -0400 Subject: [PATCH 022/161] Restructure docs --- README.rst | 155 ++++++++----------- src/mozilla_sec_eia/models/sec10k/README.rst | 99 ++++++++++++ 2 files changed, 160 insertions(+), 94 deletions(-) create mode 100644 src/mozilla_sec_eia/models/sec10k/README.rst diff --git a/README.rst b/README.rst index 6516ee4..d036e6c 100644 --- a/README.rst +++ b/README.rst @@ -1,4 +1,4 @@ -mozilla-sec-eia: Developing a linkage between SEC and EIA +pudl-models: ML models developed for PUDL ======================================================================================= .. readme-intro @@ -27,100 +27,67 @@ mozilla-sec-eia: Developing a linkage between SEC and EIA :target: https://github.com/psf/black> :alt: Any color you want, so long as it's black. -This repo contains exploratory development for an SEC-EIA linkage. - -Usage +About ----- - -CLI -^^^ -The CLI uses a sub-command structure, so new commands and workflows can easily be -added during development. It's usage is as following: - -``mozilla_dev {COMMAND} {OPTIONS}`` - -The available commands are ``validate_archive``, which validates that all filings on -the GCS archive align with those described in the metadata DB, ``finetune_ex21``, -which will finetune the exhibit 21 extractor and log the model using mlflow, and -``rename_filings``, which will rename labeled filings on GCS. - -Experiment/Model Tracking -^^^^^^^^^^^^^^^^^^^^^^^^^ -We've setup a remote tracking server using `mlflow `_ -to manage tracking, caching, and versioning models developed as a part of this project. -To interact with the server through the UI, go `here `_ -and login using the username and password stored in gcloud secret manager. -There is currently a finetuned layoutlm model for extracting exhibit 21 data stored -on the server. This model can be accessed using the method -``src/mozilla_sec_eia/utils/cloud.py:load_model``. This will return a dictionary -containing ``model`` and ``tokenizer`` fields. - -Helper Tools -^^^^^^^^^^^^ -Utility functions for accessing and working with 10k filings as well as their exhibit -21 attachments can be found in 'src/mozilla_sec_eia/utils/cloud.py'. The base class is -the ``GCSArchive`` which provides an interface to archived filings on GCS. To -instantiate this class, the following environment variables need to be set, or defined -in a ``.env`` file: - -``GCS_BUCKET_NAME`` -``GCS_METADATA_DB_INSTANCE_CONNECTION`` -``GCS_IAM_USER`` -``GCS_METADATA_DB_NAME`` -``GCS_PROJECT`` -``MLFLOW_TRACKING_URI`` - -This code sample shows how to use the class to fetch filings from the archive: - -.. code-block:: python - - from mozilla_sec_eia.utils.cloud import GCSArchive - archive = GCSArchive() - - # Get metadata from postgres instance - metadata_df = archive.get_metadata() - - # Do some filtering to get filings of interest - filings = metadata_df.loc[... # Get rows from original df - - # This will download and cache filings locally for later use - # Successive calls to get_filings will not re-download filings which are already cahced - downloaded_filings = archive.get_filings(filings) - - # Get exhibit 21's and extract subsidiary data - for filing in downloaded_filings: - cool_extraction_model(filing.get_ex_21().as_image()) - -Labeling --------- -We are using `Label Studio `_ to create training data -for fine-tuning the Ex. 21 extraction model. The very preliminary workflow -for labeling data is as follows: - -* For each filing that you want to label, follow notebook 7 to create the - inputs for Label Studio. This notebook first creates a PDF of the filing. - Then, it extracts the bounding boxes around each word and create a "task" - JSON and image for each Ex. 21 table that will be used in Label Studio. -* Upload these JSONs and images to the same bucket in GCS (the "unlabeled" - bucket by default). -* `Install Label Studio `_ -* Start Label Studio locally and create a project. -* Under Settings, set the template/config for the project with the config - found in ``labeling-configs/labeling-config.xml``. This should create the - correct entity labels and UI setup. -* Connect GCS to Label Studio by following `these directions - `_ -* Specific Label Studio settings: Filter files for only JSONs - (these are your tasks). Leave "Treat every bucket object as a source file" - disabled. Add the service account authentication JSON for your bucket. -* Additionally add a Target Storage bucket (the "labeled" bucket by - default). -* Import data and label Ex. 21 tables. -* Sync with target storage. -* Update the ``labeled_data_tracking.csv`` with the new filings you've - labeled. -* Run the ``rename_labeled_filings.py`` script to update labeled file - names in the GCS bucket with their SEC filename. +The `PUDL `__ project makes US energy data free and open +for all. For more information, see the PUDL repo and `website `__. + +This repo implements machine learning models which support PUDL. The types of +modelling performed here include record linkage between datasets, and extracting +structured data from unstructured documents. The outputs of these models then feed +into PUDL tables, and are distributed in the PUDL data warehouse. + +Project Structure +----------------- +This repo is split into two main sections, with shared tooling being implemented in +``src/mozilla_sec_eia/library`` and actual models implemented in +``src/mozilla_sec_eia/models``. + +Models +^^^^^^ +Each model is contained in its own Dagster +`code location `__. This keeps models +isolated from each other, allowing finetuned dependency management, and provides useful +organization in the Dagster UI. To add a new model, you must create a new python module +in the ``src/mozilla_sec_eia/models/`` directory. This module should define a single +Dagster ``Definitions`` object which can be imported from the top-level of the module. +For reference on how to structure a code location, see +``src/mozilla_sec_eia/models/sec10k/`` for an example. After creating a new model, +it must be added to +`workspace.yaml `__. + +There are three types of dagster `jobs `__ +expected in a model code location: + +* **Production Jobs**: Production jobs define a pipeline to execute a model and produce + outputs which typicall feed into PUDL. +* **Validation Jobs**: Validation jobs are used to test/validate models. They will be + run in a single process with an + `mlflow `__ run backing + them to allow logging results to a tracking server. +* **Training Jobs**: Training jobs are meant to train models and log results with + mlflow for use in production jobs. + +There are helper functions in ``src/mozilla_sec_eia/library/model_jobs.py`` for +constructing each of these jobs. These functions help to ensure each job will +use the appropriate executor and supply the job with necessary resources. + +Library +^^^^^^^ +There's generic shared tooling for ``pudl-models`` defined in +``src/mozilla_sec_eia/library/``. This includes the helper fucntions for +constructing dagster jobs discussed above, as well as useful methods for computing +validation metrics, and an interface to our mlflow tracking server integrated with +our tracking server. + +MlFlow +"""""" +We use a remote `mlflow tracking `__ to aide in the +development and management of ``pudl-models``. In the ``mlflow`` module, there are +several dagster resources and IO-managers that can be used in any models to allow simple +seamless interface to the server. + +.. TODO: Add mlflow resource/io-manager examples About Catalyst Cooperative diff --git a/src/mozilla_sec_eia/models/sec10k/README.rst b/src/mozilla_sec_eia/models/sec10k/README.rst new file mode 100644 index 0000000..ffecf28 --- /dev/null +++ b/src/mozilla_sec_eia/models/sec10k/README.rst @@ -0,0 +1,99 @@ +sec10k: Extracting company ownership data from sec10k documents +======================================================================================= + +This repo contains exploratory development for an SEC-EIA linkage. + +Usage +----- + +Helper Tools +^^^^^^^^^^^^ +Utility functions for accessing and working with 10k filings as well as their exhibit +21 attachments can be found in 'src/mozilla_sec_eia/utils/cloud.py'. The base class is +the ``GCSArchive`` which provides an interface to archived filings on GCS. To +instantiate this class, the following environment variables need to be set, or defined +in a ``.env`` file: + +``GCS_BUCKET_NAME`` +``GCS_METADATA_DB_INSTANCE_CONNECTION`` +``GCS_IAM_USER`` +``GCS_METADATA_DB_NAME`` +``GCS_PROJECT`` +``MLFLOW_TRACKING_URI`` + +This code sample shows how to use the class to fetch filings from the archive: + +.. code-block:: python + + from mozilla_sec_eia.utils.cloud import GCSArchive + archive = GCSArchive() + + # Get metadata from postgres instance + metadata_df = archive.get_metadata() + + # Do some filtering to get filings of interest + filings = metadata_df.loc[... # Get rows from original df + + # This will download and cache filings locally for later use + # Successive calls to get_filings will not re-download filings which are already cahced + downloaded_filings = archive.get_filings(filings) + + # Get exhibit 21's and extract subsidiary data + for filing in downloaded_filings: + cool_extraction_model(filing.get_ex_21().as_image()) + +Labeling +-------- +We are using `Label Studio `_ to create training data +for fine-tuning the Ex. 21 extraction model. The very preliminary workflow +for labeling data is as follows: + +* For each filing that you want to label, follow notebook 7 to create the + inputs for Label Studio. This notebook first creates a PDF of the filing. + Then, it extracts the bounding boxes around each word and create a "task" + JSON and image for each Ex. 21 table that will be used in Label Studio. +* Upload these JSONs and images to the same bucket in GCS (the "unlabeled" + bucket by default). +* `Install Label Studio `_ +* Start Label Studio locally and create a project. +* Under Settings, set the template/config for the project with the config + found in ``labeling-configs/labeling-config.xml``. This should create the + correct entity labels and UI setup. +* Connect GCS to Label Studio by following `these directions + `_ +* Specific Label Studio settings: Filter files for only JSONs + (these are your tasks). Leave "Treat every bucket object as a source file" + disabled. Add the service account authentication JSON for your bucket. +* Additionally add a Target Storage bucket (the "labeled" bucket by + default). +* Import data and label Ex. 21 tables. +* Sync with target storage. +* Update the ``labeled_data_tracking.csv`` with the new filings you've + labeled. +* Run the ``rename_labeled_filings.py`` script to update labeled file + names in the GCS bucket with their SEC filename. + + +About Catalyst Cooperative +--------------------------------------------------------------------------------------- +`Catalyst Cooperative `__ is a small group of data +wranglers and policy wonks organized as a worker-owned cooperative consultancy. +Our goal is a more just, livable, and sustainable world. We integrate public +data and perform custom analyses to inform public policy (`Hire us! +`__). Our focus is primarily on mitigating +climate change and improving electric utility regulation in the United States. + +Contact Us +^^^^^^^^^^ +* For general support, questions, or other conversations around the project + that might be of interest to others, check out the + `GitHub Discussions `__ +* If you'd like to get occasional updates about our projects + `sign up for our email list `__. +* Want to schedule a time to chat with us one-on-one? Join us for + `Office Hours `__ +* Follow us on Twitter: `@CatalystCoop `__ +* More info on our website: https://catalyst.coop +* For private communication about the project or to hire us to provide customized data + extraction and analysis, you can email the maintainers: + `pudl@catalyst.coop `__ From 93d02f3aac6aebb3486221eb4280a143e186864d Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 3 Sep 2024 16:14:55 -0400 Subject: [PATCH 023/161] Add train model job --- .../library/mlflow/__init__.py | 2 +- src/mozilla_sec_eia/library/model_jobs.py | 10 ++++++++++ src/mozilla_sec_eia/models/sec10k/__init__.py | 20 +++++++++++++++---- .../models/sec10k/basic_10k.py | 2 ++ .../models/sec10k/ex_21/__init__.py | 3 ++- .../models/sec10k/ex_21/train_extractor.py | 14 ++++++------- src/mozilla_sec_eia/models/sec10k/extract.py | 13 ++++++------ tests/unit/models/sec10k/extract_test.py | 8 ++++++-- 8 files changed, 50 insertions(+), 22 deletions(-) diff --git a/src/mozilla_sec_eia/library/mlflow/__init__.py b/src/mozilla_sec_eia/library/mlflow/__init__.py index 5987f75..f07997a 100644 --- a/src/mozilla_sec_eia/library/mlflow/__init__.py +++ b/src/mozilla_sec_eia/library/mlflow/__init__.py @@ -33,7 +33,7 @@ def get_mlflow_io_manager( mlflow_interface_resource = MlflowInterface.configure_at_launch() -mlflow_validation_io_managers = { +mlflow_train_test_io_managers = { "mlflow_metrics_io_manager": get_mlflow_io_manager( "mlflow_metrics_io_manager", mlflow_interface=mlflow_interface_resource, diff --git a/src/mozilla_sec_eia/library/model_jobs.py b/src/mozilla_sec_eia/library/model_jobs.py index 8bdfc5c..45602b4 100644 --- a/src/mozilla_sec_eia/library/model_jobs.py +++ b/src/mozilla_sec_eia/library/model_jobs.py @@ -86,3 +86,13 @@ def create_validation_model_job( }, **kwargs, ) + + +def create_training_job( + job_name: str, + assets: list[AssetsDefinition], + **kwargs, +): + """Construct a dagster job meant to train a model and log with mlflow.""" + # For now training job config is the same as validation + return create_validation_model_job(job_name, assets, **kwargs) diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 1e2d56b..7abbb4e 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -10,14 +10,16 @@ from mozilla_sec_eia.library.mlflow import ( MlflowInterface, mlflow_interface_resource, - mlflow_validation_io_managers, + mlflow_train_test_io_managers, ) from . import basic_10k, ex_21, extract from .utils.cloud import cloud_interface_resource +from .utils.layoutlm import LayoutlmIOManager basic_10k_assets = load_assets_from_modules([basic_10k]) ex21_assets = load_assets_from_package_module(ex_21) +layoutlm_assets = load_assets_from_modules([ex_21.train_extractor]) shared_assets = load_assets_from_modules([extract]) basic_10k_production_job = model_jobs.create_production_model_job( @@ -25,7 +27,7 @@ basic_10k.production_assets, ) -basic_10k_validation_job = model_jobs.create_production_model_job( +basic_10k_validation_job = model_jobs.create_validation_model_job( "basic_10k_extraction_validation", basic_10k.validation_assets, ) @@ -41,18 +43,28 @@ ex_21.validation_assets, ) +layoutlm_finetune_job = model_jobs.create_training_job( + "layoutlm_finetune", + layoutlm_assets, +) + defs = Definitions( - assets=basic_10k_assets + ex21_assets + shared_assets, + assets=basic_10k_assets + ex21_assets + shared_assets + layoutlm_assets, jobs=[ basic_10k_production_job, basic_10k_validation_job, ex21_production_job, ex21_validation_job, + layoutlm_finetune_job, ], resources={ "cloud_interface": cloud_interface_resource, "mlflow_interface": mlflow_interface_resource, + "layoutlm_io_manager": LayoutlmIOManager( + mlflow_interface=mlflow_interface_resource + ), } - | mlflow_validation_io_managers, + | mlflow_train_test_io_managers + | extract.SEC10k_EXTRACTOR_RESOURCES, ) diff --git a/src/mozilla_sec_eia/models/sec10k/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py index a4c0230..4c943c6 100644 --- a/src/mozilla_sec_eia/models/sec10k/basic_10k.py +++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py @@ -170,6 +170,8 @@ def basic_10k_validation_filing_metadata( filing_metadata_asset_name="basic_10k_validation_filing_metadata", extraction_metadata_asset_name="basic_10k_extraction_validation_metadata", extracted_asset_name=basic_10k_extracted_validation_asset_name, + partitions_def=None, + io_manager_key="mlflow_pandas_artifact_io_manager", ) production_assets = [basic_10k_production_extraction, sec10k_filing_metadata] diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 52b8a9d..11817c4 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -148,7 +148,7 @@ def clean_ex21_validation_set(validation_df: pd.DataFrame): "ex21", exhibit_21_extractor_resource, extraction_metadata_asset_name="ex21_extraction_metadata", - extracted_asset_name="ex21_company_info", + extracted_asset_name="ex21_company_ownership_info", ) @@ -158,6 +158,7 @@ def clean_ex21_validation_set(validation_df: pd.DataFrame): filing_metadata_asset_name="ex21_validation_filing_metadata", extraction_metadata_asset_name="ex21_extraction_validation_metadata", extracted_asset_name=ex21_extracted_validation_asset_name, + partitions_def=None, ) production_assets = [ diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py b/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py index 00c80f3..cb37619 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py @@ -7,9 +7,8 @@ from pathlib import Path -import mlflow import numpy as np -from dagster import Config +from dagster import Config, asset from datasets import ( Array2D, Array3D, @@ -137,15 +136,15 @@ def load_test_train_set( class FineTuneConfig(Config): """Configuration to supply to `train_model`.""" - labeled_json_path: str + labeled_json_path: str = "sec10k_filings/labeled_jsons/" gcs_training_data_dir: str = "labeled" output_dir: str = "layoutlm_trainer" test_size: float = 0.2 -def train_model( +@asset(io_manager_key="layoutlm_io_manager") +def layoutlm( config: FineTuneConfig, - layoutlm_mlflow_interface, ): """Train LayoutLM model with labeled data.""" # Prepare model @@ -189,6 +188,5 @@ def train_model( ) # Train inside mlflow run. Mlflow will automatically handle logging training metrcis - with mlflow.start_run(): - trainer.train() - # log_model(trainer) + trainer.train() + return trainer diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py index 8904b53..9db88bb 100644 --- a/src/mozilla_sec_eia/models/sec10k/extract.py +++ b/src/mozilla_sec_eia/models/sec10k/extract.py @@ -9,11 +9,12 @@ StaticPartitionsDefinition, asset, multi_asset, - with_resources, ) from .utils.cloud import GCSArchive +SEC10k_EXTRACTOR_RESOURCES = {} + class Sec10kExtractor(ConfigurableResource): """Base class for extracting SEC 10k data.""" @@ -54,6 +55,7 @@ def sec10k_extraction_asset_factory( filing_metadata_asset_name: str = "sec10k_filing_metadata", extraction_metadata_asset_name: str = "extraction_metadata", extracted_asset_name: str = "extraction_metadata", + io_manager_key: str | None = None, ): """Create asset to extract data from sec10k data. @@ -72,8 +74,8 @@ def sec10k_extraction_asset_factory( @multi_asset( name=name, outs={ - extraction_metadata_asset_name: AssetOut(), - extracted_asset_name: AssetOut(), + extraction_metadata_asset_name: AssetOut(io_manager_key=io_manager_key), + extracted_asset_name: AssetOut(io_manager_key=io_manager_key), }, ins={"sec10k_filing_metadata": AssetIn(filing_metadata_asset_name)}, partitions_def=partitions_def, @@ -89,6 +91,5 @@ def extract_filings( ) return extraction_metadata, extracted - return with_resources([extract_filings], {sec10k_extractor.name: sec10k_extractor})[ - 0 - ] + SEC10k_EXTRACTOR_RESOURCES[sec10k_extractor.name] = sec10k_extractor + return extract_filings diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py index a413767..82efac4 100644 --- a/tests/unit/models/sec10k/extract_test.py +++ b/tests/unit/models/sec10k/extract_test.py @@ -65,9 +65,10 @@ def fake_filing_metadata_asset(): return fake_filing_metadata # Create fake extraction asset with configured inputs + test_extractor = TestSec10kExtractor(cloud_interface=FakeArchive()) extraction_multi_asset = sec10k_extraction_asset_factory( name="test_sec10k_extraction", - sec10k_extractor=TestSec10kExtractor(cloud_interface=FakeArchive()), + sec10k_extractor=test_extractor, filing_metadata_asset_name="fake_filing_metadata_asset", extracted_asset_name="test_sec10k_extraction", extraction_metadata_asset_name="test_sec10k_extraction_metadata", @@ -75,7 +76,10 @@ def fake_filing_metadata_asset(): ) # Run assets and review results - result = materialize([fake_filing_metadata_asset, extraction_multi_asset]) + result = materialize( + [fake_filing_metadata_asset, extraction_multi_asset], + resources={test_extractor.name: test_extractor}, + ) pd.testing.assert_frame_equal( result.asset_value("test_sec10k_extraction_metadata"), fake_extraction_metadata ) From 5190bf99d8b634cfe164d9d51db71b54cbe8a3d3 Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 3 Sep 2024 16:24:29 -0400 Subject: [PATCH 024/161] Log mlflow artifacts as parquet until csv is fixed --- src/mozilla_sec_eia/library/mlflow/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mozilla_sec_eia/library/mlflow/__init__.py b/src/mozilla_sec_eia/library/mlflow/__init__.py index f07997a..a7a65d6 100644 --- a/src/mozilla_sec_eia/library/mlflow/__init__.py +++ b/src/mozilla_sec_eia/library/mlflow/__init__.py @@ -41,6 +41,5 @@ def get_mlflow_io_manager( "mlflow_pandas_artifact_io_manager": get_mlflow_io_manager( "mlflow_pandas_artifact_io_manager", mlflow_interface=mlflow_interface_resource, - pandas_file_type="csv", ), } From ca9599e70e0ef9cc95e40125e06e17ee5c2ab619 Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 3 Sep 2024 21:07:52 -0400 Subject: [PATCH 025/161] Fix ex21 extraction --- src/mozilla_sec_eia/models/sec10k/ex_21/inference.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index 56648eb..235b9c3 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -251,8 +251,14 @@ def extract_filings( that is the filename of the extracted Ex. 21. Dataframe contains columns id, subsidiary, loc, own_per. """ + filings_with_ex21 = filing_metadata[ + ~filing_metadata["exhibit_21_version"].isna() + ] + self.cloud_interface.get_filings( + filings_with_ex21, cache_directory=self._pdf_dir, cache_pdf=True + ) dataset = create_inference_dataset( - pdfs_dir=self._pdf_dir, + pdfs_dir=Path(self._pdf_dir), labeled_json_dir=self._labeled_json_dir, has_labels=self.has_labels, ) @@ -286,7 +292,7 @@ def extract_filings( all_output_df = clean_extracted_df(all_output_df) all_output_df = all_output_df[["id", "subsidiary", "loc", "own_per"]] all_output_df = all_output_df.reset_index(drop=True) - return logits, predictions, all_output_df, extraction_metadata + return extraction_metadata, all_output_df class LayoutLMInferencePipeline(Pipeline): From 7e7a50319fc1365ba1ea4f948ea536d4f11d968d Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 4 Sep 2024 09:28:05 -0400 Subject: [PATCH 026/161] Add development section to docs --- README.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.rst b/README.rst index d036e6c..c85c013 100644 --- a/README.rst +++ b/README.rst @@ -89,6 +89,12 @@ seamless interface to the server. .. TODO: Add mlflow resource/io-manager examples +Development +----------- +To launch the dagster UI to load all ``pudl-models``, run the command ``dagster dev`` +in the top-level of this repo. This will load the file ``worspace.yaml``, which points +to each model. You can also work on a single model in isolation by running the command: +``dagster dev -m mozilla_sec_eia.models.{your_cool_model}``. About Catalyst Cooperative --------------------------------------------------------------------------------------- From 61f48c36bdc92fd1def79e18fa667b33ed093108 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 4 Sep 2024 11:56:57 -0400 Subject: [PATCH 027/161] Fix integration tests --- .../library/mlflow/mlflow_resource.py | 19 ++- .../models/sec10k/ex_21/__init__.py | 4 +- .../integration/models/sec10k/extract_test.py | 141 +++--------------- 3 files changed, 29 insertions(+), 135 deletions(-) diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py index 35dbf26..1060b9b 100644 --- a/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py +++ b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py @@ -77,17 +77,20 @@ def mlflow_run_id(self) -> str | None: def _get_tracking_password(self, version_id: str = "latest"): """Get tracking server password from gcloud secrets.""" - # Create the Secret Manager client. - client = secretmanager.SecretManagerServiceClient() + # Password not required for local use + if "sqlite" not in self.tracking_uri: + # Create the Secret Manager client. + client = secretmanager.SecretManagerServiceClient() - # Build the resource name of the secret version. - name = f"projects/{self.project}/secrets/mlflow_admin_password/versions/{version_id}" + # Build the resource name of the secret version. + name = f"projects/{self.project}/secrets/mlflow_admin_password/versions/{version_id}" - # Access the secret version. - response = client.access_secret_version(name=name) + # Access the secret version. + response = client.access_secret_version(name=name) - # Return the decoded payload. - return response.payload.data.decode("UTF-8") + # Return the decoded payload. + return response.payload.data.decode("UTF-8") + return "" def _configure_mlflow(self): """Do runtime configuration of mlflow.""" diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 11817c4..39cd2d6 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -28,9 +28,7 @@ def ex21_validation_filing_metadata( """Get sec 10k filing metadata from validation set.""" filing_metadata = cloud_interface.get_metadata() return filing_metadata[ - filing_metadata["filename"].isin( - ex21_validation_set.index.get_level_values("filename").unique() - ) + filing_metadata["filename"].isin(ex21_validation_set["filename"].unique()) ] diff --git a/tests/integration/models/sec10k/extract_test.py b/tests/integration/models/sec10k/extract_test.py index 5da0bb5..dce8577 100644 --- a/tests/integration/models/sec10k/extract_test.py +++ b/tests/integration/models/sec10k/extract_test.py @@ -1,143 +1,36 @@ """Validate basic 10k and exhibit 21 extraction.""" import logging -import unittest import dotenv -import numpy as np -import pandas as pd -import pytest -from dagster import EnvVar, build_asset_context -from mozilla_sec_eia.ex_21.inference import ( - clean_extracted_df, - create_inference_dataset, - perform_inference, -) -from mozilla_sec_eia.extract import ( - _get_most_recent_run, - basic_10k_validate, - ex21_validate, -) -from mozilla_sec_eia.utils.cloud import GCSArchive, MlflowInterface -from mozilla_sec_eia.utils.layoutlm import load_model -from pandas.testing import assert_frame_equal +from mozilla_sec_eia.models import sec10k logger = logging.getLogger(f"catalystcoop.{__name__}") -def test_basic_10k_extraction(): - """Run full 10k extraction on validation set and verify desired metrics are met.""" +def test_basic_10k_validation( + test_tracker_factory, + get_most_recent_mlflow_run_factory, +): + """Test basic_10k_validation_job.""" dotenv.load_dotenv() - experiment_name = "basic_10k_validate_test" - cloud_interface = GCSArchive( - filings_bucket_name=EnvVar("GCS_FILINGS_BUCKET_NAME"), - labels_bucket_name=EnvVar("GCS_LABELS_BUCKET_NAME"), - metadata_db_instance_connection=EnvVar("GCS_METADATA_DB_INSTANCE_CONNECTION"), - user=EnvVar("GCS_IAM_USER"), - metadata_db_name=EnvVar("GCS_METADATA_DB_NAME"), - project=EnvVar("GCS_PROJECT"), - ) + sec10k.defs.get_job_def("basic_10k_extraction_validation").execute_in_process() - with build_asset_context( - resources={ - "basic_10k_extract_validate_mlflow": MlflowInterface( - experiment_name=experiment_name, - continue_run=False, - tracking_uri="sqlite:///:memory:", - cloud_interface=cloud_interface, - ), - "cloud_interface": cloud_interface, - } - ) as context: - basic_10k_validate(context) - run = _get_most_recent_run(experiment_name) + run = get_most_recent_mlflow_run_factory("basic_10k_extraction_validation") assert run.data.metrics["precision"] == 1 assert run.data.metrics["recall"] == 1 -@pytest.mark.xfail -def test_ex21_validation(test_mlflow_init_func): - """Run full Ex. 21 extraction on validation set and verify metrics are met.""" +def test_ex21_validation( + test_tracker_factory, + get_most_recent_mlflow_run_factory, +): + """Test ex21_validation_job.""" dotenv.load_dotenv() - experiment_name = "ex21_validate_test" - cloud_interface = GCSArchive( - filings_bucket_name=EnvVar("GCS_FILINGS_BUCKET_NAME"), - labels_bucket_name=EnvVar("GCS_LABELS_BUCKET_NAME"), - metadata_db_instance_connection=EnvVar("GCS_METADATA_DB_INSTANCE_CONNECTION"), - user=EnvVar("GCS_IAM_USER"), - metadata_db_name=EnvVar("GCS_METADATA_DB_NAME"), - project=EnvVar("GCS_PROJECT"), - ) - - with build_asset_context( - resources={ - "ex21_validate_test": MlflowInterface( - experiment_name=experiment_name, - continue_run=False, - tracking_uri="sqlite:///:memory:", - cloud_interface=cloud_interface, - ), - "cloud_interface": cloud_interface, - } - ) as context: - ex21_validate(context) - run = _get_most_recent_run(experiment_name) - # TODO: add in actual metric checks once validation is ready - assert run.data.metrics["ratio_extracted"] == 1 - - -@pytest.fixture -def model_checkpoint(): - """Load model from tracking server and return.""" - return load_model() + sec10k.defs.get_job_def("ex21_extraction_validation").execute_in_process() + run = get_most_recent_mlflow_run_factory("ex21_extraction_validation") -def test_model_loading(model_checkpoint): - """Test loading a fine-tuned LayoutLM model from MLFlow.""" - assert "model" in model_checkpoint - assert "tokenizer" in model_checkpoint - - -def test_dataset_creation(test_dir): - pdf_dir = test_dir / "data/test_pdfs" - dataset = create_inference_dataset(pdfs_dir=pdf_dir) - assert dataset.shape == (2, 4) - - -def test_ex21_inference_and_table_extraction( - test_dir, test_mlflow_init_func, model_checkpoint -): - """Test performing inference and extracting an Ex. 21 table.""" - model = model_checkpoint["model"] - processor = model_checkpoint["tokenizer"] - pdf_dir = test_dir / "data" / "test_pdfs" - extraction_metadata = pd.DataFrame( - {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} - ).set_index("filename") - with unittest.mock.patch("mozilla_sec_eia.extract.initialize_mlflow"): - test_mlflow_init_func() - logit_list, pred_list, output_df, extraction_metadata = perform_inference( - pdfs_dir=pdf_dir, - model=model, - processor=processor, - extraction_metadata=extraction_metadata, - device="cpu", - ) - # we don't normally want to sort by id and subsidiary - # but sort here for the sake of just testing whether dataframe - # row values are the same without worrying about order - output_df = output_df.sort_values(by=["id", "subsidiary"]).reset_index(drop=True) - # TODO: uncomment with new model checkpoint and 7th label included - # assert logit_list[0].shape == (1, 512, len(LABELS)) - expected_out_path = test_dir / "data" / "inference_and_extraction_expected_out.csv" - expected_out_df = pd.read_csv( - expected_out_path, - dtype={"id": str, "subsidiary": str, "loc": str, "own_per": np.float64}, - ) - expected_out_df["own_per"] = expected_out_df["own_per"].astype(str) - expected_out_df = clean_extracted_df(expected_out_df) - expected_out_df = expected_out_df.sort_values(by=["id", "subsidiary"]).reset_index( - drop=True - ) - assert_frame_equal(expected_out_df, output_df, check_like=True) + assert run.data.metrics["avg_subsidiary_jaccard_sim"] > 0.85 + assert run.data.metrics["avg_location_jaccard_sim"] > 0.9 From 0fd8ffc0c336a381eb8776d63d5f524acdf425f4 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 4 Sep 2024 12:15:51 -0400 Subject: [PATCH 028/161] Don't run ruff on notebooks --- tox.ini | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tox.ini b/tox.ini index 4b516d1..7c1fe9f 100644 --- a/tox.ini +++ b/tox.ini @@ -33,7 +33,7 @@ skip_install = false extras = test commands = - ruff check ./ + ruff check ./src/ [testenv:pre_commit] description = Run git pre-commit hooks not covered by the other linters. From 97d558721dda2c6b8291aa583683d98b0e0d903e Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 4 Sep 2024 12:29:03 -0400 Subject: [PATCH 029/161] xfail ex21 integration test --- tests/integration/models/sec10k/extract_test.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/integration/models/sec10k/extract_test.py b/tests/integration/models/sec10k/extract_test.py index dce8577..21be89b 100644 --- a/tests/integration/models/sec10k/extract_test.py +++ b/tests/integration/models/sec10k/extract_test.py @@ -3,6 +3,7 @@ import logging import dotenv +import pytest from mozilla_sec_eia.models import sec10k logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -22,6 +23,7 @@ def test_basic_10k_validation( assert run.data.metrics["recall"] == 1 +@pytest.mark.xfail def test_ex21_validation( test_tracker_factory, get_most_recent_mlflow_run_factory, From ace268bcdf185eed620bfc8fe2d76bbeaf126287 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 4 Sep 2024 23:01:29 -0400 Subject: [PATCH 030/161] Add parquet upath io-manager --- .../library/generic_io_managers.py | 21 +++++++++++++++++++ src/mozilla_sec_eia/models/sec10k/__init__.py | 5 +++++ .../models/sec10k/basic_10k.py | 1 + .../models/sec10k/ex_21/__init__.py | 1 + 4 files changed, 28 insertions(+) create mode 100644 src/mozilla_sec_eia/library/generic_io_managers.py diff --git a/src/mozilla_sec_eia/library/generic_io_managers.py b/src/mozilla_sec_eia/library/generic_io_managers.py new file mode 100644 index 0000000..e85aa68 --- /dev/null +++ b/src/mozilla_sec_eia/library/generic_io_managers.py @@ -0,0 +1,21 @@ +"""Implement useful generic io-managers.""" + +import pandas as pd +from dagster import InputContext, OutputContext, UPathIOManager +from upath import UPath + + +class PandasParquetIOManager(UPathIOManager): + """Read and write pandas dataframes as parquet files on local or remote filesystem.""" + + extension: str = ".parquet" + + def dump_to_path(self, context: OutputContext, obj: pd.DataFrame, path: UPath): + """Write parquet.""" + with path.open("wb") as file: + obj.to_parquet(file) + + def load_from_path(self, context: InputContext, path: UPath) -> pd.DataFrame: + """Read parquet.""" + with path.open("rb") as file: + return pd.read_parquet(file) diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 7abbb4e..a04adc0 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -5,8 +5,10 @@ load_assets_from_modules, load_assets_from_package_module, ) +from upath import UPath from mozilla_sec_eia.library import model_jobs +from mozilla_sec_eia.library.generic_io_managers import PandasParquetIOManager from mozilla_sec_eia.library.mlflow import ( MlflowInterface, mlflow_interface_resource, @@ -64,6 +66,9 @@ "layoutlm_io_manager": LayoutlmIOManager( mlflow_interface=mlflow_interface_resource ), + "pandas_parquet_io_manager": PandasParquetIOManager( + base_path=UPath("gs://sec10k-outputs") + ), } | mlflow_train_test_io_managers | extract.SEC10k_EXTRACTOR_RESOURCES, diff --git a/src/mozilla_sec_eia/models/sec10k/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py index 4c943c6..5768279 100644 --- a/src/mozilla_sec_eia/models/sec10k/basic_10k.py +++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py @@ -161,6 +161,7 @@ def basic_10k_validation_filing_metadata( basic_10k_extractor_resource, extraction_metadata_asset_name="basic_10k_extraction_metadata", extracted_asset_name="basic_10k_company_info", + io_manager_key="pandas_parquet_io_manager", ) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 39cd2d6..f27b48a 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -147,6 +147,7 @@ def clean_ex21_validation_set(validation_df: pd.DataFrame): exhibit_21_extractor_resource, extraction_metadata_asset_name="ex21_extraction_metadata", extracted_asset_name="ex21_company_ownership_info", + io_manager_key="pandas_parquet_io_manager", ) From fb1feeb9926c03fad5cb4d3336e5b5669e1d8b95 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 4 Sep 2024 23:02:33 -0400 Subject: [PATCH 031/161] Remove nb-output clear --- .pre-commit-config.yaml | 9 --------- 1 file changed, 9 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 54d4a43..543d414 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -64,15 +64,6 @@ repos: ######################################################################################## - repo: local hooks: - # clear outputs from Jupyter notebooks - - id: nb-output-clear - name: nb-output-clear - stages: [commit] - language: system - verbose: false - pass_filenames: false - always_run: true - entry: find notebooks \( -name \*.ipynb -not -name \*checkpoint.ipynb \) -type f -exec jupyter nbconvert --clear-output {} \; # Run the unit tests - id: unit-tests name: unit-tests From 294ec721cec05aee4d8e81fca7f1e186d045b340 Mon Sep 17 00:00:00 2001 From: zschira Date: Thu, 5 Sep 2024 12:50:52 -0400 Subject: [PATCH 032/161] Test docker deployment --- dagster.yaml | 48 ++++++++++++++++++++++++++++ docker/deployment/Dockerfile | 10 ++++++ docker/deployment/docker-compose.yml | 34 ++++++++++++++++++++ 3 files changed, 92 insertions(+) create mode 100644 dagster.yaml create mode 100644 docker/deployment/Dockerfile create mode 100644 docker/deployment/docker-compose.yml diff --git a/dagster.yaml b/dagster.yaml new file mode 100644 index 0000000..92abbe6 --- /dev/null +++ b/dagster.yaml @@ -0,0 +1,48 @@ +run_coordinator: + module: dagster.core.run_coordinator + class: QueuedRunCoordinator + config: + tag_concurrency_limits: + - key: "dagster/backfill" + limit: 4 +run_storage: + module: dagster_postgres.run_storage + class: PostgresRunStorage + config: + postgres_db: + hostname: dagster_postgresql + username: + env: DAGSTER_POSTGRES_USER + password: + env: DAGSTER_POSTGRES_PASSWORD + db_name: + env: DAGSTER_POSTGRES_DB + port: 5432 + +schedule_storage: + module: dagster_postgres.schedule_storage + class: PostgresScheduleStorage + config: + postgres_db: + hostname: docker_example_postgresql + username: + env: DAGSTER_POSTGRES_USER + password: + env: DAGSTER_POSTGRES_PASSWORD + db_name: + env: DAGSTER_POSTGRES_DB + port: 5432 + +event_log_storage: + module: dagster_postgres.event_log + class: PostgresEventLogStorage + config: + postgres_db: + hostname: docker_example_postgresql + username: + env: DAGSTER_POSTGRES_USER + password: + env: DAGSTER_POSTGRES_PASSWORD + db_name: + env: DAGSTER_POSTGRES_DB + port: 5432 diff --git a/docker/deployment/Dockerfile b/docker/deployment/Dockerfile new file mode 100644 index 0000000..facc384 --- /dev/null +++ b/docker/deployment/Dockerfile @@ -0,0 +1,10 @@ +FROM continuumio/miniconda3:24.7.1-0 + +WORKDIR /opt/dagster/app + +# Build environment +COPY . . +RUN conda env create -f environment.yml && conda activate mozilla-sec-eia + +EXPOSE 3000 +ENTRYPOINT ["dagster", "dev"] diff --git a/docker/deployment/docker-compose.yml b/docker/deployment/docker-compose.yml new file mode 100644 index 0000000..abed8bc --- /dev/null +++ b/docker/deployment/docker-compose.yml @@ -0,0 +1,34 @@ +services: + # This service runs the postgres DB used by dagster for run storage, schedule storage, + # and event log storage. + dagster_postgresql: + image: postgres:16 + environment: + POSTGRES_USER: "postgres_user" + POSTGRES_PASSWORD: "postgres_password" + POSTGRES_DB: "postgres_db" + networks: + - dagster + + dagster_pudl_models: + build: + context: ../../ + dockerfile: ./docker/deployment/Dockerfile + restart: always + environment: + DAGSTER_POSTGRES_USER: "postgres_user" + DAGSTER_POSTGRES_PASSWORD: "postgres_password" + DAGSTER_POSTGRES_DB: "postgres_db" + GCS_FILINGS_BUCKET_NAME: "2de2b9f52c99a240-bucket-sec-10ks" + GCS_LABELS_BUCKET_NAME: "labeled-ex21-filings" + GCS_METADATA_DB_INSTANCE_CONNECTION: "catalyst-cooperative-mozilla:us-central1:pg-mozilla" + GCS_METADATA_DB_NAME: "postgres" + GCS_IAM_USER: "mozilla-dev-sa@catalyst-cooperative-mozilla.iam.gserviceaccount.com" + MLFLOW_TRACKING_URI: "https://mlflow-ned2up6sra-uc.a.run.app" + GCS_PROJECT: "catalyst-cooperative-mozilla" + networks: + - dagster + +networks: + dagster: + driver: bridge From 4de51b34e2652d0b063143d04e9074b9b33c4303 Mon Sep 17 00:00:00 2001 From: zschira Date: Thu, 5 Sep 2024 17:53:06 -0400 Subject: [PATCH 033/161] Chunk ex 21 extraction --- .../models/sec10k/ex_21/inference.py | 87 +++++++++++-------- .../models/sec10k/utils/pdf.py | 4 +- 2 files changed, 53 insertions(+), 38 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index 235b9c3..4c68638 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -1,6 +1,7 @@ """Module for formatting inputs and performing inference with a fine-tuned LayoutLM model.""" import logging +import math import os import tempfile from contextlib import contextmanager @@ -195,6 +196,7 @@ class Exhibit21Extractor(Sec10kExtractor): device: str = "cpu" has_labels: bool = False dataset_ind: list | None = None + filing_chunk_size: int = 8 _pdf_dir: Path = PrivateAttr() _labeled_json_dir: Path | None = PrivateAttr(default=None) @@ -254,45 +256,56 @@ def extract_filings( filings_with_ex21 = filing_metadata[ ~filing_metadata["exhibit_21_version"].isna() ] - self.cloud_interface.get_filings( - filings_with_ex21, cache_directory=self._pdf_dir, cache_pdf=True - ) - dataset = create_inference_dataset( - pdfs_dir=Path(self._pdf_dir), - labeled_json_dir=self._labeled_json_dir, - has_labels=self.has_labels, - ) - if self.dataset_ind: - dataset = dataset.select(self.dataset_ind) - - # TODO: figure out device argument - model, processor = self.layoutlm.get_model_components() - pipe = pipeline( - "token-classification", - model=model, - tokenizer=processor, - pipeline_class=LayoutLMInferencePipeline, - device=self.device, + + filing_chunks = np.array_split( + filings_with_ex21, + math.ceil(len(filings_with_ex21) / self.filing_chunk_size), ) - logits = [] - predictions = [] - all_output_df = pd.DataFrame(columns=["id", "subsidiary", "loc", "own_per"]) - extraction_metadata = pd.DataFrame( - {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} - ).set_index("filename") - for logit, pred, output_df in pipe(_get_data(dataset)): - logits.append(logit) - predictions.append(pred) - if not output_df.empty: - filename = get_metadata_filename(output_df["id"].iloc[0]) - extraction_metadata.loc[filename, ["success"]] = True - all_output_df = pd.concat([all_output_df, output_df]) - all_output_df.columns.name = None - all_output_df = clean_extracted_df(all_output_df) - all_output_df = all_output_df[["id", "subsidiary", "loc", "own_per"]] - all_output_df = all_output_df.reset_index(drop=True) - return extraction_metadata, all_output_df + all_outputs_dfs = [] + extraction_metadata_dfs = [] + for filings in filing_chunks: + self.cloud_interface.get_filings( + filings_with_ex21, cache_directory=self._pdf_dir, cache_pdf=True + ) + dataset = create_inference_dataset( + pdfs_dir=Path(self._pdf_dir), + labeled_json_dir=self._labeled_json_dir, + has_labels=self.has_labels, + ) + if self.dataset_ind: + dataset = dataset.select(self.dataset_ind) + + # TODO: figure out device argument + model, processor = self.layoutlm.get_model_components() + pipe = pipeline( + "token-classification", + model=model, + tokenizer=processor, + pipeline_class=LayoutLMInferencePipeline, + device=self.device, + ) + + logits = [] + predictions = [] + all_output_df = pd.DataFrame(columns=["id", "subsidiary", "loc", "own_per"]) + extraction_metadata = pd.DataFrame( + {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} + ).set_index("filename") + for logit, pred, output_df in pipe(_get_data(dataset)): + logits.append(logit) + predictions.append(pred) + if not output_df.empty: + filename = get_metadata_filename(output_df["id"].iloc[0]) + extraction_metadata.loc[filename, ["success"]] = True + all_output_df = pd.concat([all_output_df, output_df]) + all_output_df.columns.name = None + all_output_df = clean_extracted_df(all_output_df) + all_output_df = all_output_df[["id", "subsidiary", "loc", "own_per"]] + all_output_df = all_output_df.reset_index(drop=True) + all_outputs_dfs.append(all_output_df) + extraction_metadata_dfs.append(extraction_metadata) + return pd.concat(extraction_metadata_dfs), pd.concat(all_output_df) class LayoutLMInferencePipeline(Pipeline): diff --git a/src/mozilla_sec_eia/models/sec10k/utils/pdf.py b/src/mozilla_sec_eia/models/sec10k/utils/pdf.py index a8c6411..df9be07 100644 --- a/src/mozilla_sec_eia/models/sec10k/utils/pdf.py +++ b/src/mozilla_sec_eia/models/sec10k/utils/pdf.py @@ -122,7 +122,9 @@ def combine_doc_pages(doc): combined_height += pg_txt_height output_pdf = fitz.open() - combined_page = output_pdf.new_page(width=combined_width, height=combined_height) + combined_page = output_pdf.new_page( + width=float(combined_width), height=float(combined_height) + ) for i in range(len(doc)): if i in blank_page_nums: From 214e28f02f6417cb282cf64594f2e46aaf74dade Mon Sep 17 00:00:00 2001 From: zschira Date: Thu, 5 Sep 2024 20:06:55 -0400 Subject: [PATCH 034/161] Fix asign copy --- dagster.yaml | 41 ------------------- .../models/sec10k/ex_21/inference.py | 4 +- 2 files changed, 3 insertions(+), 42 deletions(-) diff --git a/dagster.yaml b/dagster.yaml index 92abbe6..eba0c5f 100644 --- a/dagster.yaml +++ b/dagster.yaml @@ -5,44 +5,3 @@ run_coordinator: tag_concurrency_limits: - key: "dagster/backfill" limit: 4 -run_storage: - module: dagster_postgres.run_storage - class: PostgresRunStorage - config: - postgres_db: - hostname: dagster_postgresql - username: - env: DAGSTER_POSTGRES_USER - password: - env: DAGSTER_POSTGRES_PASSWORD - db_name: - env: DAGSTER_POSTGRES_DB - port: 5432 - -schedule_storage: - module: dagster_postgres.schedule_storage - class: PostgresScheduleStorage - config: - postgres_db: - hostname: docker_example_postgresql - username: - env: DAGSTER_POSTGRES_USER - password: - env: DAGSTER_POSTGRES_PASSWORD - db_name: - env: DAGSTER_POSTGRES_DB - port: 5432 - -event_log_storage: - module: dagster_postgres.event_log - class: PostgresEventLogStorage - config: - postgres_db: - hostname: docker_example_postgresql - username: - env: DAGSTER_POSTGRES_USER - password: - env: DAGSTER_POSTGRES_PASSWORD - db_name: - env: DAGSTER_POSTGRES_DB - port: 5432 diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index 4c68638..76116c3 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -426,7 +426,9 @@ def extract_table(self, all_outputs): first_in_group_df = df[ (df["pred"].ne(df["pred"].shift())) & (df["pred"] != "other") ] - first_in_group_df["iob_pred"] = "B" + first_in_group_df["iob_pred"].str[1:] + first_in_group_df.loc[:, "iob_pred"] = ( + "B" + first_in_group_df[:, "iob_pred"].str[1:] + ) df.update(first_in_group_df) # filter for just words that were labeled with non "other" entities entities_df = df.sort_values(by=["top_left_y", "top_left_x"]) From c5736e04f402d9be04398150706dfee8159090a1 Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 6 Sep 2024 12:49:57 -0400 Subject: [PATCH 035/161] Add job for testing ex21 resource usage --- src/mozilla_sec_eia/models/sec10k/__init__.py | 5 +++++ .../models/sec10k/ex_21/__init__.py | 22 ++++++++++++++++++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index a04adc0..17c3758 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -45,6 +45,10 @@ ex_21.validation_assets, ) +ex21_test_job = model_jobs.create_validation_model_job( + "ex21_test", [ex_21.test_extraction_metrics] +) + layoutlm_finetune_job = model_jobs.create_training_job( "layoutlm_finetune", layoutlm_assets, @@ -58,6 +62,7 @@ basic_10k_validation_job, ex21_production_job, ex21_validation_job, + ex21_test_job, layoutlm_finetune_job, ], resources={ diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index f27b48a..4b6b3cf 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -1,10 +1,11 @@ """Module for working with exhibit 21 data.""" +import mlflow import pandas as pd from dagster import AssetIn, AssetOut, asset, multi_asset from mozilla_sec_eia.library import validation_helpers -from mozilla_sec_eia.library.mlflow import mlflow_interface_resource +from mozilla_sec_eia.library.mlflow import MlflowInterface, mlflow_interface_resource from ..extract import sec10k_extraction_asset_factory, sec10k_filing_metadata from ..utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename @@ -138,6 +139,25 @@ def clean_ex21_validation_set(validation_df: pd.DataFrame): return validation_df +@asset +def test_extraction_metrics( + cloud_interface: GCSArchive, + exhibit21_extractor: Exhibit21Extractor, + mlflow_interface: MlflowInterface, +): + """Run extraction with various numbers of filings to view resource usage.""" + filings = cloud_interface.get_metadata() + for num_filings in [8, 16, 32, 64, 128]: + with mlflow.start_run( + run_name=f"extract_{num_filings}_filings", + nested=True, + parent_run_id=mlflow_interface.mlflow_run_id, + experiment_id=MlflowInterface.get_or_create_experiment("ex21_test"), + ): + mlflow.log_param("num_filings", num_filings) + exhibit21_extractor.extract_filings(filings.sample(num_filings)) + + exhibit_21_extractor_resource = Exhibit21Extractor( cloud_interface=cloud_interface_resource, layoutlm=LayoutlmResource(mlflow_interface=mlflow_interface_resource), From ec396334830301c45cbbb36618a1f484780e7493 Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 6 Sep 2024 12:51:06 -0400 Subject: [PATCH 036/161] Remove test docker files --- docker/deployment/Dockerfile | 10 -------- docker/deployment/docker-compose.yml | 34 ---------------------------- 2 files changed, 44 deletions(-) delete mode 100644 docker/deployment/Dockerfile delete mode 100644 docker/deployment/docker-compose.yml diff --git a/docker/deployment/Dockerfile b/docker/deployment/Dockerfile deleted file mode 100644 index facc384..0000000 --- a/docker/deployment/Dockerfile +++ /dev/null @@ -1,10 +0,0 @@ -FROM continuumio/miniconda3:24.7.1-0 - -WORKDIR /opt/dagster/app - -# Build environment -COPY . . -RUN conda env create -f environment.yml && conda activate mozilla-sec-eia - -EXPOSE 3000 -ENTRYPOINT ["dagster", "dev"] diff --git a/docker/deployment/docker-compose.yml b/docker/deployment/docker-compose.yml deleted file mode 100644 index abed8bc..0000000 --- a/docker/deployment/docker-compose.yml +++ /dev/null @@ -1,34 +0,0 @@ -services: - # This service runs the postgres DB used by dagster for run storage, schedule storage, - # and event log storage. - dagster_postgresql: - image: postgres:16 - environment: - POSTGRES_USER: "postgres_user" - POSTGRES_PASSWORD: "postgres_password" - POSTGRES_DB: "postgres_db" - networks: - - dagster - - dagster_pudl_models: - build: - context: ../../ - dockerfile: ./docker/deployment/Dockerfile - restart: always - environment: - DAGSTER_POSTGRES_USER: "postgres_user" - DAGSTER_POSTGRES_PASSWORD: "postgres_password" - DAGSTER_POSTGRES_DB: "postgres_db" - GCS_FILINGS_BUCKET_NAME: "2de2b9f52c99a240-bucket-sec-10ks" - GCS_LABELS_BUCKET_NAME: "labeled-ex21-filings" - GCS_METADATA_DB_INSTANCE_CONNECTION: "catalyst-cooperative-mozilla:us-central1:pg-mozilla" - GCS_METADATA_DB_NAME: "postgres" - GCS_IAM_USER: "mozilla-dev-sa@catalyst-cooperative-mozilla.iam.gserviceaccount.com" - MLFLOW_TRACKING_URI: "https://mlflow-ned2up6sra-uc.a.run.app" - GCS_PROJECT: "catalyst-cooperative-mozilla" - networks: - - dagster - -networks: - dagster: - driver: bridge From 101ccf17d28c4bba489a66b3603c821ec70955da Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 6 Sep 2024 14:24:05 -0400 Subject: [PATCH 037/161] Remove complex asset factory --- .../library/mlflow/mlflow_io_managers.py | 1 + src/mozilla_sec_eia/models/sec10k/__init__.py | 5 +- .../models/sec10k/basic_10k.py | 249 +++++++++--------- .../models/sec10k/ex_21/__init__.py | 68 +++-- .../models/sec10k/ex_21/inference.py | 7 +- src/mozilla_sec_eia/models/sec10k/extract.py | 69 ----- tests/unit/models/sec10k/extract_test.py | 95 ++----- 7 files changed, 199 insertions(+), 295 deletions(-) diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py index 7aa05d7..0fa03b7 100644 --- a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py +++ b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py @@ -66,6 +66,7 @@ def _get_dagster_run_id(self, context: InputContext | OutputContext) -> str: def handle_output(self, context: OutputContext, df: pd.DataFrame): """Attach dataframe to run as artifact.""" + print("HERE") if self.file_type == "csv": self._log_artifact_as_csv(df, artifact_name=f"{context.name}.csv") else: diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 17c3758..b01d144 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -10,7 +10,6 @@ from mozilla_sec_eia.library import model_jobs from mozilla_sec_eia.library.generic_io_managers import PandasParquetIOManager from mozilla_sec_eia.library.mlflow import ( - MlflowInterface, mlflow_interface_resource, mlflow_train_test_io_managers, ) @@ -74,7 +73,7 @@ "pandas_parquet_io_manager": PandasParquetIOManager( base_path=UPath("gs://sec10k-outputs") ), + "exhibit21_extractor": ex_21.exhibit_21_extractor_resource, } - | mlflow_train_test_io_managers - | extract.SEC10k_EXTRACTOR_RESOURCES, + | mlflow_train_test_io_managers, ) diff --git a/src/mozilla_sec_eia/models/sec10k/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py index 5768279..b790de5 100644 --- a/src/mozilla_sec_eia/models/sec10k/basic_10k.py +++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py @@ -3,109 +3,99 @@ import logging import pandas as pd -from dagster import AssetIn, asset +from dagster import AssetIn, AssetOut, asset, multi_asset from mozilla_sec_eia.library import validation_helpers from .extract import ( - Sec10kExtractor, - sec10k_extraction_asset_factory, sec10k_filing_metadata, ) -from .utils.cloud import GCSArchive, Sec10K, cloud_interface_resource +from .utils.cloud import GCSArchive, Sec10K logger = logging.getLogger(f"catalystcoop.{__name__}") -class Basic10kExtractor(Sec10kExtractor): - """Implement Sec10kExtractor for basic 10k company info data.""" - - name: str = "basic_10k_extractor" - - def _extract_10k(self, filing: Sec10K): - """Extract basic company data from filing.""" - logger.info(f"Extracting 10K company data from filing: {filing.filename}") - header = True - current_block = None - values = [] - filer_count = 0 - block_counts = { - "company data": 0, - "filing values": 0, - "business address": 0, - "mail address": 0, - "former company": 0, - } - unmatched_keys = [] - for line in filing.filing_text.splitlines(): - match line.replace("\t", "").lower().split(":"): - case ["filer", ""]: - filer_count += 1 - header = False - case [ - ( - "company data" - | "filing values" - | "business address" - | "mail address" - | "former company" - ) as block, - "", - ] if not header: - current_block = block - block_counts[current_block] += 1 - case [key, ""] if current_block is not None: - key = f"{block}_{key}".replace(" ", "_") - logger.warning( - f"No value found for {key} for filing {filing.filename}" - ) - unmatched_keys.append(key) - case [key, value] if current_block is not None: - key = key.replace(" ", "_") - values.append( - { - "filename": filing.filename, - "filer_count": filer_count - 1, - "block": current_block.replace(" ", "_"), - "block_count": block_counts[current_block] - 1, - "key": key.replace(" ", "_"), - "value": value, - } - ) - case ["" | ""]: - break - case _ if header: - continue - - return pd.DataFrame(values), filing.filename, unmatched_keys - - def extract_filings( - self, - filings_to_extract: pd.DataFrame, - ) -> tuple[pd.DataFrame, pd.DataFrame]: - """Extract basic 10K data and return extracted data/metadata.""" - logger.info("Starting basic 10K extraction.") - logger.info(f"Extracting {len(filings_to_extract)} filings.") - - extraction_metadata = pd.DataFrame( - {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} - ).set_index("filename") - extracted = pd.DataFrame() - - for filing in self.cloud_interface.iterate_filings(filings_to_extract): - ext, filename, unmatched_keys = self._extract_10k(filing) - extraction_metadata.loc[filename, ["success", "unmatched_keys"]] = [ - len(ext) > 0, - ",".join(unmatched_keys), - ] - extracted = pd.concat([extracted, ext]) - - return ( - extraction_metadata, - extracted.set_index( - ["filename", "filer_count", "block", "block_count", "key"] - ), - ) +def _extract_10k(filing: Sec10K): + """Extract basic company data from filing.""" + logger.info(f"Extracting 10K company data from filing: {filing.filename}") + header = True + current_block = None + values = [] + filer_count = 0 + block_counts = { + "company data": 0, + "filing values": 0, + "business address": 0, + "mail address": 0, + "former company": 0, + } + unmatched_keys = [] + for line in filing.filing_text.splitlines(): + match line.replace("\t", "").lower().split(":"): + case ["filer", ""]: + filer_count += 1 + header = False + case [ + ( + "company data" + | "filing values" + | "business address" + | "mail address" + | "former company" + ) as block, + "", + ] if not header: + current_block = block + block_counts[current_block] += 1 + case [key, ""] if current_block is not None: + key = f"{block}_{key}".replace(" ", "_") + logger.warning(f"No value found for {key} for filing {filing.filename}") + unmatched_keys.append(key) + case [key, value] if current_block is not None: + key = key.replace(" ", "_") + values.append( + { + "filename": filing.filename, + "filer_count": filer_count - 1, + "block": current_block.replace(" ", "_"), + "block_count": block_counts[current_block] - 1, + "key": key.replace(" ", "_"), + "value": value, + } + ) + case ["" | ""]: + break + case _ if header: + continue + + return pd.DataFrame(values), filing.filename, unmatched_keys + + +def extract_filings( + cloud_interface: GCSArchive, + filings_to_extract: pd.DataFrame, +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Extract basic 10K data and return extracted data/metadata.""" + logger.info("Starting basic 10K extraction.") + logger.info(f"Extracting {len(filings_to_extract)} filings.") + + extraction_metadata = pd.DataFrame( + {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} + ).set_index("filename") + extracted = pd.DataFrame() + + for filing in cloud_interface.iterate_filings(filings_to_extract): + ext, filename, unmatched_keys = _extract_10k(filing) + extraction_metadata.loc[filename, ["success", "unmatched_keys"]] = [ + len(ext) > 0, + ",".join(unmatched_keys), + ] + extracted = pd.concat([extracted, ext]) + + return ( + extraction_metadata, + extracted.set_index(["filename", "filer_count", "block", "block_count", "key"]), + ) @asset @@ -117,25 +107,20 @@ def basic_10k_validation_set() -> pd.DataFrame: ) -basic_10k_extracted_validation_asset_name = "basic_10k_company_info_validation" - - @asset( ins={ - basic_10k_extracted_validation_asset_name: AssetIn( - basic_10k_extracted_validation_asset_name - ), - "basic_10k_validation_set": AssetIn(), + "computed_df": AssetIn("basic_10k_company_info_validation"), + "validation_df": AssetIn("basic_10k_validation_set"), }, io_manager_key="mlflow_metrics_io_manager", ) -def basic_10k_extraction_validation_metrics(**kwargs): +def basic_10k_extraction_validation_metrics( + computed_df: pd.DataFrame, + validation_df: pd.DataFrame, +): """Compute basic 10k extraction validation metrics.""" - computed = kwargs[basic_10k_extracted_validation_asset_name] - validation = kwargs["basic_10k_validation_set"] - return validation_helpers.pandas_compute_precision_recall( - computed, validation, value_col="value" + computed_df, validation_df, value_col="value" ) @@ -153,32 +138,48 @@ def basic_10k_validation_filing_metadata( ] -basic_10k_extractor_resource = Basic10kExtractor( - cloud_interface=cloud_interface_resource -) -basic_10k_production_extraction = sec10k_extraction_asset_factory( - "basic_10k", - basic_10k_extractor_resource, - extraction_metadata_asset_name="basic_10k_extraction_metadata", - extracted_asset_name="basic_10k_company_info", - io_manager_key="pandas_parquet_io_manager", +@multi_asset( + outs={ + "basic_10k_extraction_metadata": AssetOut( + io_manager_key="pandas_parquet_io_manager" + ), + "basic_10k_company_info": AssetOut(io_manager_key="pandas_parquet_io_manager"), + }, ) +def basic_10k_extract( + cloud_interface: GCSArchive, + sec10k_filing_metadata: pd.DataFrame, +): + """Production asset for extracting basic 10k company info.""" + metadata, extracted = extract_filings(cloud_interface, sec10k_filing_metadata) + return metadata, extracted -basic_10k_validation_extraction = sec10k_extraction_asset_factory( - "basic_10k_validation", - basic_10k_extractor_resource, - filing_metadata_asset_name="basic_10k_validation_filing_metadata", - extraction_metadata_asset_name="basic_10k_extraction_validation_metadata", - extracted_asset_name=basic_10k_extracted_validation_asset_name, - partitions_def=None, - io_manager_key="mlflow_pandas_artifact_io_manager", +@multi_asset( + outs={ + "basic_10k_extraction_metadata_validation": AssetOut( + io_manager_key="mlflow_pandas_artifact_io_manager" + ), + "basic_10k_company_info_validation": AssetOut( + io_manager_key="mlflow_pandas_artifact_io_manager" + ), + }, ) +def basic_10k_extract_validation( + cloud_interface: GCSArchive, + basic_10k_validation_filing_metadata: pd.DataFrame, +): + """Production asset for extracting basic 10k company info.""" + metadata, extracted = extract_filings( + cloud_interface, basic_10k_validation_filing_metadata + ) + return metadata, extracted + -production_assets = [basic_10k_production_extraction, sec10k_filing_metadata] +production_assets = [basic_10k_extract, sec10k_filing_metadata] validation_assets = [ - basic_10k_validation_extraction, + basic_10k_extract_validation, basic_10k_validation_set, basic_10k_validation_filing_metadata, basic_10k_extraction_validation_metrics, diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 4b6b3cf..7287084 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -7,7 +7,7 @@ from mozilla_sec_eia.library import validation_helpers from mozilla_sec_eia.library.mlflow import MlflowInterface, mlflow_interface_resource -from ..extract import sec10k_extraction_asset_factory, sec10k_filing_metadata +from ..extract import sec10k_filing_metadata from ..utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename from ..utils.layoutlm import LayoutlmResource from .inference import Exhibit21Extractor, clean_extracted_df @@ -15,7 +15,7 @@ @asset def ex21_validation_set() -> pd.DataFrame: - """Return dataframe containing basic 10k validation data.""" + """Return dataframe containing exhibit 21 validation data.""" return clean_ex21_validation_set( validation_helpers.load_validation_data("ex21_labels.csv") ) @@ -33,12 +33,9 @@ def ex21_validation_filing_metadata( ] -ex21_extracted_validation_asset_name = "ex21_validation" - - @multi_asset( ins={ - "computed_df": AssetIn(ex21_extracted_validation_asset_name), + "computed_df": AssetIn("ex21_company_ownership_info_validation"), "validation_df": AssetIn("ex21_validation_set"), }, outs={ @@ -158,36 +155,59 @@ def test_extraction_metrics( exhibit21_extractor.extract_filings(filings.sample(num_filings)) -exhibit_21_extractor_resource = Exhibit21Extractor( - cloud_interface=cloud_interface_resource, - layoutlm=LayoutlmResource(mlflow_interface=mlflow_interface_resource), +@multi_asset( + outs={ + "ex21_extraction_metadata": AssetOut( + io_manager_key="pandas_parquet_io_manager" + ), + "ex21_company_ownership_info": AssetOut( + io_manager_key="pandas_parquet_io_manager" + ), + } ) -ex21_production_extraction = sec10k_extraction_asset_factory( - "ex21", - exhibit_21_extractor_resource, - extraction_metadata_asset_name="ex21_extraction_metadata", - extracted_asset_name="ex21_company_ownership_info", - io_manager_key="pandas_parquet_io_manager", +def ex21_extract( + sec10k_filing_metadata: pd.DataFrame, + exhibit21_extractor: Exhibit21Extractor, +): + """Extract ownership info from exhibit 21 docs.""" + metadata, extracted = exhibit21_extractor.extract_filings(sec10k_filing_metadata) + return metadata, extracted + + +@multi_asset( + outs={ + "ex21_extraction_metadata_validation": AssetOut( + io_manager_key="mlflow_pandas_artifact_io_manager" + ), + "ex21_company_ownership_info_validation": AssetOut( + io_manager_key="mlflow_pandas_artifact_io_manager" + ), + } ) +def ex21_extract_validation( + ex21_validation_filing_metadata: pd.DataFrame, + exhibit21_extractor: Exhibit21Extractor, +): + """Extract ownership info from exhibit 21 docs.""" + metadata, extracted = exhibit21_extractor.extract_filings( + ex21_validation_filing_metadata + ) + return metadata, extracted -ex21_validation_extraction = sec10k_extraction_asset_factory( - "ex21_validation", - exhibit_21_extractor_resource, - filing_metadata_asset_name="ex21_validation_filing_metadata", - extraction_metadata_asset_name="ex21_extraction_validation_metadata", - extracted_asset_name=ex21_extracted_validation_asset_name, - partitions_def=None, +exhibit_21_extractor_resource = Exhibit21Extractor( + cloud_interface=cloud_interface_resource, + layoutlm=LayoutlmResource(mlflow_interface=mlflow_interface_resource), ) production_assets = [ sec10k_filing_metadata, - ex21_production_extraction, + ex21_extract, ] validation_assets = [ ex21_validation_set, ex21_validation_filing_metadata, - ex21_validation_extraction, + ex21_extract_validation, ex21_validation_metrics, ] diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index 76116c3..e3615de 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -10,6 +10,7 @@ import numpy as np import pandas as pd import torch +from dagster import ConfigurableResource from datasets import Dataset from pydantic import PrivateAttr from transformers import ( @@ -18,8 +19,7 @@ ) from transformers.tokenization_utils_base import BatchEncoding -from ..extract import Sec10kExtractor -from ..utils.cloud import get_metadata_filename +from ..utils.cloud import GCSArchive, get_metadata_filename from ..utils.layoutlm import ( LayoutlmResource, get_id_label_conversions, @@ -188,9 +188,10 @@ def _get_data(dataset): yield from dataset -class Exhibit21Extractor(Sec10kExtractor): +class Exhibit21Extractor(ConfigurableResource): """Implement `Sec10kExtractor` interface for exhibit 21 data.""" + cloud_interface: GCSArchive layoutlm: LayoutlmResource name: str = "exhibit21_extractor" device: str = "cpu" diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py index 9db88bb..246c1b5 100644 --- a/src/mozilla_sec_eia/models/sec10k/extract.py +++ b/src/mozilla_sec_eia/models/sec10k/extract.py @@ -3,34 +3,12 @@ import pandas as pd from dagster import ( AssetExecutionContext, - AssetIn, - AssetOut, - ConfigurableResource, StaticPartitionsDefinition, asset, - multi_asset, ) from .utils.cloud import GCSArchive -SEC10k_EXTRACTOR_RESOURCES = {} - - -class Sec10kExtractor(ConfigurableResource): - """Base class for extracting SEC 10k data.""" - - cloud_interface: GCSArchive - name: str - - def extract_filings( - self, filing_metadata: pd.DataFrame - ) -> tuple[pd.DataFrame, pd.DataFrame]: - """Method must be implemented by subclasses to extract SEC10k filings.""" - raise NotImplementedError( - "extract_filings must be implemented by any subclass!" - ) - - # Create year_quarter partitions year_quarter_partitions = StaticPartitionsDefinition( [f"{year}q{quarter}" for year in range(1994, 2024) for quarter in range(1, 5)] @@ -46,50 +24,3 @@ def sec10k_filing_metadata( year_quarter = context.partition_key df = cloud_interface.get_metadata(year_quarter=year_quarter) return df - - -def sec10k_extraction_asset_factory( - name: str, - sec10k_extractor: Sec10kExtractor, - partitions_def=year_quarter_partitions, - filing_metadata_asset_name: str = "sec10k_filing_metadata", - extraction_metadata_asset_name: str = "extraction_metadata", - extracted_asset_name: str = "extraction_metadata", - io_manager_key: str | None = None, -): - """Create asset to extract data from sec10k data. - - Args: - name: Name of extraction asset. - sec10k_extractor: Subclass of Sec10kExtractor used to extract data. - partitions_def: Partitions for asset (production uses year_quarter parts, - validation is not partitioned. - filing_metadata_asset_name: Name of input asset with metadata of filings to - extract. - extraction_metadata_asset_name: Name of output asset containing metadata - from extraction run. - extracted_asset_name: Name of output asset containing extracted data. - """ - - @multi_asset( - name=name, - outs={ - extraction_metadata_asset_name: AssetOut(io_manager_key=io_manager_key), - extracted_asset_name: AssetOut(io_manager_key=io_manager_key), - }, - ins={"sec10k_filing_metadata": AssetIn(filing_metadata_asset_name)}, - partitions_def=partitions_def, - required_resource_keys={sec10k_extractor.name}, - ) - def extract_filings( - context: AssetExecutionContext, sec10k_filing_metadata: pd.DataFrame - ) -> tuple[pd.DataFrame, pd.DataFrame]: - """Run Sec10kExtractor on selected partition and return.""" - extractor = context.resources.original_resource_dict[sec10k_extractor.name] - extraction_metadata, extracted = extractor.extract_filings( - sec10k_filing_metadata - ) - return extraction_metadata, extracted - - SEC10k_EXTRACTOR_RESOURCES[sec10k_extractor.name] = sec10k_extractor - return extract_filings diff --git a/tests/unit/models/sec10k/extract_test.py b/tests/unit/models/sec10k/extract_test.py index 82efac4..a654392 100644 --- a/tests/unit/models/sec10k/extract_test.py +++ b/tests/unit/models/sec10k/extract_test.py @@ -3,86 +3,37 @@ import logging from unittest.mock import Mock +import dagster import pandas as pd -from dagster import asset, build_asset_context, materialize -from mozilla_sec_eia.models.sec10k.extract import ( - Sec10kExtractor, - sec10k_extraction_asset_factory, - sec10k_filing_metadata, -) -from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive +import pytest +from dagster import materialize +from mozilla_sec_eia.models.sec10k.extract import sec10k_filing_metadata logger = logging.getLogger(f"catalystcoop.{__name__}") def test_sec10k_filing_metadata(): """Test loading sec10k filing metadata.""" - # Prepare inputs to sec10k_filing_metadata - context = build_asset_context(partition_key="2024q1") - cloud_interface = Mock() - output_df = pd.DataFrame({"col": ["fake_col"]}) - cloud_interface.get_metadata.return_value = output_df - - returned_df = sec10k_filing_metadata( - context=context, - cloud_interface=cloud_interface, + fake_cloud_interface = Mock() + df = pd.DataFrame({"fake_df": ["fake_col"]}) + fake_cloud_interface.get_metadata.return_value = df + output = materialize( + [sec10k_filing_metadata], + partition_key="2020q1", + resources={"cloud_interface": fake_cloud_interface}, ) - # Check that GCSArchive.get_metadata was called correctly - cloud_interface.get_metadata.assert_called_once_with(year_quarter="2024q1") - pd.testing.assert_frame_equal(returned_df, output_df) - - -def test_sec10k_extraction(): - """Test loading sec10k filing metadata.""" - fake_extraction_metadata = pd.DataFrame({"extraction_metadata": ["fake_col"]}) - fake_extracted = pd.DataFrame({"extracted": ["fake_col"]}) - fake_filing_metadata = pd.DataFrame({"filing_metadata": ["fake_col"]}) - - # Create fake Sec10kExtractor - class TestSec10kExtractor(Sec10kExtractor): - name: str = "test_extractor" - - def extract_filings(self, filing_metadata): - pd.testing.assert_frame_equal(filing_metadata, fake_filing_metadata) - return fake_extraction_metadata, fake_extracted - - # Create fake GCSArchive - class FakeArchive(GCSArchive): - filings_bucket_name: str = "" - labels_bucket_name: str = "" - metadata_db_instance_connection: str = "" - user: str = "" - metadata_db_name: str = "" - project: str = "" + fake_cloud_interface.get_metadata.assert_called_once_with(year_quarter="2020q1") + pd.testing.assert_frame_equal(df, output.asset_value("sec10k_filing_metadata")) - def setup_for_execution(self, context): - pass - # Asset to return fake filing metadata - @asset - def fake_filing_metadata_asset(): - return fake_filing_metadata - - # Create fake extraction asset with configured inputs - test_extractor = TestSec10kExtractor(cloud_interface=FakeArchive()) - extraction_multi_asset = sec10k_extraction_asset_factory( - name="test_sec10k_extraction", - sec10k_extractor=test_extractor, - filing_metadata_asset_name="fake_filing_metadata_asset", - extracted_asset_name="test_sec10k_extraction", - extraction_metadata_asset_name="test_sec10k_extraction_metadata", - partitions_def=None, - ) - - # Run assets and review results - result = materialize( - [fake_filing_metadata_asset, extraction_multi_asset], - resources={test_extractor.name: test_extractor}, - ) - pd.testing.assert_frame_equal( - result.asset_value("test_sec10k_extraction_metadata"), fake_extraction_metadata - ) - pd.testing.assert_frame_equal( - result.asset_value("test_sec10k_extraction"), fake_extracted - ) +def test_sec10k_filing_metadata_bad_return_type(): + """Test loading sec10k_filing_metadata with bad return type.""" + fake_cloud_interface = Mock() + fake_cloud_interface.get_metadata.return_value = "should be DataFrame" + with pytest.raises(dagster._core.errors.DagsterTypeCheckDidNotPass): + materialize( + [sec10k_filing_metadata], + partition_key="2020q1", + resources={"cloud_interface": fake_cloud_interface}, + ) From 7e0c5a5dabc24b71fe926e91fe228520028e8ace Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 6 Sep 2024 16:12:31 -0400 Subject: [PATCH 038/161] Parallelize ex21 extraction --- src/mozilla_sec_eia/library/model_jobs.py | 29 +++++++++------ src/mozilla_sec_eia/models/sec10k/__init__.py | 1 + .../models/sec10k/ex_21/__init__.py | 35 +++++++++++++++---- .../models/sec10k/ex_21/inference.py | 2 +- src/mozilla_sec_eia/models/sec10k/extract.py | 22 ++++++++++++ 5 files changed, 71 insertions(+), 18 deletions(-) diff --git a/src/mozilla_sec_eia/library/model_jobs.py b/src/mozilla_sec_eia/library/model_jobs.py index 45602b4..87f6d15 100644 --- a/src/mozilla_sec_eia/library/model_jobs.py +++ b/src/mozilla_sec_eia/library/model_jobs.py @@ -24,23 +24,30 @@ def create_production_model_job( job_name: str, assets: list[AssetsDefinition], + concurrency_limit: int | None = None, **kwargs, ) -> JobDefinition: """Construct a dagster job and supply Definitions with assets and resources.""" + config = { + "ops": {}, + "resources": { + "mlflow_interface": { + "config": { + "experiment_name": job_name, + "tracking_enabled": False, + } + } + }, + } + if concurrency_limit is not None: + config["execution"] = { + "config": {"multiprocess": {"max_concurrent": concurrency_limit}} + } + return define_asset_job( job_name, selection=assets, - config={ - "ops": {}, - "resources": { - "mlflow_interface": { - "config": { - "experiment_name": job_name, - "tracking_enabled": False, - } - } - }, - }, + config=config, **kwargs, ) diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index b01d144..4148007 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -37,6 +37,7 @@ ex21_production_job = model_jobs.create_production_model_job( "ex21_extraction", ex_21.production_assets, + concurrency_limit=4, ) ex21_validation_job = model_jobs.create_validation_model_job( diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 7287084..06416b8 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -2,12 +2,12 @@ import mlflow import pandas as pd -from dagster import AssetIn, AssetOut, asset, multi_asset +from dagster import AssetIn, AssetOut, Out, asset, graph_multi_asset, multi_asset, op from mozilla_sec_eia.library import validation_helpers from mozilla_sec_eia.library.mlflow import MlflowInterface, mlflow_interface_resource -from ..extract import sec10k_filing_metadata +from ..extract import chunk_filings, sec10k_filing_metadata, year_quarter_partitions from ..utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename from ..utils.layoutlm import LayoutlmResource from .inference import Exhibit21Extractor, clean_extracted_df @@ -155,7 +155,25 @@ def test_extraction_metrics( exhibit21_extractor.extract_filings(filings.sample(num_filings)) -@multi_asset( +@op(out={"metadata": Out(), "extracted": Out()}) +def extract_filing_chunk( + exhibit21_extractor: Exhibit21Extractor, filings: pd.DataFrame +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Extract a set of filings and return results.""" + metadata, extracted = exhibit21_extractor.extract_filings(filings) + return metadata, extracted + + +@op(out={"metadata": Out(), "extracted": Out()}) +def collect_extracted_chunks( + metadata_dfs: list[pd.DataFrame], + extracted_dfs: list[pd.DataFrame], +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Collect chunks of extracted filings.""" + return pd.concat(metadata_dfs), pd.concat(extracted_dfs) + + +@graph_multi_asset( outs={ "ex21_extraction_metadata": AssetOut( io_manager_key="pandas_parquet_io_manager" @@ -163,14 +181,19 @@ def test_extraction_metrics( "ex21_company_ownership_info": AssetOut( io_manager_key="pandas_parquet_io_manager" ), - } + }, + partitions_def=year_quarter_partitions, ) def ex21_extract( sec10k_filing_metadata: pd.DataFrame, - exhibit21_extractor: Exhibit21Extractor, ): """Extract ownership info from exhibit 21 docs.""" - metadata, extracted = exhibit21_extractor.extract_filings(sec10k_filing_metadata) + filing_chunks = chunk_filings(sec10k_filing_metadata) + metadata_chunks, extracted_chunks = filing_chunks.map(extract_filing_chunk) + metadata, extracted = collect_extracted_chunks( + metadata_chunks.collect(), extracted_chunks.collect() + ) + return metadata, extracted diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index e3615de..afdbab4 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -428,7 +428,7 @@ def extract_table(self, all_outputs): (df["pred"].ne(df["pred"].shift())) & (df["pred"] != "other") ] first_in_group_df.loc[:, "iob_pred"] = ( - "B" + first_in_group_df[:, "iob_pred"].str[1:] + "B" + first_in_group_df["iob_pred"].str[1:] ) df.update(first_in_group_df) # filter for just words that were labeled with non "other" entities diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py index 246c1b5..5547be0 100644 --- a/src/mozilla_sec_eia/models/sec10k/extract.py +++ b/src/mozilla_sec_eia/models/sec10k/extract.py @@ -1,10 +1,17 @@ """Implement base class for an SEC10k extractor.""" +import math + +import numpy as np import pandas as pd from dagster import ( AssetExecutionContext, + Config, + DynamicOut, + DynamicOutput, StaticPartitionsDefinition, asset, + op, ) from .utils.cloud import GCSArchive @@ -24,3 +31,18 @@ def sec10k_filing_metadata( year_quarter = context.partition_key df = cloud_interface.get_metadata(year_quarter=year_quarter) return df + + +class ChunkFilingsConfig(Config): + """Set chunk size for chunk_filings.""" + + chunk_size: int = 128 + + +@op(out=DynamicOut()) +def chunk_filings(config: ChunkFilingsConfig, filings: pd.DataFrame) -> pd.DataFrame: + """Split filings into chunks for parallel processing.""" + for i, filing_chunk in enumerate( + np.array_split(filings, math.ceil(len(filings) / config.chunk_size)) + ): + yield DynamicOutput(filing_chunk, mapping_key=str(i)) From 080d7908bfe00e284cd52603f3156efd0a3ff0e6 Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 6 Sep 2024 16:22:07 -0400 Subject: [PATCH 039/161] Don't chunk in inference module --- .../models/sec10k/ex_21/inference.py | 86 ++++++++----------- 1 file changed, 37 insertions(+), 49 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index afdbab4..80c2440 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -1,7 +1,6 @@ """Module for formatting inputs and performing inference with a fine-tuned LayoutLM model.""" import logging -import math import os import tempfile from contextlib import contextmanager @@ -197,7 +196,6 @@ class Exhibit21Extractor(ConfigurableResource): device: str = "cpu" has_labels: bool = False dataset_ind: list | None = None - filing_chunk_size: int = 8 _pdf_dir: Path = PrivateAttr() _labeled_json_dir: Path | None = PrivateAttr(default=None) @@ -258,55 +256,45 @@ def extract_filings( ~filing_metadata["exhibit_21_version"].isna() ] - filing_chunks = np.array_split( - filings_with_ex21, - math.ceil(len(filings_with_ex21) / self.filing_chunk_size), + self.cloud_interface.get_filings( + filings_with_ex21, cache_directory=self._pdf_dir, cache_pdf=True + ) + dataset = create_inference_dataset( + pdfs_dir=Path(self._pdf_dir), + labeled_json_dir=self._labeled_json_dir, + has_labels=self.has_labels, + ) + if self.dataset_ind: + dataset = dataset.select(self.dataset_ind) + + # TODO: figure out device argument + model, processor = self.layoutlm.get_model_components() + pipe = pipeline( + "token-classification", + model=model, + tokenizer=processor, + pipeline_class=LayoutLMInferencePipeline, + device=self.device, ) - all_outputs_dfs = [] - extraction_metadata_dfs = [] - for filings in filing_chunks: - self.cloud_interface.get_filings( - filings_with_ex21, cache_directory=self._pdf_dir, cache_pdf=True - ) - dataset = create_inference_dataset( - pdfs_dir=Path(self._pdf_dir), - labeled_json_dir=self._labeled_json_dir, - has_labels=self.has_labels, - ) - if self.dataset_ind: - dataset = dataset.select(self.dataset_ind) - - # TODO: figure out device argument - model, processor = self.layoutlm.get_model_components() - pipe = pipeline( - "token-classification", - model=model, - tokenizer=processor, - pipeline_class=LayoutLMInferencePipeline, - device=self.device, - ) - - logits = [] - predictions = [] - all_output_df = pd.DataFrame(columns=["id", "subsidiary", "loc", "own_per"]) - extraction_metadata = pd.DataFrame( - {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} - ).set_index("filename") - for logit, pred, output_df in pipe(_get_data(dataset)): - logits.append(logit) - predictions.append(pred) - if not output_df.empty: - filename = get_metadata_filename(output_df["id"].iloc[0]) - extraction_metadata.loc[filename, ["success"]] = True - all_output_df = pd.concat([all_output_df, output_df]) - all_output_df.columns.name = None - all_output_df = clean_extracted_df(all_output_df) - all_output_df = all_output_df[["id", "subsidiary", "loc", "own_per"]] - all_output_df = all_output_df.reset_index(drop=True) - all_outputs_dfs.append(all_output_df) - extraction_metadata_dfs.append(extraction_metadata) - return pd.concat(extraction_metadata_dfs), pd.concat(all_output_df) + logits = [] + predictions = [] + all_output_df = pd.DataFrame(columns=["id", "subsidiary", "loc", "own_per"]) + extraction_metadata = pd.DataFrame( + {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} + ).set_index("filename") + for logit, pred, output_df in pipe(_get_data(dataset)): + logits.append(logit) + predictions.append(pred) + if not output_df.empty: + filename = get_metadata_filename(output_df["id"].iloc[0]) + extraction_metadata.loc[filename, ["success"]] = True + all_output_df = pd.concat([all_output_df, output_df]) + all_output_df.columns.name = None + all_output_df = clean_extracted_df(all_output_df) + all_output_df = all_output_df[["id", "subsidiary", "loc", "own_per"]] + all_output_df = all_output_df.reset_index(drop=True) + return extraction_metadata, all_output_df class LayoutLMInferencePipeline(Pipeline): From 44dfc5299b185741f594f33e9322e18aa61e25c7 Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 6 Sep 2024 17:20:55 -0400 Subject: [PATCH 040/161] Handle failures in converting to pdf --- .../library/mlflow/mlflow_io_managers.py | 1 - .../models/sec10k/ex_21/inference.py | 40 ++++++++++++++++--- .../models/sec10k/utils/cloud.py | 8 +++- 3 files changed, 41 insertions(+), 8 deletions(-) diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py index 0fa03b7..7aa05d7 100644 --- a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py +++ b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py @@ -66,7 +66,6 @@ def _get_dagster_run_id(self, context: InputContext | OutputContext) -> str: def handle_output(self, context: OutputContext, df: pd.DataFrame): """Attach dataframe to run as artifact.""" - print("HERE") if self.file_type == "csv": self._log_artifact_as_csv(df, artifact_name=f"{context.name}.csv") else: diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index 80c2440..9912969 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -183,6 +183,34 @@ def get_flattened_mode_predictions(token_boxes_tensor, predictions_tensor): return flattened_modes +def _cache_pdfs( + filings: pd.DataFrame, cloud_interface: GCSArchive, pdf_dir: Path +) -> pd.DataFrame: + """Iterate filings and cache pdfs.""" + extraction_metadata = pd.DataFrame( + { + "filename": pd.Series(dtype=str), + "success": pd.Series(dtype=bool), + "notes": pd.Series(dtype=str), + } + ).set_index("filename") + + for filing in cloud_interface.iterate_filings(filings): + pdf_path = cloud_interface.get_local_filename( + cache_directory=pdf_dir, filing=filing, extension=".pdf" + ) + + # Some filings are poorly formatted and fail in `save_as_pdf` + # We want a record of these but don't want to stop run + try: + with pdf_path.open("wb") as f: + filing.ex_21.save_as_pdf(f) + except Exception as e: + extraction_metadata.loc[filing.filename, ["success"]] = False + extraction_metadata.loc[filing.filename, ["note"]] = str(e) + return extraction_metadata + + def _get_data(dataset): yield from dataset @@ -202,6 +230,9 @@ class Exhibit21Extractor(ConfigurableResource): @contextmanager def yield_for_execution(self, context): """Setup temp path working directories.""" + # Set env variable to improve GPU memory access + os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" + with ( tempfile.TemporaryDirectory() as pdf_dir, tempfile.TemporaryDirectory() as labeled_json_dir, @@ -256,8 +287,10 @@ def extract_filings( ~filing_metadata["exhibit_21_version"].isna() ] - self.cloud_interface.get_filings( - filings_with_ex21, cache_directory=self._pdf_dir, cache_pdf=True + extraction_metadata = _cache_pdfs( + filings_with_ex21, + cloud_interface=self.cloud_interface, + pdf_dir=self._pdf_dir, ) dataset = create_inference_dataset( pdfs_dir=Path(self._pdf_dir), @@ -280,9 +313,6 @@ def extract_filings( logits = [] predictions = [] all_output_df = pd.DataFrame(columns=["id", "subsidiary", "loc", "own_per"]) - extraction_metadata = pd.DataFrame( - {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} - ).set_index("filename") for logit, pred, output_df in pipe(_get_data(dataset)): logits.append(logit) predictions.append(pred) diff --git a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py index 81c7a35..5346ec6 100644 --- a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py +++ b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py @@ -226,11 +226,15 @@ def get_filing_blob(self, year_quarter: str, path: str) -> storage.Blob: return self._filings_bucket.blob(f"sec10k/sec10k-{year_quarter}/{path}") def get_local_filename( - self, cache_directory: Path, filing: pd.Series, extension=".html" + self, cache_directory: Path, filing: pd.Series | Sec10K, extension=".html" ) -> Path: """Return path to a filing in local cache based on metadata.""" + if isinstance(filing, pd.Series): + filename = filing["filename"] + else: + filename = filing.filename return cache_directory / Path( - f"{filing['filename'].replace('edgar/data/', '').replace('/', '-')}".replace( + f"{filename.replace('edgar/data/', '').replace('/', '-')}".replace( ".txt", extension ) ) From 6e24157c95cb022285dded95f4d09fb6c2150f59 Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 6 Sep 2024 18:26:02 -0400 Subject: [PATCH 041/161] Delete cached pdfs early --- .../models/sec10k/ex_21/inference.py | 41 ++++++++----------- 1 file changed, 16 insertions(+), 25 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index 9912969..fe36752 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -11,7 +11,6 @@ import torch from dagster import ConfigurableResource from datasets import Dataset -from pydantic import PrivateAttr from transformers import ( Pipeline, pipeline, @@ -224,24 +223,12 @@ class Exhibit21Extractor(ConfigurableResource): device: str = "cpu" has_labels: bool = False dataset_ind: list | None = None - _pdf_dir: Path = PrivateAttr() - _labeled_json_dir: Path | None = PrivateAttr(default=None) @contextmanager - def yield_for_execution(self, context): - """Setup temp path working directories.""" - # Set env variable to improve GPU memory access + def setup_for_execution(self, context): + """Set env variable to improve GPU memory access.""" os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" - with ( - tempfile.TemporaryDirectory() as pdf_dir, - tempfile.TemporaryDirectory() as labeled_json_dir, - ): - self._pdf_dir = pdf_dir - if self.has_labels: - self._labeled_json_dir = labeled_json_dir - yield self - def extract_filings( self, filing_metadata: pd.DataFrame ) -> tuple[pd.DataFrame, pd.DataFrame]: @@ -287,16 +274,20 @@ def extract_filings( ~filing_metadata["exhibit_21_version"].isna() ] - extraction_metadata = _cache_pdfs( - filings_with_ex21, - cloud_interface=self.cloud_interface, - pdf_dir=self._pdf_dir, - ) - dataset = create_inference_dataset( - pdfs_dir=Path(self._pdf_dir), - labeled_json_dir=self._labeled_json_dir, - has_labels=self.has_labels, - ) + with ( + tempfile.TemporaryDirectory() as pdf_dir, + tempfile.TemporaryDirectory() as labeled_json_dir, + ): + extraction_metadata = _cache_pdfs( + filings_with_ex21, + cloud_interface=self.cloud_interface, + pdf_dir=pdf_dir, + ) + dataset = create_inference_dataset( + pdfs_dir=Path(pdf_dir), + labeled_json_dir=labeled_json_dir, + has_labels=self.has_labels, + ) if self.dataset_ind: dataset = dataset.select(self.dataset_ind) From cd06d0709dd110ec212b5aff09aadb22d4048727 Mon Sep 17 00:00:00 2001 From: zschira Date: Mon, 9 Sep 2024 11:53:45 -0400 Subject: [PATCH 042/161] Add metadata to chunk_filings --- src/mozilla_sec_eia/models/sec10k/extract.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py index 5547be0..acb885e 100644 --- a/src/mozilla_sec_eia/models/sec10k/extract.py +++ b/src/mozilla_sec_eia/models/sec10k/extract.py @@ -9,6 +9,7 @@ Config, DynamicOut, DynamicOutput, + OpExecutionContext, StaticPartitionsDefinition, asset, op, @@ -40,9 +41,12 @@ class ChunkFilingsConfig(Config): @op(out=DynamicOut()) -def chunk_filings(config: ChunkFilingsConfig, filings: pd.DataFrame) -> pd.DataFrame: +def chunk_filings( + context: OpExecutionContext, config: ChunkFilingsConfig, filings: pd.DataFrame +) -> pd.DataFrame: """Split filings into chunks for parallel processing.""" for i, filing_chunk in enumerate( np.array_split(filings, math.ceil(len(filings) / config.chunk_size)) ): + context.add_output_metadata(metadata={"filings": list(filing_chunk.filename)}) yield DynamicOutput(filing_chunk, mapping_key=str(i)) From e3e8c45895f9bea607982d287575c255baf5dce3 Mon Sep 17 00:00:00 2001 From: zschira Date: Mon, 9 Sep 2024 12:35:17 -0400 Subject: [PATCH 043/161] Catch oom errors while extracting ex21 --- .../models/sec10k/ex_21/__init__.py | 22 ++++++++++++++++++- src/mozilla_sec_eia/models/sec10k/extract.py | 6 +---- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 06416b8..35cdc10 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -1,7 +1,10 @@ """Module for working with exhibit 21 data.""" +import logging + import mlflow import pandas as pd +import torch from dagster import AssetIn, AssetOut, Out, asset, graph_multi_asset, multi_asset, op from mozilla_sec_eia.library import validation_helpers @@ -12,6 +15,8 @@ from ..utils.layoutlm import LayoutlmResource from .inference import Exhibit21Extractor, clean_extracted_df +logger = logging.getLogger(f"catalystcoop.{__name__}") + @asset def ex21_validation_set() -> pd.DataFrame: @@ -160,7 +165,20 @@ def extract_filing_chunk( exhibit21_extractor: Exhibit21Extractor, filings: pd.DataFrame ) -> tuple[pd.DataFrame, pd.DataFrame]: """Extract a set of filings and return results.""" - metadata, extracted = exhibit21_extractor.extract_filings(filings) + try: + metadata, extracted = exhibit21_extractor.extract_filings(filings) + except torch.OutOfMemoryError: + logging.warning( + f"Ran out of memory while extracting filings: {filings['filename']}" + ) + metadata = pd.DataFrame( + { + "filename": filings["filename"], + "success": [False] * len(filings), + "notes": ["Out of memory error"] * len(filings), + } + ).set_index("filename") + extracted = pd.DataFrame() return metadata, extracted @@ -170,6 +188,8 @@ def collect_extracted_chunks( extracted_dfs: list[pd.DataFrame], ) -> tuple[pd.DataFrame, pd.DataFrame]: """Collect chunks of extracted filings.""" + metadata_dfs = [df for df in metadata_dfs if not df.empty] + extracted_dfs = [df for df in extracted_dfs if not df.empty] return pd.concat(metadata_dfs), pd.concat(extracted_dfs) diff --git a/src/mozilla_sec_eia/models/sec10k/extract.py b/src/mozilla_sec_eia/models/sec10k/extract.py index acb885e..5547be0 100644 --- a/src/mozilla_sec_eia/models/sec10k/extract.py +++ b/src/mozilla_sec_eia/models/sec10k/extract.py @@ -9,7 +9,6 @@ Config, DynamicOut, DynamicOutput, - OpExecutionContext, StaticPartitionsDefinition, asset, op, @@ -41,12 +40,9 @@ class ChunkFilingsConfig(Config): @op(out=DynamicOut()) -def chunk_filings( - context: OpExecutionContext, config: ChunkFilingsConfig, filings: pd.DataFrame -) -> pd.DataFrame: +def chunk_filings(config: ChunkFilingsConfig, filings: pd.DataFrame) -> pd.DataFrame: """Split filings into chunks for parallel processing.""" for i, filing_chunk in enumerate( np.array_split(filings, math.ceil(len(filings) / config.chunk_size)) ): - context.add_output_metadata(metadata={"filings": list(filing_chunk.filename)}) yield DynamicOutput(filing_chunk, mapping_key=str(i)) From 350defba0eee3576e79cbf81a428fcd210216351 Mon Sep 17 00:00:00 2001 From: zschira Date: Mon, 9 Sep 2024 13:16:01 -0400 Subject: [PATCH 044/161] Fix ex21 gcs io-manager --- src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 35cdc10..c2adf11 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -182,7 +182,12 @@ def extract_filing_chunk( return metadata, extracted -@op(out={"metadata": Out(), "extracted": Out()}) +@op( + out={ + "metadata": Out(io_manager_key="pandas_parquet_io_manager"), + "extracted": Out(io_manager_key="pandas_parquet_io_manager"), + } +) def collect_extracted_chunks( metadata_dfs: list[pd.DataFrame], extracted_dfs: list[pd.DataFrame], From 3c80b72fdb69890f05b5d9eabd5838b3784ddc30 Mon Sep 17 00:00:00 2001 From: zschira Date: Mon, 9 Sep 2024 14:21:53 -0400 Subject: [PATCH 045/161] Fix partitions for basic 10k extraction. --- src/mozilla_sec_eia/models/sec10k/basic_10k.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mozilla_sec_eia/models/sec10k/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py index b790de5..538d477 100644 --- a/src/mozilla_sec_eia/models/sec10k/basic_10k.py +++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py @@ -9,6 +9,7 @@ from .extract import ( sec10k_filing_metadata, + year_quarter_partitions, ) from .utils.cloud import GCSArchive, Sec10K @@ -145,6 +146,7 @@ def basic_10k_validation_filing_metadata( ), "basic_10k_company_info": AssetOut(io_manager_key="pandas_parquet_io_manager"), }, + partitions_def=year_quarter_partitions, ) def basic_10k_extract( cloud_interface: GCSArchive, From 31971b7e6b1616a781b5ca59620a1aab10c89c7c Mon Sep 17 00:00:00 2001 From: zschira Date: Mon, 9 Sep 2024 16:06:09 -0400 Subject: [PATCH 046/161] Cache layoutlm locally --- src/mozilla_sec_eia/models/sec10k/__init__.py | 7 +++- .../models/sec10k/ex_21/__init__.py | 41 +++++++++++++------ .../models/sec10k/ex_21/inference.py | 4 +- .../models/sec10k/utils/layoutlm.py | 16 ++++++++ 4 files changed, 51 insertions(+), 17 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 4148007..986cf85 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -16,7 +16,7 @@ from . import basic_10k, ex_21, extract from .utils.cloud import cloud_interface_resource -from .utils.layoutlm import LayoutlmIOManager +from .utils.layoutlm import LayoutlmIOManager, LayoutlmLocalIOManager basic_10k_assets = load_assets_from_modules([basic_10k]) ex21_assets = load_assets_from_package_module(ex_21) @@ -46,7 +46,7 @@ ) ex21_test_job = model_jobs.create_validation_model_job( - "ex21_test", [ex_21.test_extraction_metrics] + "ex21_test", [ex_21.test_extraction_metrics, ex_21.layoutlm_local_cache] ) layoutlm_finetune_job = model_jobs.create_training_job( @@ -75,6 +75,9 @@ base_path=UPath("gs://sec10k-outputs") ), "exhibit21_extractor": ex_21.exhibit_21_extractor_resource, + "layoutlm_local_io_manager": LayoutlmLocalIOManager( + mlflow_interface=mlflow_interface_resource + ), } | mlflow_train_test_io_managers, ) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index c2adf11..f70940f 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -12,7 +12,6 @@ from ..extract import chunk_filings, sec10k_filing_metadata, year_quarter_partitions from ..utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename -from ..utils.layoutlm import LayoutlmResource from .inference import Exhibit21Extractor, clean_extracted_df logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -162,14 +161,20 @@ def test_extraction_metrics( @op(out={"metadata": Out(), "extracted": Out()}) def extract_filing_chunk( - exhibit21_extractor: Exhibit21Extractor, filings: pd.DataFrame + exhibit21_extractor: Exhibit21Extractor, + filings: pd.DataFrame, + layoutlm, ) -> tuple[pd.DataFrame, pd.DataFrame]: """Extract a set of filings and return results.""" try: - metadata, extracted = exhibit21_extractor.extract_filings(filings) - except torch.OutOfMemoryError: + metadata, extracted = exhibit21_extractor.extract_filings( + filings, + model=layoutlm["model"], + processor=layoutlm["processor"], + ) + except (torch.OutOfMemoryError, RuntimeError) as e: logging.warning( - f"Ran out of memory while extracting filings: {filings['filename']}" + f"Error {str(e)} while extracting filings: {filings['filename']}" ) metadata = pd.DataFrame( { @@ -198,6 +203,15 @@ def collect_extracted_chunks( return pd.concat(metadata_dfs), pd.concat(extracted_dfs) +@asset( + io_manager_key="layoutlm_local_io_manager", + ins={"layoutlm": AssetIn(input_manager_key="layoutlm_io_manager")}, +) +def layoutlm_local_cache(layoutlm): + """Load pretrained layoutlm from mlflow and save to local path.""" + return layoutlm + + @graph_multi_asset( outs={ "ex21_extraction_metadata": AssetOut( @@ -211,10 +225,13 @@ def collect_extracted_chunks( ) def ex21_extract( sec10k_filing_metadata: pd.DataFrame, + layoutlm_local_cache, ): """Extract ownership info from exhibit 21 docs.""" filing_chunks = chunk_filings(sec10k_filing_metadata) - metadata_chunks, extracted_chunks = filing_chunks.map(extract_filing_chunk) + metadata_chunks, extracted_chunks = filing_chunks.map( + lambda filings: extract_filing_chunk(filings, layoutlm_local_cache) + ) metadata, extracted = collect_extracted_chunks( metadata_chunks.collect(), extracted_chunks.collect() ) @@ -235,27 +252,27 @@ def ex21_extract( def ex21_extract_validation( ex21_validation_filing_metadata: pd.DataFrame, exhibit21_extractor: Exhibit21Extractor, + layoutlm_local_cache, ): """Extract ownership info from exhibit 21 docs.""" metadata, extracted = exhibit21_extractor.extract_filings( - ex21_validation_filing_metadata + ex21_validation_filing_metadata, + model=layoutlm_local_cache["model"], + processor=layoutlm_local_cache["processor"], ) return metadata, extracted exhibit_21_extractor_resource = Exhibit21Extractor( cloud_interface=cloud_interface_resource, - layoutlm=LayoutlmResource(mlflow_interface=mlflow_interface_resource), ) -production_assets = [ - sec10k_filing_metadata, - ex21_extract, -] +production_assets = [sec10k_filing_metadata, ex21_extract, layoutlm_local_cache] validation_assets = [ ex21_validation_set, ex21_validation_filing_metadata, ex21_extract_validation, ex21_validation_metrics, + layoutlm_local_cache, ] diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index fe36752..96b1672 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -19,7 +19,6 @@ from ..utils.cloud import GCSArchive, get_metadata_filename from ..utils.layoutlm import ( - LayoutlmResource, get_id_label_conversions, iob_to_label, normalize_bboxes, @@ -218,7 +217,6 @@ class Exhibit21Extractor(ConfigurableResource): """Implement `Sec10kExtractor` interface for exhibit 21 data.""" cloud_interface: GCSArchive - layoutlm: LayoutlmResource name: str = "exhibit21_extractor" device: str = "cpu" has_labels: bool = False @@ -230,7 +228,7 @@ def setup_for_execution(self, context): os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" def extract_filings( - self, filing_metadata: pd.DataFrame + self, filing_metadata: pd.DataFrame, model, processor ) -> tuple[pd.DataFrame, pd.DataFrame]: """Predict entities with a fine-tuned model and extract Ex. 21 tables. diff --git a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py index 1e88052..a10da6d 100644 --- a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py +++ b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py @@ -35,6 +35,22 @@ def load_input(self, context: InputContext) -> dict: return _load_pretrained_layoutlm(self.version) +class LayoutlmLocalIOManager(MlflowBaseIOManager): + """Load and log mlflow models to local path.""" + + local_path: str = "./layoutlm" + + def handle_output(self, context: OutputContext, components: dict): + """Load metrics to mlflow run/experiment created by `MlflowInterface`.""" + mlflow.transformers.save_model( + components, path=self.local_path, task="token-classification" + ) + + def load_input(self, context: InputContext) -> dict: + """Log metrics to mlflow run/experiment created by `MlflowInterface`.""" + return mlflow.transformers.load_model(self.local_path, return_type="components") + + class LayoutlmResource(ConfigurableResource): """Dagster resource for loading/using pretrained layoutlm model as a resource.""" From 634a050a1f598fe9ecd8585c69bd2b8c4b9de726 Mon Sep 17 00:00:00 2001 From: zschira Date: Mon, 9 Sep 2024 19:24:02 -0400 Subject: [PATCH 047/161] Fix caching model --- src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 4 ++-- src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index f70940f..5a0991a 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -170,7 +170,7 @@ def extract_filing_chunk( metadata, extracted = exhibit21_extractor.extract_filings( filings, model=layoutlm["model"], - processor=layoutlm["processor"], + processor=layoutlm["tokenizer"], ) except (torch.OutOfMemoryError, RuntimeError) as e: logging.warning( @@ -258,7 +258,7 @@ def ex21_extract_validation( metadata, extracted = exhibit21_extractor.extract_filings( ex21_validation_filing_metadata, model=layoutlm_local_cache["model"], - processor=layoutlm_local_cache["processor"], + processor=layoutlm_local_cache["tokenizer"], ) return metadata, extracted diff --git a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py index a10da6d..f1fcb48 100644 --- a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py +++ b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py @@ -15,7 +15,7 @@ def _load_pretrained_layoutlm(version: str = "latest") -> dict: """Function to load layoutlm from mlflow.""" path = f"models:/layoutlm_extractor/{version}" - return mlflow.transformers.load_model(path, return_type="components") + return mlflow.transformers.load_model(path, return_type="pipeline") class LayoutlmIOManager(MlflowBaseIOManager): @@ -40,10 +40,10 @@ class LayoutlmLocalIOManager(MlflowBaseIOManager): local_path: str = "./layoutlm" - def handle_output(self, context: OutputContext, components: dict): + def handle_output(self, context: OutputContext, pipeline): """Load metrics to mlflow run/experiment created by `MlflowInterface`.""" mlflow.transformers.save_model( - components, path=self.local_path, task="token-classification" + pipeline, path=self.local_path, task="token-classification" ) def load_input(self, context: InputContext) -> dict: From 69ee4c04229178c1c6da451f9eb71e2d22e810d5 Mon Sep 17 00:00:00 2001 From: zschira Date: Mon, 9 Sep 2024 19:31:45 -0400 Subject: [PATCH 048/161] Remove bad call --- src/mozilla_sec_eia/models/sec10k/ex_21/inference.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index 96b1672..46ab425 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -290,7 +290,6 @@ def extract_filings( dataset = dataset.select(self.dataset_ind) # TODO: figure out device argument - model, processor = self.layoutlm.get_model_components() pipe = pipeline( "token-classification", model=model, From 63d66006e724671c460dec8f164da2c5bb3e38d6 Mon Sep 17 00:00:00 2001 From: zschira Date: Mon, 9 Sep 2024 20:03:04 -0400 Subject: [PATCH 049/161] Test own_per conversion --- src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 5a0991a..fdcf27a 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -200,7 +200,10 @@ def collect_extracted_chunks( """Collect chunks of extracted filings.""" metadata_dfs = [df for df in metadata_dfs if not df.empty] extracted_dfs = [df for df in extracted_dfs if not df.empty] - return pd.concat(metadata_dfs), pd.concat(extracted_dfs) + metadata_df = pd.concat(metadata_dfs) + extracted_df = pd.concat(extracted_dfs) + extracted_df["own_per"] = extracted_df["own_per"].astype("float64", errors="ignore") + return metadata_df, extracted_df @asset( From c8490d470d66ea102c3c64f140aa77832e88c2e7 Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 10 Sep 2024 15:26:27 -0400 Subject: [PATCH 050/161] Add pandera types for output tables --- pyproject.toml | 1 + .../models/sec10k/basic_10k.py | 17 +++++++--- .../models/sec10k/ex_21/__init__.py | 33 +++++++++++++++---- .../models/sec10k/ex_21/inference.py | 4 +++ 4 files changed, 43 insertions(+), 12 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 1edfd0f..b026c10 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,6 +23,7 @@ dependencies = [ "dagster>=1.7.15", # 1.7.13 & 1.7.14 were both breaking things "dagster-mlflow", "dagster-webserver", + "dagster-pandera", "datasets>=2.1,<3", # Access Hugging Face datasets "seqeval>=1.2,<2", # Sequence labeling evaluation "google-cloud-secret-manager>=2,<3", diff --git a/src/mozilla_sec_eia/models/sec10k/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py index 538d477..22b7a2a 100644 --- a/src/mozilla_sec_eia/models/sec10k/basic_10k.py +++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py @@ -7,6 +7,7 @@ from mozilla_sec_eia.library import validation_helpers +from .entities import basic_10k_extract_type, sec10k_extract_metadata_type from .extract import ( sec10k_filing_metadata, year_quarter_partitions, @@ -99,7 +100,7 @@ def extract_filings( ) -@asset +@asset(dagster_type=basic_10k_extract_type) def basic_10k_validation_set() -> pd.DataFrame: """Return dataframe containing basic 10k validation data.""" return validation_helpers.load_validation_data( @@ -142,9 +143,13 @@ def basic_10k_validation_filing_metadata( @multi_asset( outs={ "basic_10k_extraction_metadata": AssetOut( - io_manager_key="pandas_parquet_io_manager" + io_manager_key="pandas_parquet_io_manager", + dagster_type=sec10k_extract_metadata_type, + ), + "basic_10k_company_info": AssetOut( + io_manager_key="pandas_parquet_io_manager", + dagster_type=basic_10k_extract_type, ), - "basic_10k_company_info": AssetOut(io_manager_key="pandas_parquet_io_manager"), }, partitions_def=year_quarter_partitions, ) @@ -160,10 +165,12 @@ def basic_10k_extract( @multi_asset( outs={ "basic_10k_extraction_metadata_validation": AssetOut( - io_manager_key="mlflow_pandas_artifact_io_manager" + io_manager_key="mlflow_pandas_artifact_io_manager", + dagster_type=sec10k_extract_metadata_type, ), "basic_10k_company_info_validation": AssetOut( - io_manager_key="mlflow_pandas_artifact_io_manager" + io_manager_key="mlflow_pandas_artifact_io_manager", + dagster_type=basic_10k_extract_type, ), }, ) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index fdcf27a..323f115 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -10,6 +10,11 @@ from mozilla_sec_eia.library import validation_helpers from mozilla_sec_eia.library.mlflow import MlflowInterface, mlflow_interface_resource +from ..entities import ( + Ex21CompanyOwnership, + ex21_extract_type, + sec10k_extract_metadata_type, +) from ..extract import chunk_filings, sec10k_filing_metadata, year_quarter_partitions from ..utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename from .inference import Exhibit21Extractor, clean_extracted_df @@ -17,7 +22,7 @@ logger = logging.getLogger(f"catalystcoop.{__name__}") -@asset +@asset(dagster_type=ex21_extract_type) def ex21_validation_set() -> pd.DataFrame: """Return dataframe containing exhibit 21 validation data.""" return clean_ex21_validation_set( @@ -159,7 +164,12 @@ def test_extraction_metrics( exhibit21_extractor.extract_filings(filings.sample(num_filings)) -@op(out={"metadata": Out(), "extracted": Out()}) +@op( + out={ + "metadata": Out(dagster_type=sec10k_extract_metadata_type), + "extracted": Out(dagster_type=ex21_extract_type), + } +) def extract_filing_chunk( exhibit21_extractor: Exhibit21Extractor, filings: pd.DataFrame, @@ -183,14 +193,21 @@ def extract_filing_chunk( "notes": ["Out of memory error"] * len(filings), } ).set_index("filename") - extracted = pd.DataFrame() + extracted = Ex21CompanyOwnership.example(size=0) + extracted.own_per = extracted.own_per.astype("float64") return metadata, extracted @op( out={ - "metadata": Out(io_manager_key="pandas_parquet_io_manager"), - "extracted": Out(io_manager_key="pandas_parquet_io_manager"), + "metadata": Out( + io_manager_key="pandas_parquet_io_manager", + dagster_type=sec10k_extract_metadata_type, + ), + "extracted": Out( + io_manager_key="pandas_parquet_io_manager", + dagster_type=ex21_extract_type, + ), } ) def collect_extracted_chunks( @@ -245,10 +262,12 @@ def ex21_extract( @multi_asset( outs={ "ex21_extraction_metadata_validation": AssetOut( - io_manager_key="mlflow_pandas_artifact_io_manager" + io_manager_key="mlflow_pandas_artifact_io_manager", + dagster_type=sec10k_extract_metadata_type, ), "ex21_company_ownership_info_validation": AssetOut( - io_manager_key="mlflow_pandas_artifact_io_manager" + io_manager_key="mlflow_pandas_artifact_io_manager", + dagster_type=ex21_extract_type, ), } ) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index 46ab425..27bd2dc 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -120,6 +120,10 @@ def clean_extracted_df(extracted_df): extracted_df["own_per"] = extracted_df["own_per"].str.replace( r"[^\d.]", "", regex=True ) + # Find values with multiple decimal points + extracted_df["own_per"] = extracted_df["own_per"].str.replace( + r"(\d*\.\d+)\..*", r"\1", regex=True + ) extracted_df["own_per"] = extracted_df["own_per"].replace("", np.nan) extracted_df["own_per"] = extracted_df["own_per"].astype( "float64", errors="ignore" From fa4f57da3fc9d4ec50570380578dd476728d47d5 Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 10 Sep 2024 15:31:31 -0400 Subject: [PATCH 051/161] Add missing entities module --- src/mozilla_sec_eia/models/sec10k/entities.py | 57 +++++++++++++++++++ 1 file changed, 57 insertions(+) create mode 100644 src/mozilla_sec_eia/models/sec10k/entities.py diff --git a/src/mozilla_sec_eia/models/sec10k/entities.py b/src/mozilla_sec_eia/models/sec10k/entities.py new file mode 100644 index 0000000..6abc4c8 --- /dev/null +++ b/src/mozilla_sec_eia/models/sec10k/entities.py @@ -0,0 +1,57 @@ +"""Define table structure SEC10k extraction.""" + +import pandera as pa +from dagster_pandera import pandera_schema_to_dagster_type +from pandera.typing import Index, Series + + +class Ex21CompanyOwnership(pa.DataFrameModel): + """Define table structure for extracted EX 21 data.""" + + _id: Series[str] = pa.Field(alias="id", description="ID of extracted filing.") + subsidiary: Series[str] = pa.Field(description="Name of subsidiary company.") + loc: Series[str] = pa.Field( + description="Location of subsidiary company.", nullable=True + ) + own_per: Series[float] = pa.Field( + description="Percent ownership of subsidiary company.", + nullable=True, + coerce=True, + ) + + +class Basic10kCompanyInfo(pa.DataFrameModel): + """Define table structure for extracted basic 10k data.""" + + filename: Index[str] = pa.Field(description="Name of extracted filing.") + filer_count: Index[str] = pa.Field( + description="Some filings have multiple blocks of company data." + ) + block: Index[str] = pa.Field(description="Block of company data.") + block_count: Index[str] = pa.Field(description="Some blocks occur multiple times.") + key: Index[str] = pa.Field(description="Key within block.") + value: Series[str] = pa.Field(description="Company info fact.") + + class Config: + """Provide multi index options in the config.""" + + multiindex_name = "time" + multiindex_strict = True + multiindex_coerce = True + + +class Sec10kExtractionMetadata(pa.DataFrameModel): + """Define table structure extraction metadata.""" + + filename: Index[str] = pa.Field(description="Name of extracted filing.") + success: Series[bool] = pa.Field( + description="Indicates whether filing was successfully extracted.", coerce=True + ) + notes: Series[str] = pa.Field( + description="Optional notes about extraction.", nullable=True + ) + + +ex21_extract_type = pandera_schema_to_dagster_type(Ex21CompanyOwnership) +basic_10k_extract_type = pandera_schema_to_dagster_type(Basic10kCompanyInfo) +sec10k_extract_metadata_type = pandera_schema_to_dagster_type(Sec10kExtractionMetadata) From 35e917d1ca7e498b84ceeeff0b8d688801c8dcf5 Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 10 Sep 2024 16:18:35 -0400 Subject: [PATCH 052/161] Don't cache model, load with io manager --- src/mozilla_sec_eia/models/sec10k/__init__.py | 2 +- .../models/sec10k/ex_21/__init__.py | 26 +++++++------------ 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 986cf85..f7f5ba0 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -46,7 +46,7 @@ ) ex21_test_job = model_jobs.create_validation_model_job( - "ex21_test", [ex_21.test_extraction_metrics, ex_21.layoutlm_local_cache] + "ex21_test", [ex_21.test_extraction_metrics] ) layoutlm_finetune_job = model_jobs.create_training_job( diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 323f115..42b5dc1 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -223,15 +223,6 @@ def collect_extracted_chunks( return metadata_df, extracted_df -@asset( - io_manager_key="layoutlm_local_io_manager", - ins={"layoutlm": AssetIn(input_manager_key="layoutlm_io_manager")}, -) -def layoutlm_local_cache(layoutlm): - """Load pretrained layoutlm from mlflow and save to local path.""" - return layoutlm - - @graph_multi_asset( outs={ "ex21_extraction_metadata": AssetOut( @@ -241,16 +232,17 @@ def layoutlm_local_cache(layoutlm): io_manager_key="pandas_parquet_io_manager" ), }, + ins={"layoutlm": AssetIn(input_manager_key="layoutlm_io_manager")}, partitions_def=year_quarter_partitions, ) def ex21_extract( sec10k_filing_metadata: pd.DataFrame, - layoutlm_local_cache, + layoutlm, ): """Extract ownership info from exhibit 21 docs.""" filing_chunks = chunk_filings(sec10k_filing_metadata) metadata_chunks, extracted_chunks = filing_chunks.map( - lambda filings: extract_filing_chunk(filings, layoutlm_local_cache) + lambda filings: extract_filing_chunk(filings, layoutlm) ) metadata, extracted = collect_extracted_chunks( metadata_chunks.collect(), extracted_chunks.collect() @@ -269,18 +261,19 @@ def ex21_extract( io_manager_key="mlflow_pandas_artifact_io_manager", dagster_type=ex21_extract_type, ), - } + }, + ins={"layoutlm": AssetIn(input_manager_key="layoutlm_io_manager")}, ) def ex21_extract_validation( ex21_validation_filing_metadata: pd.DataFrame, exhibit21_extractor: Exhibit21Extractor, - layoutlm_local_cache, + layoutlm, ): """Extract ownership info from exhibit 21 docs.""" metadata, extracted = exhibit21_extractor.extract_filings( ex21_validation_filing_metadata, - model=layoutlm_local_cache["model"], - processor=layoutlm_local_cache["tokenizer"], + model=layoutlm["model"], + processor=layoutlm["tokenizer"], ) return metadata, extracted @@ -289,12 +282,11 @@ def ex21_extract_validation( cloud_interface=cloud_interface_resource, ) -production_assets = [sec10k_filing_metadata, ex21_extract, layoutlm_local_cache] +production_assets = [sec10k_filing_metadata, ex21_extract] validation_assets = [ ex21_validation_set, ex21_validation_filing_metadata, ex21_extract_validation, ex21_validation_metrics, - layoutlm_local_cache, ] From a7b1c7fe24cea739886b3aa64191c00378ed32fa Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 10 Sep 2024 16:30:05 -0400 Subject: [PATCH 053/161] Remove float conversion --- src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 42b5dc1..562f7e1 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -194,7 +194,6 @@ def extract_filing_chunk( } ).set_index("filename") extracted = Ex21CompanyOwnership.example(size=0) - extracted.own_per = extracted.own_per.astype("float64") return metadata, extracted From f01911757f6ca9675ea250a79529cafaeac54762 Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 10 Sep 2024 16:44:36 -0400 Subject: [PATCH 054/161] Add hypothesis to deps --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index b026c10..92796d1 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,6 +28,7 @@ dependencies = [ "seqeval>=1.2,<2", # Sequence labeling evaluation "google-cloud-secret-manager>=2,<3", "google-cloud-storage>=2,<3", + "hypothesis", "matplotlib>=3.8,<4", "mlflow>=2.12", "opencv-python", From d7d13d8de55100859b0125036dcf85068b6bdd42 Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 10 Sep 2024 16:45:26 -0400 Subject: [PATCH 055/161] Make own_per str --- src/mozilla_sec_eia/models/sec10k/entities.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/entities.py b/src/mozilla_sec_eia/models/sec10k/entities.py index 6abc4c8..b0f6869 100644 --- a/src/mozilla_sec_eia/models/sec10k/entities.py +++ b/src/mozilla_sec_eia/models/sec10k/entities.py @@ -13,7 +13,8 @@ class Ex21CompanyOwnership(pa.DataFrameModel): loc: Series[str] = pa.Field( description="Location of subsidiary company.", nullable=True ) - own_per: Series[float] = pa.Field( + #: Use str to avoid conversion errors + own_per: Series[str] = pa.Field( description="Percent ownership of subsidiary company.", nullable=True, coerce=True, From 70f529371b2c6de3bb80911f0ae3d5b611162ae0 Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 10 Sep 2024 17:01:56 -0400 Subject: [PATCH 056/161] Remove astype --- src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 562f7e1..38a4295 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -218,7 +218,6 @@ def collect_extracted_chunks( extracted_dfs = [df for df in extracted_dfs if not df.empty] metadata_df = pd.concat(metadata_dfs) extracted_df = pd.concat(extracted_dfs) - extracted_df["own_per"] = extracted_df["own_per"].astype("float64", errors="ignore") return metadata_df, extracted_df From e4060926e7d4166ce842329db584837643d62c63 Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 10 Sep 2024 17:26:52 -0400 Subject: [PATCH 057/161] Validate ex21 return types --- src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 38a4295..967c76a 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -12,6 +12,7 @@ from ..entities import ( Ex21CompanyOwnership, + Sec10kExtractionMetadata, ex21_extract_type, sec10k_extract_metadata_type, ) @@ -218,7 +219,10 @@ def collect_extracted_chunks( extracted_dfs = [df for df in extracted_dfs if not df.empty] metadata_df = pd.concat(metadata_dfs) extracted_df = pd.concat(extracted_dfs) - return metadata_df, extracted_df + return ( + Sec10kExtractionMetadata.validate(metadata_df), + Ex21CompanyOwnership.validate(extracted_df), + ) @graph_multi_asset( From f3835d999ee60d20ab9112899b75f076392b0757 Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 10 Sep 2024 21:09:45 -0400 Subject: [PATCH 058/161] Clean model download temp dir --- src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py index f1fcb48..b45abd2 100644 --- a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py +++ b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py @@ -1,5 +1,7 @@ """Util functions for training and predicting with LayoutLM on Ex. 21 tables.""" +import tempfile + import mlflow from dagster import ConfigurableResource, InputContext, OutputContext from PIL import ImageDraw, ImageFont @@ -15,7 +17,10 @@ def _load_pretrained_layoutlm(version: str = "latest") -> dict: """Function to load layoutlm from mlflow.""" path = f"models:/layoutlm_extractor/{version}" - return mlflow.transformers.load_model(path, return_type="pipeline") + with tempfile.TemporaryDirectory() as dst_path: + return mlflow.transformers.load_model( + path, dst_path=dst_path, return_type="pipeline" + ) class LayoutlmIOManager(MlflowBaseIOManager): From 3c995cdac21b00a3a00cc477ce8207d04fa7ac7c Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 10 Sep 2024 21:31:09 -0400 Subject: [PATCH 059/161] Fix model return type --- src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py index b45abd2..0e24f8d 100644 --- a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py +++ b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py @@ -19,7 +19,7 @@ def _load_pretrained_layoutlm(version: str = "latest") -> dict: with tempfile.TemporaryDirectory() as dst_path: return mlflow.transformers.load_model( - path, dst_path=dst_path, return_type="pipeline" + path, dst_path=dst_path, return_type="components" ) From ef55e4bfb9dce01e14d4e2da324f2ebc2e912418 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 11 Sep 2024 09:21:03 -0400 Subject: [PATCH 060/161] Catch errors in creating ex 21 dataset --- .../models/sec10k/ex_21/inference.py | 20 ++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index 27bd2dc..41f7830 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -17,6 +17,7 @@ ) from transformers.tokenization_utils_base import BatchEncoding +from ..entities import Ex21CompanyOwnership from ..utils.cloud import GCSArchive, get_metadata_filename from ..utils.layoutlm import ( get_id_label_conversions, @@ -285,11 +286,20 @@ def extract_filings( cloud_interface=self.cloud_interface, pdf_dir=pdf_dir, ) - dataset = create_inference_dataset( - pdfs_dir=Path(pdf_dir), - labeled_json_dir=labeled_json_dir, - has_labels=self.has_labels, - ) + try: + dataset = create_inference_dataset( + pdfs_dir=Path(pdf_dir), + labeled_json_dir=labeled_json_dir, + has_labels=self.has_labels, + ) + # TODO: Investigate failures in creating dataset + except KeyError: + logger.warning("Failed to create inference dataset!") + extraction_metadata.loc[:, "filename"] = False + extraction_metadata.loc[:, "notes"] = ( + "Failed to create inference dataset." + ) + return extraction_metadata, Ex21CompanyOwnership.example(size=0) if self.dataset_ind: dataset = dataset.select(self.dataset_ind) From b37450ae08a018cb2ad3c6e370834aa066374d4e Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 11 Sep 2024 11:21:49 -0400 Subject: [PATCH 061/161] Fix column name --- src/mozilla_sec_eia/models/sec10k/ex_21/inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index 41f7830..274ff29 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -295,7 +295,7 @@ def extract_filings( # TODO: Investigate failures in creating dataset except KeyError: logger.warning("Failed to create inference dataset!") - extraction_metadata.loc[:, "filename"] = False + extraction_metadata.loc[:, "success"] = False extraction_metadata.loc[:, "notes"] = ( "Failed to create inference dataset." ) From 06b18ed4eb5a17660d1f3877f8226f71c6c3b001 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 11 Sep 2024 20:15:54 -0400 Subject: [PATCH 062/161] Try to catch empty pdf errors --- src/mozilla_sec_eia/models/sec10k/ex_21/inference.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index 274ff29..5833de1 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -60,7 +60,10 @@ def format_unlabeled_pdf_dataframe(pdfs_dir: Path): continue src_path = pdfs_dir / pdf_filename filename = Path(pdf_filename).stem - extracted, pg = get_pdf_data_from_path(src_path) + try: + extracted, pg = get_pdf_data_from_path(src_path) + except RuntimeError: + continue txt = extracted["pdf_text"] pg_meta = extracted["page"] # normalize bboxes between 0 and 1000 for Hugging Face From abfc006f0f3da4e0db606c448bcb3f08c46d0b18 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 11 Sep 2024 20:56:55 -0400 Subject: [PATCH 063/161] Print traceback in caught exception --- src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 967c76a..5f7590c 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -1,6 +1,7 @@ """Module for working with exhibit 21 data.""" import logging +import traceback import mlflow import pandas as pd @@ -183,10 +184,9 @@ def extract_filing_chunk( model=layoutlm["model"], processor=layoutlm["tokenizer"], ) - except (torch.OutOfMemoryError, RuntimeError) as e: - logging.warning( - f"Error {str(e)} while extracting filings: {filings['filename']}" - ) + except (torch.OutOfMemoryError, RuntimeError): + logger.warning(traceback.format_exc()) + logger.warning(f"Error while extracting filings: {filings['filename']}") metadata = pd.DataFrame( { "filename": filings["filename"], From ff92a55e99960362a8836542465c4a7e0971314a Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 11 Sep 2024 21:03:59 -0400 Subject: [PATCH 064/161] Fix empty pdf check --- .../models/sec10k/ex_21/create_labeled_dataset.py | 5 +++++ src/mozilla_sec_eia/models/sec10k/ex_21/inference.py | 8 ++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py index 55e1d5a..63be643 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py @@ -202,6 +202,11 @@ def get_image_dict(pdfs_dir): if pdf_filename.split(".")[-1] != "pdf": continue pdf_file_path = pdfs_dir / pdf_filename + + # Check for empty file + if pdf_file_path.stat().st_size == 0: + continue + _, pg = get_pdf_data_from_path(pdf_file_path) full_pg_img = render_page(pg) filename = pdf_filename.split(".")[0] diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index 5833de1..080b072 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -60,10 +60,7 @@ def format_unlabeled_pdf_dataframe(pdfs_dir: Path): continue src_path = pdfs_dir / pdf_filename filename = Path(pdf_filename).stem - try: - extracted, pg = get_pdf_data_from_path(src_path) - except RuntimeError: - continue + extracted, pg = get_pdf_data_from_path(src_path) txt = extracted["pdf_text"] pg_meta = extracted["page"] # normalize bboxes between 0 and 1000 for Hugging Face @@ -82,9 +79,8 @@ def create_inference_dataset(pdfs_dir: Path, labeled_json_dir=None, has_labels=F else: inference_df = format_unlabeled_pdf_dataframe(pdfs_dir=pdfs_dir) image_dict = get_image_dict(pdfs_dir) - doc_filenames = inference_df["id"].unique() annotations = [] - for filename in doc_filenames: + for filename in image_dict: annotation = { "id": filename, "tokens": inference_df.groupby("id")["text"].apply(list).loc[filename], From 8aa8c9544f54c2c5fb822dca6cf4b171f490718b Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 11 Sep 2024 21:16:34 -0400 Subject: [PATCH 065/161] Actually fix empty pdf check? --- .../models/sec10k/ex_21/create_labeled_dataset.py | 5 ----- src/mozilla_sec_eia/models/sec10k/ex_21/inference.py | 7 +++++++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py index 63be643..55e1d5a 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py @@ -202,11 +202,6 @@ def get_image_dict(pdfs_dir): if pdf_filename.split(".")[-1] != "pdf": continue pdf_file_path = pdfs_dir / pdf_filename - - # Check for empty file - if pdf_file_path.stat().st_size == 0: - continue - _, pg = get_pdf_data_from_path(pdf_file_path) full_pg_img = render_page(pg) filename = pdf_filename.split(".")[0] diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index 080b072..ff4be90 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -210,6 +210,13 @@ def _cache_pdfs( except Exception as e: extraction_metadata.loc[filing.filename, ["success"]] = False extraction_metadata.loc[filing.filename, ["note"]] = str(e) + + # Some pdfs are empty. Check for these and remove from dir + if pdf_path.stat().st_size == 0: + extraction_metadata.loc[filing.filename, ["success"]] = False + extraction_metadata.loc[filing.filename, ["note"]] = "PDF empty" + pdf_path.unlink() + return extraction_metadata From 43600bc2ab128a4b0cff3032877fc9094d4c799f Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 17 Sep 2024 21:26:24 -0400 Subject: [PATCH 066/161] Use UPath in GCSArchive --- .../models/sec10k/basic_10k.py | 12 +- .../models/sec10k/ex_21/__init__.py | 2 +- .../models/sec10k/utils/cloud.py | 201 +++++------------- tests/unit/models/sec10k/utils_test.py | 88 ++++---- 4 files changed, 105 insertions(+), 198 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/basic_10k.py b/src/mozilla_sec_eia/models/sec10k/basic_10k.py index 22b7a2a..a1497b1 100644 --- a/src/mozilla_sec_eia/models/sec10k/basic_10k.py +++ b/src/mozilla_sec_eia/models/sec10k/basic_10k.py @@ -82,15 +82,19 @@ def extract_filings( logger.info(f"Extracting {len(filings_to_extract)} filings.") extraction_metadata = pd.DataFrame( - {"filename": pd.Series(dtype=str), "success": pd.Series(dtype=bool)} + { + "filename": pd.Series(dtype=str), + "success": pd.Series(dtype=bool), + "notes": pd.Series(dtype=str), + } ).set_index("filename") extracted = pd.DataFrame() for filing in cloud_interface.iterate_filings(filings_to_extract): ext, filename, unmatched_keys = _extract_10k(filing) - extraction_metadata.loc[filename, ["success", "unmatched_keys"]] = [ + extraction_metadata.loc[filename, ["success", "notes"]] = [ len(ext) > 0, - ",".join(unmatched_keys), + "Unmatched Keys: " + ",".join(unmatched_keys), ] extracted = pd.concat([extracted, ext]) @@ -134,7 +138,7 @@ def basic_10k_validation_filing_metadata( """Get sec 10k filing metadata from validation set.""" filing_metadata = cloud_interface.get_metadata() return filing_metadata[ - filing_metadata["filename"].isin( + filing_metadata.index.isin( basic_10k_validation_set.index.get_level_values("filename").unique() ) ] diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 5f7590c..890f44e 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -40,7 +40,7 @@ def ex21_validation_filing_metadata( """Get sec 10k filing metadata from validation set.""" filing_metadata = cloud_interface.get_metadata() return filing_metadata[ - filing_metadata["filename"].isin(ex21_validation_set["filename"].unique()) + filing_metadata.index.isin(ex21_validation_set["filename"].unique()) ] diff --git a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py index 5346ec6..930f44f 100644 --- a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py +++ b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py @@ -4,25 +4,18 @@ import io import logging import re -from contextlib import contextmanager from hashlib import md5 from pathlib import Path from typing import BinaryIO, TextIO import fitz import pandas as pd -import pg8000 -from dagster import ConfigurableResource, EnvVar -from google.cloud import storage -from google.cloud.sql.connector import Connector +from dagster import ConfigurableResource from PIL import Image -from pydantic import BaseModel, PrivateAttr -from sqlalchemy import Engine, create_engine, select -from sqlalchemy.orm import Session +from pydantic import BaseModel +from upath import UPath from xhtml2pdf import pisa -from .db_metadata import Base, Sec10kMetadata - logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -142,88 +135,36 @@ def from_file( class GCSArchive(ConfigurableResource): - """Provides an interface for archived filings on GCS. - - This class looks for several environment variables to configure - access to cloud resources. These can be set directly, or be in a - .env file at the top level. - - The following variables need to be set: - - GCS_FILINGS_BUCKET_NAME: Name of bucket where 10k filings are stored. - GCS_LABELS_BUCKET_NAME: Name of top-level bucket where labelled training data is stored. - GCS_METADATA_DB_INSTANCE_CONNECTION: instance connection string - in the form 'project:region:instance'. - GCS_IAM_USER: Email of user of service account trying to connect. - GCS_METADATA_DB_NAME: Name of DB in instance to connect to. - GCS_PROJECT: Name of google cloud project. - MLFLOW_TRACKING_URI: URI of mlflow tracking server. - """ - - filings_bucket_name: str - labels_bucket_name: str - metadata_db_instance_connection: str - user: str - metadata_db_name: str - project: str - - _filings_bucket = PrivateAttr() - _labels_bucket = PrivateAttr() - _engine = PrivateAttr() - - def setup_for_execution(self, context): - """Initialize interface to filings archive on GCS.""" - self._engine = self._get_engine() - self._filings_bucket = self._get_bucket(self.filings_bucket_name) - self._labels_bucket = self._get_bucket(self.labels_bucket_name) - - Base.metadata.create_all(self._engine) - - def _get_bucket(self, bucket_name): - """Return cloud storage bucket where SEC10k filings are archived.""" - storage_client = storage.Client() - return storage_client.bucket(bucket_name) - - def _get_engine(self) -> Engine: - """Initialize a connection pool for a Cloud SQL instance of Postgres. - - Uses the Cloud SQL Python Connector with Automatic IAM Database Authentication. - """ - # initialize Cloud SQL Python Connector object - connector = Connector() - - def getconn() -> pg8000.dbapi.Connection: - conn: pg8000.dbapi.Connection = connector.connect( - self.metadata_db_instance_connection, - "pg8000", - user=self.user, - db=self.metadata_db_name, - enable_iam_auth=True, - ) - return conn + """Provides an interface for archived filings on GCS.""" - return create_engine( - "postgresql+pg8000://", - creator=getconn, - ) + filings_bucket: str = "gs://2de2b9f52c99a240-bucket-sec-10ks/" + labels_bucket: str = "gs://labeled-ex21-filings/" + outputs_bucket: str = "gs://sec10k-outputs/" + + @property + def filings_bucket_path(self): + """Return UPath of filings bucket.""" + return UPath(self.filings_bucket) - @contextmanager - def create_session(self) -> Session: - """Yield sqlalchemy session.""" - with Session(self._engine) as session: - yield session + @property + def labels_bucket_path(self): + """Return UPath of filings bucket.""" + return UPath(self.labels_bucket) + + @property + def outputs_bucket_path(self): + """Return UPath of filings bucket.""" + return UPath(self.outputs_bucket) def get_metadata(self, year_quarter: str | None = None) -> pd: """Return dataframe of filing metadata.""" - selection = select(Sec10kMetadata) + selection = None if year_quarter is not None: - selection = selection.where(Sec10kMetadata.year_quarter == year_quarter) - - return pd.read_sql(selection, self._engine) + selection = ["year_quarter", "==", year_quarter] - def get_filing_blob(self, year_quarter: str, path: str) -> storage.Blob: - """Return Blob pointing to file in GCS bucket.""" - return self._filings_bucket.blob(f"sec10k/sec10k-{year_quarter}/{path}") + return pd.read_parquet( + self.outputs_bucket_path / "sec10k_filing_metadata", filters=selection + ) def get_local_filename( self, cache_directory: Path, filing: pd.Series | Sec10K, extension=".html" @@ -239,29 +180,6 @@ def get_local_filename( ) ) - def cache_blob( - self, - blob: storage.Blob, - local_path: Path, - ) -> Path: - """Cache a single filing in cache_directory and return path.""" - # Create cache directory - local_path.parent.mkdir(parents=True, exist_ok=True) - - if exists := local_path.exists(): - blob.update() - local_hash = _compute_md5(local_path) - remote_hash = blob.md5_hash - refresh = remote_hash != local_hash - - if (not exists) or refresh: - logger.info(f"Downloading to {local_path}") - blob.download_to_filename(local_path) - else: - logger.info(f"{local_path} is already cached") - - return local_path - def get_filings( self, filing_selection: pd.DataFrame, @@ -278,11 +196,12 @@ def get_filings( """ filings = [] for _, filing in filing_selection.iterrows(): - blob = self.get_filing_blob(filing["year_quarter"], filing["filename"]) local_path = self.get_local_filename(cache_directory, filing) - filing_path = self.cache_blob(blob, local_path) + if not local_path.exists(): + with local_path.open("w") as f: + f.write((self.filings_bucket_path / filing.filename).read_text()) - with filing_path.open() as f: + with local_path.open() as f: sec10k_filing = Sec10K.from_file( file=f, filename=filing["filename"], @@ -317,14 +236,11 @@ def iterate_filings( filing_selection: Pandas dataframe with same schema as metadata df where each row is a filing to return. """ - for _, filing in filing_selection.iterrows(): + for filename, filing in filing_selection.iterrows(): + filepath = f"sec10k/sec10k-{filing.year_quarter}/{filename}" yield Sec10K.from_file( - file=io.StringIO( - self.get_filing_blob( - filing["year_quarter"], filing["filename"] - ).download_as_text() - ), - filename=filing["filename"], + file=io.StringIO((self.filings_bucket_path / filepath).read_text()), + filename=filename, cik=filing["cik"], year_quarter=filing["year_quarter"], ex_21_version=filing["exhibit_21_version"], @@ -334,7 +250,7 @@ def cache_training_data( self, json_cache_path: Path, pdf_cache_path: Path, - gcs_folder_name: str = "labeled/", + gcs_folder_name: str = "labeledv0.2", overwrite_pdfs: bool = False, ): """Cache labeled training data stored on GCS for local use.""" @@ -342,36 +258,34 @@ def cache_training_data( pdf_cache_path.mkdir(parents=True, exist_ok=True) metadata_df = self.get_metadata() label_name_pattern = re.compile(r"(\d+)-\d{4}q[1-4]-\d+-(.+)") - if gcs_folder_name[-1] != "/": - gcs_folder_name += "/" - for blob in self._labels_bucket.list_blobs(match_glob=f"{gcs_folder_name}*"): - if blob.name == gcs_folder_name: - continue + # Cache filings and labels + filenames = [] + direc = self.labels_bucket_path / gcs_folder_name + for file in direc.iterdir(): + if file.name == gcs_folder_name: + continue # Cache labels - self.cache_blob( - blob, json_cache_path / blob.name.replace(gcs_folder_name, "") - ) + with (json_cache_path / file.name).open("w") as f: + f.write(file.read_text()) # Cache filing - match = label_name_pattern.search(blob.name) - filename = f"edgar/data/{match.group(1)}/{match.group(2)}.txt" - filing_metadata = metadata_df[metadata_df["filename"] == filename] - filing = self.get_filings(filing_metadata)[0] - pdf_path = self.get_local_filename( - pdf_cache_path, filing_metadata.iloc[0], extension=".pdf" - ) - if not pdf_path.exists() or overwrite_pdfs: - with pdf_path.open("wb") as f: - filing.ex_21.save_as_pdf(f) + match = label_name_pattern.search(file.name) + filenames.append(f"edgar/data/{match.group(1)}/{match.group(2)}.txt") + + filings = metadata_df[metadata_df["filename"].isin(filenames)] + self.get_filings( + filings, + cache_path=pdf_cache_path, + cache_pdf=True, + ) def validate_archive(self) -> bool: """Validate that all filings described in metadata table exist in GCS bucket.""" # Get files in archive logger.info("Get list of files in archive.") archive_filenames = { - re.sub(r"sec10k/sec10k-\d{4}q\d/", "", blob.name) - for blob in self._filings_bucket.list_blobs() + filing.name for filing in self.filings_bucket_path.iterdir() } # Get metadata df @@ -400,11 +314,4 @@ def get_metadata_filename(local_filename: str): return "edgar/data/" + local_filename.replace("-", "/", 1) + ".txt" -cloud_interface_resource = GCSArchive( - filings_bucket_name=EnvVar("GCS_FILINGS_BUCKET_NAME"), - labels_bucket_name=EnvVar("GCS_LABELS_BUCKET_NAME"), - metadata_db_instance_connection=EnvVar("GCS_METADATA_DB_INSTANCE_CONNECTION"), - user=EnvVar("GCS_IAM_USER"), - metadata_db_name=EnvVar("GCS_METADATA_DB_NAME"), - project=EnvVar("GCS_PROJECT"), -) +cloud_interface_resource = GCSArchive() diff --git a/tests/unit/models/sec10k/utils_test.py b/tests/unit/models/sec10k/utils_test.py index de9fbd7..b31fc63 100644 --- a/tests/unit/models/sec10k/utils_test.py +++ b/tests/unit/models/sec10k/utils_test.py @@ -3,6 +3,7 @@ import io import unittest from dataclasses import dataclass +from pathlib import Path import pandas as pd import pytest @@ -16,40 +17,29 @@ @pytest.fixture def test_archive(): """Return test GCSArchive class.""" - with ( - unittest.mock.patch( - "mozilla_sec_eia.models.sec10k.utils.cloud.GCSArchive._get_engine" - ), - unittest.mock.patch( - "mozilla_sec_eia.models.sec10k.utils.cloud.GCSArchive._get_bucket" - ), - ): - archive = GCSArchive( - filings_bucket_name="filings_bucket_name", - labels_bucket_name="labels_bucket_name", - metadata_db_instance_connection="metadata_db_instance_connection", - user="user", - metadata_db_name="metadata_db_name", - project="project_name", - ) - archive.setup_for_execution("fake_context") - return archive + return GCSArchive() @dataclass -class _FakeBlob: - name: str +class _FakePath: + files: list[str] + + def iterdir(self): + """Fake iterdir""" + yield from self.files @pytest.mark.parametrize( "archive_files,metadata_files,valid", [ ( - [ - _FakeBlob("sec10k/sec10k-1993q1/filing1.txt"), - _FakeBlob("sec10k/sec10k-1996q2/filing2.txt"), - _FakeBlob("sec10k/sec10k-2000q4/filing3.txt"), - ], + _FakePath( + files=[ + Path("sec10k/sec10k-1993q1/filing1.txt"), + Path("sec10k/sec10k-1996q2/filing2.txt"), + Path("sec10k/sec10k-2000q4/filing3.txt"), + ] + ), [ "filing1.txt", "filing2.txt", @@ -58,12 +48,14 @@ class _FakeBlob: True, ), ( - [ - _FakeBlob("sec10k/sec10k-1993q1/filing1.txt"), - _FakeBlob("sec10k/sec10k-1996q2/filing2.txt"), - _FakeBlob("sec10k/sec10k-2000q4/filing3.txt"), - _FakeBlob("sec10k/sec10k-2001q3/filing4.txt"), - ], + _FakePath( + files=[ + Path("sec10k/sec10k-1993q1/filing1.txt"), + Path("sec10k/sec10k-1996q2/filing2.txt"), + Path("sec10k/sec10k-2000q4/filing3.txt"), + Path("sec10k/sec10k-2001q3/filing4.txt"), + ] + ), [ "filing1.txt", "filing2.txt", @@ -72,11 +64,13 @@ class _FakeBlob: False, ), ( - [ - _FakeBlob("sec10k/sec10k-1993q1/filing1.txt"), - _FakeBlob("sec10k/sec10k-1996q2/filing2.txt"), - _FakeBlob("sec10k/sec10k-2000q4/filing3.txt"), - ], + _FakePath( + files=[ + Path("sec10k/sec10k-1993q1/filing1.txt"), + Path("sec10k/sec10k-1996q2/filing2.txt"), + Path("sec10k/sec10k-2000q4/filing3.txt"), + ] + ), [ "filing1.txt", "filing2.txt", @@ -89,17 +83,19 @@ class _FakeBlob: ) def test_validate_archive(test_archive, archive_files, metadata_files, valid, mocker): """Test archive validation functionality.""" - test_archive._filings_bucket.list_blobs.return_value = archive_files - - metadata_mock = mocker.MagicMock( - return_value=pd.DataFrame({"filename": metadata_files}) - ) - mocker.patch( - "mozilla_sec_eia.models.sec10k.utils.cloud.GCSArchive.get_metadata", - new=metadata_mock, - ) + with unittest.mock.patch( + "mozilla_sec_eia.models.sec10k.utils.GCSArchive.filings_bucket_path", + new=archive_files, + ): + metadata_mock = mocker.MagicMock( + return_value=pd.DataFrame({"filename": metadata_files}) + ) + mocker.patch( + "mozilla_sec_eia.models.sec10k.utils.cloud.GCSArchive.get_metadata", + new=metadata_mock, + ) - assert test_archive.validate_archive() == valid + assert test_archive.validate_archive() == valid @pytest.mark.parametrize( From 05ad82cd97b5e6aee76dec738a79bce303cc2f5f Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 18 Sep 2024 10:34:10 -0400 Subject: [PATCH 067/161] Make _configure_mlflow a standalone function --- .../library/mlflow/mlflow_resource.py | 60 ++++++++++--------- 1 file changed, 32 insertions(+), 28 deletions(-) diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py index 1060b9b..82710de 100644 --- a/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py +++ b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py @@ -21,6 +21,37 @@ logger = logging.getLogger(f"catalystcoop.{__name__}") +def _configure_mlflow(tracking_uri: str, project: str): + """Do runtime configuration of mlflow.""" + os.environ["MLFLOW_TRACKING_USERNAME"] = "admin" + os.environ["MLFLOW_TRACKING_PASSWORD"] = _get_tracking_password( + tracking_uri, project + ) + os.environ["MLFLOW_TRACKING_URI"] = tracking_uri + os.environ["MLFLOW_GCS_DOWNLOAD_CHUNK_SIZE"] = "20971520" + os.environ["MLFLOW_GCS_UPLOAD_CHUNK_SIZE"] = "20971520" + os.environ["MLFLOW_HTTP_REQUEST_TIMEOUT"] = "900" + os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true" + + +def _get_tracking_password(tracking_uri: str, project: str, version_id: str = "latest"): + """Get tracking server password from gcloud secrets.""" + # Password not required for local use + if "sqlite" not in tracking_uri: + # Create the Secret Manager client. + client = secretmanager.SecretManagerServiceClient() + + # Build the resource name of the secret version. + name = f"projects/{project}/secrets/mlflow_admin_password/versions/{version_id}" + + # Access the secret version. + response = client.access_secret_version(name=name) + + # Return the decoded payload. + return response.payload.data.decode("UTF-8") + return "" + + class MlflowInterface(ConfigurableResource): """Dagster resource to interface with mlflow tracking server. @@ -52,7 +83,7 @@ def yield_for_execution( """Create experiment tracker for specified experiment.""" dagster_run_id = context.run_id self._mlflow_run_id = None - self._configure_mlflow() + _configure_mlflow(self.tracking_uri, self.project) if self.tracking_enabled: # Get run_id associated with current dagster run @@ -75,33 +106,6 @@ def mlflow_run_id(self) -> str | None: """Return run id of current run.""" return self._mlflow_run_id - def _get_tracking_password(self, version_id: str = "latest"): - """Get tracking server password from gcloud secrets.""" - # Password not required for local use - if "sqlite" not in self.tracking_uri: - # Create the Secret Manager client. - client = secretmanager.SecretManagerServiceClient() - - # Build the resource name of the secret version. - name = f"projects/{self.project}/secrets/mlflow_admin_password/versions/{version_id}" - - # Access the secret version. - response = client.access_secret_version(name=name) - - # Return the decoded payload. - return response.payload.data.decode("UTF-8") - return "" - - def _configure_mlflow(self): - """Do runtime configuration of mlflow.""" - os.environ["MLFLOW_TRACKING_USERNAME"] = "admin" - os.environ["MLFLOW_TRACKING_PASSWORD"] = self._get_tracking_password() - os.environ["MLFLOW_TRACKING_URI"] = self.tracking_uri - os.environ["MLFLOW_GCS_DOWNLOAD_CHUNK_SIZE"] = "20971520" - os.environ["MLFLOW_GCS_UPLOAD_CHUNK_SIZE"] = "20971520" - os.environ["MLFLOW_HTTP_REQUEST_TIMEOUT"] = "900" - os.environ["MLFLOW_ENABLE_SYSTEM_METRICS_LOGGING"] = "true" - @staticmethod def get_or_create_experiment( experiment_name: str, artifact_location: str = "" From 99fc7edac96e8d8db5f4ebe934bcef6bbce5cd67 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 18 Sep 2024 12:39:59 -0400 Subject: [PATCH 068/161] Try to skip notebooks in ruff check --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 92796d1..dd39186 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -155,6 +155,7 @@ doctest_optionflags = [ ] [tool.ruff] +exclude = ["notebooks/*"] select = [ "A", # flake8-builtins # "ARG", # unused arguments From b13550052c39112ff3dd88cc2569978a3d5aef1e Mon Sep 17 00:00:00 2001 From: zschira Date: Thu, 19 Sep 2024 10:43:57 -0400 Subject: [PATCH 069/161] Pull integration test fixes from main --- tests/conftest.py | 44 +++--------------- .../integration/models/sec10k/extract_test.py | 46 +++++++++++++------ 2 files changed, 39 insertions(+), 51 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 87e44e5..d1a47d4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,13 +1,11 @@ """PyTest configuration module. Defines useful fixtures, command line args.""" import logging +import os from pathlib import Path -import mlflow import pytest -from mozilla_sec_eia.library.mlflow import MlflowInterface - logger = logging.getLogger(__name__) @@ -38,41 +36,11 @@ def test_dir() -> Path: return Path(__file__).parent -class TestTracker(MlflowInterface): - """Create sub-class of `MlflowInterface` to use in testing context. - - Test class creates an in-memory sqlite db for tracking, and a temporary directory - for artifact storage. - """ - - def _get_tracking_password(self): - return "password" - - @pytest.fixture -def test_tracker_factory(tmp_path): - def factory(experiment_name: str) -> TestTracker: - return TestTracker( - artifact_location=str(tmp_path), - tracking_uri="sqlite:///:memory:", - experiment_name=experiment_name, - project="", - ) +def set_test_mlflow_env_vars_factory(): + def factory(): + # Use in memory tracking backend unless USE_TRACKING_SERVER is set + if not os.getenv("USE_TRACKING_SERVER"): + os.environ["MLFLOW_TRACKING_URI"] = "sqlite:///:memory:" return factory - - -@pytest.fixture -def get_most_recent_mlflow_run_factory(): - def _get_run(experiment_name: str): - """Search mlflow for most recent run with specified experiment name.""" - run_metadata = mlflow.search_runs( - experiment_names=[experiment_name], - ) - - # Mlflow returns runs ordered by their runtime, so it's easy to grab the latest run - # This assert will ensure this doesn't silently break if the ordering changes - assert run_metadata.loc[0, "end_time"] == run_metadata["end_time"].max() - return mlflow.get_run(run_metadata.loc[0, "run_id"]) - - return _get_run diff --git a/tests/integration/models/sec10k/extract_test.py b/tests/integration/models/sec10k/extract_test.py index 1e2d6a8..26f7fd6 100644 --- a/tests/integration/models/sec10k/extract_test.py +++ b/tests/integration/models/sec10k/extract_test.py @@ -1,39 +1,59 @@ """Validate basic 10k and exhibit 21 extraction.""" import logging +import os +import unittest import dotenv -import pytest +from mozilla_sec_eia.library.mlflow.mlflow_resource import ( + _configure_mlflow, + get_most_recent_run, +) from mozilla_sec_eia.models import sec10k logger = logging.getLogger(f"catalystcoop.{__name__}") +# TODO: Make validation tests log to tracking server on merge to main + def test_basic_10k_validation( - test_tracker_factory, - get_most_recent_mlflow_run_factory, + set_test_mlflow_env_vars_factory, ): """Test basic_10k_validation_job.""" - dotenv.load_dotenv() - sec10k.defs.get_job_def("basic_10k_extraction_validation").execute_in_process() + dotenv.load_dotenv(override=True) + set_test_mlflow_env_vars_factory() + result = sec10k.defs.get_job_def( + "basic_10k_extraction_validation" + ).execute_in_process() - run = get_most_recent_mlflow_run_factory("basic_10k_extraction_validation") + run = get_most_recent_run("basic_10k_extraction_validation", result.run_id) assert run.data.metrics["precision"] == 1 assert run.data.metrics["recall"] == 1 -@pytest.mark.xfail def test_ex21_validation( - test_tracker_factory, - get_most_recent_mlflow_run_factory, + set_test_mlflow_env_vars_factory, ): """Test ex21_validation_job.""" - dotenv.load_dotenv() - sec10k.defs.get_job_def("ex21_extraction_validation").execute_in_process() - - run = get_most_recent_mlflow_run_factory("ex21_extraction_validation") + dotenv.load_dotenv(override=True) + _configure_mlflow( + os.getenv("MLFLOW_TRACKING_URI"), + os.getenv("GCS_PROJECT"), + ) + pretrained_model = sec10k.utils.layoutlm._load_pretrained_layoutlm() + + with unittest.mock.patch( + "mozilla_sec_eia.models.sec10k.utils.layoutlm._load_pretrained_layoutlm", + new=lambda _: pretrained_model, + ): + set_test_mlflow_env_vars_factory() + result = sec10k.defs.get_job_def( + "ex21_extraction_validation" + ).execute_in_process() + + run = get_most_recent_run("ex21_extraction_validation", result.run_id) assert run.data.metrics["avg_subsidiary_jaccard_sim"] > 0.85 assert run.data.metrics["avg_location_jaccard_sim"] > 0.9 From 6e868f22e7f081d83e6c80fdb06f1680cc8b13b5 Mon Sep 17 00:00:00 2001 From: Zach Schira Date: Thu, 19 Sep 2024 12:00:34 -0400 Subject: [PATCH 070/161] Fix typos in README.rst --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index af06075..948010c 100644 --- a/README.rst +++ b/README.rst @@ -75,14 +75,14 @@ use the appropriate executor and supply the job with necessary resources. Library ^^^^^^^ There's generic shared tooling for ``pudl-models`` defined in -``src/mozilla_sec_eia/library/``. This includes the helper fucntions for +``src/mozilla_sec_eia/library/``. This includes the helper functions for constructing dagster jobs discussed above, as well as useful methods for computing validation metrics, and an interface to our mlflow tracking server integrated with our tracking server. MlFlow """""" -We use a remote `mlflow tracking `__ to aide in the +We use a remote `mlflow tracking `__ to aid in the development and management of ``pudl-models``. In the ``mlflow`` module, there are several dagster resources and IO-managers that can be used in any models to allow simple seamless interface to the server. From df4fd095e023cfa7644ea9c19f09777cf3912bc2 Mon Sep 17 00:00:00 2001 From: zschira Date: Thu, 19 Sep 2024 14:45:07 -0400 Subject: [PATCH 071/161] Cache downloaded layoutlm in dagster home --- .../library/mlflow/mlflow_resource.py | 7 ++++ .../models/sec10k/utils/layoutlm.py | 41 +++++++------------ 2 files changed, 21 insertions(+), 27 deletions(-) diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py index e3428d7..d0fa62b 100644 --- a/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py +++ b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py @@ -12,6 +12,7 @@ import logging import os from contextlib import contextmanager +from pathlib import Path import mlflow from dagster import ConfigurableResource, EnvVar, InitResourceContext @@ -72,9 +73,15 @@ class MlflowInterface(ConfigurableResource): experiment_name: str tags: dict = {} project: str = EnvVar("GCS_PROJECT") + dagster_home: str = EnvVar("DAGSTER_HOME") _mlflow_run_id: str = PrivateAttr() + @property + def dagster_home_path(self): + """Return `dagster_home` as a Path.""" + return Path(self.dagster_home) + @contextmanager def yield_for_execution( self, diff --git a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py index d358e33..6efc310 100644 --- a/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py +++ b/src/mozilla_sec_eia/models/sec10k/utils/layoutlm.py @@ -1,26 +1,22 @@ """Util functions for training and predicting with LayoutLM on Ex. 21 tables.""" -import tempfile - import mlflow -from dagster import ConfigurableResource, InputContext, OutputContext +from dagster import InputContext, OutputContext from PIL import ImageDraw, ImageFont -from pydantic import PrivateAttr from transformers import ( Trainer, ) -from mozilla_sec_eia.library.mlflow import MlflowBaseIOManager, MlflowInterface +from mozilla_sec_eia.library.mlflow import MlflowBaseIOManager -def _load_pretrained_layoutlm(version: str = "latest") -> dict: +def _load_pretrained_layoutlm(cache_path: str, version: str = "latest") -> dict: """Function to load layoutlm from mlflow.""" path = f"models:/layoutlm_extractor/{version}" - with tempfile.TemporaryDirectory() as dst_path: - return mlflow.transformers.load_model( - path, dst_path=dst_path, return_type="components" - ) + return mlflow.transformers.load_model( + path, dst_path=cache_path, return_type="components" + ) class LayoutlmIOManager(MlflowBaseIOManager): @@ -37,23 +33,14 @@ def handle_output(self, context: OutputContext, finetuned_model: Trainer): def load_input(self, context: InputContext) -> dict: """Log metrics to mlflow run/experiment created by `MlflowInterface`.""" - return _load_pretrained_layoutlm(self.version) - - -class LayoutlmResource(ConfigurableResource): - """Dagster resource for loading/using pretrained layoutlm model as a resource.""" - - mlflow_interface: MlflowInterface - version: str | None = None - _model_components: dict = PrivateAttr() - - def setup_for_execution(self, context): - """Load layoutlm from mlflow.""" - self._model_components = _load_pretrained_layoutlm(self.version) - - def get_model_components(self): - """Return model components from loaded model.""" - return self._model_components["model"], self._model_components["tokenizer"] + cache_path = ( + self.mlflow_interface.dagster_home_path / "model_cache" / "layoutlm" + ) + cache_path.mkdir(exist_ok=True, parents=True) + return _load_pretrained_layoutlm( + cache_path=cache_path, + version=self.version, + ) def normalize_bboxes(txt_df, pg_meta_df): From 364276559c61d696b656c1640c731299622df475 Mon Sep 17 00:00:00 2001 From: zschira Date: Thu, 19 Sep 2024 16:40:16 -0400 Subject: [PATCH 072/161] Fix broken test --- tests/integration/models/sec10k/extract_test.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/integration/models/sec10k/extract_test.py b/tests/integration/models/sec10k/extract_test.py index 26f7fd6..750c969 100644 --- a/tests/integration/models/sec10k/extract_test.py +++ b/tests/integration/models/sec10k/extract_test.py @@ -34,6 +34,7 @@ def test_basic_10k_validation( def test_ex21_validation( + tmp_path, set_test_mlflow_env_vars_factory, ): """Test ex21_validation_job.""" @@ -42,7 +43,9 @@ def test_ex21_validation( os.getenv("MLFLOW_TRACKING_URI"), os.getenv("GCS_PROJECT"), ) - pretrained_model = sec10k.utils.layoutlm._load_pretrained_layoutlm() + pretrained_model = sec10k.utils.layoutlm._load_pretrained_layoutlm( + cache_path=tmp_path + ) with unittest.mock.patch( "mozilla_sec_eia.models.sec10k.utils.layoutlm._load_pretrained_layoutlm", From 830bd74288d3887129878a35db2e9db081fe286b Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Fri, 20 Sep 2024 08:25:12 +0100 Subject: [PATCH 073/161] fix rename filings --- .../sec10k/ex_21/rename_labeled_filings.py | 24 ++++++++++--------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/rename_labeled_filings.py b/src/mozilla_sec_eia/models/sec10k/ex_21/rename_labeled_filings.py index 182dd04..1de455f 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/rename_labeled_filings.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/rename_labeled_filings.py @@ -10,7 +10,7 @@ logger = logging.getLogger(f"catalystcoop.{__name__}") -def rename_filings(): +def rename_filings(labeled_bucket_name="labeledv0.1"): """Rename labeled filings in GCS after importing from Label Studio. After importing labeled documents from Label Studio into GCS the @@ -22,18 +22,20 @@ def rename_filings(): filename. """ archive = GCSArchive() - bucket = archive._labels_bucket - - labeled_bucket_name = "labeled/" + bucket_path = archive.labels_bucket_path / labeled_bucket_name - for blob in bucket.list_blobs(prefix=labeled_bucket_name): - if blob.name != labeled_bucket_name: - logger.info(blob.name) - file_dict = json.loads(blob.download_as_text()) + for file in bucket_path.iterdir(): + filename = file.parts[-1] + if filename != labeled_bucket_name: + logger.info(filename) + file_dict = json.loads(file.read_text()) archive_name = file_dict["task"]["data"]["ocr"].split("/")[-1].split(".")[0] - archive_filepath = f"{labeled_bucket_name}/{archive_name}" - logger.info(archive_filepath) - bucket.rename_blob(blob, archive_filepath) + # check if name uses the old local filing naming schema + if len(archive_name.split("-")) == 6: + archive_name = "-".join(archive_name.split("-")[2:]) + new_name = file.with_name(archive_name) + logger.info(new_name) + file.move(new_name) def copy_labeled_jsons_to_new_version_folder( From 2cd1fe629a27b591d50145d0b0f87a58b2bc4a8a Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Fri, 20 Sep 2024 12:36:56 +0100 Subject: [PATCH 074/161] fix paths to cache training data --- .../models/sec10k/utils/cloud.py | 27 +++++++++++++++---- 1 file changed, 22 insertions(+), 5 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py index 930f44f..64e5dc3 100644 --- a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py +++ b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py @@ -55,6 +55,13 @@ def from_10k(cls, filename: str, sec10k_text: str, ex_21_version: str): def save_as_pdf(self, file: BinaryIO): """Save Exhibit 21 as a PDF in `file`, which can be in memory or on disk.""" + # TODO: probably should make a "corrections" file that has CSS/HTML replacements + # to make PDF render + # TODO: should probably also catch errors and not fail + if "border-bottom: black thin solid;" in self.ex_21_text: + self.ex_21_text = self.ex_21_text.replace( + "border-bottom: black thin solid;", "border-bottom: 1px solid black;" + ) res = pisa.CreatePDF(self.ex_21_text, file) if res.err: logger.warning( @@ -197,9 +204,17 @@ def get_filings( filings = [] for _, filing in filing_selection.iterrows(): local_path = self.get_local_filename(cache_directory, filing) + year_quarter = filing["year_quarter"] if not local_path.exists(): with local_path.open("w") as f: - f.write((self.filings_bucket_path / filing.filename).read_text()) + f.write( + ( + self.filings_bucket_path + / "sec10k" + / f"sec10k-{year_quarter}" + / filing.filename + ).read_text() + ) with local_path.open() as f: sec10k_filing = Sec10K.from_file( @@ -257,11 +272,12 @@ def cache_training_data( json_cache_path.mkdir(parents=True, exist_ok=True) pdf_cache_path.mkdir(parents=True, exist_ok=True) metadata_df = self.get_metadata() - label_name_pattern = re.compile(r"(\d+)-\d{4}q[1-4]-\d+-(.+)") - + # label_name_pattern = re.compile(r"(\d+)-\d{4}q[1-4]-\d+-(.+)") + label_name_pattern = re.compile(r"(\d+)-(.+)") # Cache filings and labels filenames = [] direc = self.labels_bucket_path / gcs_folder_name + logger.info(direc.is_dir()) for file in direc.iterdir(): if file.name == gcs_folder_name: continue @@ -273,10 +289,11 @@ def cache_training_data( match = label_name_pattern.search(file.name) filenames.append(f"edgar/data/{match.group(1)}/{match.group(2)}.txt") + metadata_df = metadata_df.reset_index() filings = metadata_df[metadata_df["filename"].isin(filenames)] self.get_filings( - filings, - cache_path=pdf_cache_path, + filing_selection=filings, + cache_directory=pdf_cache_path, cache_pdf=True, ) From 64dc8c580bc8a3554c8aa743116dd9bb7aec5461 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Fri, 20 Sep 2024 13:39:25 +0100 Subject: [PATCH 075/161] update root dir path --- .../models/sec10k/ex_21/create_labeled_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py index 55e1d5a..47d5ee8 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py @@ -16,7 +16,7 @@ ) logger = logging.getLogger(f"catalystcoop.{__name__}") -ROOT_DIR = Path(__file__).parent.parent.parent.parent.resolve() +ROOT_DIR = Path(__file__).parent.parent.parent.parent.parent.parent.resolve() BBOX_COLS_PDF = [ From 226d91ce52696e0c3b33b54023f3c626ee5ee8ef Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 20 Sep 2024 10:10:17 -0400 Subject: [PATCH 076/161] Fix UPath initialization --- src/mozilla_sec_eia/models/sec10k/utils/cloud.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py index 930f44f..493c69a 100644 --- a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py +++ b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py @@ -144,17 +144,23 @@ class GCSArchive(ConfigurableResource): @property def filings_bucket_path(self): """Return UPath of filings bucket.""" - return UPath(self.filings_bucket) + path = UPath(self.filings_bucket) + assert path.exists(), "Filings bucket path does not exist" + return path @property def labels_bucket_path(self): """Return UPath of filings bucket.""" - return UPath(self.labels_bucket) + path = UPath(self.labels_bucket) + assert path.exists(), "Labels bucket path does not exist" + return path @property def outputs_bucket_path(self): """Return UPath of filings bucket.""" - return UPath(self.outputs_bucket) + path = UPath(self.outputs_bucket) + assert path.exists(), "Outputs bucket path does not exist" + return path def get_metadata(self, year_quarter: str | None = None) -> pd: """Return dataframe of filing metadata.""" From 3c17d33a71ca78547e3541453e660fdc1336002e Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 20 Sep 2024 10:32:23 -0400 Subject: [PATCH 077/161] Fix path in test --- tests/conftest.py | 3 ++- tests/integration/models/sec10k/extract_test.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index d1a47d4..1339467 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -37,8 +37,9 @@ def test_dir() -> Path: @pytest.fixture -def set_test_mlflow_env_vars_factory(): +def set_test_mlflow_env_vars_factory(tmp_path): def factory(): + os.environ["DAGSTER_HOME"] = str(tmp_path) # Use in memory tracking backend unless USE_TRACKING_SERVER is set if not os.getenv("USE_TRACKING_SERVER"): os.environ["MLFLOW_TRACKING_URI"] = "sqlite:///:memory:" diff --git a/tests/integration/models/sec10k/extract_test.py b/tests/integration/models/sec10k/extract_test.py index 750c969..88e5415 100644 --- a/tests/integration/models/sec10k/extract_test.py +++ b/tests/integration/models/sec10k/extract_test.py @@ -49,7 +49,7 @@ def test_ex21_validation( with unittest.mock.patch( "mozilla_sec_eia.models.sec10k.utils.layoutlm._load_pretrained_layoutlm", - new=lambda _: pretrained_model, + new=lambda cache_path, version: pretrained_model, ): set_test_mlflow_env_vars_factory() result = sec10k.defs.get_job_def( From df69f42c222c4f1de7dcbf3de66b73ca0b83b01b Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 20 Sep 2024 11:23:03 -0400 Subject: [PATCH 078/161] Create huggingface dataset outside model execution --- src/mozilla_sec_eia/models/sec10k/__init__.py | 5 - .../models/sec10k/ex_21/__init__.py | 49 +----- .../models/sec10k/ex_21/inference.py | 140 ++++++++---------- 3 files changed, 68 insertions(+), 126 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 4148007..fd9a866 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -45,10 +45,6 @@ ex_21.validation_assets, ) -ex21_test_job = model_jobs.create_validation_model_job( - "ex21_test", [ex_21.test_extraction_metrics] -) - layoutlm_finetune_job = model_jobs.create_training_job( "layoutlm_finetune", layoutlm_assets, @@ -62,7 +58,6 @@ basic_10k_validation_job, ex21_production_job, ex21_validation_job, - ex21_test_job, layoutlm_finetune_job, ], resources={ diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 890f44e..ce971c5 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -1,11 +1,9 @@ """Module for working with exhibit 21 data.""" import logging -import traceback import mlflow import pandas as pd -import torch from dagster import AssetIn, AssetOut, Out, asset, graph_multi_asset, multi_asset, op from mozilla_sec_eia.library import validation_helpers @@ -19,7 +17,7 @@ ) from ..extract import chunk_filings, sec10k_filing_metadata, year_quarter_partitions from ..utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename -from .inference import Exhibit21Extractor, clean_extracted_df +from .inference import Exhibit21Extractor, clean_extracted_df, extract_filings logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -147,25 +145,6 @@ def clean_ex21_validation_set(validation_df: pd.DataFrame): return validation_df -@asset -def test_extraction_metrics( - cloud_interface: GCSArchive, - exhibit21_extractor: Exhibit21Extractor, - mlflow_interface: MlflowInterface, -): - """Run extraction with various numbers of filings to view resource usage.""" - filings = cloud_interface.get_metadata() - for num_filings in [8, 16, 32, 64, 128]: - with mlflow.start_run( - run_name=f"extract_{num_filings}_filings", - nested=True, - parent_run_id=mlflow_interface.mlflow_run_id, - experiment_id=MlflowInterface.get_or_create_experiment("ex21_test"), - ): - mlflow.log_param("num_filings", num_filings) - exhibit21_extractor.extract_filings(filings.sample(num_filings)) - - @op( out={ "metadata": Out(dagster_type=sec10k_extract_metadata_type), @@ -178,24 +157,7 @@ def extract_filing_chunk( layoutlm, ) -> tuple[pd.DataFrame, pd.DataFrame]: """Extract a set of filings and return results.""" - try: - metadata, extracted = exhibit21_extractor.extract_filings( - filings, - model=layoutlm["model"], - processor=layoutlm["tokenizer"], - ) - except (torch.OutOfMemoryError, RuntimeError): - logger.warning(traceback.format_exc()) - logger.warning(f"Error while extracting filings: {filings['filename']}") - metadata = pd.DataFrame( - { - "filename": filings["filename"], - "success": [False] * len(filings), - "notes": ["Out of memory error"] * len(filings), - } - ).set_index("filename") - extracted = Ex21CompanyOwnership.example(size=0) - return metadata, extracted + return extract_filings(exhibit21_extractor, filings, layoutlm) @op( @@ -272,12 +234,9 @@ def ex21_extract_validation( layoutlm, ): """Extract ownership info from exhibit 21 docs.""" - metadata, extracted = exhibit21_extractor.extract_filings( - ex21_validation_filing_metadata, - model=layoutlm["model"], - processor=layoutlm["tokenizer"], + return extract_filings( + exhibit21_extractor, ex21_validation_filing_metadata, layoutlm ) - return metadata, extracted exhibit_21_extractor_resource = Exhibit21Extractor( diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index ff4be90..d69ddb5 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -3,6 +3,7 @@ import logging import os import tempfile +import traceback from contextlib import contextmanager from pathlib import Path @@ -17,7 +18,7 @@ ) from transformers.tokenization_utils_base import BatchEncoding -from ..entities import Ex21CompanyOwnership +from ..entities import Ex21CompanyOwnership, Sec10kExtractionMetadata from ..utils.cloud import GCSArchive, get_metadata_filename from ..utils.layoutlm import ( get_id_label_conversions, @@ -70,15 +71,33 @@ def format_unlabeled_pdf_dataframe(pdfs_dir: Path): return inference_df -def create_inference_dataset(pdfs_dir: Path, labeled_json_dir=None, has_labels=False): +def create_inference_dataset( + filing_metadata: pd.DataFrame, cloud_interface: GCSArchive, has_labels: bool = False +) -> tuple[pd.DataFrame, Dataset]: """Create a Hugging Face Dataset from PDFs for inference.""" - if has_labels: - inference_df = format_label_studio_output( - labeled_json_dir=labeled_json_dir, pdfs_dir=pdfs_dir + filings_with_ex21 = filing_metadata[~filing_metadata["exhibit_21_version"].isna()] + + # Parse PDFS + with ( + tempfile.TemporaryDirectory() as pdfs_dir, + tempfile.TemporaryDirectory() as labeled_json_dir, + ): + pdfs_dir = Path(pdfs_dir) + labeled_json_dir = Path(labeled_json_dir) + + extraction_metadata = _cache_pdfs( + filings_with_ex21, + cloud_interface=cloud_interface, + pdf_dir=pdfs_dir, ) - else: - inference_df = format_unlabeled_pdf_dataframe(pdfs_dir=pdfs_dir) - image_dict = get_image_dict(pdfs_dir) + if has_labels: + inference_df = format_label_studio_output( + labeled_json_dir=labeled_json_dir, pdfs_dir=pdfs_dir + ) + else: + inference_df = format_unlabeled_pdf_dataframe(pdfs_dir=pdfs_dir) + image_dict = get_image_dict(pdfs_dir) + annotations = [] for filename in image_dict: annotation = { @@ -96,7 +115,7 @@ def create_inference_dataset(pdfs_dir: Path, labeled_json_dir=None, has_labels=F annotations.append(annotation) dataset = Dataset.from_list(annotations) - return dataset + return extraction_metadata, dataset def clean_extracted_df(extracted_df): @@ -239,73 +258,9 @@ def setup_for_execution(self, context): os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" def extract_filings( - self, filing_metadata: pd.DataFrame, model, processor + self, dataset: Dataset, model, processor ) -> tuple[pd.DataFrame, pd.DataFrame]: - """Predict entities with a fine-tuned model and extract Ex. 21 tables. - - This function starts by creating a HuggingFace dataset from PDFs in `pdfs_dir` - that the model can then perform inference on (`create_inference_dataset`). - Then it creates an instance of the custom LayoutLM inference pipeline and - runs the dataset through the pipeline. The pipeline outputs logits, predictions, - and an output dataframe with extracted Ex. 21 table. - - Arguments: - pdfs_dir: Path to the directory with PDFs that are being used for inference. - model: A fine-tuned LayoutLM model. - processor: The tokenizer and encoder for model inputs. - extraction_metadata: A dataframe to track extraction success metrics. Should - have columns 'filename' and 'success'. - dataset_ind: A list of index numbers of dataset records to be used for inference - Default is None, in which the entire dataset created from the PDF directory - is used. - labeled_json_dir: Path to the directory with labeled JSONs from Label Studio. Cannot - be None if has_labels is True. - has_labels: Boolean, true if the data has associated labels that can be used in - visualizing and validating results. - device: String or int, specify what computation device to use for inference - i.e. "mps", "cpu", "cuda" - - Returns: - logits: A list of logits. The list is the length of the number of documents in the - dataset (number of PDFs in pdfs_dir). Each logit object in the list is of - shape (batch_size, seq_len, num_labels). Seq_len is - the same as token length (512 in this case). - predictions: A list of predictions. The list is the length of the number of documents - in the dataset (number of PDFs in pdfs_dir). - From the logits, we take the highest score for each token, using argmax. - This serves as the predicted label for each token. It is shape (seq_len) or token - length. - output_dfs: The extracted Ex. 21 tables. This is one big dataframe with an ID column - that is the filename of the extracted Ex. 21. Dataframe contains columns id, - subsidiary, loc, own_per. - """ - filings_with_ex21 = filing_metadata[ - ~filing_metadata["exhibit_21_version"].isna() - ] - - with ( - tempfile.TemporaryDirectory() as pdf_dir, - tempfile.TemporaryDirectory() as labeled_json_dir, - ): - extraction_metadata = _cache_pdfs( - filings_with_ex21, - cloud_interface=self.cloud_interface, - pdf_dir=pdf_dir, - ) - try: - dataset = create_inference_dataset( - pdfs_dir=Path(pdf_dir), - labeled_json_dir=labeled_json_dir, - has_labels=self.has_labels, - ) - # TODO: Investigate failures in creating dataset - except KeyError: - logger.warning("Failed to create inference dataset!") - extraction_metadata.loc[:, "success"] = False - extraction_metadata.loc[:, "notes"] = ( - "Failed to create inference dataset." - ) - return extraction_metadata, Ex21CompanyOwnership.example(size=0) + """Predict entities with a fine-tuned model and extract Ex. 21 tables.""" if self.dataset_ind: dataset = dataset.select(self.dataset_ind) @@ -320,7 +275,8 @@ def extract_filings( logits = [] predictions = [] - all_output_df = pd.DataFrame(columns=["id", "subsidiary", "loc", "own_per"]) + all_output_df = Ex21CompanyOwnership.example(size=0) + extraction_metadata = Sec10kExtractionMetadata.example(size=0) for logit, pred, output_df in pipe(_get_data(dataset)): logits.append(logit) predictions.append(pred) @@ -335,6 +291,38 @@ def extract_filings( return extraction_metadata, all_output_df +def extract_filings( + exhibit21_extractor: Exhibit21Extractor, + filings: pd.DataFrame, + layoutlm, +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Create huggingface dataset from filings and perform extraction.""" + try: + failed_metadata, dataset = create_inference_dataset( + filing_metadata=filings, + cloud_interface=exhibit21_extractor.cloud_interface, + has_labels=exhibit21_extractor.has_labels, + ) + metadata, extracted = exhibit21_extractor.extract_filings( + dataset, + model=layoutlm["model"], + processor=layoutlm["tokenizer"], + ) + metadata = pd.concat([failed_metadata, metadata]) + except Exception as e: + logger.warning(traceback.format_exc()) + logger.warning(f"Error while extracting filings: {filings.index}") + metadata = pd.DataFrame( + { + "filename": filings.index, + "success": [False] * len(filings), + "notes": [str(e)] * len(filings), + } + ).set_index("filename") + extracted = Ex21CompanyOwnership.example(size=0) + return metadata, extracted + + class LayoutLMInferencePipeline(Pipeline): """Pipeline for performing inference with fine-tuned LayoutLM.""" From 2d3345cc7f9a3e6f3570c0ec1ea807afd5777d47 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Fri, 20 Sep 2024 17:23:27 +0100 Subject: [PATCH 079/161] small fixes to path handling --- labeled_data_tracking.csv | 259 +++++++++++------- .../sec10k/ex_21/create_labeled_dataset.py | 9 +- 2 files changed, 169 insertions(+), 99 deletions(-) diff --git a/labeled_data_tracking.csv b/labeled_data_tracking.csv index 31a7950..0e9f642 100644 --- a/labeled_data_tracking.csv +++ b/labeled_data_tracking.csv @@ -1,97 +1,162 @@ -CIK,Filename,Initials,Notes -107815,edgar/data/107815/0000107815-17-000106.txt,KL -354707,edgar/data/354707/0000354707-19-000043.txt,KL -61339,edgar/data/61339/0001161728-17-000004.txt,KL -1317577,edgar/data/1317577/0001193125-13-356794.txt,KL -59527,edgar/data/59527/0000059527-20-000007.txt,KL -40545,edgar/data/40545/0000040545-04-000013.txt,KL -84557,edgar/data/84557/0001046861-06-000007.txt,KL -100826,edgar/data/100826/0001193125-09-042636.txt,KL -81033,edgar/data/81033/0000950117-06-000927.txt,KL -4904,edgar/data/4904/0000004904-09-000040.txt,KL -46207,edgar/data/46207/0001104659-13-011461.txt,KL -205402,edgar/data/205402/0000950114-99-000043.txt,KL -77227,edgar/data/77227/0001031296-09-000008.txt,KL -92487,edgar/data/92487/0000004904-21-000010.txt,KL -922237,edgar/data/922237/0000950005-99-000915.txt,KL -106170,edgar/data/106170/0000072741-98-000076.txt,KL -1223037,edgar/data/1223037/0001193125-09-249998.txt,KL -3146,edgar/data/3146/0001193125-06-055140.txt,KL -932628,edgar/data/932628/0000932628-16-000045.txt,KL -804212,edgar/data/804212/0000804212-14-000014.txt,KL -92416,edgar/data/92416/0000892569-94-000102.txt,KL -38079,edgar/data/38079/0001558370-16-004332.txt,KL -933157,edgar/data/933157/0001144204-08-021779.txt,KL -869495,edgar/data/869495/0001144204-13-002380.txt,KL -80812,edgar/data/80812/0000927016-98-004349.txt,KL -1582244,edgar/data/1582244/0001582244-16-000187.txt,KL -1166847,edgar/data/1166847/0001117768-12-000118.txt,KL -86521,edgar/data/86521/0000086521-10-000019.txt,KL -1012493,edgar/data/1012493/0000922358-99-000021.txt,KL -1170154,edgar/data/1170154/0001193125-11-062378.txt,KL -1140414,edgar/data/1140414/0001387131-16-004912.txt,KL -1158053,edgar/data/1158053/0000893220-04-001186.txt,KL -71675,edgar/data/71675/0001046861-02-000012.txt,KL -96271,edgar/data/96271/0001193125-07-042781.txt,KL -710182,edgar/data/710182/0000930661-97-000576.txt,KL -1029528,edgar/data/1029528/0001193125-04-043994.txt,KL -1043186,edgar/data/1043186/0001564590-19-011739.txt,KL -9342,edgar/data/9342/0000009342-95-000008.txt,KL -18647,edgar/data/18647/0001169232-08-000603.txt,KL -20947,edgar/data/20947/0001031296-06-000044.txt,KL -916529,edgar/data/916529/0001144204-03-001333.txt,KL -60549,edgar/data/60549/0001047469-98-012481.txt,KL -38725,edgar/data/38725/0000038725-17-000042.txt,KL -100122,edgar/data/100122/0000941138-03-000007.txt,KL -355811,edgar/data/355811/0000355811-18-000009.txt,KL -1039065,edgar/data/1039065/0001558370-15-001687.txt,KL -1008654,edgar/data/1008654/0001008654-20-000018.txt,KL -9534,edgar/data/9534/0000897069-05-000574.txt -1085866,edgar/data/1085866/0001072613-06-000748.txt -1045425,edgar/data/1045425/0000893220-05-000599.txt -1090908,edgar/data/1090908/0001437749-16-034757.txt -1546640,edgar/data/1546640/0001546640-14-000023.txt -844143,edgar/data/844143/0001104659-07-008735.txt -722056,edgar/data/722056/0001012870-99-002106.txt -1599298,edgar/data/1599298/0001599298-21-000011.txt -1010961,edgar/data/1010961/0001010961-01-500013.txt -802781,edgar/data/802781/0000950116-97-000760.txt -742126,edgar/data/742126/0001015402-05-001005.txt -930835,edgar/data/930835/0001047469-04-007773.txt -1174922,edgar/data/1174922/0001193125-10-043336.txt -1433270,edgar/data/1433270/0001047469-14-001424.txt -1275229,edgar/data/1275229/0001558370-19-002331.txt -18230,edgar/data/18230/0000950131-98-002084.txt -940942,edgar/data/940942/0001564590-21-009409.txt -320575,edgar/data/320575/0001193125-07-117419.txt -78778,edgar/data/78778/0000078778-97-000019.txt -1627811,edgar/data/1627811/0001493152-19-004568.txt -78890,edgar/data/78890/0000078890-14-000004.txt -99250,edgar/data/99250/0000099250-00-000002.txt -78100,edgar/data/78100/0001109357-20-000053.txt -700949,edgar/data/700949/0000892626-96-000081.txt -1468174,edgar/data/1468174/0001468174-21-000011.txt -805730,edgar/data/805730/0001104659-05-009806.txt -820242,edgar/data/820242/0000912057-01-517770.txt -52795,edgar/data/52795/0000950137-00-000865.txt -944130,edgar/data/944130/0001432093-11-000164.txt -66901,edgar/data/66901/0000065984-96-000046.txt -722077,edgar/data/722077/0001047469-15-002056.txt -103872,edgar/data/103872/0001193125-13-444053.txt -1065201,edgar/data/1065201/0001193125-10-070085.txt -729213,edgar/data/729213/0001038838-01-000141.txt -1383414,edgar/data/1383414/0001193125-14-409216.txt -1493594,edgar/data/1493594/0001493594-19-000064.txt -1039399,edgar/data/1039399/0001039399-20-000011.txt -943452,edgar/data/943452/0001193125-07-043570.txt -944739,edgar/data/944739/0001193125-06-035399.txt -61986,edgar/data/61986/0000061986-99-000003.txt -6769,edgar/data/6769/0000950129-03-001523.txt -319201,edgar/data/319201/0000891618-98-004336.txt -34067,edgar/data/34067/0001104659-06-016592.txt -1265245,edgar/data/1265245/0000770944-04-000004.txt -1066134,edgar/data/1066134/0001193125-08-186978.txt -789570,edgar/data/789570/0000898430-95-000343.txt -1273013,edgar/data/1273013/0001104659-07-020456.txt -88205,edgar/data/88205/0000950168-03-000755.txt -1286613,edgar/data/1286613/0001140361-18-012880.txt +,CIK,Filename,Initials,Notes +0,107815,edgar/data/107815/0000107815-17-000106.txt,KL, +1,354707,edgar/data/354707/0000354707-19-000043.txt,KL, +2,61339,edgar/data/61339/0001161728-17-000004.txt,KL, +3,1317577,edgar/data/1317577/0001193125-13-356794.txt,KL, +4,59527,edgar/data/59527/0000059527-20-000007.txt,KL, +5,40545,edgar/data/40545/0000040545-04-000013.txt,KL, +6,84557,edgar/data/84557/0001046861-06-000007.txt,KL, +7,100826,edgar/data/100826/0001193125-09-042636.txt,KL, +8,81033,edgar/data/81033/0000950117-06-000927.txt,KL, +9,4904,edgar/data/4904/0000004904-09-000040.txt,KL, +10,46207,edgar/data/46207/0001104659-13-011461.txt,KL, +11,205402,edgar/data/205402/0000950114-99-000043.txt,KL, +12,77227,edgar/data/77227/0001031296-09-000008.txt,KL, +13,92487,edgar/data/92487/0000004904-21-000010.txt,KL, +14,922237,edgar/data/922237/0000950005-99-000915.txt,KL, +15,106170,edgar/data/106170/0000072741-98-000076.txt,KL, +16,1223037,edgar/data/1223037/0001193125-09-249998.txt,KL, +17,3146,edgar/data/3146/0001193125-06-055140.txt,KL, +18,932628,edgar/data/932628/0000932628-16-000045.txt,KL, +19,804212,edgar/data/804212/0000804212-14-000014.txt,KL, +20,92416,edgar/data/92416/0000892569-94-000102.txt,KL, +21,38079,edgar/data/38079/0001558370-16-004332.txt,KL, +22,933157,edgar/data/933157/0001144204-08-021779.txt,KL, +23,869495,edgar/data/869495/0001144204-13-002380.txt,KL, +24,80812,edgar/data/80812/0000927016-98-004349.txt,KL, +25,1582244,edgar/data/1582244/0001582244-16-000187.txt,KL, +26,1166847,edgar/data/1166847/0001117768-12-000118.txt,KL, +27,86521,edgar/data/86521/0000086521-10-000019.txt,KL, +28,1012493,edgar/data/1012493/0000922358-99-000021.txt,KL, +29,1170154,edgar/data/1170154/0001193125-11-062378.txt,KL, +30,1140414,edgar/data/1140414/0001387131-16-004912.txt,KL, +31,1158053,edgar/data/1158053/0000893220-04-001186.txt,KL, +32,71675,edgar/data/71675/0001046861-02-000012.txt,KL, +33,96271,edgar/data/96271/0001193125-07-042781.txt,KL, +34,710182,edgar/data/710182/0000930661-97-000576.txt,KL, +35,1029528,edgar/data/1029528/0001193125-04-043994.txt,KL, +36,1043186,edgar/data/1043186/0001564590-19-011739.txt,KL, +37,9342,edgar/data/9342/0000009342-95-000008.txt,KL, +38,18647,edgar/data/18647/0001169232-08-000603.txt,KL, +39,20947,edgar/data/20947/0001031296-06-000044.txt,KL, +40,916529,edgar/data/916529/0001144204-03-001333.txt,KL, +41,60549,edgar/data/60549/0001047469-98-012481.txt,KL, +42,38725,edgar/data/38725/0000038725-17-000042.txt,KL, +43,100122,edgar/data/100122/0000941138-03-000007.txt,KL, +44,355811,edgar/data/355811/0000355811-18-000009.txt,KL, +45,1039065,edgar/data/1039065/0001558370-15-001687.txt,KL, +46,1008654,edgar/data/1008654/0001008654-20-000018.txt,KL, +47,9534,edgar/data/9534/0000897069-05-000574.txt,KL, +48,1085866,edgar/data/1085866/0001072613-06-000748.txt,KL, +49,1045425,edgar/data/1045425/0000893220-05-000599.txt,KL, +50,1090908,edgar/data/1090908/0001437749-16-034757.txt,KL, +51,1546640,edgar/data/1546640/0001546640-14-000023.txt,KL, +52,844143,edgar/data/844143/0001104659-07-008735.txt,KL, +53,1599298,edgar/data/1599298/0001599298-21-000011.txt,KL, +54,1010961,edgar/data/1010961/0001010961-01-500013.txt,KL, +55,802781,edgar/data/802781/0000950116-97-000760.txt,KL, +56,742126,edgar/data/742126/0001015402-05-001005.txt,KL, +57,1174922,edgar/data/1174922/0001193125-10-043336.txt,KL, +58,1433270,edgar/data/1433270/0001047469-14-001424.txt,KL, +59,1275229,edgar/data/1275229/0001558370-19-002331.txt,KL, +60,940942,edgar/data/940942/0001564590-21-009409.txt,KL, +61,320575,edgar/data/320575/0001193125-07-117419.txt,KL, +62,78778,edgar/data/78778/0000078778-97-000019.txt,KL, +63,1627811,edgar/data/1627811/0001493152-19-004568.txt,KL, +64,99250,edgar/data/99250/0000099250-00-000002.txt,KL, +65,700949,edgar/data/700949/0000892626-96-000081.txt,KL, +66,805730,edgar/data/805730/0001104659-05-009806.txt,KL, +67,820242,edgar/data/820242/0000912057-01-517770.txt,KL, +68,944130,edgar/data/944130/0001432093-11-000164.txt,KL, +69,66901,edgar/data/66901/0000065984-96-000046.txt,KL, +70,722077,edgar/data/722077/0001047469-15-002056.txt,KL, +71,103872,edgar/data/103872/0001193125-13-444053.txt,KL, +72,1065201,edgar/data/1065201/0001193125-10-070085.txt,KL, +73,729213,edgar/data/729213/0001038838-01-000141.txt,KL, +74,1383414,edgar/data/1383414/0001193125-14-409216.txt,KL, +75,1493594,edgar/data/1493594/0001493594-19-000064.txt,KL, +76,1039399,edgar/data/1039399/0001039399-20-000011.txt,KL, +77,943452,edgar/data/943452/0001193125-07-043570.txt,KL, +78,944739,edgar/data/944739/0001193125-06-035399.txt,KL, +79,61986,edgar/data/61986/0000061986-99-000003.txt,KL, +80,319201,edgar/data/319201/0000891618-98-004336.txt,KL, +81,34067,edgar/data/34067/0001104659-06-016592.txt,KL, +82,1265245,edgar/data/1265245/0000770944-04-000004.txt,KL, +83,1066134,edgar/data/1066134/0001193125-08-186978.txt,KL, +84,789570,edgar/data/789570/0000898430-95-000343.txt,KL, +85,1273013,edgar/data/1273013/0001104659-07-020456.txt,KL, +86,88205,edgar/data/88205/0000950168-03-000755.txt,KL, +87,1286613,edgar/data/1286613/0001140361-18-012880.txt,KL, +88,92416,edgar/data/92416/0001193125-17-062419.txt,KL, +89,68589,edgar/data/68589/0000068589-11-000002.txt,KL, +90,1738827,edgar/data/1738827/0001558370-19-002349.txt,KL, +91,1283140,edgar/data/1283140/0000950134-08-002891.txt,KL, +92,72903,edgar/data/72903/0001104659-07-013272.txt,KL, +93,104819,edgar/data/104819/0001193125-14-422013.txt,KL, +94,872248,edgar/data/872248/0000950123-10-018189.txt,KL, +95,1035002,edgar/data/1035002/0001035002-19-000008.txt,KL, +96,1300514,edgar/data/1300514/0000950123-12-004305.txt,KL, +98,1868941,edgar/data/1868941/0001868941-22-000120.txt,KL, +99,1004155,edgar/data/1004155/0000092122-23-000012.txt,KL, +100,1013871,edgar/data/1013871/0000950123-08-002271.txt,KL, +101,202584,edgar/data/202584/0000065984-10-000035.txt,KL, +102,1573166,edgar/data/1573166/0001047469-16-010925.txt,KL, +103,1106935,edgar/data/1106935/0000945234-03-000135.txt,KL, +104,70145,edgar/data/70145/0001193125-11-321222.txt,KL, +105,32689,edgar/data/32689/0001047469-09-001643.txt,KL, +106,20290,edgar/data/20290/0001326160-19-000057.txt,KL, +107,1581552,edgar/data/1581552/0001185185-22-000284.txt,KL, +108,866829,edgar/data/866829/0000866829-12-000009.txt,KL, +109,1361937,edgar/data/1361937/0001144204-14-012672.txt,KL, +110,721693,edgar/data/721693/0001213900-19-005898.txt,KL, +111,1445146,edgar/data/1445146/0001445146-16-000019.txt,KL, +112,700997,edgar/data/700997/0000950134-96-000227.txt,KL, +113,1040736,edgar/data/1040736/0001040736-05-000005.txt,KL, +114,315189,edgar/data/315189/0000940180-02-001741.txt,KL, +115,6314,edgar/data/6314/0000891092-07-004187.txt,KL, +116,1069157,edgar/data/1069157/0001047469-04-007313.txt,KL, +117,1436161,edgar/data/1436161/0001553350-15-000363.txt,KL, +118,1047098,edgar/data/1047098/0001193125-04-161613.txt,KL, +119,842635,edgar/data/842635/0000899243-98-000530.txt,KL, +120,29644,edgar/data/29644/0000897101-11-000318.txt,KL, +121,804269,edgar/data/804269/0001193125-06-188013.txt,KL, +122,46738,edgar/data/46738/0000950131-01-001406.txt,KL, +123,99780,edgar/data/99780/0000099780-20-000025.txt,KL, +124,822662,edgar/data/822662/0000822662-18-000021.txt,KL, +125,108516,edgar/data/108516/0001564590-19-027053.txt,KL, +127,1430306,edgar/data/1430306/0001387131-20-003189.txt,KL, +128,909413,edgar/data/909413/0000950129-99-001323.txt,KL, +129,725058,edgar/data/725058/0000950109-97-002390.txt,KL, +130,1668370,edgar/data/1668370/0001575872-22-000280.txt,KL, +131,1035688,edgar/data/1035688/0000950123-10-025544.txt,KL, +132,865911,edgar/data/865911/0000891554-01-501498.txt,KL, +133,790708,edgar/data/790708/0000950168-03-001327.txt,KL, +134,881665,edgar/data/881665/0001193125-07-062920.txt,KL, +135,1440799,edgar/data/1440799/0001144204-11-045638.txt,KL, +136,32604,edgar/data/32604/0000032604-97-000015.txt,KL, +137,1421517,edgar/data/1421517/0000950123-10-024776.txt,KL, +138,1032208,edgar/data/1032208/0000086521-16-000091.txt,KL, +139,936340,edgar/data/936340/0000950123-11-015771.txt,KL, +140,728385,edgar/data/728385/0001477932-16-009335.txt,KL, +141,731802,edgar/data/731802/0000731802-19-000037.txt,KL, +142,30371,edgar/data/30371/0001326160-17-000016.txt,KL, +143,1623360,edgar/data/1623360/0001640334-18-002417.txt,KL, +144,57183,edgar/data/57183/0001068800-04-000659.txt,KL, +145,43350,edgar/data/43350/0001144204-17-014878.txt,KL, +146,319019,edgar/data/319019/0000319019-96-000013.txt,KL, +147,726435,edgar/data/726435/0001406774-11-000031.txt,KL, +148,811156,edgar/data/811156/0001047469-13-001373.txt,KL, +149,1368802,edgar/data/1368802/0001144204-07-014837.txt,KL, +150,1273441,edgar/data/1273441/0001273441-13-000012.txt,KL, +152,1135338,edgar/data/1135338/0000950116-05-001941.txt,KL, +153,1375063,edgar/data/1375063/0001144204-14-055659.txt,KL, +154,103682,edgar/data/103682/0001193125-05-038710.txt,KL, +155,1081316,edgar/data/1081316/0001081316-14-000006.txt,KL, +156,54507,edgar/data/54507/0000054507-18-000012.txt,KL, +157,1130310,edgar/data/1130310/0001130310-16-000031.txt,KL, +158,884504,edgar/data/884504/0001144204-10-017335.txt,KL, +159,1555177,edgar/data/1555177/0001555177-17-000011.txt,KL, +160,1142129,edgar/data/1142129/0001493152-17-005793.txt,KL, +161,1059025,edgar/data/1059025/0000934665-99-000002.txt,KL, +162,318996,edgar/data/318996/0000318996-18-000007.txt,KL, +163,350563,edgar/data/350563/0001193125-12-078254.txt,KL, diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py index 47d5ee8..39ac161 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py @@ -140,8 +140,6 @@ def _is_cik_in_training_data(labeled_json_filename, tracking_df): return cik in tracking_df.CIK.unique() -# TODO: make this work with GCS input directory not local -# TODO: have default paths? def format_label_studio_output( labeled_json_dir=ROOT_DIR / "sec10k_filings/labeled_jsons", pdfs_dir=ROOT_DIR / "sec10k_filings/pdfs", @@ -157,6 +155,9 @@ def format_label_studio_output( with Path.open(json_file_path) as j: doc_dict = json.loads(j.read()) filename = doc_dict["task"]["data"]["ocr"].split("/")[-1].split(".")[0] + # check if old local naming schema is being used + if len(filename.split("-")) == 6: + filename = "-".join(filename.split("-")[2:]) if not _is_cik_in_training_data(filename, tracking_df=tracking_df): continue pdf_filename = filename + ".pdf" @@ -180,6 +181,10 @@ def format_label_studio_output( # combine the bounding boxes for each word doc_df = doc_df.groupby(level=0).first() txt.loc[:, "id"] = filename + # TODO: probably want to filter out these empty Ex. 21 docs + # the doc might not have any labels in it if it was an empty Ex. 21 + if "labels" not in doc_df: + doc_df.loc[:, "labels"] = pd.Series() output_df = pd.concat([txt, doc_df[["labels"]]], axis=1) labeled_df = pd.concat([labeled_df, output_df]) From 6f9d34a0ee90dbe08cea04b3cd319f7dccbe6f95 Mon Sep 17 00:00:00 2001 From: zschira Date: Mon, 23 Sep 2024 10:22:14 -0400 Subject: [PATCH 080/161] Minor fixes --- labeled_data_tracking.csv | 97 ------------------- .../library/mlflow/__init__.py | 1 + .../library/mlflow/mlflow_resource.py | 7 +- .../library/validation_helpers.py | 12 +++ .../models/sec10k/utils/cloud.py | 24 +++-- tests/unit/models/sec10k/utils_test.py | 4 +- 6 files changed, 32 insertions(+), 113 deletions(-) delete mode 100644 labeled_data_tracking.csv diff --git a/labeled_data_tracking.csv b/labeled_data_tracking.csv deleted file mode 100644 index 31a7950..0000000 --- a/labeled_data_tracking.csv +++ /dev/null @@ -1,97 +0,0 @@ -CIK,Filename,Initials,Notes -107815,edgar/data/107815/0000107815-17-000106.txt,KL -354707,edgar/data/354707/0000354707-19-000043.txt,KL -61339,edgar/data/61339/0001161728-17-000004.txt,KL -1317577,edgar/data/1317577/0001193125-13-356794.txt,KL -59527,edgar/data/59527/0000059527-20-000007.txt,KL -40545,edgar/data/40545/0000040545-04-000013.txt,KL -84557,edgar/data/84557/0001046861-06-000007.txt,KL -100826,edgar/data/100826/0001193125-09-042636.txt,KL -81033,edgar/data/81033/0000950117-06-000927.txt,KL -4904,edgar/data/4904/0000004904-09-000040.txt,KL -46207,edgar/data/46207/0001104659-13-011461.txt,KL -205402,edgar/data/205402/0000950114-99-000043.txt,KL -77227,edgar/data/77227/0001031296-09-000008.txt,KL -92487,edgar/data/92487/0000004904-21-000010.txt,KL -922237,edgar/data/922237/0000950005-99-000915.txt,KL -106170,edgar/data/106170/0000072741-98-000076.txt,KL -1223037,edgar/data/1223037/0001193125-09-249998.txt,KL -3146,edgar/data/3146/0001193125-06-055140.txt,KL -932628,edgar/data/932628/0000932628-16-000045.txt,KL -804212,edgar/data/804212/0000804212-14-000014.txt,KL -92416,edgar/data/92416/0000892569-94-000102.txt,KL -38079,edgar/data/38079/0001558370-16-004332.txt,KL -933157,edgar/data/933157/0001144204-08-021779.txt,KL -869495,edgar/data/869495/0001144204-13-002380.txt,KL -80812,edgar/data/80812/0000927016-98-004349.txt,KL -1582244,edgar/data/1582244/0001582244-16-000187.txt,KL -1166847,edgar/data/1166847/0001117768-12-000118.txt,KL -86521,edgar/data/86521/0000086521-10-000019.txt,KL -1012493,edgar/data/1012493/0000922358-99-000021.txt,KL -1170154,edgar/data/1170154/0001193125-11-062378.txt,KL -1140414,edgar/data/1140414/0001387131-16-004912.txt,KL -1158053,edgar/data/1158053/0000893220-04-001186.txt,KL -71675,edgar/data/71675/0001046861-02-000012.txt,KL -96271,edgar/data/96271/0001193125-07-042781.txt,KL -710182,edgar/data/710182/0000930661-97-000576.txt,KL -1029528,edgar/data/1029528/0001193125-04-043994.txt,KL -1043186,edgar/data/1043186/0001564590-19-011739.txt,KL -9342,edgar/data/9342/0000009342-95-000008.txt,KL -18647,edgar/data/18647/0001169232-08-000603.txt,KL -20947,edgar/data/20947/0001031296-06-000044.txt,KL -916529,edgar/data/916529/0001144204-03-001333.txt,KL -60549,edgar/data/60549/0001047469-98-012481.txt,KL -38725,edgar/data/38725/0000038725-17-000042.txt,KL -100122,edgar/data/100122/0000941138-03-000007.txt,KL -355811,edgar/data/355811/0000355811-18-000009.txt,KL -1039065,edgar/data/1039065/0001558370-15-001687.txt,KL -1008654,edgar/data/1008654/0001008654-20-000018.txt,KL -9534,edgar/data/9534/0000897069-05-000574.txt -1085866,edgar/data/1085866/0001072613-06-000748.txt -1045425,edgar/data/1045425/0000893220-05-000599.txt -1090908,edgar/data/1090908/0001437749-16-034757.txt -1546640,edgar/data/1546640/0001546640-14-000023.txt -844143,edgar/data/844143/0001104659-07-008735.txt -722056,edgar/data/722056/0001012870-99-002106.txt -1599298,edgar/data/1599298/0001599298-21-000011.txt -1010961,edgar/data/1010961/0001010961-01-500013.txt -802781,edgar/data/802781/0000950116-97-000760.txt -742126,edgar/data/742126/0001015402-05-001005.txt -930835,edgar/data/930835/0001047469-04-007773.txt -1174922,edgar/data/1174922/0001193125-10-043336.txt -1433270,edgar/data/1433270/0001047469-14-001424.txt -1275229,edgar/data/1275229/0001558370-19-002331.txt -18230,edgar/data/18230/0000950131-98-002084.txt -940942,edgar/data/940942/0001564590-21-009409.txt -320575,edgar/data/320575/0001193125-07-117419.txt -78778,edgar/data/78778/0000078778-97-000019.txt -1627811,edgar/data/1627811/0001493152-19-004568.txt -78890,edgar/data/78890/0000078890-14-000004.txt -99250,edgar/data/99250/0000099250-00-000002.txt -78100,edgar/data/78100/0001109357-20-000053.txt -700949,edgar/data/700949/0000892626-96-000081.txt -1468174,edgar/data/1468174/0001468174-21-000011.txt -805730,edgar/data/805730/0001104659-05-009806.txt -820242,edgar/data/820242/0000912057-01-517770.txt -52795,edgar/data/52795/0000950137-00-000865.txt -944130,edgar/data/944130/0001432093-11-000164.txt -66901,edgar/data/66901/0000065984-96-000046.txt -722077,edgar/data/722077/0001047469-15-002056.txt -103872,edgar/data/103872/0001193125-13-444053.txt -1065201,edgar/data/1065201/0001193125-10-070085.txt -729213,edgar/data/729213/0001038838-01-000141.txt -1383414,edgar/data/1383414/0001193125-14-409216.txt -1493594,edgar/data/1493594/0001493594-19-000064.txt -1039399,edgar/data/1039399/0001039399-20-000011.txt -943452,edgar/data/943452/0001193125-07-043570.txt -944739,edgar/data/944739/0001193125-06-035399.txt -61986,edgar/data/61986/0000061986-99-000003.txt -6769,edgar/data/6769/0000950129-03-001523.txt -319201,edgar/data/319201/0000891618-98-004336.txt -34067,edgar/data/34067/0001104659-06-016592.txt -1265245,edgar/data/1265245/0000770944-04-000004.txt -1066134,edgar/data/1066134/0001193125-08-186978.txt -789570,edgar/data/789570/0000898430-95-000343.txt -1273013,edgar/data/1273013/0001104659-07-020456.txt -88205,edgar/data/88205/0000950168-03-000755.txt -1286613,edgar/data/1286613/0001140361-18-012880.txt diff --git a/src/mozilla_sec_eia/library/mlflow/__init__.py b/src/mozilla_sec_eia/library/mlflow/__init__.py index a7a65d6..6c32768 100644 --- a/src/mozilla_sec_eia/library/mlflow/__init__.py +++ b/src/mozilla_sec_eia/library/mlflow/__init__.py @@ -7,6 +7,7 @@ ) from .mlflow_resource import ( MlflowInterface, + configure_mlflow, get_most_recent_run, ) diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py index d0fa62b..2e015af 100644 --- a/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py +++ b/src/mozilla_sec_eia/library/mlflow/mlflow_resource.py @@ -22,8 +22,11 @@ logger = logging.getLogger(f"catalystcoop.{__name__}") -def _configure_mlflow(tracking_uri: str, project: str): +def configure_mlflow(tracking_uri: str | None = None, project: str | None = None): """Do runtime configuration of mlflow.""" + tracking_uri = tracking_uri if tracking_uri else os.getenv("MLFLOW_TRACKING_URI") + project = project if project else os.getenv("GCS_PROJECT") + os.environ["MLFLOW_TRACKING_USERNAME"] = "admin" os.environ["MLFLOW_TRACKING_PASSWORD"] = _get_tracking_password( tracking_uri, project @@ -90,7 +93,7 @@ def yield_for_execution( """Create experiment tracker for specified experiment.""" dagster_run_id = context.run_id self._mlflow_run_id = None - _configure_mlflow(self.tracking_uri, self.project) + configure_mlflow(self.tracking_uri, self.project) if self.tracking_enabled: # Get run_id associated with current dagster run diff --git a/src/mozilla_sec_eia/library/validation_helpers.py b/src/mozilla_sec_eia/library/validation_helpers.py index 62c1825..dfe5afb 100644 --- a/src/mozilla_sec_eia/library/validation_helpers.py +++ b/src/mozilla_sec_eia/library/validation_helpers.py @@ -5,6 +5,18 @@ import pandas as pd +def load_training_data( + filename: str, index_cols: list[str] | None = None +) -> pd.DataFrame: + """Load csv with validation data from `package_data` directory.""" + df = pd.read_csv( + resources.files("mozilla_sec_eia.package_data.training_data") / filename + ) + if index_cols is not None: + df = df.set_index(index_cols) + return df + + def load_validation_data( filename: str, index_cols: list[str] | None = None ) -> pd.DataFrame: diff --git a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py index 493c69a..b531308 100644 --- a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py +++ b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py @@ -162,7 +162,7 @@ def outputs_bucket_path(self): assert path.exists(), "Outputs bucket path does not exist" return path - def get_metadata(self, year_quarter: str | None = None) -> pd: + def get_metadata(self, year_quarter: str | None = None) -> pd.DataFrame: """Return dataframe of filing metadata.""" selection = None if year_quarter is not None: @@ -176,10 +176,7 @@ def get_local_filename( self, cache_directory: Path, filing: pd.Series | Sec10K, extension=".html" ) -> Path: """Return path to a filing in local cache based on metadata.""" - if isinstance(filing, pd.Series): - filename = filing["filename"] - else: - filename = filing.filename + filename = filing.name if isinstance(filing, pd.Series) else filing.filename return cache_directory / Path( f"{filename.replace('edgar/data/', '').replace('/', '-')}".replace( ".txt", extension @@ -201,16 +198,17 @@ def get_filings( cache_pdf: Boolean indicating whether to also cache a PDF of the Ex. 21 """ filings = [] - for _, filing in filing_selection.iterrows(): + for filename, filing in filing_selection.iterrows(): local_path = self.get_local_filename(cache_directory, filing) + filepath = f"sec10k/sec10k-{filing.year_quarter}/{filename}" if not local_path.exists(): with local_path.open("w") as f: - f.write((self.filings_bucket_path / filing.filename).read_text()) + f.write((self.filings_bucket_path / filepath).read_text()) with local_path.open() as f: sec10k_filing = Sec10K.from_file( file=f, - filename=filing["filename"], + filename=filename, cik=filing["cik"], year_quarter=filing["year_quarter"], ex_21_version=filing["exhibit_21_version"], @@ -263,13 +261,13 @@ def cache_training_data( json_cache_path.mkdir(parents=True, exist_ok=True) pdf_cache_path.mkdir(parents=True, exist_ok=True) metadata_df = self.get_metadata() - label_name_pattern = re.compile(r"(\d+)-\d{4}q[1-4]-\d+-(.+)") + label_name_pattern = re.compile(r"(\d+)-(.+)") # Cache filings and labels filenames = [] direc = self.labels_bucket_path / gcs_folder_name for file in direc.iterdir(): - if file.name == gcs_folder_name: + if file.name in gcs_folder_name: continue # Cache labels with (json_cache_path / file.name).open("w") as f: @@ -279,10 +277,10 @@ def cache_training_data( match = label_name_pattern.search(file.name) filenames.append(f"edgar/data/{match.group(1)}/{match.group(2)}.txt") - filings = metadata_df[metadata_df["filename"].isin(filenames)] + filings = metadata_df[metadata_df.index.isin(filenames)] self.get_filings( filings, - cache_path=pdf_cache_path, + cache_directory=pdf_cache_path, cache_pdf=True, ) @@ -296,7 +294,7 @@ def validate_archive(self) -> bool: # Get metadata df logger.info("Get list of files in metadata.") - metadata_filenames = set(self.get_metadata()["filename"]) + metadata_filenames = set(self.get_metadata().index) if not (valid := archive_filenames == metadata_filenames): logger.warning("Archive validation failed.") diff --git a/tests/unit/models/sec10k/utils_test.py b/tests/unit/models/sec10k/utils_test.py index 1bb7905..845ef2f 100644 --- a/tests/unit/models/sec10k/utils_test.py +++ b/tests/unit/models/sec10k/utils_test.py @@ -89,7 +89,9 @@ def test_validate_archive(test_archive, archive_files, metadata_files, valid, mo new=archive_files, ): metadata_mock = mocker.MagicMock( - return_value=pd.DataFrame({"filename": metadata_files}) + return_value=pd.DataFrame({"filename": metadata_files}).set_index( + "filename" + ) ) mocker.patch( "mozilla_sec_eia.models.sec10k.utils.cloud.GCSArchive.get_metadata", From 81813a7b3331f981577d4586f7576748598aabc6 Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 24 Sep 2024 12:25:56 -0400 Subject: [PATCH 081/161] Create dataset as dataframe for logging --- src/mozilla_sec_eia/models/sec10k/ex_21/inference.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index d69ddb5..658ab72 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -73,7 +73,7 @@ def format_unlabeled_pdf_dataframe(pdfs_dir: Path): def create_inference_dataset( filing_metadata: pd.DataFrame, cloud_interface: GCSArchive, has_labels: bool = False -) -> tuple[pd.DataFrame, Dataset]: +) -> tuple[pd.DataFrame, pd.DataFrame]: """Create a Hugging Face Dataset from PDFs for inference.""" filings_with_ex21 = filing_metadata[~filing_metadata["exhibit_21_version"].isna()] @@ -114,8 +114,7 @@ def create_inference_dataset( ) annotations.append(annotation) - dataset = Dataset.from_list(annotations) - return extraction_metadata, dataset + return extraction_metadata, pd.DataFrame(annotations) def clean_extracted_df(extracted_df): From 5174ed776b726890eb1aacf0efb7f9545a6b6b3e Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 24 Sep 2024 14:51:39 -0400 Subject: [PATCH 082/161] Modify dataset return type --- src/mozilla_sec_eia/models/sec10k/ex_21/inference.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index 658ab72..7c743bd 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -99,14 +99,16 @@ def create_inference_dataset( image_dict = get_image_dict(pdfs_dir) annotations = [] - for filename in image_dict: + for filename, image in image_dict.items(): annotation = { "id": filename, "tokens": inference_df.groupby("id")["text"].apply(list).loc[filename], "bboxes": inference_df.loc[inference_df["id"] == filename, :][BBOX_COLS_PDF] .to_numpy() .tolist(), - "image": image_dict[filename], + "image": image.tobytes(), + "mode": image.mode, + "size": image.size, } if has_labels: annotation["ner_tags"] = ( From 7a572c07967840da51cda298c9c2216d4c1202dd Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 24 Sep 2024 15:03:01 -0400 Subject: [PATCH 083/161] Fix dataset types for model signature --- src/mozilla_sec_eia/models/sec10k/ex_21/inference.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index 7c743bd..5de6eb9 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -108,7 +108,8 @@ def create_inference_dataset( .tolist(), "image": image.tobytes(), "mode": image.mode, - "size": image.size, + "width": image.size[0], + "height": image.size[1], } if has_labels: annotation["ner_tags"] = ( From 5728026605781f098c1c40cd1b076d84c0f1b5ba Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 25 Sep 2024 14:24:51 -0400 Subject: [PATCH 084/161] Migrate ex 21 model training to a notebook --- .../library/mlflow/__init__.py | 1 + .../library/mlflow/mlflow_io_managers.py | 26 + src/mozilla_sec_eia/models/sec10k/__init__.py | 46 +- .../models/sec10k/ex_21/__init__.py | 184 +-- .../models/sec10k/ex_21/data.py | 1 + .../models/sec10k/ex_21/inference.py | 385 ++---- .../models/sec10k/ex_21/train_extractor.py | 192 --- .../notebooks/exhibit21_extractor.ipynb | 1084 +++++++++++++++++ .../train_exhibit21_extraction.ipynb | 1045 ---------------- .../models/sec10k/utils/cloud.py | 2 +- tests/unit/models/sec10k/ex21_model_test.py | 6 +- 11 files changed, 1239 insertions(+), 1733 deletions(-) create mode 100644 src/mozilla_sec_eia/models/sec10k/ex_21/data.py delete mode 100644 src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py create mode 100644 src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb delete mode 100644 src/mozilla_sec_eia/models/sec10k/notebooks/train_exhibit21_extraction.ipynb diff --git a/src/mozilla_sec_eia/library/mlflow/__init__.py b/src/mozilla_sec_eia/library/mlflow/__init__.py index 6c32768..17a765d 100644 --- a/src/mozilla_sec_eia/library/mlflow/__init__.py +++ b/src/mozilla_sec_eia/library/mlflow/__init__.py @@ -4,6 +4,7 @@ MlflowBaseIOManager, MlflowMetricsIOManager, MlflowPandasArtifactIOManager, + MlflowPyfuncModelIOManager, ) from .mlflow_resource import ( MlflowInterface, diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py index 7aa05d7..94468f5 100644 --- a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py +++ b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py @@ -26,6 +26,32 @@ def _get_run_info(self) -> Run: return mlflow.get_run(self.mlflow_interface.mlflow_run_id) +class MlflowPyfuncModelIOManager(MlflowBaseIOManager): + """IO Manager to load pyfunc models from tracking server.""" + + uri: str | None = None + + def handle_output(self, context, obj): + """Outputs not implemented.""" + raise NotImplementedError("Logging models not supported by io manager.") + + def load_input(self, context: InputContext): + """Load pyfunc model with mlflow server.""" + cache_path = ( + self.mlflow_interface.dagster_home_path / "model_cache" / context.name + ) + cache_path.mkdir(exist_ok=True, parents=True) + + model_uri = self.uri + if model_uri is None: + model_uri = f"models:/{context.name}" + + mlflow.pyfunc.load_model( + model_uri, + dst_path=cache_path, + ) + + class MlflowPandasArtifactIOManager(MlflowBaseIOManager): """Implement IO manager for logging/loading dataframes as mlflow artifacts.""" diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index fd9a866..b482aec 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -1,26 +1,33 @@ """Implement models to extract data from SEC10k filings.""" from dagster import ( + Config, Definitions, + define_asset_job, + file_relative_path, + in_process_executor, load_assets_from_modules, load_assets_from_package_module, ) +from dagstermill import ( + ConfigurableLocalOutputNotebookIOManager, + define_dagstermill_asset, +) from upath import UPath from mozilla_sec_eia.library import model_jobs from mozilla_sec_eia.library.generic_io_managers import PandasParquetIOManager from mozilla_sec_eia.library.mlflow import ( + MlflowPyfuncModelIOManager, mlflow_interface_resource, mlflow_train_test_io_managers, ) from . import basic_10k, ex_21, extract from .utils.cloud import cloud_interface_resource -from .utils.layoutlm import LayoutlmIOManager basic_10k_assets = load_assets_from_modules([basic_10k]) ex21_assets = load_assets_from_package_module(ex_21) -layoutlm_assets = load_assets_from_modules([ex_21.train_extractor]) shared_assets = load_assets_from_modules([extract]) basic_10k_production_job = model_jobs.create_production_model_job( @@ -40,36 +47,45 @@ concurrency_limit=4, ) -ex21_validation_job = model_jobs.create_validation_model_job( - "ex21_extraction_validation", - ex_21.validation_assets, -) -layoutlm_finetune_job = model_jobs.create_training_job( - "layoutlm_finetune", - layoutlm_assets, +class TrainConfig(Config): + """Config for training notebook.""" + + uri: str = "runs:/c363159de2f5439c93dd972d51247370/layoutlm_extractor" + training_set: str = "labeledv0.2" + + +exhibit21_extractor = define_dagstermill_asset( + name="exhibit21_extractor", + notebook_path=file_relative_path(__file__, "notebooks/exhibit21_extractor.ipynb"), + config_schema=TrainConfig.to_config_schema(), +) +ex21_training_job = define_asset_job( + "ex21_training", + selection=[exhibit21_extractor], + executor_def=in_process_executor, ) defs = Definitions( - assets=basic_10k_assets + ex21_assets + shared_assets + layoutlm_assets, + assets=basic_10k_assets + ex21_assets + shared_assets + [exhibit21_extractor], jobs=[ basic_10k_production_job, basic_10k_validation_job, ex21_production_job, - ex21_validation_job, - layoutlm_finetune_job, + ex21_training_job, ], resources={ "cloud_interface": cloud_interface_resource, "mlflow_interface": mlflow_interface_resource, - "layoutlm_io_manager": LayoutlmIOManager( - mlflow_interface=mlflow_interface_resource + "layoutlm_io_manager": MlflowPyfuncModelIOManager( + mlflow_interface=mlflow_interface_resource, + uri="runs:/b959cfa0ba3c4b91a0f8fe158cd0109f/exhibit21_extractor", ), "pandas_parquet_io_manager": PandasParquetIOManager( base_path=UPath("gs://sec10k-outputs") ), - "exhibit21_extractor": ex_21.exhibit_21_extractor_resource, + "output_notebook_io_manager": ConfigurableLocalOutputNotebookIOManager(), } | mlflow_train_test_io_managers, ) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index ce971c5..2daf5ae 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -4,9 +4,17 @@ import mlflow import pandas as pd -from dagster import AssetIn, AssetOut, Out, asset, graph_multi_asset, multi_asset, op +from dagster import ( + AssetIn, + AssetOut, + In, + Out, + asset, + graph_multi_asset, + multi_asset, + op, +) -from mozilla_sec_eia.library import validation_helpers from mozilla_sec_eia.library.mlflow import MlflowInterface, mlflow_interface_resource from ..entities import ( @@ -17,147 +25,24 @@ ) from ..extract import chunk_filings, sec10k_filing_metadata, year_quarter_partitions from ..utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename -from .inference import Exhibit21Extractor, clean_extracted_df, extract_filings +from .inference import extract_filings logger = logging.getLogger(f"catalystcoop.{__name__}") -@asset(dagster_type=ex21_extract_type) -def ex21_validation_set() -> pd.DataFrame: - """Return dataframe containing exhibit 21 validation data.""" - return clean_ex21_validation_set( - validation_helpers.load_validation_data("ex21_labels.csv") - ) - - -@asset -def ex21_validation_filing_metadata( - cloud_interface: GCSArchive, - ex21_validation_set: pd.DataFrame, -) -> pd.DataFrame: - """Get sec 10k filing metadata from validation set.""" - filing_metadata = cloud_interface.get_metadata() - return filing_metadata[ - filing_metadata.index.isin(ex21_validation_set["filename"].unique()) - ] - - -@multi_asset( - ins={ - "computed_df": AssetIn("ex21_company_ownership_info_validation"), - "validation_df": AssetIn("ex21_validation_set"), - }, - outs={ - "ex21_jaccard_per_table": AssetOut( - io_manager_key="mlflow_pandas_artifact_io_manager" - ), - "ex21_precision_recall_per_table": AssetOut( - io_manager_key="mlflow_pandas_artifact_io_manager" - ), - "ex21_incorrect_filenames": AssetOut( - io_manager_key="mlflow_pandas_artifact_io_manager" - ), - "ex21_extraction_metrics": AssetOut(io_manager_key="mlflow_metrics_io_manager"), - }, -) -def ex21_validation_metrics(computed_df: pd.DataFrame, validation_df: pd.DataFrame): - """Compute validation metrics for Ex. 21 extraction.""" - shared_cols = validation_df.columns.intersection(computed_df.columns) - validation_df = validation_df.astype(computed_df[shared_cols].dtypes) - n_equal = 0 - validation_filenames = validation_df["id"].unique() - n_files = len(validation_filenames) - table_metrics_dict = {} - jaccard_dict = {} - incorrect_files = [] - # iterate through each file and check each extracted table - for filename in validation_filenames: - extracted_table_df = computed_df[computed_df["id"] == filename].reset_index( - drop=True - ) - validation_table_df = validation_df[ - validation_df["id"] == filename - ].reset_index(drop=True) - # check if the tables are exactly equal - if extracted_table_df.equals(validation_table_df): - # TODO: strip llc and other company strings before comparison - n_equal += 1 - else: - incorrect_files.append(filename) - # compute precision and recall for each column - table_metrics_dict[filename] = {} - jaccard_dict[filename] = {} - for col in ["subsidiary", "loc", "own_per"]: - table_prec_recall = validation_helpers.pandas_compute_precision_recall( - extracted_table_df, validation_table_df, value_col=col - ) - table_metrics_dict[filename][f"{col}_precision"] = table_prec_recall[ - "precision" - ] - table_metrics_dict[filename][f"{col}_recall"] = table_prec_recall["recall"] - # get the jaccard similarity between columns - jaccard_dict[filename][col] = validation_helpers.jaccard_similarity( - computed_df=extracted_table_df, - validation_df=validation_table_df, - value_col=col, - ) - - jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient="index").reset_index() - prec_recall_df = pd.DataFrame.from_dict( - table_metrics_dict, orient="index" - ).reset_index() - - return ( - jaccard_df, - prec_recall_df, - pd.DataFrame({"filename": incorrect_files}), - { - "table_accuracy": n_equal / n_files, - "avg_subsidiary_jaccard_sim": jaccard_df["subsidiary"].sum() / n_files, - "avg_location_jaccard_sim": jaccard_df["loc"].sum() / n_files, - "avg_own_per_jaccard_sim": jaccard_df["own_per"].sum() / n_files, - "avg_subsidiary_precision": prec_recall_df["subsidiary_precision"].sum() - / n_files, - "avg_location_precision": prec_recall_df["loc_precision"].sum() / n_files, - "avg_own_per_precision": prec_recall_df["own_per_precision"].sum() - / n_files, - "avg_subsidiary_recall": prec_recall_df["subsidiary_recall"].sum() - / n_files, - "avg_location_recall": prec_recall_df["loc_recall"].sum() / n_files, - "avg_own_per_recall": prec_recall_df["own_per_recall"].sum() / n_files, - }, - ) - - -def clean_ex21_validation_set(validation_df: pd.DataFrame): - """Clean Ex. 21 validation data to match extracted format.""" - validation_df = validation_df.rename( - columns={ - "Filename": "id", - "Subsidiary": "subsidiary", - "Location of Incorporation": "loc", - "Ownership Percentage": "own_per", - } - ) - validation_df["own_per"] = validation_df["own_per"].astype(str) - validation_df["filename"] = validation_df["id"].apply(get_metadata_filename) - validation_df = clean_extracted_df(validation_df) - return validation_df - - @op( out={ "metadata": Out(dagster_type=sec10k_extract_metadata_type), "extracted": Out(dagster_type=ex21_extract_type), - } + }, + ins={"exhibit21_extractor": In(input_manager_key="layoutlm_io_manager")}, ) def extract_filing_chunk( - exhibit21_extractor: Exhibit21Extractor, filings: pd.DataFrame, - layoutlm, + exhibit21_extractor, ) -> tuple[pd.DataFrame, pd.DataFrame]: """Extract a set of filings and return results.""" - return extract_filings(exhibit21_extractor, filings, layoutlm) + return extract_filings(filings, exhibit21_extractor) @op( @@ -196,17 +81,15 @@ def collect_extracted_chunks( io_manager_key="pandas_parquet_io_manager" ), }, - ins={"layoutlm": AssetIn(input_manager_key="layoutlm_io_manager")}, partitions_def=year_quarter_partitions, ) def ex21_extract( sec10k_filing_metadata: pd.DataFrame, - layoutlm, ): """Extract ownership info from exhibit 21 docs.""" filing_chunks = chunk_filings(sec10k_filing_metadata) metadata_chunks, extracted_chunks = filing_chunks.map( - lambda filings: extract_filing_chunk(filings, layoutlm) + lambda filings: extract_filing_chunk(filings) ) metadata, extracted = collect_extracted_chunks( metadata_chunks.collect(), extracted_chunks.collect() @@ -215,39 +98,4 @@ def ex21_extract( return metadata, extracted -@multi_asset( - outs={ - "ex21_extraction_metadata_validation": AssetOut( - io_manager_key="mlflow_pandas_artifact_io_manager", - dagster_type=sec10k_extract_metadata_type, - ), - "ex21_company_ownership_info_validation": AssetOut( - io_manager_key="mlflow_pandas_artifact_io_manager", - dagster_type=ex21_extract_type, - ), - }, - ins={"layoutlm": AssetIn(input_manager_key="layoutlm_io_manager")}, -) -def ex21_extract_validation( - ex21_validation_filing_metadata: pd.DataFrame, - exhibit21_extractor: Exhibit21Extractor, - layoutlm, -): - """Extract ownership info from exhibit 21 docs.""" - return extract_filings( - exhibit21_extractor, ex21_validation_filing_metadata, layoutlm - ) - - -exhibit_21_extractor_resource = Exhibit21Extractor( - cloud_interface=cloud_interface_resource, -) - production_assets = [sec10k_filing_metadata, ex21_extract] - -validation_assets = [ - ex21_validation_set, - ex21_validation_filing_metadata, - ex21_extract_validation, - ex21_validation_metrics, -] diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data.py new file mode 100644 index 0000000..4e331c8 --- /dev/null +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data.py @@ -0,0 +1 @@ +"""Define methods and assets for handling datasets used by.""" diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index 5de6eb9..5633e40 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -4,25 +4,15 @@ import os import tempfile import traceback -from contextlib import contextmanager from pathlib import Path import numpy as np import pandas as pd -import torch -from dagster import ConfigurableResource -from datasets import Dataset -from transformers import ( - Pipeline, - pipeline, -) -from transformers.tokenization_utils_base import BatchEncoding +from mlflow.pyfunc import PyFuncModel -from ..entities import Ex21CompanyOwnership, Sec10kExtractionMetadata -from ..utils.cloud import GCSArchive, get_metadata_filename +from ..entities import Ex21CompanyOwnership +from ..utils.cloud import GCSArchive from ..utils.layoutlm import ( - get_id_label_conversions, - iob_to_label, normalize_bboxes, ) from ..utils.pdf import ( @@ -33,7 +23,6 @@ format_label_studio_output, get_image_dict, ) -from .train_extractor import BBOX_COLS, LABELS # When handling multi page documents LayoutLM uses a sliding 'frame' # with some overlap between frames. The overlap creates multiple @@ -50,6 +39,19 @@ "O", ] +LABELS = [ + "O", + "B-Subsidiary", + "I-Subsidiary", + "B-Loc", + "I-Loc", + "B-Own_Per", + "I-Own_Per", +] + +BBOX_COLS = ["top_left_x", "top_left_y", "bottom_right_x", "bottom_right_y"] +label2id = {v: k for k, v in enumerate(LABELS)} + logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -71,6 +73,41 @@ def format_unlabeled_pdf_dataframe(pdfs_dir: Path): return inference_df +def _cache_pdfs( + filings: pd.DataFrame, cloud_interface: GCSArchive, pdf_dir: Path +) -> pd.DataFrame: + """Iterate filings and cache pdfs.""" + extraction_metadata = pd.DataFrame( + { + "filename": pd.Series(dtype=str), + "success": pd.Series(dtype=bool), + "notes": pd.Series(dtype=str), + } + ).set_index("filename") + + for filing in cloud_interface.iterate_filings(filings): + pdf_path = cloud_interface.get_local_filename( + cache_directory=pdf_dir, filing=filing, extension=".pdf" + ) + + # Some filings are poorly formatted and fail in `save_as_pdf` + # We want a record of these but don't want to stop run + try: + with pdf_path.open("wb") as f: + filing.ex_21.save_as_pdf(f) + except Exception as e: + extraction_metadata.loc[filing.filename, ["success"]] = False + extraction_metadata.loc[filing.filename, ["note"]] = str(e) + + # Some pdfs are empty. Check for these and remove from dir + if pdf_path.stat().st_size == 0: + extraction_metadata.loc[filing.filename, ["success"]] = False + extraction_metadata.loc[filing.filename, ["note"]] = "PDF empty" + pdf_path.unlink() + + return extraction_metadata + + def create_inference_dataset( filing_metadata: pd.DataFrame, cloud_interface: GCSArchive, has_labels: bool = False ) -> tuple[pd.DataFrame, pd.DataFrame]: @@ -120,42 +157,34 @@ def create_inference_dataset( return extraction_metadata, pd.DataFrame(annotations) -def clean_extracted_df(extracted_df): - """Perform basic cleaning on a dataframe extracted from an Ex. 21.""" - if extracted_df.empty: - return extracted_df - if "row" in extracted_df.columns: - extracted_df = extracted_df.drop(columns=["row"]) - extracted_df["subsidiary"] = extracted_df["subsidiary"].str.strip().str.lower() - # strip special chars from the start and end of the string - extracted_df["subsidiary"] = extracted_df["subsidiary"].str.replace( - r"^[^\w&\s]+|[^\w&\s]+$", "", regex=True - ) - if "loc" in extracted_df.columns: - extracted_df["loc"] = extracted_df["loc"].str.strip().str.lower() - extracted_df["loc"] = extracted_df["loc"].str.replace( - r"[^a-zA-Z&,\s]", "", regex=True - ) - if "own_per" in extracted_df.columns: - # remove special chars and letters - extracted_df["own_per"] = extracted_df["own_per"].str.replace( - r"[^\d.]", "", regex=True - ) - # Find values with multiple decimal points - extracted_df["own_per"] = extracted_df["own_per"].str.replace( - r"(\d*\.\d+)\..*", r"\1", regex=True - ) - extracted_df["own_per"] = extracted_df["own_per"].replace("", np.nan) - extracted_df["own_per"] = extracted_df["own_per"].astype( - "float64", errors="ignore" +def extract_filings( + filings: pd.DataFrame, + layoutlm: PyFuncModel, +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Create huggingface dataset from filings and perform extraction.""" + try: + failed_metadata, dataset = create_inference_dataset( + filing_metadata=filings, + cloud_interface=GCSArchive(), + has_labels=False, ) - # drop rows that have a null subsidiary value - extracted_df = extracted_df.dropna(subset="subsidiary") - return extracted_df + metadata, extracted = layoutlm.predict(dataset) + metadata = pd.concat([failed_metadata, metadata]) + except Exception as e: + logger.warning(traceback.format_exc()) + logger.warning(f"Error while extracting filings: {filings.index}") + metadata = pd.DataFrame( + { + "filename": filings.index, + "success": [False] * len(filings), + "notes": [str(e)] * len(filings), + } + ).set_index("filename") + extracted = Ex21CompanyOwnership.example(size=0) + return metadata, extracted def _sort_by_label_priority(target_array): - _, label2id = get_id_label_conversions(LABELS) id_priority = [label2id[label] for label in LABEL_PRIORITY] # Create a priority map from the label priority priority_map = {val: idx for idx, val in enumerate(id_priority)} @@ -204,267 +233,3 @@ def get_flattened_mode_predictions(token_boxes_tensor, predictions_tensor): flattened_modes = modes[inverse_indices] return flattened_modes - - -def _cache_pdfs( - filings: pd.DataFrame, cloud_interface: GCSArchive, pdf_dir: Path -) -> pd.DataFrame: - """Iterate filings and cache pdfs.""" - extraction_metadata = pd.DataFrame( - { - "filename": pd.Series(dtype=str), - "success": pd.Series(dtype=bool), - "notes": pd.Series(dtype=str), - } - ).set_index("filename") - - for filing in cloud_interface.iterate_filings(filings): - pdf_path = cloud_interface.get_local_filename( - cache_directory=pdf_dir, filing=filing, extension=".pdf" - ) - - # Some filings are poorly formatted and fail in `save_as_pdf` - # We want a record of these but don't want to stop run - try: - with pdf_path.open("wb") as f: - filing.ex_21.save_as_pdf(f) - except Exception as e: - extraction_metadata.loc[filing.filename, ["success"]] = False - extraction_metadata.loc[filing.filename, ["note"]] = str(e) - - # Some pdfs are empty. Check for these and remove from dir - if pdf_path.stat().st_size == 0: - extraction_metadata.loc[filing.filename, ["success"]] = False - extraction_metadata.loc[filing.filename, ["note"]] = "PDF empty" - pdf_path.unlink() - - return extraction_metadata - - -def _get_data(dataset): - yield from dataset - - -class Exhibit21Extractor(ConfigurableResource): - """Implement `Sec10kExtractor` interface for exhibit 21 data.""" - - cloud_interface: GCSArchive - name: str = "exhibit21_extractor" - device: str = "cpu" - has_labels: bool = False - dataset_ind: list | None = None - - @contextmanager - def setup_for_execution(self, context): - """Set env variable to improve GPU memory access.""" - os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True" - - def extract_filings( - self, dataset: Dataset, model, processor - ) -> tuple[pd.DataFrame, pd.DataFrame]: - """Predict entities with a fine-tuned model and extract Ex. 21 tables.""" - if self.dataset_ind: - dataset = dataset.select(self.dataset_ind) - - # TODO: figure out device argument - pipe = pipeline( - "token-classification", - model=model, - tokenizer=processor, - pipeline_class=LayoutLMInferencePipeline, - device=self.device, - ) - - logits = [] - predictions = [] - all_output_df = Ex21CompanyOwnership.example(size=0) - extraction_metadata = Sec10kExtractionMetadata.example(size=0) - for logit, pred, output_df in pipe(_get_data(dataset)): - logits.append(logit) - predictions.append(pred) - if not output_df.empty: - filename = get_metadata_filename(output_df["id"].iloc[0]) - extraction_metadata.loc[filename, ["success"]] = True - all_output_df = pd.concat([all_output_df, output_df]) - all_output_df.columns.name = None - all_output_df = clean_extracted_df(all_output_df) - all_output_df = all_output_df[["id", "subsidiary", "loc", "own_per"]] - all_output_df = all_output_df.reset_index(drop=True) - return extraction_metadata, all_output_df - - -def extract_filings( - exhibit21_extractor: Exhibit21Extractor, - filings: pd.DataFrame, - layoutlm, -) -> tuple[pd.DataFrame, pd.DataFrame]: - """Create huggingface dataset from filings and perform extraction.""" - try: - failed_metadata, dataset = create_inference_dataset( - filing_metadata=filings, - cloud_interface=exhibit21_extractor.cloud_interface, - has_labels=exhibit21_extractor.has_labels, - ) - metadata, extracted = exhibit21_extractor.extract_filings( - dataset, - model=layoutlm["model"], - processor=layoutlm["tokenizer"], - ) - metadata = pd.concat([failed_metadata, metadata]) - except Exception as e: - logger.warning(traceback.format_exc()) - logger.warning(f"Error while extracting filings: {filings.index}") - metadata = pd.DataFrame( - { - "filename": filings.index, - "success": [False] * len(filings), - "notes": [str(e)] * len(filings), - } - ).set_index("filename") - extracted = Ex21CompanyOwnership.example(size=0) - return metadata, extracted - - -class LayoutLMInferencePipeline(Pipeline): - """Pipeline for performing inference with fine-tuned LayoutLM.""" - - def __init__(self, *args, **kwargs): - """Initialize LayoutLMInferencePipeline.""" - super().__init__(*args, **kwargs) - - def _sanitize_parameters(self, **kwargs): - preprocess_kwargs = {} - if "maybe_arg" in kwargs: - preprocess_kwargs["maybe_arg"] = kwargs["maybe_arg"] - return preprocess_kwargs, {}, {} - - def preprocess(self, doc_dict): - """Encode and tokenize model inputs.""" - image = doc_dict["image"] - words = doc_dict["tokens"] - boxes = doc_dict["bboxes"] - encoding = self.tokenizer( - image, - words, - boxes=boxes, - return_tensors="pt", - truncation=True, - padding="max_length", - max_length=512, # this is the maximum max_length - stride=128, - return_offsets_mapping=True, - return_overflowing_tokens=True, - ) - model_inputs = {} - model_inputs["raw_encoding"] = encoding.copy() - model_inputs["doc_dict"] = doc_dict - model_inputs["offset_mapping"] = encoding.pop("offset_mapping") - model_inputs["sample_mapping"] = encoding.pop("overflow_to_sample_mapping") - # TODO: do we actually need to make these into ints? - encoding["input_ids"] = encoding["input_ids"].to(torch.int64) - encoding["attention_mask"] = encoding["attention_mask"].to(torch.int64) - encoding["bbox"] = encoding["bbox"].to(torch.int64) - encoding["pixel_values"] = torch.stack(encoding["pixel_values"]) - model_inputs["encoding"] = encoding - return model_inputs - - def _forward(self, model_inputs): - # encoding is passed as a UserDict in the model_inputs dictionary - # turn it back into a BatchEncoding - encoding = BatchEncoding(model_inputs["encoding"]) - if torch.cuda.is_available(): - encoding.to("cuda") - self.model.to("cuda") - # since we're doing inference, we don't need gradient computation - with torch.no_grad(): - output = self.model(**encoding) - return { - "logits": output.logits, - "predictions": output.logits.argmax(-1).squeeze().tolist(), - "raw_encoding": model_inputs["raw_encoding"], - "doc_dict": model_inputs["doc_dict"], - } - - def postprocess(self, all_outputs): - """Return logits, model predictions, and the extracted dataframe.""" - logits = all_outputs["logits"] - predictions = all_outputs["logits"].argmax(-1).squeeze().tolist() - output_df = self.extract_table(all_outputs) - return logits, predictions, output_df - - def extract_table(self, all_outputs): - """Extract a structured table from a set of inference predictions. - - This function essentially works by stacking bounding boxes and predictions - into a dataframe and going from left to right and top to bottom. Then, every - every time a new subsidiary entity is encountered, it assigns a new group or - "row" to that subsidiary. Next, location and ownership percentage words/labeled - entities in between these subsidiary groups are assigned to a subsidiary row/group. - Finally, this is all formatted into a dataframe with an ID column from the original - filename and a basic cleaning function normalizes strings. - """ - # TODO: when model more mature, break this into sub functions to make it - # clearer what's going on - predictions = all_outputs["predictions"] - encoding = all_outputs["raw_encoding"] - doc_dict = all_outputs["doc_dict"] - - token_boxes_tensor = encoding["bbox"].flatten(start_dim=0, end_dim=1) - predictions_tensor = torch.tensor(predictions) - mode_predictions = get_flattened_mode_predictions( - token_boxes_tensor, predictions_tensor - ) - token_boxes = encoding["bbox"].flatten(start_dim=0, end_dim=1).tolist() - predicted_labels = [ - self.model.config.id2label[pred] for pred in mode_predictions - ] - simple_preds = [iob_to_label(pred).lower() for pred in predicted_labels] - - df = pd.DataFrame(data=token_boxes, columns=BBOX_COLS) - df.loc[:, "iob_pred"] = predicted_labels - df.loc[:, "pred"] = simple_preds - invalid_mask = ( - (df["top_left_x"] == 0) - & (df["top_left_y"] == 0) - & (df["bottom_right_x"] == 0) - & (df["bottom_right_y"] == 0) - ) - df = df[~invalid_mask] - # we want to get actual words on the dataframe, not just subwords that correspond to tokens - # subwords from the same word share the same bounding box coordinates - # so we merge the original words onto our dataframe on bbox coordinates - words_df = pd.DataFrame(data=doc_dict["bboxes"], columns=BBOX_COLS) - words_df.loc[:, "word"] = doc_dict["tokens"] - df = df.merge(words_df, how="left", on=BBOX_COLS).drop_duplicates( - subset=BBOX_COLS + ["pred", "word"] - ) - # rows that are the first occurrence in a new group (subsidiary, loc, own_per) - # should always have a B entity label. Manually override labels so this is true. - first_in_group_df = df[ - (df["pred"].ne(df["pred"].shift())) & (df["pred"] != "other") - ] - first_in_group_df.loc[:, "iob_pred"] = ( - "B" + first_in_group_df["iob_pred"].str[1:] - ) - df.update(first_in_group_df) - # filter for just words that were labeled with non "other" entities - entities_df = df.sort_values(by=["top_left_y", "top_left_x"]) - entities_df = entities_df[entities_df["pred"] != "other"] - # words are labeled with IOB format which stands for inside, outside, beginning - # merge B and I entities to form one entity group - # (i.e. "B-Subsidiary" and "I-Subsidiary" become just "subsidiary"), assign a group ID - entities_df["group"] = (entities_df["iob_pred"].str.startswith("B-")).cumsum() - grouped_df = ( - entities_df.groupby(["group", "pred"])["word"] - .apply(" ".join) - .reset_index()[["pred", "word"]] - ) - # assign a new row every time there's a new subsidiary - grouped_df["row"] = (grouped_df["pred"].str.startswith("subsidiary")).cumsum() - output_df = grouped_df.pivot_table( - index="row", columns="pred", values="word", aggfunc=lambda x: " ".join(x) - ).reset_index() - if output_df.empty: - return output_df - output_df.loc[:, "id"] = doc_dict["id"] - return output_df diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py b/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py deleted file mode 100644 index cb37619..0000000 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/train_extractor.py +++ /dev/null @@ -1,192 +0,0 @@ -"""Fine-tune LayoutLM to extract Ex. 21 tables. - -This module uses a labeled training dataset to fine-tune -a LayoutLM model to extract unstructured Exhibit 21 tables -from SEC 10K filings. -""" - -from pathlib import Path - -import numpy as np -from dagster import Config, asset -from datasets import ( - Array2D, - Array3D, - Dataset, - Features, - Sequence, - Value, - load_metric, -) -from transformers import ( - AutoProcessor, - LayoutLMv3ForTokenClassification, - Trainer, - TrainingArguments, -) -from transformers.data.data_collator import default_data_collator - -from ..utils.layoutlm import get_id_label_conversions -from .create_labeled_dataset import format_as_ner_annotations - -LABELS = [ - "O", - "B-Subsidiary", - "I-Subsidiary", - "B-Loc", - "I-Loc", - "B-Own_Per", - "I-Own_Per", -] - -BBOX_COLS = ["top_left_x", "top_left_y", "bottom_right_x", "bottom_right_y"] - - -def compute_metrics(p, metric, label_list, return_entity_level_metrics=False): - """Compute metrics to train and evaluate the model on.""" - predictions, labels = p - predictions = np.argmax(predictions, axis=2) - - # Remove ignored index (special tokens) - true_predictions = [ - [label_list[pred] for (pred, lab) in zip(prediction, label) if lab != -100] - for prediction, label in zip(predictions, labels) - ] - true_labels = [ - [label_list[lab] for (pred, lab) in zip(prediction, label) if lab != -100] - for prediction, label in zip(predictions, labels) - ] - - results = metric.compute(predictions=true_predictions, references=true_labels) - if return_entity_level_metrics: - # Unpack nested dictionaries - final_results = {} - for key, value in results.items(): - if isinstance(value, dict): - for n, v in value.items(): - final_results[f"{key}_{n}"] = v - else: - final_results[key] = value - return final_results - return { - "precision": results["overall_precision"], - "recall": results["overall_recall"], - "f1": results["overall_f1"], - "accuracy": results["overall_accuracy"], - } - - -def _prepare_dataset(annotations, processor, label2id): - """Put the dataset in its final format for training LayoutLM.""" - - def _convert_ner_tags_to_id(ner_tags, label2id): - return [int(label2id[ner_tag]) for ner_tag in ner_tags] - - images = annotations["image"] - words = annotations["tokens"] - boxes = annotations["bboxes"] - # Map over labels and convert to numeric id for each ner_tag - ner_tags = [ - _convert_ner_tags_to_id(ner_tags, label2id) - for ner_tags in annotations["ner_tags"] - ] - - encoding = processor( - images, - words, - boxes=boxes, - word_labels=ner_tags, - truncation=True, - padding="max_length", - ) - - return encoding - - -def load_test_train_set( - processor: AutoProcessor, test_size: float, ner_annotations: list[dict] -): - """Load training/test set and prepare for training or evaluation.""" - id2label, label2id = get_id_label_conversions(LABELS) - # Cache/prepare training data - dataset = Dataset.from_list(ner_annotations) - - # Prepare our train & eval dataset - column_names = dataset.column_names - features = Features( - { - "pixel_values": Array3D(dtype="float32", shape=(3, 224, 224)), - "input_ids": Sequence(feature=Value(dtype="int64")), - "attention_mask": Sequence(Value(dtype="int64")), - "bbox": Array2D(dtype="int64", shape=(512, 4)), - "labels": Sequence(feature=Value(dtype="int64")), - } - ) - dataset = dataset.map( - lambda annotations: _prepare_dataset(annotations, processor, label2id), - batched=True, - remove_columns=column_names, - features=features, - ) - dataset.set_format("torch") - split_dataset = dataset.train_test_split(test_size=test_size) - return split_dataset["train"], split_dataset["test"] - - -class FineTuneConfig(Config): - """Configuration to supply to `train_model`.""" - - labeled_json_path: str = "sec10k_filings/labeled_jsons/" - gcs_training_data_dir: str = "labeled" - output_dir: str = "layoutlm_trainer" - test_size: float = 0.2 - - -@asset(io_manager_key="layoutlm_io_manager") -def layoutlm( - config: FineTuneConfig, -): - """Train LayoutLM model with labeled data.""" - # Prepare model - id2label, label2id = get_id_label_conversions(LABELS) - model = LayoutLMv3ForTokenClassification.from_pretrained( - "microsoft/layoutlmv3-base", id2label=id2label, label2id=label2id - ) - processor = AutoProcessor.from_pretrained( - "microsoft/layoutlmv3-base", apply_ocr=False - ) - ner_annotations = format_as_ner_annotations( - labeled_json_path=Path(config.labeled_json_path), - gcs_folder_name=config.gcs_training_data_dir, - ) - # Get training/test data using pre-trained processor to prepare data - train_dataset, eval_dataset = load_test_train_set( - processor=processor, test_size=config.test_size, ner_annotations=ner_annotations - ) - - # Initialize our Trainer - metric = load_metric("seqeval") - training_args = TrainingArguments( - output_dir=config.output_dir, - max_steps=1000, - per_device_train_batch_size=1, - per_device_eval_batch_size=1, - learning_rate=1e-5, - evaluation_strategy="steps", - eval_steps=100, - load_best_model_at_end=True, - metric_for_best_model="f1", - ) - trainer = Trainer( - model=model, - args=training_args, - train_dataset=train_dataset, - eval_dataset=eval_dataset, - tokenizer=processor, - data_collator=default_data_collator, - compute_metrics=lambda p: compute_metrics(p, metric=metric, label_list=LABELS), - ) - - # Train inside mlflow run. Mlflow will automatically handle logging training metrcis - trainer.train() - return trainer diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb new file mode 100644 index 0000000..4efc905 --- /dev/null +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb @@ -0,0 +1,1084 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "0da8c588-2d09-464b-945f-168704c0cdac", + "metadata": { + "tags": [] + }, + "source": [ + "# Exhibit 21 extraction\n", + "\n", + "This notebook implements a model built on top of [layoutlmv3](https://huggingface.co/microsoft/layoutlmv3-base/tree/main)\n", + "from Exhibit 21 attachments to SEC-10k filings. These documents contain a list of all subsidiary companies owned by a filing\n", + "company." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "48f185de-95ef-4194-9245-93f8d603d2e6", + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [], + "source": [ + "import dagstermill\n", + "\n", + "context = dagstermill.get_context(op_config={\n", + " \"uri\": \"runs:/c363159de2f5439c93dd972d51247370/layoutlm_extractor\",\n", + " \"training_set\": \"labeledv0.2\",\n", + "})" + ] + }, + { + "cell_type": "markdown", + "id": "7f299b2b-2358-4526-b023-f29c817316d9", + "metadata": { + "tags": [] + }, + "source": [ + "## Train Layoutlmv3" + ] + }, + { + "cell_type": "markdown", + "id": "32edcce1-ab18-40b6-9da8-ce0ea53c2f72", + "metadata": { + "tags": [] + }, + "source": [ + "### Setup training/test sets" + ] + }, + { + "cell_type": "markdown", + "id": "8b389646-c4af-4c92-a29e-b4b23f4c391b", + "metadata": {}, + "source": [ + "Download training data and convert to NER annotations. This involves converting exhibit 21 filings into PDF's, then using labels generated by label studio to produce the annotations. These annotations are then used to create a huggingface dataset that will be used for training.\n", + "\n", + "First define several helper functions to do the conversion." + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "9372b908-d9b9-4d18-a5bf-d332648b3e49", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "from pathlib import Path\n", + "from tempfile import TemporaryDirectory\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "\n", + "from mozilla_sec_eia.library import validation_helpers\n", + "from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive, get_metadata_filename\n", + "from mozilla_sec_eia.models.sec10k.utils.layoutlm import normalize_bboxes\n", + "from mozilla_sec_eia.models.sec10k.utils.pdf import (\n", + " get_pdf_data_from_path,\n", + " render_page,\n", + ")\n", + "\n", + "# Set some constants\n", + "LABELS = [\n", + " \"O\",\n", + " \"B-Subsidiary\",\n", + " \"I-Subsidiary\",\n", + " \"B-Loc\",\n", + " \"I-Loc\",\n", + " \"B-Own_Per\",\n", + " \"I-Own_Per\",\n", + "]\n", + "LABEL_PRIORITY = [\n", + " \"I-Subsidiary\",\n", + " \"I-Loc\",\n", + " \"I-Own_Per\",\n", + " \"B-Subsidiary\",\n", + " \"B-Loc\",\n", + " \"B-Own_Per\",\n", + " \"O\",\n", + "]\n", + "\n", + "BBOX_COLS = [\"top_left_x\", \"top_left_y\", \"bottom_right_x\", \"bottom_right_y\"]\n", + "BBOX_COLS_PDF = [\n", + " \"top_left_x_pdf\",\n", + " \"top_left_y_pdf\",\n", + " \"bottom_right_x_pdf\",\n", + " \"bottom_right_y_pdf\",\n", + "]\n", + "\n", + "# Map back and forth between id's and labels\n", + "id2label = dict(enumerate(LABELS))\n", + "label2id = {v: k for k, v in enumerate(LABELS)}\n", + "\n", + "def _is_cik_in_training_data(labeled_json_filename, tracking_df):\n", + " # TODO: for now CIK is stored as an int, update when fixed\n", + " cik = int(labeled_json_filename.split(\"/\")[-1].split(\"-\")[0])\n", + " return cik in tracking_df.CIK.unique()\n", + "\n", + "\n", + "def format_label_studio_output(\n", + " labeled_json_dir: Path,\n", + " pdfs_dir: Path,\n", + ") -> pd.DataFrame:\n", + " \"\"\"Format Label Studio output JSONs into dataframe.\"\"\"\n", + " labeled_df = pd.DataFrame()\n", + " # TODO: make this path stuff less janky?\n", + " tracking_df = validation_helpers.load_training_data(\"ex21_labels.csv\")\n", + " for json_filename in os.listdir(labeled_json_dir):\n", + " if not json_filename[0].isdigit() or json_filename.endswith(\".json\"):\n", + " continue\n", + " json_file_path = labeled_json_dir / json_filename\n", + " with Path.open(json_file_path) as j:\n", + " doc_dict = json.loads(j.read())\n", + "\n", + " filename = doc_dict[\"task\"][\"data\"][\"ocr\"].split(\"/\")[-1].split(\".\")[0]\n", + " # check if old local naming schema is being used\n", + " if len(filename.split(\"-\")) == 6:\n", + " filename = \"-\".join(filename.split(\"-\")[2:])\n", + " if not _is_cik_in_training_data(filename, tracking_df=tracking_df):\n", + " continue\n", + "\n", + " pdf_filename = filename + \".pdf\"\n", + " src_path = pdfs_dir / pdf_filename\n", + " extracted, pg = get_pdf_data_from_path(src_path)\n", + " txt = extracted[\"pdf_text\"]\n", + " pg_meta = extracted[\"page\"]\n", + " # normalize bboxes between 0 and 1000 for Hugging Face\n", + " txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta)\n", + " # parse the output dictionary of labeled bounding boxes from Label Studio\n", + " doc_df = pd.DataFrame()\n", + " for item in doc_dict[\"result\"]:\n", + " value = item[\"value\"]\n", + " # sometimes Label Studio will fill in an empty list as a label\n", + " # when there is really no label\n", + " # TODO: do this without dict comprehension?\n", + " if (\"labels\" in value) and value[\"labels\"] == []:\n", + " value = {k: v for k, v in value.items() if k != \"labels\"}\n", + " ind = int(item[\"id\"].split(\"_\")[-1])\n", + " doc_df = pd.concat([doc_df, pd.DataFrame(value, index=[ind])])\n", + "\n", + " # combine the bounding boxes for each word\n", + " doc_df = doc_df.groupby(level=0).first()\n", + " txt.loc[:, \"id\"] = filename\n", + " # TODO: probably want to filter out these empty Ex. 21 docs\n", + " # the doc might not have any labels in it if it was an empty Ex. 21\n", + " if \"labels\" not in doc_df:\n", + " doc_df.loc[:, \"labels\"] = pd.Series()\n", + "\n", + " output_df = pd.concat([txt, doc_df[[\"labels\"]]], axis=1)\n", + " labeled_df = pd.concat([labeled_df, output_df])\n", + "\n", + " # fill in unlabeled words and clean up labeled dataframe\n", + " labeled_df[\"labels\"] = labeled_df[\"labels\"].fillna(\"O\")\n", + " labeled_df = labeled_df.rename(columns={\"labels\": \"ner_tag\"})\n", + " non_id_columns = [col for col in labeled_df.columns if col != \"id\"]\n", + " labeled_df = labeled_df.loc[:, [\"id\"] + non_id_columns]\n", + "\n", + " # TODO: add in sanity checks on labeled_df bounding boxes to make sure\n", + " # that no value is above 1000 or below 0\n", + "\n", + " return labeled_df\n", + "\n", + "\n", + "def get_image_dict(pdfs_dir):\n", + " \"\"\"Create a dictionary with filenames and their Ex. 21 images.\"\"\"\n", + " image_dict = {}\n", + " for pdf_filename in os.listdir(pdfs_dir):\n", + " if pdf_filename.split(\".\")[-1] != \"pdf\":\n", + " continue\n", + " pdf_file_path = pdfs_dir / pdf_filename\n", + " _, pg = get_pdf_data_from_path(pdf_file_path)\n", + " full_pg_img = render_page(pg)\n", + " filename = pdf_filename.split(\".\")[0]\n", + " image_dict[filename] = full_pg_img\n", + " return image_dict\n", + "\n", + "\n", + "def format_as_ner_annotations(\n", + " labeled_json_path: Path,\n", + " pdfs_path: Path,\n", + " gcs_folder_name: Path,\n", + ") -> list[dict]:\n", + " \"\"\"Format a Label Studio output JSONs as NER annotations.\n", + "\n", + " Formats the dataframe as named entity recognition annotations.\n", + " # TODO: say more about this format\n", + "\n", + " Returns:\n", + " ner_annotations: a list of dicts, with one dict for each doc.\n", + " \"\"\"\n", + " GCSArchive().cache_training_data(\n", + " json_cache_path=labeled_json_path,\n", + " pdf_cache_path=pdfs_path,\n", + " gcs_folder_name=gcs_folder_name\n", + " )\n", + "\n", + " labeled_df = format_label_studio_output(\n", + " labeled_json_dir=labeled_json_path, pdfs_dir=pdfs_path\n", + " )\n", + " # convert dataframe/dictionary into NER format\n", + " # document_annotation_to_ner https://github.com/butlerlabs/docai/blob/main/docai/annotations/ner_utils.py\n", + " # complete dataset is a list of dicts, with one dict for each doc\n", + " doc_filenames = labeled_df[\"id\"].unique()\n", + " image_dict = get_image_dict(pdfs_dir=pdfs_path)\n", + " ner_annotations = []\n", + " for filename in doc_filenames:\n", + " annotation = {\n", + " \"id\": filename,\n", + " \"tokens\": labeled_df.groupby(\"id\")[\"text\"].apply(list).loc[filename],\n", + " \"ner_tags\": labeled_df.groupby(\"id\")[\"ner_tag\"].apply(list).loc[filename],\n", + " \"bboxes\": labeled_df.loc[labeled_df[\"id\"] == filename, :][BBOX_COLS_PDF]\n", + " .to_numpy()\n", + " .tolist(),\n", + " \"image\": image_dict[filename],\n", + " }\n", + " ner_annotations.append(annotation)\n", + "\n", + " return ner_annotations\n", + "\n", + "def _prepare_dataset(annotations, processor, label2id):\n", + " \"\"\"Put the dataset in its final format for training LayoutLM.\"\"\"\n", + "\n", + " def _convert_ner_tags_to_id(ner_tags, label2id):\n", + " return [int(label2id[ner_tag]) for ner_tag in ner_tags]\n", + "\n", + " images = annotations[\"image\"]\n", + " words = annotations[\"tokens\"]\n", + " boxes = annotations[\"bboxes\"]\n", + " # Map over labels and convert to numeric id for each ner_tag\n", + " ner_tags = [\n", + " _convert_ner_tags_to_id(ner_tags, label2id)\n", + " for ner_tags in annotations[\"ner_tags\"]\n", + " ]\n", + "\n", + " encoding = processor(\n", + " images,\n", + " words,\n", + " boxes=boxes,\n", + " word_labels=ner_tags,\n", + " truncation=True,\n", + " padding=\"max_length\",\n", + " )\n", + "\n", + " return encoding\n", + "\n", + "def compute_metrics(p, metric, label_list, return_entity_level_metrics=False):\n", + " \"\"\"Compute metrics to train and evaluate the model on.\"\"\"\n", + " predictions, labels = p\n", + " predictions = np.argmax(predictions, axis=2)\n", + "\n", + " # Remove ignored index (special tokens)\n", + " true_predictions = [\n", + " [label_list[pred] for (pred, lab) in zip(prediction, label) if lab != -100]\n", + " for prediction, label in zip(predictions, labels)\n", + " ]\n", + " true_labels = [\n", + " [label_list[lab] for (pred, lab) in zip(prediction, label) if lab != -100]\n", + " for prediction, label in zip(predictions, labels)\n", + " ]\n", + "\n", + " results = metric.compute(predictions=true_predictions, references=true_labels)\n", + " if return_entity_level_metrics:\n", + " # Unpack nested dictionaries\n", + " final_results = {}\n", + " for key, value in results.items():\n", + " if isinstance(value, dict):\n", + " for n, v in value.items():\n", + " final_results[f\"{key}_{n}\"] = v\n", + " else:\n", + " final_results[key] = value\n", + " return final_results\n", + " return {\n", + " \"precision\": results[\"overall_precision\"],\n", + " \"recall\": results[\"overall_recall\"],\n", + " \"f1\": results[\"overall_f1\"],\n", + " \"accuracy\": results[\"overall_accuracy\"],\n", + " }" + ] + }, + { + "cell_type": "markdown", + "id": "8160263c-8f69-437c-918b-e56ad007961a", + "metadata": { + "tags": [] + }, + "source": [ + "#### Finetune Model\n", + "The next cell will use the functions defined in the previous section to actually construct a huggingface dataset from labeled data and finetune the `layoutlm` model. Model finetuning will only be run if configured to do so, otherwise a pretrained version will be used from the `mlflow` tracking server.\n", + "\n", + "Model training contains several steps implemented below:\n", + "1. Use temporary path to convert filings to PDF's and stash labels\n", + "2. Use PDF's and labels to convert PDF's and labels to NER annotations\n", + "3. Construct huggingface dataset from NER annotations and split into train and test sets\n", + "4. Load pretrained model from huggingface\n", + "5. Finetune model on training data and evaluate on test data" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "71d205b2-e6ea-4ad0-982c-22e762269119", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import mlflow\n", + "from datasets import (\n", + " Array2D,\n", + " Array3D,\n", + " Dataset,\n", + " Features,\n", + " Sequence,\n", + " Value,\n", + " load_metric,\n", + ")\n", + "from dotenv import load_dotenv\n", + "from transformers import (\n", + " AutoProcessor,\n", + " LayoutLMv3ForTokenClassification,\n", + " Trainer,\n", + " TrainingArguments,\n", + ")\n", + "from transformers.data.data_collator import default_data_collator\n", + "\n", + "from mozilla_sec_eia.library.mlflow import configure_mlflow\n", + "\n", + "load_dotenv()\n", + "\n", + "\n", + "configure_mlflow()\n", + "mlflow.set_experiment(\"exhibit21_extraction_test\")\n", + "\n", + "# Only finetune if configured to do so\n", + "training_run_id = None\n", + "if context.op_config[\"uri\"] is None:\n", + " # Change temp_dir to save training data locally for inspection\n", + " with TemporaryDirectory() as temp_dir:\n", + " ner_annotations = format_as_ner_annotations(\n", + " labeled_json_path=Path(temp_dir) / \"sec10k_filings\" / \"labeled_jsons\",\n", + " pdfs_path=Path(temp_dir) / \"sec10k_filings\" / \"pdfs\",\n", + " gcs_folder_name=context.op_config[\"training_set\"],\n", + " )\n", + "\n", + " # Cache/prepare training data\n", + " dataset = Dataset.from_list(ner_annotations)\n", + "\n", + " # Load pretrained model\n", + " model = LayoutLMv3ForTokenClassification.from_pretrained(\n", + " \"microsoft/layoutlmv3-base\", id2label=id2label, label2id=label2id\n", + " )\n", + " processor = AutoProcessor.from_pretrained(\n", + " \"microsoft/layoutlmv3-base\", apply_ocr=False\n", + " )\n", + "\n", + " # Prepare our train & eval dataset\n", + " column_names = dataset.column_names\n", + " features = Features(\n", + " {\n", + " \"pixel_values\": Array3D(dtype=\"float32\", shape=(3, 224, 224)),\n", + " \"input_ids\": Sequence(feature=Value(dtype=\"int64\")),\n", + " \"attention_mask\": Sequence(Value(dtype=\"int64\")),\n", + " \"bbox\": Array2D(dtype=\"int64\", shape=(512, 4)),\n", + " \"labels\": Sequence(feature=Value(dtype=\"int64\")),\n", + " }\n", + " )\n", + " dataset = dataset.map(\n", + " lambda annotations: _prepare_dataset(annotations, processor, label2id),\n", + " batched=True,\n", + " remove_columns=column_names,\n", + " features=features,\n", + " )\n", + " dataset.set_format(\"torch\")\n", + " split_dataset = dataset.train_test_split(test_size=0.2)\n", + " train_dataset, eval_dataset = split_dataset[\"train\"], split_dataset[\"test\"]\n", + "\n", + " # Initialize our Trainer\n", + " metric = load_metric(\"seqeval\")\n", + " training_args = TrainingArguments(\n", + " max_steps=1000,\n", + " per_device_train_batch_size=1,\n", + " per_device_eval_batch_size=1,\n", + " learning_rate=1e-5,\n", + " evaluation_strategy=\"steps\",\n", + " eval_steps=100,\n", + " load_best_model_at_end=True,\n", + " metric_for_best_model=\"f1\",\n", + " output_dir=\"./layoutlm\",\n", + " )\n", + " trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=train_dataset,\n", + " eval_dataset=eval_dataset,\n", + " tokenizer=processor,\n", + " data_collator=default_data_collator,\n", + " compute_metrics=lambda p: compute_metrics(p, metric=metric, label_list=LABELS),\n", + " )\n", + "\n", + " with mlflow.start_run() as training_run:\n", + " # Train inside mlflow run. Mlflow will automatically handle logging training metrcis\n", + " trainer.train()\n", + "\n", + " # Log finetuend model with mlflow\n", + " model = {\"model\": trainer.model, \"tokenizer\": trainer.tokenizer}\n", + " mlflow.transformers.log_model(\n", + " model, artifact_path=\"layoutlm_extractor\", task=\"token-classification\"\n", + " )\n", + " training_run_id = training_run.info. run_id" + ] + }, + { + "cell_type": "markdown", + "id": "ee9b4e20-7781-43a7-b7aa-caf0690a201e", + "metadata": {}, + "source": [ + "## Model inference\n", + "Use the finetuned model to perform inference and evaluate on labeled validation data. First create a Huggingface `Pipeline` which wraps layoutlm with some custom pre/post processing steps." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "42c8e920-d671-40c2-b5db-c43611a33897", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import torch\n", + "from transformers import Pipeline, pipeline\n", + "from transformers.tokenization_utils_base import BatchEncoding\n", + "\n", + "from mozilla_sec_eia.models.sec10k.inference import get_flattened_mode_predictions\n", + "from mozilla_sec_eia.models.sec10k.utils.layoutlm import (\n", + " iob_to_label,\n", + ")\n", + "\n", + "\n", + "class LayoutLMInferencePipeline(Pipeline):\n", + " \"\"\"Pipeline for performing inference with fine-tuned LayoutLM.\"\"\"\n", + "\n", + " def __init__(self, *args, **kwargs):\n", + " \"\"\"Initialize LayoutLMInferencePipeline.\"\"\"\n", + " super().__init__(*args, **kwargs)\n", + "\n", + " def _sanitize_parameters(self, **kwargs):\n", + " preprocess_kwargs = {}\n", + " if \"maybe_arg\" in kwargs:\n", + " preprocess_kwargs[\"maybe_arg\"] = kwargs[\"maybe_arg\"]\n", + " return preprocess_kwargs, {}, {}\n", + "\n", + " def preprocess(self, doc_dict):\n", + " \"\"\"Encode and tokenize model inputs.\"\"\"\n", + " image = doc_dict[\"image\"]\n", + " words = doc_dict[\"tokens\"]\n", + " boxes = doc_dict[\"bboxes\"]\n", + " encoding = self.tokenizer(\n", + " image,\n", + " words,\n", + " boxes=boxes,\n", + " return_tensors=\"pt\",\n", + " truncation=True,\n", + " padding=\"max_length\",\n", + " max_length=512, # this is the maximum max_length\n", + " stride=128,\n", + " return_offsets_mapping=True,\n", + " return_overflowing_tokens=True,\n", + " )\n", + " model_inputs = {}\n", + " model_inputs[\"raw_encoding\"] = encoding.copy()\n", + " model_inputs[\"doc_dict\"] = doc_dict\n", + " model_inputs[\"offset_mapping\"] = encoding.pop(\"offset_mapping\")\n", + " model_inputs[\"sample_mapping\"] = encoding.pop(\"overflow_to_sample_mapping\")\n", + " # TODO: do we actually need to make these into ints?\n", + " encoding[\"input_ids\"] = encoding[\"input_ids\"].to(torch.int64)\n", + " encoding[\"attention_mask\"] = encoding[\"attention_mask\"].to(torch.int64)\n", + " encoding[\"bbox\"] = encoding[\"bbox\"].to(torch.int64)\n", + " encoding[\"pixel_values\"] = torch.stack(encoding[\"pixel_values\"])\n", + " model_inputs[\"encoding\"] = encoding\n", + " return model_inputs\n", + "\n", + " def _forward(self, model_inputs):\n", + " # encoding is passed as a UserDict in the model_inputs dictionary\n", + " # turn it back into a BatchEncoding\n", + " encoding = BatchEncoding(model_inputs[\"encoding\"])\n", + " if torch.cuda.is_available():\n", + " encoding.to(\"cuda\")\n", + " self.model.to(\"cuda\")\n", + " # since we're doing inference, we don't need gradient computation\n", + " with torch.no_grad():\n", + " output = self.model(**encoding)\n", + " return {\n", + " \"logits\": output.logits,\n", + " \"predictions\": output.logits.argmax(-1).squeeze().tolist(),\n", + " \"raw_encoding\": model_inputs[\"raw_encoding\"],\n", + " \"doc_dict\": model_inputs[\"doc_dict\"],\n", + " }\n", + "\n", + " def postprocess(self, all_outputs):\n", + " \"\"\"Return logits, model predictions, and the extracted dataframe.\"\"\"\n", + " logits = all_outputs[\"logits\"]\n", + " predictions = all_outputs[\"logits\"].argmax(-1).squeeze().tolist()\n", + " output_df = self.extract_table(all_outputs)\n", + " return logits, predictions, output_df\n", + "\n", + " def extract_table(self, all_outputs):\n", + " \"\"\"Extract a structured table from a set of inference predictions.\n", + "\n", + " This function essentially works by stacking bounding boxes and predictions\n", + " into a dataframe and going from left to right and top to bottom. Then, every\n", + " every time a new subsidiary entity is encountered, it assigns a new group or\n", + " \"row\" to that subsidiary. Next, location and ownership percentage words/labeled\n", + " entities in between these subsidiary groups are assigned to a subsidiary row/group.\n", + " Finally, this is all formatted into a dataframe with an ID column from the original\n", + " filename and a basic cleaning function normalizes strings.\n", + " \"\"\"\n", + " # TODO: when model more mature, break this into sub functions to make it\n", + " # clearer what's going on\n", + " predictions = all_outputs[\"predictions\"]\n", + " encoding = all_outputs[\"raw_encoding\"]\n", + " doc_dict = all_outputs[\"doc_dict\"]\n", + "\n", + " token_boxes_tensor = encoding[\"bbox\"].flatten(start_dim=0, end_dim=1)\n", + " predictions_tensor = torch.tensor(predictions)\n", + " mode_predictions = get_flattened_mode_predictions(\n", + " token_boxes_tensor, predictions_tensor\n", + " )\n", + " token_boxes = encoding[\"bbox\"].flatten(start_dim=0, end_dim=1).tolist()\n", + " predicted_labels = [\n", + " self.model.config.id2label[pred] for pred in mode_predictions\n", + " ]\n", + " simple_preds = [iob_to_label(pred).lower() for pred in predicted_labels]\n", + "\n", + " df = pd.DataFrame(data=token_boxes, columns=BBOX_COLS)\n", + " df.loc[:, \"iob_pred\"] = predicted_labels\n", + " df.loc[:, \"pred\"] = simple_preds\n", + " invalid_mask = (\n", + " (df[\"top_left_x\"] == 0)\n", + " & (df[\"top_left_y\"] == 0)\n", + " & (df[\"bottom_right_x\"] == 0)\n", + " & (df[\"bottom_right_y\"] == 0)\n", + " )\n", + " df = df[~invalid_mask]\n", + " # we want to get actual words on the dataframe, not just subwords that correspond to tokens\n", + " # subwords from the same word share the same bounding box coordinates\n", + " # so we merge the original words onto our dataframe on bbox coordinates\n", + " words_df = pd.DataFrame(data=doc_dict[\"bboxes\"], columns=BBOX_COLS)\n", + " words_df.loc[:, \"word\"] = doc_dict[\"tokens\"]\n", + " df = df.merge(words_df, how=\"left\", on=BBOX_COLS).drop_duplicates(\n", + " subset=BBOX_COLS + [\"pred\", \"word\"]\n", + " )\n", + " # rows that are the first occurrence in a new group (subsidiary, loc, own_per)\n", + " # should always have a B entity label. Manually override labels so this is true.\n", + " first_in_group_df = df[\n", + " (df[\"pred\"].ne(df[\"pred\"].shift())) & (df[\"pred\"] != \"other\")\n", + " ]\n", + " first_in_group_df.loc[:, \"iob_pred\"] = (\n", + " \"B\" + first_in_group_df[\"iob_pred\"].str[1:]\n", + " )\n", + " df.update(first_in_group_df)\n", + " # filter for just words that were labeled with non \"other\" entities\n", + " entities_df = df.sort_values(by=[\"top_left_y\", \"top_left_x\"])\n", + " entities_df = entities_df[entities_df[\"pred\"] != \"other\"]\n", + " # words are labeled with IOB format which stands for inside, outside, beginning\n", + " # merge B and I entities to form one entity group\n", + " # (i.e. \"B-Subsidiary\" and \"I-Subsidiary\" become just \"subsidiary\"), assign a group ID\n", + " entities_df[\"group\"] = (entities_df[\"iob_pred\"].str.startswith(\"B-\")).cumsum()\n", + " grouped_df = (\n", + " entities_df.groupby([\"group\", \"pred\"])[\"word\"]\n", + " .apply(\" \".join)\n", + " .reset_index()[[\"pred\", \"word\"]]\n", + " )\n", + " # assign a new row every time there's a new subsidiary\n", + " grouped_df[\"row\"] = (grouped_df[\"pred\"].str.startswith(\"subsidiary\")).cumsum()\n", + " output_df = grouped_df.pivot_table(\n", + " index=\"row\", columns=\"pred\", values=\"word\", aggfunc=lambda x: \" \".join(x)\n", + " ).reset_index()\n", + " if output_df.empty:\n", + " return output_df\n", + " output_df.loc[:, \"id\"] = doc_dict[\"id\"]\n", + " return output_df" + ] + }, + { + "cell_type": "markdown", + "id": "ea9fe887-43ca-43e2-85e3-bf5371bd165f", + "metadata": {}, + "source": [ + "Next, wrap the `LayoutLMInferencePipeline` in an `mlflow` `pyfunc` model, which handles loading the pretrained model and managing inputs/outputs." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "4d802e00-1ca4-40b3-b15b-561711a9db70", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "ff844a110fb04ddcbe788e647651786c", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading artifacts: 0%| | 0/1 [00:00, skipping schema inference\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "004ac3503c77461f9ce7938949a660c5", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading artifacts: 0%| | 0/17 [00:00`_ for more details.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "02516db30cd241ed97c08df920368bf8", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading artifacts: 0%| | 0/17 [00:00 pd.DataFrame:\n", - " \"\"\"Format Label Studio output JSONs into dataframe.\"\"\"\n", - " labeled_df = pd.DataFrame()\n", - " # TODO: make this path stuff less janky?\n", - " tracking_df = validation_helpers.load_training_data(\"ex21_labels.csv\")\n", - " for json_filename in os.listdir(labeled_json_dir):\n", - " if not json_filename[0].isdigit() or json_filename.endswith(\".json\"):\n", - " continue\n", - " json_file_path = labeled_json_dir / json_filename\n", - " with Path.open(json_file_path) as j:\n", - " doc_dict = json.loads(j.read())\n", - "\n", - " filename = doc_dict[\"task\"][\"data\"][\"ocr\"].split(\"/\")[-1].split(\".\")[0]\n", - " # check if old local naming schema is being used\n", - " if len(filename.split(\"-\")) == 6:\n", - " filename = \"-\".join(filename.split(\"-\")[2:])\n", - " if not _is_cik_in_training_data(filename, tracking_df=tracking_df):\n", - " continue\n", - "\n", - " pdf_filename = filename + \".pdf\"\n", - " src_path = pdfs_dir / pdf_filename\n", - " extracted, pg = get_pdf_data_from_path(src_path)\n", - " txt = extracted[\"pdf_text\"]\n", - " pg_meta = extracted[\"page\"]\n", - " # normalize bboxes between 0 and 1000 for Hugging Face\n", - " txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta)\n", - " # parse the output dictionary of labeled bounding boxes from Label Studio\n", - " doc_df = pd.DataFrame()\n", - " for item in doc_dict[\"result\"]:\n", - " value = item[\"value\"]\n", - " # sometimes Label Studio will fill in an empty list as a label\n", - " # when there is really no label\n", - " # TODO: do this without dict comprehension?\n", - " if (\"labels\" in value) and value[\"labels\"] == []:\n", - " value = {k: v for k, v in value.items() if k != \"labels\"}\n", - " ind = int(item[\"id\"].split(\"_\")[-1])\n", - " doc_df = pd.concat([doc_df, pd.DataFrame(value, index=[ind])])\n", - "\n", - " # combine the bounding boxes for each word\n", - " doc_df = doc_df.groupby(level=0).first()\n", - " txt.loc[:, \"id\"] = filename\n", - " # TODO: probably want to filter out these empty Ex. 21 docs\n", - " # the doc might not have any labels in it if it was an empty Ex. 21\n", - " if \"labels\" not in doc_df:\n", - " doc_df.loc[:, \"labels\"] = pd.Series()\n", - "\n", - " output_df = pd.concat([txt, doc_df[[\"labels\"]]], axis=1)\n", - " labeled_df = pd.concat([labeled_df, output_df])\n", - "\n", - " # fill in unlabeled words and clean up labeled dataframe\n", - " labeled_df[\"labels\"] = labeled_df[\"labels\"].fillna(\"O\")\n", - " labeled_df = labeled_df.rename(columns={\"labels\": \"ner_tag\"})\n", - " non_id_columns = [col for col in labeled_df.columns if col != \"id\"]\n", - " labeled_df = labeled_df.loc[:, [\"id\"] + non_id_columns]\n", - "\n", - " # TODO: add in sanity checks on labeled_df bounding boxes to make sure\n", - " # that no value is above 1000 or below 0\n", - "\n", - " return labeled_df\n", - "\n", - "\n", - "def get_image_dict(pdfs_dir):\n", - " \"\"\"Create a dictionary with filenames and their Ex. 21 images.\"\"\"\n", - " image_dict = {}\n", - " for pdf_filename in os.listdir(pdfs_dir):\n", - " if pdf_filename.split(\".\")[-1] != \"pdf\":\n", - " continue\n", - " pdf_file_path = pdfs_dir / pdf_filename\n", - " _, pg = get_pdf_data_from_path(pdf_file_path)\n", - " full_pg_img = render_page(pg)\n", - " filename = pdf_filename.split(\".\")[0]\n", - " image_dict[filename] = full_pg_img\n", - " return image_dict\n", - "\n", - "\n", - "def format_as_ner_annotations(\n", - " labeled_json_path: Path,\n", - " pdfs_path: Path,\n", - " gcs_folder_name: Path,\n", - ") -> list[dict]:\n", - " \"\"\"Format a Label Studio output JSONs as NER annotations.\n", - "\n", - " Formats the dataframe as named entity recognition annotations.\n", - " # TODO: say more about this format\n", - "\n", - " Returns:\n", - " ner_annotations: a list of dicts, with one dict for each doc.\n", - " \"\"\"\n", - " GCSArchive().cache_training_data(\n", - " json_cache_path=labeled_json_path,\n", - " pdf_cache_path=pdfs_path,\n", - " gcs_folder_name=gcs_folder_name\n", - " )\n", - "\n", - " labeled_df = format_label_studio_output(\n", - " labeled_json_dir=labeled_json_path, pdfs_dir=pdfs_path\n", - " )\n", - " # convert dataframe/dictionary into NER format\n", - " # document_annotation_to_ner https://github.com/butlerlabs/docai/blob/main/docai/annotations/ner_utils.py\n", - " # complete dataset is a list of dicts, with one dict for each doc\n", - " doc_filenames = labeled_df[\"id\"].unique()\n", - " image_dict = get_image_dict(pdfs_dir=pdfs_path)\n", - " ner_annotations = []\n", - " for filename in doc_filenames:\n", - " annotation = {\n", - " \"id\": filename,\n", - " \"tokens\": labeled_df.groupby(\"id\")[\"text\"].apply(list).loc[filename],\n", - " \"ner_tags\": labeled_df.groupby(\"id\")[\"ner_tag\"].apply(list).loc[filename],\n", - " \"bboxes\": labeled_df.loc[labeled_df[\"id\"] == filename, :][BBOX_COLS_PDF]\n", - " .to_numpy()\n", - " .tolist(),\n", - " \"image\": image_dict[filename],\n", - " }\n", - " ner_annotations.append(annotation)\n", - "\n", - " return ner_annotations\n", - "\n", - "def _prepare_dataset(annotations, processor, label2id):\n", - " \"\"\"Put the dataset in its final format for training LayoutLM.\"\"\"\n", - "\n", - " def _convert_ner_tags_to_id(ner_tags, label2id):\n", - " return [int(label2id[ner_tag]) for ner_tag in ner_tags]\n", - "\n", - " images = annotations[\"image\"]\n", - " words = annotations[\"tokens\"]\n", - " boxes = annotations[\"bboxes\"]\n", - " # Map over labels and convert to numeric id for each ner_tag\n", - " ner_tags = [\n", - " _convert_ner_tags_to_id(ner_tags, label2id)\n", - " for ner_tags in annotations[\"ner_tags\"]\n", - " ]\n", - "\n", - " encoding = processor(\n", - " images,\n", - " words,\n", - " boxes=boxes,\n", - " word_labels=ner_tags,\n", - " truncation=True,\n", - " padding=\"max_length\",\n", - " )\n", - "\n", - " return encoding\n", - "\n", - "def compute_metrics(p, metric, label_list, return_entity_level_metrics=False):\n", - " \"\"\"Compute metrics to train and evaluate the model on.\"\"\"\n", - " predictions, labels = p\n", - " predictions = np.argmax(predictions, axis=2)\n", - "\n", - " # Remove ignored index (special tokens)\n", - " true_predictions = [\n", - " [label_list[pred] for (pred, lab) in zip(prediction, label) if lab != -100]\n", - " for prediction, label in zip(predictions, labels)\n", - " ]\n", - " true_labels = [\n", - " [label_list[lab] for (pred, lab) in zip(prediction, label) if lab != -100]\n", - " for prediction, label in zip(predictions, labels)\n", - " ]\n", - "\n", - " results = metric.compute(predictions=true_predictions, references=true_labels)\n", - " if return_entity_level_metrics:\n", - " # Unpack nested dictionaries\n", - " final_results = {}\n", - " for key, value in results.items():\n", - " if isinstance(value, dict):\n", - " for n, v in value.items():\n", - " final_results[f\"{key}_{n}\"] = v\n", - " else:\n", - " final_results[key] = value\n", - " return final_results\n", - " return {\n", - " \"precision\": results[\"overall_precision\"],\n", - " \"recall\": results[\"overall_recall\"],\n", - " \"f1\": results[\"overall_f1\"],\n", - " \"accuracy\": results[\"overall_accuracy\"],\n", - " }" - ] - }, - { - "cell_type": "markdown", - "id": "8160263c-8f69-437c-918b-e56ad007961a", - "metadata": { - "tags": [] - }, - "source": [ - "#### Finetune Model\n", - "The next cell will use the functions defined in the previous section to actually construct a huggingface dataset from labeled data and finetune the `layoutlm` model. Model finetuning will only be run if configured to do so, otherwise a pretrained version will be used from the `mlflow` tracking server.\n", - "\n", - "Model training contains several steps implemented below:\n", - "1. Use temporary path to convert filings to PDF's and stash labels\n", - "2. Use PDF's and labels to convert PDF's and labels to NER annotations\n", - "3. Construct huggingface dataset from NER annotations and split into train and test sets\n", - "4. Load pretrained model from huggingface\n", - "5. Finetune model on training data and evaluate on test data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "71d205b2-e6ea-4ad0-982c-22e762269119", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " is empty\n", - "' The Southwest Companies Nevada PriMerit Bank Federally chartered stock savings bank Paiute Pipeline Company Nevada Carson Water Company Nevada Southwest Gas Transmission Company Partnership between Southwest Gas Corporation and Utility Financial Corp. Utility Financial Corp. Nevada Southwest Gas Corporation of Arizona Nevada PRIMERIT BANK SUBSIDIARIES AT DECEMBER 31, 1993'\n", - "
is empty\n", - "' TCA Management Company.................................................... Texas Teleservice Corporation of America........................................ Texas Texas Community Antennas, Inc............................................. Texas Texas Telecable, Inc...................................................... Texas TCA Cable of Amarillo, Inc................................................ Texas Telecable Associates, Inc................................................. Texas Delta Cablevision, Inc.................................................... Arkansas Sun Valley Cablevision, Inc............................................... Idaho VPI Communications, Inc................................................... Texas AvComm Corporation........................................................ Texas Tele-Communications of Arkansas L. P......................................'\n", - "
is empty\n", - "' DOMESTIC SUBSIDIARIES International Sales & Business, Inc. California KLA-Tencor Building Corporation California KLA-Tencor Disc Corporation California KLA-Tencor International Corporation California KLA-Tencor Klinnik Corporation California KLA-Tencor Management Corporation California KLA-Tencor (Thailand Branch) Corporation California VLSI Standards, Inc. California Amray, Inc. Delaware Groff Associates, Inc. California DeviceWare, Inc. California INTERNATIONAL SUBSIDIARIES'\n", - "
is empty\n", - "' 1. Northeast Energy, LLC (100%-Owned) .................................................... Florida 2. Northeast Energy Associates, A Limited Partnership (99%-Owned) (a) .................... Massachusetts 3. North Jersey Energy Associates, A Limited Partnership (99%-Owned) (a) ................. New Jersey (a) Northeast Energy, LLC owns the remaining 1% interest. '\n", - "
is empty\n", - "' 1. ESI Tractebel Urban Renewal Corporation (100%-Owned) .................................. New Jersey '\n", - "
is empty\n", - "' IVANHOE ENERGY HOLDINGS INC. (Nevada) 100% IVANHOE ENERGY (USA) INC. (Nevada) 100% (indirect) IVANHOE ENERGY ROYALTY INC. (Nevada) 100% (indirect) IVANHOE ENERGY INTERNATIONAL VENTURES INC. (BVI) 100% Ivanhoe Energy Sweetwater Limited (Malta) 100% (Indirect) Ivanhoe Energy (Qatar) Inc. (BVI) 100% (Indirect) GTL Japan Corporation (Japan) 100% (Indirect) IVANHOE ENERGY'\n", - "
is empty\n", - "' Airgas Canada, Inc. Canada Airgas Carbonic, Inc. DE Airgas Data, LLC DE Airgas East, Inc. DE Airgas Great Lakes, Inc. DE Airgas Gulf States, Inc. DE Airgas Intermountain, Inc. CO Airgas International, Inc. VI Airgas Mid America, Inc. DE Airgas Mid South, Inc. DE Airgas Nor Pac, Inc. DE'\n", - "
is empty\n", - "' Subsidiary Name State of Formation - --------------- ------------------- American Ecology Environmental Services Corporation Texas Corporation American Ecology Holdings Corporation Delaware Corporation American Ecology Recycle Center, Inc. Delaware Corporation American Ecology Services Corporation Delaware Corporation Texas Ecologists, Inc. Texas Corporation US Ecology, Inc. California Corporation US Ecology Idaho, Inc. Delaware'\n", - "Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "bae617cb831d4b2593c0fa4a874f1592", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Map: 0%| | 0/159 [00:00\n", - " \n", - " \n", - " [ 2/1000 : < :, Epoch 0.01/8]\n", - " \n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
StepTraining LossValidation Loss

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/09/23 14:14:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run bedecked-trout-555 at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13/runs/573e64992704411c9013937d849e1504.\n", - "2024/09/23 14:14:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13.\n", - "2024/09/23 14:14:51 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n", - "2024/09/23 14:14:51 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n" - ] - }, - { - "ename": "OutOfMemoryError", - "evalue": "CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacity of 3.93 GiB of which 20.12 MiB is free. Including non-PyTorch memory, this process has 2.72 GiB memory in use. Of the allocated memory 2.53 GiB is allocated by PyTorch, and 104.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mOutOfMemoryError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[6], line 94\u001b[0m\n\u001b[1;32m 91\u001b[0m mlflow\u001b[38;5;241m.\u001b[39mset_experiment(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mexhibit21_extraction_test\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 92\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m mlflow\u001b[38;5;241m.\u001b[39mstart_run():\n\u001b[1;32m 93\u001b[0m \u001b[38;5;66;03m# Train inside mlflow run. Mlflow will automatically handle logging training metrcis\u001b[39;00m\n\u001b[0;32m---> 94\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 96\u001b[0m \u001b[38;5;66;03m# Log finetuend model with mlflow\u001b[39;00m\n\u001b[1;32m 97\u001b[0m mlflow\u001b[38;5;241m.\u001b[39mtransformers\u001b[38;5;241m.\u001b[39mlog_model(\n\u001b[1;32m 98\u001b[0m trainer, artifact_path\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayoutlm_extractor\u001b[39m\u001b[38;5;124m\"\u001b[39m, task\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken-classification\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 99\u001b[0m )\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:1938\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 1936\u001b[0m hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m 1937\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1938\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1939\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1940\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1941\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1942\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1943\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:2279\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 2276\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_step_begin(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[1;32m 2278\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39maccumulate(model):\n\u001b[0;32m-> 2279\u001b[0m tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2281\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 2282\u001b[0m args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m 2283\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_xla_available()\n\u001b[1;32m 2284\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m 2285\u001b[0m ):\n\u001b[1;32m 2286\u001b[0m \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m 2287\u001b[0m tr_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:3318\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[0;34m(self, model, inputs)\u001b[0m\n\u001b[1;32m 3315\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss_mb\u001b[38;5;241m.\u001b[39mreduce_mean()\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m 3317\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[0;32m-> 3318\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3320\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m inputs\n\u001b[1;32m 3321\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 3322\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mtorch_empty_cache_steps \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 3323\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m%\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mtorch_empty_cache_steps \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 3324\u001b[0m ):\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:3363\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[0;34m(self, model, inputs, return_outputs)\u001b[0m\n\u001b[1;32m 3361\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 3362\u001b[0m labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 3363\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3364\u001b[0m \u001b[38;5;66;03m# Save past state if it exists\u001b[39;00m\n\u001b[1;32m 3365\u001b[0m \u001b[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[1;32m 3366\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mpast_index \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:1099\u001b[0m, in \u001b[0;36mLayoutLMv3ForTokenClassification.forward\u001b[0;34m(self, input_ids, bbox, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, pixel_values)\u001b[0m\n\u001b[1;32m 1069\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1070\u001b[0m \u001b[38;5;124;03mlabels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):\u001b[39;00m\n\u001b[1;32m 1071\u001b[0m \u001b[38;5;124;03m Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1095\u001b[0m \u001b[38;5;124;03m>>> logits = outputs.logits\u001b[39;00m\n\u001b[1;32m 1096\u001b[0m \u001b[38;5;124;03m```\"\"\"\u001b[39;00m\n\u001b[1;32m 1097\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[0;32m-> 1099\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlayoutlmv3\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1100\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1101\u001b[0m \u001b[43m \u001b[49m\u001b[43mbbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1102\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1103\u001b[0m \u001b[43m \u001b[49m\u001b[43mtoken_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1104\u001b[0m \u001b[43m \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1105\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1106\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1107\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1108\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1109\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1110\u001b[0m \u001b[43m \u001b[49m\u001b[43mpixel_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpixel_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1111\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1112\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m input_ids \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1113\u001b[0m input_shape \u001b[38;5;241m=\u001b[39m input_ids\u001b[38;5;241m.\u001b[39msize()\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:975\u001b[0m, in \u001b[0;36mLayoutLMv3Model.forward\u001b[0;34m(self, input_ids, bbox, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, pixel_values, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 968\u001b[0m \u001b[38;5;66;03m# Prepare head mask if needed\u001b[39;00m\n\u001b[1;32m 969\u001b[0m \u001b[38;5;66;03m# 1.0 in head_mask indicate we keep the head\u001b[39;00m\n\u001b[1;32m 970\u001b[0m \u001b[38;5;66;03m# attention_probs has shape bsz x n_heads x N x N\u001b[39;00m\n\u001b[1;32m 971\u001b[0m \u001b[38;5;66;03m# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]\u001b[39;00m\n\u001b[1;32m 972\u001b[0m \u001b[38;5;66;03m# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]\u001b[39;00m\n\u001b[1;32m 973\u001b[0m head_mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_head_mask(head_mask, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mnum_hidden_layers)\n\u001b[0;32m--> 975\u001b[0m encoder_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 976\u001b[0m \u001b[43m \u001b[49m\u001b[43membedding_output\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 977\u001b[0m \u001b[43m \u001b[49m\u001b[43mbbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfinal_bbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 978\u001b[0m \u001b[43m \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfinal_position_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 979\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextended_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 980\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 981\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 982\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 983\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 984\u001b[0m \u001b[43m \u001b[49m\u001b[43mpatch_height\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpatch_height\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 985\u001b[0m \u001b[43m \u001b[49m\u001b[43mpatch_width\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpatch_width\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 986\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 988\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m encoder_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 990\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m return_dict:\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:681\u001b[0m, in \u001b[0;36mLayoutLMv3Encoder.forward\u001b[0;34m(self, hidden_states, bbox, attention_mask, head_mask, output_attentions, output_hidden_states, return_dict, position_ids, patch_height, patch_width)\u001b[0m\n\u001b[1;32m 671\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[1;32m 672\u001b[0m layer_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[1;32m 673\u001b[0m hidden_states,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 678\u001b[0m rel_2d_pos,\n\u001b[1;32m 679\u001b[0m )\n\u001b[1;32m 680\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 681\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mlayer_module\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 682\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 683\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 684\u001b[0m \u001b[43m \u001b[49m\u001b[43mlayer_head_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 685\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 686\u001b[0m \u001b[43m \u001b[49m\u001b[43mrel_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 687\u001b[0m \u001b[43m \u001b[49m\u001b[43mrel_2d_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_2d_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 688\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 690\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 691\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m output_attentions:\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:532\u001b[0m, in \u001b[0;36mLayoutLMv3Layer.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, output_attentions, rel_pos, rel_2d_pos)\u001b[0m\n\u001b[1;32m 523\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m 524\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 525\u001b[0m hidden_states,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 530\u001b[0m rel_2d_pos\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 531\u001b[0m ):\n\u001b[0;32m--> 532\u001b[0m self_attention_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattention\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 533\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 534\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 535\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 536\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 537\u001b[0m \u001b[43m \u001b[49m\u001b[43mrel_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 538\u001b[0m \u001b[43m \u001b[49m\u001b[43mrel_2d_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_2d_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 539\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 540\u001b[0m attention_output \u001b[38;5;241m=\u001b[39m self_attention_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 542\u001b[0m outputs \u001b[38;5;241m=\u001b[39m self_attention_outputs[\u001b[38;5;241m1\u001b[39m:] \u001b[38;5;66;03m# add self attentions if we output attention weights\u001b[39;00m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:500\u001b[0m, in \u001b[0;36mLayoutLMv3Attention.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, output_attentions, rel_pos, rel_2d_pos)\u001b[0m\n\u001b[1;32m 491\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m 492\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 493\u001b[0m hidden_states,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 498\u001b[0m rel_2d_pos\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 499\u001b[0m ):\n\u001b[0;32m--> 500\u001b[0m self_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mself\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 501\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 502\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 503\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 504\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 505\u001b[0m \u001b[43m \u001b[49m\u001b[43mrel_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 506\u001b[0m \u001b[43m \u001b[49m\u001b[43mrel_2d_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_2d_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 507\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 508\u001b[0m attention_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput(self_outputs[\u001b[38;5;241m0\u001b[39m], hidden_states)\n\u001b[1;32m 509\u001b[0m outputs \u001b[38;5;241m=\u001b[39m (attention_output,) \u001b[38;5;241m+\u001b[39m self_outputs[\u001b[38;5;241m1\u001b[39m:] \u001b[38;5;66;03m# add attentions if we output them\u001b[39;00m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:448\u001b[0m, in \u001b[0;36mLayoutLMv3SelfAttention.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, output_attentions, rel_pos, rel_2d_pos)\u001b[0m\n\u001b[1;32m 444\u001b[0m attention_scores \u001b[38;5;241m=\u001b[39m attention_scores \u001b[38;5;241m+\u001b[39m attention_mask\n\u001b[1;32m 446\u001b[0m \u001b[38;5;66;03m# Normalize the attention scores to probabilities.\u001b[39;00m\n\u001b[1;32m 447\u001b[0m \u001b[38;5;66;03m# Use the trick of the CogView paper to stablize training\u001b[39;00m\n\u001b[0;32m--> 448\u001b[0m attention_probs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcogview_attention\u001b[49m\u001b[43m(\u001b[49m\u001b[43mattention_scores\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 450\u001b[0m \u001b[38;5;66;03m# This is actually dropping out entire tokens to attend to, which might\u001b[39;00m\n\u001b[1;32m 451\u001b[0m \u001b[38;5;66;03m# seem a bit unusual, but is taken from the original Transformer paper.\u001b[39;00m\n\u001b[1;32m 452\u001b[0m attention_probs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(attention_probs)\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:414\u001b[0m, in \u001b[0;36mLayoutLMv3SelfAttention.cogview_attention\u001b[0;34m(self, attention_scores, alpha)\u001b[0m\n\u001b[1;32m 412\u001b[0m scaled_attention_scores \u001b[38;5;241m=\u001b[39m attention_scores \u001b[38;5;241m/\u001b[39m alpha\n\u001b[1;32m 413\u001b[0m max_value \u001b[38;5;241m=\u001b[39m scaled_attention_scores\u001b[38;5;241m.\u001b[39mamax(dim\u001b[38;5;241m=\u001b[39m(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m))\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m--> 414\u001b[0m new_attention_scores \u001b[38;5;241m=\u001b[39m \u001b[43m(\u001b[49m\u001b[43mscaled_attention_scores\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mmax_value\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m \u001b[49m\u001b[43malpha\u001b[49m\n\u001b[1;32m 415\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m nn\u001b[38;5;241m.\u001b[39mSoftmax(dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)(new_attention_scores)\n", - "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacity of 3.93 GiB of which 20.12 MiB is free. Including non-PyTorch memory, this process has 2.72 GiB memory in use. Of the allocated memory 2.53 GiB is allocated by PyTorch, and 104.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)" - ] - } - ], - "source": [ - "import mlflow\n", - "from datasets import (\n", - " Array2D,\n", - " Array3D,\n", - " Dataset,\n", - " Features,\n", - " Sequence,\n", - " Value,\n", - " load_metric,\n", - ")\n", - "from dotenv import load_dotenv\n", - "from transformers import (\n", - " AutoProcessor,\n", - " LayoutLMv3ForTokenClassification,\n", - " Trainer,\n", - " TrainingArguments,\n", - ")\n", - "from transformers.data.data_collator import default_data_collator\n", - "\n", - "from mozilla_sec_eia.library.mlflow import configure_mlflow\n", - "\n", - "load_dotenv()\n", - "\n", - "\n", - "configure_mlflow()\n", - "\n", - "# Only finetune if configured to do so\n", - "if context.op_config[\"train_model\"]:\n", - " # Change temp_dir to save training data locally for inspection\n", - " with TemporaryDirectory() as temp_dir:\n", - " ner_annotations = format_as_ner_annotations(\n", - " labeled_json_path=Path(temp_dir) / \"sec10k_filings\" / \"labeled_jsons\",\n", - " pdfs_path=Path(temp_dir) / \"sec10k_filings\" / \"pdfs\",\n", - " gcs_folder_name=\"labeledv0.2/\",\n", - " )\n", - "\n", - " # Cache/prepare training data\n", - " dataset = Dataset.from_list(ner_annotations)\n", - "\n", - " # Load pretrained model\n", - " model = LayoutLMv3ForTokenClassification.from_pretrained(\n", - " \"microsoft/layoutlmv3-base\", id2label=id2label, label2id=label2id\n", - " )\n", - " processor = AutoProcessor.from_pretrained(\n", - " \"microsoft/layoutlmv3-base\", apply_ocr=False\n", - " )\n", - "\n", - " # Prepare our train & eval dataset\n", - " column_names = dataset.column_names\n", - " features = Features(\n", - " {\n", - " \"pixel_values\": Array3D(dtype=\"float32\", shape=(3, 224, 224)),\n", - " \"input_ids\": Sequence(feature=Value(dtype=\"int64\")),\n", - " \"attention_mask\": Sequence(Value(dtype=\"int64\")),\n", - " \"bbox\": Array2D(dtype=\"int64\", shape=(512, 4)),\n", - " \"labels\": Sequence(feature=Value(dtype=\"int64\")),\n", - " }\n", - " )\n", - " dataset = dataset.map(\n", - " lambda annotations: _prepare_dataset(annotations, processor, label2id),\n", - " batched=True,\n", - " remove_columns=column_names,\n", - " features=features,\n", - " )\n", - " dataset.set_format(\"torch\")\n", - " split_dataset = dataset.train_test_split(test_size=0.2)\n", - " train_dataset, eval_dataset = split_dataset[\"train\"], split_dataset[\"test\"]\n", - "\n", - " # Initialize our Trainer\n", - " metric = load_metric(\"seqeval\")\n", - " training_args = TrainingArguments(\n", - " max_steps=1000,\n", - " per_device_train_batch_size=1,\n", - " per_device_eval_batch_size=1,\n", - " learning_rate=1e-5,\n", - " evaluation_strategy=\"steps\",\n", - " eval_steps=100,\n", - " load_best_model_at_end=True,\n", - " metric_for_best_model=\"f1\",\n", - " output_dir=\"./layoutlm\",\n", - " )\n", - " trainer = Trainer(\n", - " model=model,\n", - " args=training_args,\n", - " train_dataset=train_dataset,\n", - " eval_dataset=eval_dataset,\n", - " tokenizer=processor,\n", - " data_collator=default_data_collator,\n", - " compute_metrics=lambda p: compute_metrics(p, metric=metric, label_list=LABELS),\n", - " )\n", - "\n", - " mlflow.set_experiment(\"exhibit21_extraction_test\")\n", - " with mlflow.start_run():\n", - " # Train inside mlflow run. Mlflow will automatically handle logging training metrcis\n", - " trainer.train()\n", - "\n", - " # Log finetuend model with mlflow\n", - " mlflow.transformers.log_model(\n", - " trainer, artifact_path=\"layoutlm_extractor\", task=\"token-classification\"\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "ee9b4e20-7781-43a7-b7aa-caf0690a201e", - "metadata": {}, - "source": [ - "## Model inference\n", - "Use the finetuned model to perform inference and evaluate on labeled validation data. First create a Huggingface `Pipeline` which wraps layoutlm with some custom pre/post processing steps. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "42c8e920-d671-40c2-b5db-c43611a33897", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import torch\n", - "from transformers import Pipeline, pipeline\n", - "from transformers.tokenization_utils_base import BatchEncoding\n", - "\n", - "from mozilla_sec_eia.models.sec10k.utils.layoutlm import (\n", - " iob_to_label,\n", - ")\n", - "\n", - "\n", - "def _sort_by_label_priority(target_array):\n", - " id_priority = [label2id[label] for label in LABEL_PRIORITY]\n", - " # Create a priority map from the label priority\n", - " priority_map = {val: idx for idx, val in enumerate(id_priority)}\n", - " # Sort the target array based on the priority map\n", - " sorted_array = sorted(target_array, key=lambda x: priority_map.get(x, float(\"inf\")))\n", - " return sorted_array\n", - "\n", - "def get_flattened_mode_predictions(token_boxes_tensor, predictions_tensor):\n", - " \"\"\"Get the mode prediction for each box in an Ex. 21.\n", - "\n", - " When handling multi page documents LayoutLM uses a sliding 'frame'\n", - " with some overlap between frames. The overlap creates multiple\n", - " predictions for the same bounding boxes. Thus it's necessary to find\n", - " the mode of all the predictions for a bounding box and use that as the\n", - " single prediction for each box. If there are multiple mode\n", - " predictions for a bounding box, then ties are broken by setting\n", - " a priority for the labels (LABEL_PRIORITY) and choosing the highest priority\n", - " label.\n", - " \"\"\"\n", - " # Flatten the tensors\n", - " flat_token_boxes = token_boxes_tensor.view(-1, 4)\n", - " flat_predictions = predictions_tensor.view(-1)\n", - "\n", - " boxes = flat_token_boxes.numpy()\n", - " predictions = flat_predictions.numpy()\n", - "\n", - " # Find unique boxes and indices\n", - " unique_boxes, inverse_indices = np.unique(boxes, axis=0, return_inverse=True)\n", - "\n", - " # Compute the mode for each unique bounding box\n", - " # for each unique box in boxes, create a list with all predictions for that box\n", - " # get the indices in predictions where the corresponding index in boxes is\n", - " unique_box_predictions = [\n", - " predictions[np.where(inverse_indices == i)[0]] for i in range(len(unique_boxes))\n", - " ]\n", - " pred_counts = [np.bincount(arr) for arr in unique_box_predictions]\n", - " # Compute the mode of predictions for each group\n", - " # break ties by taking into account LABEL_PRIORITY\n", - " modes = np.array(\n", - " [\n", - " _sort_by_label_priority(np.where(arr == np.max(arr))[0])[0]\n", - " for arr in pred_counts\n", - " ]\n", - " )\n", - " flattened_modes = modes[inverse_indices]\n", - "\n", - " return flattened_modes\n", - "\n", - "class LayoutLMInferencePipeline(Pipeline):\n", - " \"\"\"Pipeline for performing inference with fine-tuned LayoutLM.\"\"\"\n", - "\n", - " def __init__(self, *args, **kwargs):\n", - " \"\"\"Initialize LayoutLMInferencePipeline.\"\"\"\n", - " super().__init__(*args, **kwargs)\n", - "\n", - " def _sanitize_parameters(self, **kwargs):\n", - " preprocess_kwargs = {}\n", - " if \"maybe_arg\" in kwargs:\n", - " preprocess_kwargs[\"maybe_arg\"] = kwargs[\"maybe_arg\"]\n", - " return preprocess_kwargs, {}, {}\n", - "\n", - " def preprocess(self, doc_dict):\n", - " \"\"\"Encode and tokenize model inputs.\"\"\"\n", - " image = doc_dict[\"image\"]\n", - " words = doc_dict[\"tokens\"]\n", - " boxes = doc_dict[\"bboxes\"]\n", - " encoding = self.tokenizer(\n", - " image,\n", - " words,\n", - " boxes=boxes,\n", - " return_tensors=\"pt\",\n", - " truncation=True,\n", - " padding=\"max_length\",\n", - " max_length=512, # this is the maximum max_length\n", - " stride=128,\n", - " return_offsets_mapping=True,\n", - " return_overflowing_tokens=True,\n", - " )\n", - " model_inputs = {}\n", - " model_inputs[\"raw_encoding\"] = encoding.copy()\n", - " model_inputs[\"doc_dict\"] = doc_dict\n", - " model_inputs[\"offset_mapping\"] = encoding.pop(\"offset_mapping\")\n", - " model_inputs[\"sample_mapping\"] = encoding.pop(\"overflow_to_sample_mapping\")\n", - " # TODO: do we actually need to make these into ints?\n", - " encoding[\"input_ids\"] = encoding[\"input_ids\"].to(torch.int64)\n", - " encoding[\"attention_mask\"] = encoding[\"attention_mask\"].to(torch.int64)\n", - " encoding[\"bbox\"] = encoding[\"bbox\"].to(torch.int64)\n", - " encoding[\"pixel_values\"] = torch.stack(encoding[\"pixel_values\"])\n", - " model_inputs[\"encoding\"] = encoding\n", - " return model_inputs\n", - "\n", - " def _forward(self, model_inputs):\n", - " # encoding is passed as a UserDict in the model_inputs dictionary\n", - " # turn it back into a BatchEncoding\n", - " encoding = BatchEncoding(model_inputs[\"encoding\"])\n", - " if torch.cuda.is_available():\n", - " encoding.to(\"cuda\")\n", - " self.model.to(\"cuda\")\n", - " # since we're doing inference, we don't need gradient computation\n", - " with torch.no_grad():\n", - " output = self.model(**encoding)\n", - " return {\n", - " \"logits\": output.logits,\n", - " \"predictions\": output.logits.argmax(-1).squeeze().tolist(),\n", - " \"raw_encoding\": model_inputs[\"raw_encoding\"],\n", - " \"doc_dict\": model_inputs[\"doc_dict\"],\n", - " }\n", - "\n", - " def postprocess(self, all_outputs):\n", - " \"\"\"Return logits, model predictions, and the extracted dataframe.\"\"\"\n", - " logits = all_outputs[\"logits\"]\n", - " predictions = all_outputs[\"logits\"].argmax(-1).squeeze().tolist()\n", - " output_df = self.extract_table(all_outputs)\n", - " return logits, predictions, output_df\n", - "\n", - " def extract_table(self, all_outputs):\n", - " \"\"\"Extract a structured table from a set of inference predictions.\n", - "\n", - " This function essentially works by stacking bounding boxes and predictions\n", - " into a dataframe and going from left to right and top to bottom. Then, every\n", - " every time a new subsidiary entity is encountered, it assigns a new group or\n", - " \"row\" to that subsidiary. Next, location and ownership percentage words/labeled\n", - " entities in between these subsidiary groups are assigned to a subsidiary row/group.\n", - " Finally, this is all formatted into a dataframe with an ID column from the original\n", - " filename and a basic cleaning function normalizes strings.\n", - " \"\"\"\n", - " # TODO: when model more mature, break this into sub functions to make it\n", - " # clearer what's going on\n", - " predictions = all_outputs[\"predictions\"]\n", - " encoding = all_outputs[\"raw_encoding\"]\n", - " doc_dict = all_outputs[\"doc_dict\"]\n", - "\n", - " token_boxes_tensor = encoding[\"bbox\"].flatten(start_dim=0, end_dim=1)\n", - " predictions_tensor = torch.tensor(predictions)\n", - " mode_predictions = get_flattened_mode_predictions(\n", - " token_boxes_tensor, predictions_tensor\n", - " )\n", - " token_boxes = encoding[\"bbox\"].flatten(start_dim=0, end_dim=1).tolist()\n", - " predicted_labels = [\n", - " self.model.config.id2label[pred] for pred in mode_predictions\n", - " ]\n", - " simple_preds = [iob_to_label(pred).lower() for pred in predicted_labels]\n", - "\n", - " df = pd.DataFrame(data=token_boxes, columns=BBOX_COLS)\n", - " df.loc[:, \"iob_pred\"] = predicted_labels\n", - " df.loc[:, \"pred\"] = simple_preds\n", - " invalid_mask = (\n", - " (df[\"top_left_x\"] == 0)\n", - " & (df[\"top_left_y\"] == 0)\n", - " & (df[\"bottom_right_x\"] == 0)\n", - " & (df[\"bottom_right_y\"] == 0)\n", - " )\n", - " df = df[~invalid_mask]\n", - " # we want to get actual words on the dataframe, not just subwords that correspond to tokens\n", - " # subwords from the same word share the same bounding box coordinates\n", - " # so we merge the original words onto our dataframe on bbox coordinates\n", - " words_df = pd.DataFrame(data=doc_dict[\"bboxes\"], columns=BBOX_COLS)\n", - " words_df.loc[:, \"word\"] = doc_dict[\"tokens\"]\n", - " df = df.merge(words_df, how=\"left\", on=BBOX_COLS).drop_duplicates(\n", - " subset=BBOX_COLS + [\"pred\", \"word\"]\n", - " )\n", - " # rows that are the first occurrence in a new group (subsidiary, loc, own_per)\n", - " # should always have a B entity label. Manually override labels so this is true.\n", - " first_in_group_df = df[\n", - " (df[\"pred\"].ne(df[\"pred\"].shift())) & (df[\"pred\"] != \"other\")\n", - " ]\n", - " first_in_group_df.loc[:, \"iob_pred\"] = (\n", - " \"B\" + first_in_group_df[\"iob_pred\"].str[1:]\n", - " )\n", - " df.update(first_in_group_df)\n", - " # filter for just words that were labeled with non \"other\" entities\n", - " entities_df = df.sort_values(by=[\"top_left_y\", \"top_left_x\"])\n", - " entities_df = entities_df[entities_df[\"pred\"] != \"other\"]\n", - " # words are labeled with IOB format which stands for inside, outside, beginning\n", - " # merge B and I entities to form one entity group\n", - " # (i.e. \"B-Subsidiary\" and \"I-Subsidiary\" become just \"subsidiary\"), assign a group ID\n", - " entities_df[\"group\"] = (entities_df[\"iob_pred\"].str.startswith(\"B-\")).cumsum()\n", - " grouped_df = (\n", - " entities_df.groupby([\"group\", \"pred\"])[\"word\"]\n", - " .apply(\" \".join)\n", - " .reset_index()[[\"pred\", \"word\"]]\n", - " )\n", - " # assign a new row every time there's a new subsidiary\n", - " grouped_df[\"row\"] = (grouped_df[\"pred\"].str.startswith(\"subsidiary\")).cumsum()\n", - " output_df = grouped_df.pivot_table(\n", - " index=\"row\", columns=\"pred\", values=\"word\", aggfunc=lambda x: \" \".join(x)\n", - " ).reset_index()\n", - " if output_df.empty:\n", - " return output_df\n", - " output_df.loc[:, \"id\"] = doc_dict[\"id\"]\n", - " return output_df" - ] - }, - { - "cell_type": "markdown", - "id": "ea9fe887-43ca-43e2-85e3-bf5371bd165f", - "metadata": {}, - "source": [ - "Next, wrap the `LayoutLMInferencePipeline` in an `mlflow` `pyfunc` model, which handles loading the pretrained model and managing inputs/outputs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4d802e00-1ca4-40b3-b15b-561711a9db70", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from mozilla_sec_eia.models.sec10k.entities import (\n", - " Ex21CompanyOwnership,\n", - " Sec10kExtractionMetadata,\n", - ")\n", - "from mozilla_sec_eia.models.sec10k.ex_21.inference import clean_extracted_df\n", - "\n", - "# Construct model_uri from model_version\n", - "model_uri = f\"models:/layoutlm_extractor/{context.op_config['model_version']}\"\n", - "model_info = mlflow.models.get_model_info(model_uri)\n", - "\n", - "def _get_data(dataset):\n", - " yield from dataset\n", - "\n", - "class Ex21Extractor(mlflow.pyfunc.PythonModel):\n", - " \"\"\"Create an mlflow pyfunc model to perform full EX21 extraction.\"\"\"\n", - " def load_context(self, context):\n", - " \"\"\"Load pretrained model.\"\"\"\n", - " os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n", - " self.model_components = mlflow.transformers.load_model(\n", - " context.artifacts[\"layoutlm_extractor\"], return_type=\"components\"\n", - " )\n", - "\n", - " def predict(self, context, model_input: Dataset, params=None):\n", - " \"\"\"Use pretrained model and inference pipeline to perform inference.\"\"\"\n", - " # TODO: figure out device argument\n", - " pipe = pipeline(\n", - " \"token-classification\",\n", - " model=self.model_components[\"model\"],\n", - " tokenizer=self.model_components[\"tokenizer\"],\n", - " pipeline_class=LayoutLMInferencePipeline,\n", - " )\n", - "\n", - " logits = []\n", - " predictions = []\n", - " all_output_df = Ex21CompanyOwnership.example(size=0)\n", - " extraction_metadata = Sec10kExtractionMetadata.example(size=0)\n", - " for logit, pred, output_df in pipe(_get_data(model_input)):\n", - " logits.append(logit)\n", - " predictions.append(pred)\n", - " if not output_df.empty:\n", - " filename = get_metadata_filename(output_df[\"id\"].iloc[0])\n", - " extraction_metadata.loc[filename, [\"success\"]] = True\n", - " all_output_df = pd.concat([all_output_df, output_df])\n", - " all_output_df.columns.name = None\n", - " all_output_df = clean_extracted_df(all_output_df)\n", - " all_output_df = all_output_df[[\"id\", \"subsidiary\", \"loc\", \"own_per\"]]\n", - " all_output_df = all_output_df.reset_index(drop=True)\n", - " return extraction_metadata, all_output_df\n", - "\n", - "# Save model to local temp dir with artifacts, then reload for evaluation\n", - "with TemporaryDirectory() as tmp_dir:\n", - " mlflow.pyfunc.save_model(\n", - " path=tmp_dir,\n", - " python_model=Ex21Extractor(),\n", - " artifacts={\"model_components\": model_uri},\n", - " )\n", - " ex21_extraction_model = mlflow.pyfunc.load_model(tmp_dir)" - ] - }, - { - "cell_type": "markdown", - "id": "fee84b13-6c37-4afe-8faa-003ff149aa2d", - "metadata": {}, - "source": [ - "### Model Evaluation\n", - "Now the full extraction model can be evaluated using labeled validation data and logged to `mlflow`. The `mlflow` run used to evaluate and log the inference model will be created as a nested child run to the run used to train `layoutlm`. This setup allows multiple versions/configurations of inference to be associated with a single version of `layoutlm`, creating a clean organizational structure for testing the base model and inference logic separately." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "47c19b41-131f-4059-8f42-931237565a20", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def clean_ex21_validation_set(validation_df: pd.DataFrame):\n", - " \"\"\"Clean Ex. 21 validation data to match extracted format.\"\"\"\n", - " validation_df = validation_df.rename(\n", - " columns={\n", - " \"Filename\": \"id\",\n", - " \"Subsidiary\": \"subsidiary\",\n", - " \"Location of Incorporation\": \"loc\",\n", - " \"Ownership Percentage\": \"own_per\",\n", - " }\n", - " )\n", - " validation_df[\"own_per\"] = validation_df[\"own_per\"].astype(str)\n", - " validation_df[\"filename\"] = validation_df[\"id\"].apply(get_metadata_filename)\n", - " validation_df = clean_extracted_df(validation_df)\n", - " return validation_df\n", - "\n", - "# Load labeled validation set\n", - "validation_set = clean_ex21_validation_set(\n", - " validation_helpers.load_validation_data(\"ex21_labels.csv\")\n", - ")\n", - "\n", - "# Get filing metadata for filings in validation set\n", - "cloud_interface = GCSArchive()\n", - "filing_metadata = cloud_interface.get_metadata()\n", - "ex21_validation_filing_metadata = filing_metadata[\n", - " filing_metadata.index.isin(validation_set[\"filename\"].unique())\n", - "]" - ] - }, - { - "cell_type": "markdown", - "id": "eddcc912-324a-42e9-9841-3a916c6ece6b", - "metadata": {}, - "source": [ - "Next define methods evaluating model output, then run extraction and log in child run." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f79bd14d-5156-4f34-9a50-e9c813b822cf", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from mozilla_sec_eia.models.sec10k.ex_21.inference import create_inference_dataset\n", - "\n", - "\n", - "def ex21_validation_metrics(computed_df: pd.DataFrame, validation_df: pd.DataFrame):\n", - " \"\"\"Compute validation metrics for Ex. 21 extraction.\"\"\"\n", - " shared_cols = validation_df.columns.intersection(computed_df.columns)\n", - " validation_df = validation_df.astype(computed_df[shared_cols].dtypes)\n", - " n_equal = 0\n", - " validation_filenames = validation_df[\"id\"].unique()\n", - " n_files = len(validation_filenames)\n", - " table_metrics_dict = {}\n", - " jaccard_dict = {}\n", - " incorrect_files = []\n", - " # iterate through each file and check each extracted table\n", - " for filename in validation_filenames:\n", - " extracted_table_df = computed_df[computed_df[\"id\"] == filename].reset_index(\n", - " drop=True\n", - " )\n", - " validation_table_df = validation_df[\n", - " validation_df[\"id\"] == filename\n", - " ].reset_index(drop=True)\n", - " # check if the tables are exactly equal\n", - " if extracted_table_df.equals(validation_table_df):\n", - " # TODO: strip llc and other company strings before comparison\n", - " n_equal += 1\n", - " else:\n", - " incorrect_files.append(filename)\n", - " # compute precision and recall for each column\n", - " table_metrics_dict[filename] = {}\n", - " jaccard_dict[filename] = {}\n", - " for col in [\"subsidiary\", \"loc\", \"own_per\"]:\n", - " table_prec_recall = validation_helpers.pandas_compute_precision_recall(\n", - " extracted_table_df, validation_table_df, value_col=col\n", - " )\n", - " table_metrics_dict[filename][f\"{col}_precision\"] = table_prec_recall[\n", - " \"precision\"\n", - " ]\n", - " table_metrics_dict[filename][f\"{col}_recall\"] = table_prec_recall[\"recall\"]\n", - " # get the jaccard similarity between columns\n", - " jaccard_dict[filename][col] = validation_helpers.jaccard_similarity(\n", - " computed_df=extracted_table_df,\n", - " validation_df=validation_table_df,\n", - " value_col=col,\n", - " )\n", - "\n", - " jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient=\"index\").reset_index()\n", - " prec_recall_df = pd.DataFrame.from_dict(\n", - " table_metrics_dict, orient=\"index\"\n", - " ).reset_index()\n", - "\n", - " return (\n", - " jaccard_df,\n", - " prec_recall_df,\n", - " pd.DataFrame({\"filename\": incorrect_files}),\n", - " {\n", - " \"table_accuracy\": n_equal / n_files,\n", - " \"avg_subsidiary_jaccard_sim\": jaccard_df[\"subsidiary\"].sum() / n_files,\n", - " \"avg_location_jaccard_sim\": jaccard_df[\"loc\"].sum() / n_files,\n", - " \"avg_own_per_jaccard_sim\": jaccard_df[\"own_per\"].sum() / n_files,\n", - " \"avg_subsidiary_precision\": prec_recall_df[\"subsidiary_precision\"].sum()\n", - " / n_files,\n", - " \"avg_location_precision\": prec_recall_df[\"loc_precision\"].sum() / n_files,\n", - " \"avg_own_per_precision\": prec_recall_df[\"own_per_precision\"].sum()\n", - " / n_files,\n", - " \"avg_subsidiary_recall\": prec_recall_df[\"subsidiary_recall\"].sum()\n", - " / n_files,\n", - " \"avg_location_recall\": prec_recall_df[\"loc_recall\"].sum() / n_files,\n", - " \"avg_own_per_recall\": prec_recall_df[\"own_per_recall\"].sum() / n_files,\n", - " },\n", - " )\n", - "\n", - "\n", - "with mlflow.start_run(parent_run_id=model_info.run_id, nested=True):\n", - " failed_metadata, dataset = create_inference_dataset(\n", - " filings=ex21_validation_filing_metadata,\n", - " cloud_interface=cloud_interface,\n", - " has_labels=True,\n", - " )\n", - " metadata, extracted = ex21_extraction_model.predict(dataset)\n", - " metadata = pd.concat([failed_metadata, metadata])\n", - "\n", - " jaccard_df, prec_recall_df, incorrect_filenames, metrics = ex21_validation_metrics(extracted, validation_set)\n", - " mlflow.log_metrics(metrics)\n", - " mlflow.pyfunc.log_model(\"exhibit21_extractor\", python_model=ex21_extraction_model)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "45a5b13a-2276-4fb2-80dd-76e3f1184bea", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py index b685063..1a5ec96 100644 --- a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py +++ b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py @@ -173,7 +173,7 @@ def get_metadata(self, year_quarter: str | None = None) -> pd.DataFrame: """Return dataframe of filing metadata.""" selection = None if year_quarter is not None: - selection = ["year_quarter", "==", year_quarter] + selection = [("year_quarter", "==", year_quarter)] return pd.read_parquet( self.outputs_bucket_path / "sec10k_filing_metadata", filters=selection diff --git a/tests/unit/models/sec10k/ex21_model_test.py b/tests/unit/models/sec10k/ex21_model_test.py index ebfe63d..0e23fed 100644 --- a/tests/unit/models/sec10k/ex21_model_test.py +++ b/tests/unit/models/sec10k/ex21_model_test.py @@ -2,8 +2,10 @@ import torch -from mozilla_sec_eia.models.sec10k.ex_21.inference import get_flattened_mode_predictions -from mozilla_sec_eia.models.sec10k.ex_21.train_extractor import LABELS +from mozilla_sec_eia.models.sec10k.ex_21.inference import ( + LABELS, + get_flattened_mode_predictions, +) from mozilla_sec_eia.models.sec10k.utils.layoutlm import get_id_label_conversions From 37edd50bc1030082a24551113ac4fd18910d1dd3 Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 4 Oct 2024 11:24:33 -0400 Subject: [PATCH 085/161] Split dataset loading into separate assets --- src/mozilla_sec_eia/models/sec10k/__init__.py | 16 +- .../models/sec10k/ex_21/__init__.py | 12 - .../models/sec10k/ex_21/data.py | 1 - .../models/sec10k/ex_21/data/__init__.py | 72 ++ .../models/sec10k/ex_21/data/common.py | 203 ++++ .../models/sec10k/ex_21/data/inference.py | 119 ++ .../training.py} | 101 +- .../sec10k/ex_21/ex21_validation_helpers.py | 35 + .../models/sec10k/ex_21/inference.py | 197 +--- .../notebooks/exhibit21_extractor.ipynb | 699 ++++------- .../train_exhibit21_extraction.ipynb | 1045 ----------------- .../models/sec10k/utils/pdf.py | 15 + tests/unit/models/sec10k/ex21_model_test.py | 8 +- 13 files changed, 705 insertions(+), 1818 deletions(-) delete mode 100644 src/mozilla_sec_eia/models/sec10k/ex_21/data.py create mode 100644 src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py create mode 100644 src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py create mode 100644 src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py rename src/mozilla_sec_eia/models/sec10k/ex_21/{create_labeled_dataset.py => data/training.py} (57%) delete mode 100644 src/mozilla_sec_eia/models/sec10k/notebooks/train_exhibit21_extraction.ipynb diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index b482aec..63097e9 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -1,6 +1,7 @@ """Implement models to extract data from SEC10k filings.""" from dagster import ( + AssetIn, Config, Definitions, define_asset_job, @@ -28,6 +29,7 @@ basic_10k_assets = load_assets_from_modules([basic_10k]) ex21_assets = load_assets_from_package_module(ex_21) +ex21_training_data_assets = load_assets_from_modules([ex_21.data]) shared_assets = load_assets_from_modules([extract]) basic_10k_production_job = model_jobs.create_production_model_job( @@ -59,16 +61,26 @@ class TrainConfig(Config): name="exhibit21_extractor", notebook_path=file_relative_path(__file__, "notebooks/exhibit21_extractor.ipynb"), config_schema=TrainConfig.to_config_schema(), + ins={ + "ex21_training_data": AssetIn(), + "ex21_validation_set": AssetIn(), + "ex21_failed_parsing_metadata": AssetIn(), + "ex21_inference_dataset": AssetIn(), + }, ) ex21_training_job = define_asset_job( "ex21_training", - selection=[exhibit21_extractor], + selection=[exhibit21_extractor] + ex21_training_data_assets, executor_def=in_process_executor, ) defs = Definitions( - assets=basic_10k_assets + ex21_assets + shared_assets + [exhibit21_extractor], + assets=basic_10k_assets + + ex21_assets + + shared_assets + + [exhibit21_extractor] + + ex21_training_data_assets, jobs=[ basic_10k_production_job, basic_10k_validation_job, diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 60e7c97..574074d 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -2,24 +2,15 @@ import logging -import mlflow import pandas as pd from dagster import ( - AssetIn, AssetOut, In, Out, - asset, graph_multi_asset, - multi_asset, op, ) -from mozilla_sec_eia.library.mlflow import MlflowInterface, mlflow_interface_resource -from mozilla_sec_eia.models.sec10k.ex_21.ex21_validation_helpers import ( - clean_ex21_validation_set, -) - from ..entities import ( Ex21CompanyOwnership, Sec10kExtractionMetadata, @@ -27,7 +18,6 @@ sec10k_extract_metadata_type, ) from ..extract import chunk_filings, sec10k_filing_metadata, year_quarter_partitions -from ..utils.cloud import GCSArchive, cloud_interface_resource, get_metadata_filename from .inference import extract_filings logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -84,12 +74,10 @@ def collect_extracted_chunks( io_manager_key="pandas_parquet_io_manager" ), }, - ins={"layoutlm": AssetIn(input_manager_key="layoutlm_io_manager")}, partitions_def=year_quarter_partitions, ) def ex21_extract( sec10k_filing_metadata: pd.DataFrame, - layoutlm, ): """Extract ownership info from exhibit 21 docs.""" filing_chunks = chunk_filings(sec10k_filing_metadata) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data.py deleted file mode 100644 index 4e331c8..0000000 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/data.py +++ /dev/null @@ -1 +0,0 @@ -"""Define methods and assets for handling datasets used by.""" diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py new file mode 100644 index 0000000..da5525f --- /dev/null +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py @@ -0,0 +1,72 @@ +"""Tools for constructing datasets used by exhibit 21 extraction model.""" + +from pathlib import Path +from tempfile import TemporaryDirectory + +import pandas as pd +from dagster import AssetOut, Config, asset, multi_asset + +from mozilla_sec_eia.library import validation_helpers + +from ...entities import ex21_extract_type, sec10k_extract_metadata_type +from ...utils.cloud import GCSArchive +from ..ex21_validation_helpers import clean_ex21_validation_set +from .inference import create_inference_dataset +from .training import format_as_ner_annotations + + +class Ex21TrainingConfig(Config): + """Configure asset to produce ex21 training data.""" + + training_set: str = "labeledv0.2" + + +@asset +def ex21_training_data(config: Ex21TrainingConfig): + """Construct training dataset for ex 21 extraction.""" + with TemporaryDirectory() as temp_dir: + ner_annotations = format_as_ner_annotations( + labeled_json_path=Path(temp_dir) / "sec10k_filings" / "labeled_jsons", + pdfs_path=Path(temp_dir) / "sec10k_filings" / "pdfs", + gcs_folder_name=config.training_set, + ) + return ner_annotations + + +@asset(dagster_type=ex21_extract_type) +def ex21_validation_set() -> pd.DataFrame: + """Return dataframe containing basic 10k validation data.""" + return clean_ex21_validation_set( + validation_helpers.load_validation_data("ex21_labels.csv") + ) + + +@asset +def ex21_validation_filing_metadata( + cloud_interface: GCSArchive, + ex21_validation_set: pd.DataFrame, +) -> pd.DataFrame: + """Get sec 10k filing metadata from validation set.""" + filing_metadata = cloud_interface.get_metadata() + return filing_metadata[ + filing_metadata.index.isin(ex21_validation_set["filename"].unique()) + ] + + +@multi_asset( + outs={ + "ex21_failed_parsing_metadata": AssetOut( + dagster_type=sec10k_extract_metadata_type, + ), + "ex21_inference_dataset": AssetOut(), + }, +) +def ex21_inference_dataset( + cloud_interface: GCSArchive, + ex21_validation_filing_metadata: pd.DataFrame, +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Construct inference dataset for ex 21 extraction.""" + return create_inference_dataset( + filing_metadata=ex21_validation_filing_metadata, + cloud_interface=cloud_interface, + ) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py new file mode 100644 index 0000000..5f79109 --- /dev/null +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py @@ -0,0 +1,203 @@ +"""Implement methods used to construct both inference and training sets.""" + +import json +import os +from pathlib import Path + +import numpy as np +import pandas as pd + +from mozilla_sec_eia.library import validation_helpers + +from ...utils.pdf import get_pdf_data_from_path + +LABEL_PRIORITY = [ + "I-Subsidiary", + "I-Loc", + "I-Own_Per", + "B-Subsidiary", + "B-Loc", + "B-Own_Per", + "O", +] +LABELS = [ + "O", + "B-Subsidiary", + "I-Subsidiary", + "B-Loc", + "I-Loc", + "B-Own_Per", + "I-Own_Per", +] + +BBOX_COLS = ["top_left_x", "top_left_y", "bottom_right_x", "bottom_right_y"] + +BBOX_COLS_PDF = [ + "top_left_x_pdf", + "top_left_y_pdf", + "bottom_right_x_pdf", + "bottom_right_y_pdf", +] + + +def normalize_bboxes(txt_df, pg_meta_df): + """Normalize bboxes between 0 and 1000.""" + txt_df["top_left_x_pdf"] = ( + txt_df["top_left_x_pdf"] / pg_meta_df.width_pdf_coord.iloc[0] * 1000 + ) + txt_df["top_left_y_pdf"] = ( + txt_df["top_left_y_pdf"] / pg_meta_df.height_pdf_coord.iloc[0] * 1000 + ) + txt_df["bottom_right_x_pdf"] = ( + txt_df["bottom_right_x_pdf"] / pg_meta_df.width_pdf_coord.iloc[0] * 1000 + ) + txt_df["bottom_right_y_pdf"] = ( + txt_df["bottom_right_y_pdf"] / pg_meta_df.height_pdf_coord.iloc[0] * 1000 + ) + return txt_df + + +def unnormalize_box(bbox, width, height): + """Unnormalize bboxes for drawing onto an image.""" + return [ + width * (bbox[0] / 1000), + height * (bbox[1] / 1000), + width * (bbox[2] / 1000), + height * (bbox[3] / 1000), + ] + + +def get_id_label_conversions(labels): + """Return dicts mapping ids to labels and labels to ids.""" + id2label = dict(enumerate(labels)) + label2id = {v: k for k, v in enumerate(labels)} + return id2label, label2id + + +def iob_to_label(label): + """Convert an IOB entity label to a standard string label. + + i.e. 'B-Subsidiary' becomes 'Subsidiary'. + """ + label = label[2:] + if not label: + return "other" + return label + + +def _is_cik_in_training_data(labeled_json_filename, tracking_df): + # TODO: for now CIK is stored as an int, update when fixed + cik = int(labeled_json_filename.split("/")[-1].split("-")[0]) + return cik in tracking_df.CIK.unique() + + +def format_label_studio_output( + labeled_json_dir: Path, + pdfs_dir: Path, +) -> pd.DataFrame: + """Format Label Studio output JSONs into dataframe.""" + labeled_df = pd.DataFrame() + tracking_df = validation_helpers.load_training_data("ex21_labels.csv") + + for json_filename in os.listdir(labeled_json_dir): + if not json_filename[0].isdigit() or json_filename.endswith(".json"): + continue + json_file_path = labeled_json_dir / json_filename + with Path.open(json_file_path) as j: + doc_dict = json.loads(j.read()) + filename = doc_dict["task"]["data"]["ocr"].split("/")[-1].split(".")[0] + # check if old local naming schema is being used + if len(filename.split("-")) == 6: + filename = "-".join(filename.split("-")[2:]) + if not _is_cik_in_training_data(filename, tracking_df=tracking_df): + continue + pdf_filename = filename + ".pdf" + src_path = pdfs_dir / pdf_filename + extracted, pg = get_pdf_data_from_path(src_path) + txt = extracted["pdf_text"] + pg_meta = extracted["page"] + # normalize bboxes between 0 and 1000 for Hugging Face + txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta) + # parse the output dictionary of labeled bounding boxes from Label Studio + doc_df = pd.DataFrame() + for item in doc_dict["result"]: + value = item["value"] + # sometimes Label Studio will fill in an empty list as a label + # when there is really no label + # TODO: do this without dict comprehension? + if ("labels" in value) and value["labels"] == []: + value = {k: v for k, v in value.items() if k != "labels"} + ind = int(item["id"].split("_")[-1]) + doc_df = pd.concat([doc_df, pd.DataFrame(value, index=[ind])]) + # combine the bounding boxes for each word + doc_df = doc_df.groupby(level=0).first() + txt.loc[:, "id"] = filename + # TODO: probably want to filter out these empty Ex. 21 docs + # the doc might not have any labels in it if it was an empty Ex. 21 + if "labels" not in doc_df: + doc_df.loc[:, "labels"] = pd.Series() + output_df = pd.concat([txt, doc_df[["labels"]]], axis=1) + labeled_df = pd.concat([labeled_df, output_df]) + + # fill in unlabeled words and clean up labeled dataframe + labeled_df["labels"] = labeled_df["labels"].fillna("O") + labeled_df = labeled_df.rename(columns={"labels": "ner_tag"}) + non_id_columns = [col for col in labeled_df.columns if col != "id"] + labeled_df = labeled_df.loc[:, ["id"] + non_id_columns] + + # TODO: add in sanity checks on labeled_df bounding boxes to make sure + # that no value is above 1000 or below 0 + + return labeled_df + + +def _sort_by_label_priority(target_array): + _, label2id = get_id_label_conversions(LABELS) + id_priority = [label2id[label] for label in LABEL_PRIORITY] + # Create a priority map from the label priority + priority_map = {val: idx for idx, val in enumerate(id_priority)} + # Sort the target array based on the priority map + sorted_array = sorted(target_array, key=lambda x: priority_map.get(x, float("inf"))) + return sorted_array + + +def get_flattened_mode_predictions(token_boxes_tensor, predictions_tensor): + """Get the mode prediction for each box in an Ex. 21. + + When handling multi page documents LayoutLM uses a sliding 'frame' + with some overlap between frames. The overlap creates multiple + predictions for the same bounding boxes. Thus it's necessary to find + the mode of all the predictions for a bounding box and use that as the + single prediction for each box. If there are multiple mode + predictions for a bounding box, then ties are broken by setting + a priority for the labels (LABEL_PRIORITY) and choosing the highest priority + label. + """ + # Flatten the tensors + flat_token_boxes = token_boxes_tensor.view(-1, 4) + flat_predictions = predictions_tensor.view(-1) + + boxes = flat_token_boxes.numpy() + predictions = flat_predictions.numpy() + + # Find unique boxes and indices + unique_boxes, inverse_indices = np.unique(boxes, axis=0, return_inverse=True) + + # Compute the mode for each unique bounding box + # for each unique box in boxes, create a list with all predictions for that box + # get the indices in predictions where the corresponding index in boxes is + unique_box_predictions = [ + predictions[np.where(inverse_indices == i)[0]] for i in range(len(unique_boxes)) + ] + pred_counts = [np.bincount(arr) for arr in unique_box_predictions] + # Compute the mode of predictions for each group + # break ties by taking into account LABEL_PRIORITY + modes = np.array( + [ + _sort_by_label_priority(np.where(arr == np.max(arr))[0])[0] + for arr in pred_counts + ] + ) + flattened_modes = modes[inverse_indices] + + return flattened_modes diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py new file mode 100644 index 0000000..56a7d9b --- /dev/null +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py @@ -0,0 +1,119 @@ +"""Create inference dataset for exhibit 21 extraction model.""" + +import logging +import os +import tempfile +from pathlib import Path + +import pandas as pd + +from ...utils.cloud import GCSArchive +from ...utils.pdf import get_image_dict, get_pdf_data_from_path +from .common import BBOX_COLS_PDF, format_label_studio_output, normalize_bboxes + +logger = logging.getLogger(f"catalystcoop.{__name__}") + + +logger = logging.getLogger(f"catalystcoop.{__name__}") + + +def format_unlabeled_pdf_dataframe(pdfs_dir: Path): + """Read and format PDFs into a dataframe (without labels).""" + inference_df = pd.DataFrame() + for pdf_filename in os.listdir(pdfs_dir): + if not pdf_filename.endswith(".pdf"): + continue + src_path = pdfs_dir / pdf_filename + filename = Path(pdf_filename).stem + extracted, pg = get_pdf_data_from_path(src_path) + txt = extracted["pdf_text"] + pg_meta = extracted["page"] + # normalize bboxes between 0 and 1000 for Hugging Face + txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta) + txt.loc[:, "id"] = filename + inference_df = pd.concat([inference_df, txt]) + return inference_df + + +def _cache_pdfs( + filings: pd.DataFrame, cloud_interface: GCSArchive, pdf_dir: Path +) -> pd.DataFrame: + """Iterate filings and cache pdfs.""" + extraction_metadata = pd.DataFrame( + { + "filename": pd.Series(dtype=str), + "success": pd.Series(dtype=bool), + "notes": pd.Series(dtype=str), + } + ).set_index("filename") + + for filing in cloud_interface.iterate_filings(filings): + pdf_path = cloud_interface.get_local_filename( + cache_directory=pdf_dir, filing=filing, extension=".pdf" + ) + + # Some filings are poorly formatted and fail in `save_as_pdf` + # We want a record of these but don't want to stop run + try: + with pdf_path.open("wb") as f: + filing.ex_21.save_as_pdf(f) + except Exception as e: + extraction_metadata.loc[filing.filename, ["success"]] = False + extraction_metadata.loc[filing.filename, ["note"]] = str(e) + + # Some pdfs are empty. Check for these and remove from dir + if pdf_path.stat().st_size == 0: + extraction_metadata.loc[filing.filename, ["success"]] = False + extraction_metadata.loc[filing.filename, ["note"]] = "PDF empty" + pdf_path.unlink() + + return extraction_metadata + + +def create_inference_dataset( + filing_metadata: pd.DataFrame, cloud_interface: GCSArchive, has_labels: bool = False +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Create a Hugging Face Dataset from PDFs for inference.""" + filings_with_ex21 = filing_metadata[~filing_metadata["exhibit_21_version"].isna()] + + # Parse PDFS + with ( + tempfile.TemporaryDirectory() as pdfs_dir, + tempfile.TemporaryDirectory() as labeled_json_dir, + ): + pdfs_dir = Path(pdfs_dir) + labeled_json_dir = Path(labeled_json_dir) + + extraction_metadata = _cache_pdfs( + filings_with_ex21, + cloud_interface=cloud_interface, + pdf_dir=pdfs_dir, + ) + if has_labels: + inference_df = format_label_studio_output( + labeled_json_dir=labeled_json_dir, pdfs_dir=pdfs_dir + ) + else: + inference_df = format_unlabeled_pdf_dataframe(pdfs_dir=pdfs_dir) + image_dict = get_image_dict(pdfs_dir) + + annotations = [] + for filename, image in image_dict.items(): + annotation = { + "id": filename, + "tokens": inference_df.groupby("id")["text"].apply(list).loc[filename], + "bboxes": inference_df.loc[inference_df["id"] == filename, :][BBOX_COLS_PDF] + .to_numpy() + .tolist(), + "image": image.tobytes(), + "mode": image.mode, + "width": image.size[0], + "height": image.size[1], + } + if has_labels: + annotation["ner_tags"] = ( + inference_df.groupby("id")["ner_tag"].apply(list).loc[filename] + ) + annotations.append(annotation) + + return extraction_metadata, pd.DataFrame(annotations) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/training.py similarity index 57% rename from src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py rename to src/mozilla_sec_eia/models/sec10k/ex_21/data/training.py index 8530705..e354a66 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/create_labeled_dataset.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/training.py @@ -1,4 +1,4 @@ -"""Module handling Label Studio inputs and outputs and preparing a dataset for fine-tuning.""" +"""Create training dataset for layoutlm extraction.""" import json import logging @@ -7,26 +7,19 @@ import pandas as pd -from ..utils.cloud import GCSArchive -from ..utils.layoutlm import normalize_bboxes -from ..utils.pdf import ( +from ...utils.cloud import GCSArchive +from ...utils.pdf import ( + get_image_dict, get_pdf_data_from_path, pil_to_cv2, render_page, ) +from .common import BBOX_COLS_PDF, format_label_studio_output logger = logging.getLogger(f"catalystcoop.{__name__}") ROOT_DIR = Path(__file__).parent.parent.parent.parent.parent.parent.resolve() -BBOX_COLS_PDF = [ - "top_left_x_pdf", - "top_left_y_pdf", - "bottom_right_x_pdf", - "bottom_right_y_pdf", -] - - def create_inputs_for_label_studio( model_version: str = "v1.0", pdfs_dir: Path = ROOT_DIR / "sec10k_filings/pdfs", @@ -135,89 +128,9 @@ def get_bbox_dicts( return [box_dict, word_dict] -def _is_cik_in_training_data(labeled_json_filename, tracking_df): - # TODO: for now CIK is stored as an int, update when fixed - cik = int(labeled_json_filename.split("/")[-1].split("-")[0]) - return cik in tracking_df.CIK.unique() - - -def format_label_studio_output( - labeled_json_dir=ROOT_DIR / "sec10k_filings/labeled_jsons", - pdfs_dir=ROOT_DIR / "sec10k_filings/pdfs", -) -> pd.DataFrame: - """Format Label Studio output JSONs into dataframe.""" - labeled_df = pd.DataFrame() - # TODO: make this path stuff less janky? - tracking_df = pd.read_csv(ROOT_DIR / "labeled_data_tracking.csv") - for json_filename in os.listdir(labeled_json_dir): - if not json_filename[0].isdigit() or json_filename.endswith(".json"): - continue - json_file_path = labeled_json_dir / json_filename - with Path.open(json_file_path) as j: - doc_dict = json.loads(j.read()) - filename = doc_dict["task"]["data"]["ocr"].split("/")[-1].split(".")[0] - # check if old local naming schema is being used - if len(filename.split("-")) == 6: - filename = "-".join(filename.split("-")[2:]) - if not _is_cik_in_training_data(filename, tracking_df=tracking_df): - continue - pdf_filename = filename + ".pdf" - src_path = pdfs_dir / pdf_filename - extracted, pg = get_pdf_data_from_path(src_path) - txt = extracted["pdf_text"] - pg_meta = extracted["page"] - # normalize bboxes between 0 and 1000 for Hugging Face - txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta) - # parse the output dictionary of labeled bounding boxes from Label Studio - doc_df = pd.DataFrame() - for item in doc_dict["result"]: - value = item["value"] - # sometimes Label Studio will fill in an empty list as a label - # when there is really no label - # TODO: do this without dict comprehension? - if ("labels" in value) and value["labels"] == []: - value = {k: v for k, v in value.items() if k != "labels"} - ind = int(item["id"].split("_")[-1]) - doc_df = pd.concat([doc_df, pd.DataFrame(value, index=[ind])]) - # combine the bounding boxes for each word - doc_df = doc_df.groupby(level=0).first() - txt.loc[:, "id"] = filename - # TODO: probably want to filter out these empty Ex. 21 docs - # the doc might not have any labels in it if it was an empty Ex. 21 - if "labels" not in doc_df: - doc_df.loc[:, "labels"] = pd.Series() - output_df = pd.concat([txt, doc_df[["labels"]]], axis=1) - labeled_df = pd.concat([labeled_df, output_df]) - - # fill in unlabeled words and clean up labeled dataframe - labeled_df["labels"] = labeled_df["labels"].fillna("O") - labeled_df = labeled_df.rename(columns={"labels": "ner_tag"}) - non_id_columns = [col for col in labeled_df.columns if col != "id"] - labeled_df = labeled_df.loc[:, ["id"] + non_id_columns] - - # TODO: add in sanity checks on labeled_df bounding boxes to make sure - # that no value is above 1000 or below 0 - - return labeled_df - - -def get_image_dict(pdfs_dir): - """Create a dictionary with filenames and their Ex. 21 images.""" - image_dict = {} - for pdf_filename in os.listdir(pdfs_dir): - if pdf_filename.split(".")[-1] != "pdf": - continue - pdf_file_path = pdfs_dir / pdf_filename - _, pg = get_pdf_data_from_path(pdf_file_path) - full_pg_img = render_page(pg) - filename = pdf_filename.split(".")[0] - image_dict[filename] = full_pg_img - return image_dict - - def format_as_ner_annotations( - labeled_json_path=ROOT_DIR / "sec10k_filings/labeled_jsons", - pdfs_path=ROOT_DIR / "sec10k_filings/pdfs", + labeled_json_path: Path, + pdfs_path: Path, gcs_folder_name: str = "labeled/", ) -> list[dict]: """Format a Label Studio output JSONs as NER annotations. diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py b/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py index d6eebea..fca7168 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py @@ -1,10 +1,45 @@ """Helper functions specific to Exhibit 21 model validation.""" +import numpy as np import pandas as pd from mozilla_sec_eia.models.sec10k.utils.cloud import get_metadata_filename +def clean_extracted_df(extracted_df): + """Perform basic cleaning on a dataframe extracted from an Ex. 21.""" + if extracted_df.empty: + return extracted_df + if "row" in extracted_df.columns: + extracted_df = extracted_df.drop(columns=["row"]) + extracted_df["subsidiary"] = extracted_df["subsidiary"].str.strip().str.lower() + # strip special chars from the start and end of the string + extracted_df["subsidiary"] = extracted_df["subsidiary"].str.replace( + r"^[^\w&\s]+|[^\w&\s]+$", "", regex=True + ) + if "loc" in extracted_df.columns: + extracted_df["loc"] = extracted_df["loc"].str.strip().str.lower() + extracted_df["loc"] = extracted_df["loc"].str.replace( + r"[^a-zA-Z&,\s]", "", regex=True + ) + if "own_per" in extracted_df.columns: + # remove special chars and letters + extracted_df["own_per"] = extracted_df["own_per"].str.replace( + r"[^\d.]", "", regex=True + ) + # Find values with multiple decimal points + extracted_df["own_per"] = extracted_df["own_per"].str.replace( + r"(\d*\.\d+)\..*", r"\1", regex=True + ) + extracted_df["own_per"] = extracted_df["own_per"].replace("", np.nan) + extracted_df["own_per"] = extracted_df["own_per"].astype( + "float64", errors="ignore" + ) + # drop rows that have a null subsidiary value + extracted_df = extracted_df.dropna(subset="subsidiary") + return extracted_df + + def clean_ex21_validation_set(validation_df: pd.DataFrame): """Clean Ex. 21 validation data to match extracted format.""" validation_df = validation_df.rename( diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py index 5633e40..6f517a3 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py @@ -1,162 +1,18 @@ """Module for formatting inputs and performing inference with a fine-tuned LayoutLM model.""" import logging -import os -import tempfile import traceback -from pathlib import Path -import numpy as np import pandas as pd from mlflow.pyfunc import PyFuncModel from ..entities import Ex21CompanyOwnership from ..utils.cloud import GCSArchive -from ..utils.layoutlm import ( - normalize_bboxes, -) -from ..utils.pdf import ( - get_pdf_data_from_path, -) -from .create_labeled_dataset import ( - BBOX_COLS_PDF, - format_label_studio_output, - get_image_dict, -) - -# When handling multi page documents LayoutLM uses a sliding 'frame' -# with some overlap between frames. The overlap creates multiple -# predictions for the same bounding boxes. If there are multiple mode -# predictions for a bounding box, then ties are broken by setting -# a priority for the labels and choosing the highest priority label. -LABEL_PRIORITY = [ - "I-Subsidiary", - "I-Loc", - "I-Own_Per", - "B-Subsidiary", - "B-Loc", - "B-Own_Per", - "O", -] - -LABELS = [ - "O", - "B-Subsidiary", - "I-Subsidiary", - "B-Loc", - "I-Loc", - "B-Own_Per", - "I-Own_Per", -] - -BBOX_COLS = ["top_left_x", "top_left_y", "bottom_right_x", "bottom_right_y"] -label2id = {v: k for k, v in enumerate(LABELS)} +from .data.inference import create_inference_dataset logger = logging.getLogger(f"catalystcoop.{__name__}") -def format_unlabeled_pdf_dataframe(pdfs_dir: Path): - """Read and format PDFs into a dataframe (without labels).""" - inference_df = pd.DataFrame() - for pdf_filename in os.listdir(pdfs_dir): - if not pdf_filename.endswith(".pdf"): - continue - src_path = pdfs_dir / pdf_filename - filename = Path(pdf_filename).stem - extracted, pg = get_pdf_data_from_path(src_path) - txt = extracted["pdf_text"] - pg_meta = extracted["page"] - # normalize bboxes between 0 and 1000 for Hugging Face - txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta) - txt.loc[:, "id"] = filename - inference_df = pd.concat([inference_df, txt]) - return inference_df - - -def _cache_pdfs( - filings: pd.DataFrame, cloud_interface: GCSArchive, pdf_dir: Path -) -> pd.DataFrame: - """Iterate filings and cache pdfs.""" - extraction_metadata = pd.DataFrame( - { - "filename": pd.Series(dtype=str), - "success": pd.Series(dtype=bool), - "notes": pd.Series(dtype=str), - } - ).set_index("filename") - - for filing in cloud_interface.iterate_filings(filings): - pdf_path = cloud_interface.get_local_filename( - cache_directory=pdf_dir, filing=filing, extension=".pdf" - ) - - # Some filings are poorly formatted and fail in `save_as_pdf` - # We want a record of these but don't want to stop run - try: - with pdf_path.open("wb") as f: - filing.ex_21.save_as_pdf(f) - except Exception as e: - extraction_metadata.loc[filing.filename, ["success"]] = False - extraction_metadata.loc[filing.filename, ["note"]] = str(e) - - # Some pdfs are empty. Check for these and remove from dir - if pdf_path.stat().st_size == 0: - extraction_metadata.loc[filing.filename, ["success"]] = False - extraction_metadata.loc[filing.filename, ["note"]] = "PDF empty" - pdf_path.unlink() - - return extraction_metadata - - -def create_inference_dataset( - filing_metadata: pd.DataFrame, cloud_interface: GCSArchive, has_labels: bool = False -) -> tuple[pd.DataFrame, pd.DataFrame]: - """Create a Hugging Face Dataset from PDFs for inference.""" - filings_with_ex21 = filing_metadata[~filing_metadata["exhibit_21_version"].isna()] - - # Parse PDFS - with ( - tempfile.TemporaryDirectory() as pdfs_dir, - tempfile.TemporaryDirectory() as labeled_json_dir, - ): - pdfs_dir = Path(pdfs_dir) - labeled_json_dir = Path(labeled_json_dir) - - extraction_metadata = _cache_pdfs( - filings_with_ex21, - cloud_interface=cloud_interface, - pdf_dir=pdfs_dir, - ) - if has_labels: - inference_df = format_label_studio_output( - labeled_json_dir=labeled_json_dir, pdfs_dir=pdfs_dir - ) - else: - inference_df = format_unlabeled_pdf_dataframe(pdfs_dir=pdfs_dir) - image_dict = get_image_dict(pdfs_dir) - - annotations = [] - for filename, image in image_dict.items(): - annotation = { - "id": filename, - "tokens": inference_df.groupby("id")["text"].apply(list).loc[filename], - "bboxes": inference_df.loc[inference_df["id"] == filename, :][BBOX_COLS_PDF] - .to_numpy() - .tolist(), - "image": image.tobytes(), - "mode": image.mode, - "width": image.size[0], - "height": image.size[1], - } - if has_labels: - annotation["ner_tags"] = ( - inference_df.groupby("id")["ner_tag"].apply(list).loc[filename] - ) - annotations.append(annotation) - - return extraction_metadata, pd.DataFrame(annotations) - - def extract_filings( filings: pd.DataFrame, layoutlm: PyFuncModel, @@ -182,54 +38,3 @@ def extract_filings( ).set_index("filename") extracted = Ex21CompanyOwnership.example(size=0) return metadata, extracted - - -def _sort_by_label_priority(target_array): - id_priority = [label2id[label] for label in LABEL_PRIORITY] - # Create a priority map from the label priority - priority_map = {val: idx for idx, val in enumerate(id_priority)} - # Sort the target array based on the priority map - sorted_array = sorted(target_array, key=lambda x: priority_map.get(x, float("inf"))) - return sorted_array - - -def get_flattened_mode_predictions(token_boxes_tensor, predictions_tensor): - """Get the mode prediction for each box in an Ex. 21. - - When handling multi page documents LayoutLM uses a sliding 'frame' - with some overlap between frames. The overlap creates multiple - predictions for the same bounding boxes. Thus it's necessary to find - the mode of all the predictions for a bounding box and use that as the - single prediction for each box. If there are multiple mode - predictions for a bounding box, then ties are broken by setting - a priority for the labels (LABEL_PRIORITY) and choosing the highest priority - label. - """ - # Flatten the tensors - flat_token_boxes = token_boxes_tensor.view(-1, 4) - flat_predictions = predictions_tensor.view(-1) - - boxes = flat_token_boxes.numpy() - predictions = flat_predictions.numpy() - - # Find unique boxes and indices - unique_boxes, inverse_indices = np.unique(boxes, axis=0, return_inverse=True) - - # Compute the mode for each unique bounding box - # for each unique box in boxes, create a list with all predictions for that box - # get the indices in predictions where the corresponding index in boxes is - unique_box_predictions = [ - predictions[np.where(inverse_indices == i)[0]] for i in range(len(unique_boxes)) - ] - pred_counts = [np.bincount(arr) for arr in unique_box_predictions] - # Compute the mode of predictions for each group - # break ties by taking into account LABEL_PRIORITY - modes = np.array( - [ - _sort_by_label_priority(np.where(arr == np.max(arr))[0])[0] - for arr in pred_counts - ] - ) - flattened_modes = modes[inverse_indices] - - return flattened_modes diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb index 4efc905..c03cc4e 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb @@ -28,8 +28,7 @@ "import dagstermill\n", "\n", "context = dagstermill.get_context(op_config={\n", - " \"uri\": \"runs:/c363159de2f5439c93dd972d51247370/layoutlm_extractor\",\n", - " \"training_set\": \"labeledv0.2\",\n", + " \"uri\": None,\n", "})" ] }, @@ -50,17 +49,8 @@ "tags": [] }, "source": [ - "### Setup training/test sets" - ] - }, - { - "cell_type": "markdown", - "id": "8b389646-c4af-4c92-a29e-b4b23f4c391b", - "metadata": {}, - "source": [ - "Download training data and convert to NER annotations. This involves converting exhibit 21 filings into PDF's, then using labels generated by label studio to produce the annotations. These annotations are then used to create a huggingface dataset that will be used for training.\n", - "\n", - "First define several helper functions to do the conversion." + "### Define training metrics\n", + "The method `compute_metrics` will be used to score the model. It computes precision, recall, f1 score, and accuracy on bounding box labels output by `layoutlm`." ] }, { @@ -72,205 +62,15 @@ }, "outputs": [], "source": [ - "import json\n", "import os\n", - "from pathlib import Path\n", "from tempfile import TemporaryDirectory\n", "\n", "import numpy as np\n", "import pandas as pd\n", "\n", "from mozilla_sec_eia.library import validation_helpers\n", - "from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive, get_metadata_filename\n", - "from mozilla_sec_eia.models.sec10k.utils.layoutlm import normalize_bboxes\n", - "from mozilla_sec_eia.models.sec10k.utils.pdf import (\n", - " get_pdf_data_from_path,\n", - " render_page,\n", - ")\n", - "\n", - "# Set some constants\n", - "LABELS = [\n", - " \"O\",\n", - " \"B-Subsidiary\",\n", - " \"I-Subsidiary\",\n", - " \"B-Loc\",\n", - " \"I-Loc\",\n", - " \"B-Own_Per\",\n", - " \"I-Own_Per\",\n", - "]\n", - "LABEL_PRIORITY = [\n", - " \"I-Subsidiary\",\n", - " \"I-Loc\",\n", - " \"I-Own_Per\",\n", - " \"B-Subsidiary\",\n", - " \"B-Loc\",\n", - " \"B-Own_Per\",\n", - " \"O\",\n", - "]\n", - "\n", - "BBOX_COLS = [\"top_left_x\", \"top_left_y\", \"bottom_right_x\", \"bottom_right_y\"]\n", - "BBOX_COLS_PDF = [\n", - " \"top_left_x_pdf\",\n", - " \"top_left_y_pdf\",\n", - " \"bottom_right_x_pdf\",\n", - " \"bottom_right_y_pdf\",\n", - "]\n", - "\n", - "# Map back and forth between id's and labels\n", - "id2label = dict(enumerate(LABELS))\n", - "label2id = {v: k for k, v in enumerate(LABELS)}\n", - "\n", - "def _is_cik_in_training_data(labeled_json_filename, tracking_df):\n", - " # TODO: for now CIK is stored as an int, update when fixed\n", - " cik = int(labeled_json_filename.split(\"/\")[-1].split(\"-\")[0])\n", - " return cik in tracking_df.CIK.unique()\n", - "\n", - "\n", - "def format_label_studio_output(\n", - " labeled_json_dir: Path,\n", - " pdfs_dir: Path,\n", - ") -> pd.DataFrame:\n", - " \"\"\"Format Label Studio output JSONs into dataframe.\"\"\"\n", - " labeled_df = pd.DataFrame()\n", - " # TODO: make this path stuff less janky?\n", - " tracking_df = validation_helpers.load_training_data(\"ex21_labels.csv\")\n", - " for json_filename in os.listdir(labeled_json_dir):\n", - " if not json_filename[0].isdigit() or json_filename.endswith(\".json\"):\n", - " continue\n", - " json_file_path = labeled_json_dir / json_filename\n", - " with Path.open(json_file_path) as j:\n", - " doc_dict = json.loads(j.read())\n", - "\n", - " filename = doc_dict[\"task\"][\"data\"][\"ocr\"].split(\"/\")[-1].split(\".\")[0]\n", - " # check if old local naming schema is being used\n", - " if len(filename.split(\"-\")) == 6:\n", - " filename = \"-\".join(filename.split(\"-\")[2:])\n", - " if not _is_cik_in_training_data(filename, tracking_df=tracking_df):\n", - " continue\n", - "\n", - " pdf_filename = filename + \".pdf\"\n", - " src_path = pdfs_dir / pdf_filename\n", - " extracted, pg = get_pdf_data_from_path(src_path)\n", - " txt = extracted[\"pdf_text\"]\n", - " pg_meta = extracted[\"page\"]\n", - " # normalize bboxes between 0 and 1000 for Hugging Face\n", - " txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta)\n", - " # parse the output dictionary of labeled bounding boxes from Label Studio\n", - " doc_df = pd.DataFrame()\n", - " for item in doc_dict[\"result\"]:\n", - " value = item[\"value\"]\n", - " # sometimes Label Studio will fill in an empty list as a label\n", - " # when there is really no label\n", - " # TODO: do this without dict comprehension?\n", - " if (\"labels\" in value) and value[\"labels\"] == []:\n", - " value = {k: v for k, v in value.items() if k != \"labels\"}\n", - " ind = int(item[\"id\"].split(\"_\")[-1])\n", - " doc_df = pd.concat([doc_df, pd.DataFrame(value, index=[ind])])\n", - "\n", - " # combine the bounding boxes for each word\n", - " doc_df = doc_df.groupby(level=0).first()\n", - " txt.loc[:, \"id\"] = filename\n", - " # TODO: probably want to filter out these empty Ex. 21 docs\n", - " # the doc might not have any labels in it if it was an empty Ex. 21\n", - " if \"labels\" not in doc_df:\n", - " doc_df.loc[:, \"labels\"] = pd.Series()\n", - "\n", - " output_df = pd.concat([txt, doc_df[[\"labels\"]]], axis=1)\n", - " labeled_df = pd.concat([labeled_df, output_df])\n", - "\n", - " # fill in unlabeled words and clean up labeled dataframe\n", - " labeled_df[\"labels\"] = labeled_df[\"labels\"].fillna(\"O\")\n", - " labeled_df = labeled_df.rename(columns={\"labels\": \"ner_tag\"})\n", - " non_id_columns = [col for col in labeled_df.columns if col != \"id\"]\n", - " labeled_df = labeled_df.loc[:, [\"id\"] + non_id_columns]\n", - "\n", - " # TODO: add in sanity checks on labeled_df bounding boxes to make sure\n", - " # that no value is above 1000 or below 0\n", - "\n", - " return labeled_df\n", - "\n", - "\n", - "def get_image_dict(pdfs_dir):\n", - " \"\"\"Create a dictionary with filenames and their Ex. 21 images.\"\"\"\n", - " image_dict = {}\n", - " for pdf_filename in os.listdir(pdfs_dir):\n", - " if pdf_filename.split(\".\")[-1] != \"pdf\":\n", - " continue\n", - " pdf_file_path = pdfs_dir / pdf_filename\n", - " _, pg = get_pdf_data_from_path(pdf_file_path)\n", - " full_pg_img = render_page(pg)\n", - " filename = pdf_filename.split(\".\")[0]\n", - " image_dict[filename] = full_pg_img\n", - " return image_dict\n", - "\n", - "\n", - "def format_as_ner_annotations(\n", - " labeled_json_path: Path,\n", - " pdfs_path: Path,\n", - " gcs_folder_name: Path,\n", - ") -> list[dict]:\n", - " \"\"\"Format a Label Studio output JSONs as NER annotations.\n", - "\n", - " Formats the dataframe as named entity recognition annotations.\n", - " # TODO: say more about this format\n", - "\n", - " Returns:\n", - " ner_annotations: a list of dicts, with one dict for each doc.\n", - " \"\"\"\n", - " GCSArchive().cache_training_data(\n", - " json_cache_path=labeled_json_path,\n", - " pdf_cache_path=pdfs_path,\n", - " gcs_folder_name=gcs_folder_name\n", - " )\n", - "\n", - " labeled_df = format_label_studio_output(\n", - " labeled_json_dir=labeled_json_path, pdfs_dir=pdfs_path\n", - " )\n", - " # convert dataframe/dictionary into NER format\n", - " # document_annotation_to_ner https://github.com/butlerlabs/docai/blob/main/docai/annotations/ner_utils.py\n", - " # complete dataset is a list of dicts, with one dict for each doc\n", - " doc_filenames = labeled_df[\"id\"].unique()\n", - " image_dict = get_image_dict(pdfs_dir=pdfs_path)\n", - " ner_annotations = []\n", - " for filename in doc_filenames:\n", - " annotation = {\n", - " \"id\": filename,\n", - " \"tokens\": labeled_df.groupby(\"id\")[\"text\"].apply(list).loc[filename],\n", - " \"ner_tags\": labeled_df.groupby(\"id\")[\"ner_tag\"].apply(list).loc[filename],\n", - " \"bboxes\": labeled_df.loc[labeled_df[\"id\"] == filename, :][BBOX_COLS_PDF]\n", - " .to_numpy()\n", - " .tolist(),\n", - " \"image\": image_dict[filename],\n", - " }\n", - " ner_annotations.append(annotation)\n", - "\n", - " return ner_annotations\n", - "\n", - "def _prepare_dataset(annotations, processor, label2id):\n", - " \"\"\"Put the dataset in its final format for training LayoutLM.\"\"\"\n", - "\n", - " def _convert_ner_tags_to_id(ner_tags, label2id):\n", - " return [int(label2id[ner_tag]) for ner_tag in ner_tags]\n", - "\n", - " images = annotations[\"image\"]\n", - " words = annotations[\"tokens\"]\n", - " boxes = annotations[\"bboxes\"]\n", - " # Map over labels and convert to numeric id for each ner_tag\n", - " ner_tags = [\n", - " _convert_ner_tags_to_id(ner_tags, label2id)\n", - " for ner_tags in annotations[\"ner_tags\"]\n", - " ]\n", + "from mozilla_sec_eia.models.sec10k.utils.cloud import get_metadata_filename\n", "\n", - " encoding = processor(\n", - " images,\n", - " words,\n", - " boxes=boxes,\n", - " word_labels=ner_tags,\n", - " truncation=True,\n", - " padding=\"max_length\",\n", - " )\n", - "\n", - " return encoding\n", "\n", "def compute_metrics(p, metric, label_list, return_entity_level_metrics=False):\n", " \"\"\"Compute metrics to train and evaluate the model on.\"\"\"\n", @@ -306,6 +106,43 @@ " }" ] }, + { + "cell_type": "markdown", + "id": "39f0cbeb-7895-46bd-97d1-2c74e5265e12", + "metadata": { + "tags": [] + }, + "source": [ + "#### Load training data asset\n", + "\n", + "The following cell will load training data from a dagster asset. Using the dagster asset will allow easily caching the training data which can be computationally intensive to produce. When running this notebook in dagster directly, this cell will be replaced by dagster actually materializing the asset." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "f8df608a-32b7-4795-a670-63a2e8772910", + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", + "2024-10-03 17:47:13 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_training_data using PickledObjectFilesystemIOManager...\n" + ] + } + ], + "source": [ + "from mozilla_sec_eia.models.sec10k import defs\n", + "\n", + "ex21_training_data = defs.load_asset_value(\"ex21_training_data\")" + ] + }, { "cell_type": "markdown", "id": "8160263c-8f69-437c-918b-e56ad007961a", @@ -326,12 +163,120 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 6, "id": "71d205b2-e6ea-4ad0-982c-22e762269119", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n", + "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "fafaf3dc8cfe431b90802b61bfe0acc6", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Map: 0%| | 0/159 [00:00\n", + " \n", + " \n", + " [ 6/1000 00:02 < 10:34, 1.57 it/s, Epoch 0.04/8]\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
StepTraining LossValidation Loss

" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/10/03 17:52:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run orderly-mare-33 at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13/runs/a94ac72df36447a489d576ea06a71a4a.\n", + "2024/10/03 17:52:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13.\n", + "2024/10/03 17:52:09 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n", + "2024/10/03 17:52:10 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n" + ] + }, + { + "ename": "OutOfMemoryError", + "evalue": "CUDA out of memory. Tried to allocate 148.00 MiB. GPU 0 has a total capacity of 3.93 GiB of which 129.75 MiB is free. Including non-PyTorch memory, this process has 2.94 GiB memory in use. Of the allocated memory 1.89 GiB is allocated by PyTorch, and 979.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mOutOfMemoryError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[6], line 118\u001b[0m\n\u001b[1;32m 106\u001b[0m trainer \u001b[38;5;241m=\u001b[39m Trainer(\n\u001b[1;32m 107\u001b[0m model\u001b[38;5;241m=\u001b[39mmodel,\n\u001b[1;32m 108\u001b[0m args\u001b[38;5;241m=\u001b[39mtraining_args,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 113\u001b[0m compute_metrics\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mlambda\u001b[39;00m p: compute_metrics(p, metric\u001b[38;5;241m=\u001b[39mmetric, label_list\u001b[38;5;241m=\u001b[39mLABELS),\n\u001b[1;32m 114\u001b[0m )\n\u001b[1;32m 116\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m mlflow\u001b[38;5;241m.\u001b[39mstart_run() \u001b[38;5;28;01mas\u001b[39;00m training_run:\n\u001b[1;32m 117\u001b[0m \u001b[38;5;66;03m# Train inside mlflow run. Mlflow will automatically handle logging training metrcis\u001b[39;00m\n\u001b[0;32m--> 118\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 120\u001b[0m \u001b[38;5;66;03m# Log finetuend model with mlflow\u001b[39;00m\n\u001b[1;32m 121\u001b[0m model \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m\"\u001b[39m: trainer\u001b[38;5;241m.\u001b[39mmodel, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtokenizer\u001b[39m\u001b[38;5;124m\"\u001b[39m: trainer\u001b[38;5;241m.\u001b[39mtokenizer}\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:1938\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 1936\u001b[0m hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m 1937\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1938\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1939\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1940\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1941\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1942\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1943\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:2341\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 2338\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 2339\u001b[0m grad_norm \u001b[38;5;241m=\u001b[39m _grad_norm\n\u001b[0;32m-> 2341\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptimizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2343\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_optimizer_step(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[1;32m 2345\u001b[0m optimizer_was_run \u001b[38;5;241m=\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39moptimizer_step_was_skipped\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/accelerate/optimizer.py:172\u001b[0m, in \u001b[0;36mAcceleratedOptimizer.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m 170\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_accelerate_step_called \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 171\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 172\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptimizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43mclosure\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator_state\u001b[38;5;241m.\u001b[39mdistributed_type \u001b[38;5;241m==\u001b[39m DistributedType\u001b[38;5;241m.\u001b[39mXLA:\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgradient_state\u001b[38;5;241m.\u001b[39mis_xla_gradients_synced \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/lr_scheduler.py:130\u001b[0m, in \u001b[0;36mLRScheduler.__init__..patch_track_step_called..wrap_step..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 128\u001b[0m opt \u001b[38;5;241m=\u001b[39m opt_ref()\n\u001b[1;32m 129\u001b[0m opt\u001b[38;5;241m.\u001b[39m_opt_called \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m \u001b[38;5;66;03m# type: ignore[union-attr]\u001b[39;00m\n\u001b[0;32m--> 130\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__get__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mopt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mopt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__class__\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/optimizer.py:484\u001b[0m, in \u001b[0;36mOptimizer.profile_hook_step..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 479\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 480\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m 481\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must return None or a tuple of (new_args, new_kwargs), but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresult\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 482\u001b[0m )\n\u001b[0;32m--> 484\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 485\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_optimizer_step_code()\n\u001b[1;32m 487\u001b[0m \u001b[38;5;66;03m# call optimizer step post hooks\u001b[39;00m\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/optimizer.py:89\u001b[0m, in \u001b[0;36m_use_grad_for_differentiable.._use_grad\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 87\u001b[0m torch\u001b[38;5;241m.\u001b[39mset_grad_enabled(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefaults[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdifferentiable\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 88\u001b[0m torch\u001b[38;5;241m.\u001b[39m_dynamo\u001b[38;5;241m.\u001b[39mgraph_break()\n\u001b[0;32m---> 89\u001b[0m ret \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 90\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 91\u001b[0m torch\u001b[38;5;241m.\u001b[39m_dynamo\u001b[38;5;241m.\u001b[39mgraph_break()\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/adamw.py:227\u001b[0m, in \u001b[0;36mAdamW.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m 214\u001b[0m beta1, beta2 \u001b[38;5;241m=\u001b[39m cast(Tuple[\u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mfloat\u001b[39m], group[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbetas\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 216\u001b[0m has_complex \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_init_group(\n\u001b[1;32m 217\u001b[0m group,\n\u001b[1;32m 218\u001b[0m params_with_grad,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 224\u001b[0m state_steps,\n\u001b[1;32m 225\u001b[0m )\n\u001b[0;32m--> 227\u001b[0m \u001b[43madamw\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 228\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams_with_grad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 229\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 230\u001b[0m \u001b[43m \u001b[49m\u001b[43mexp_avgs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 231\u001b[0m \u001b[43m \u001b[49m\u001b[43mexp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 232\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_exp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 233\u001b[0m \u001b[43m \u001b[49m\u001b[43mstate_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 234\u001b[0m \u001b[43m \u001b[49m\u001b[43mamsgrad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mamsgrad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 235\u001b[0m \u001b[43m \u001b[49m\u001b[43mbeta1\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta1\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 236\u001b[0m \u001b[43m \u001b[49m\u001b[43mbeta2\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 237\u001b[0m \u001b[43m \u001b[49m\u001b[43mlr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 238\u001b[0m \u001b[43m \u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mweight_decay\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 239\u001b[0m \u001b[43m \u001b[49m\u001b[43meps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43meps\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 240\u001b[0m \u001b[43m \u001b[49m\u001b[43mmaximize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmaximize\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 241\u001b[0m \u001b[43m \u001b[49m\u001b[43mforeach\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mforeach\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 242\u001b[0m \u001b[43m \u001b[49m\u001b[43mcapturable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcapturable\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 243\u001b[0m \u001b[43m \u001b[49m\u001b[43mdifferentiable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdifferentiable\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 244\u001b[0m \u001b[43m \u001b[49m\u001b[43mfused\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfused\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 245\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrad_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgrad_scale\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 246\u001b[0m \u001b[43m \u001b[49m\u001b[43mfound_inf\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfound_inf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 247\u001b[0m \u001b[43m \u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhas_complex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 248\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 250\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/optimizer.py:161\u001b[0m, in \u001b[0;36m_disable_dynamo_if_unsupported..wrapper..maybe_fallback\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m disabled_func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 160\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 161\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/adamw.py:767\u001b[0m, in \u001b[0;36madamw\u001b[0;34m(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, foreach, capturable, differentiable, fused, grad_scale, found_inf, has_complex, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize)\u001b[0m\n\u001b[1;32m 764\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 765\u001b[0m func \u001b[38;5;241m=\u001b[39m _single_tensor_adamw\n\u001b[0;32m--> 767\u001b[0m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 768\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 769\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 770\u001b[0m \u001b[43m \u001b[49m\u001b[43mexp_avgs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 771\u001b[0m \u001b[43m \u001b[49m\u001b[43mexp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 772\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_exp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 773\u001b[0m \u001b[43m \u001b[49m\u001b[43mstate_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 774\u001b[0m \u001b[43m \u001b[49m\u001b[43mamsgrad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mamsgrad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 775\u001b[0m \u001b[43m \u001b[49m\u001b[43mbeta1\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta1\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 776\u001b[0m \u001b[43m \u001b[49m\u001b[43mbeta2\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 777\u001b[0m \u001b[43m \u001b[49m\u001b[43mlr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 778\u001b[0m \u001b[43m \u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mweight_decay\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 779\u001b[0m \u001b[43m \u001b[49m\u001b[43meps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43meps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 780\u001b[0m \u001b[43m \u001b[49m\u001b[43mmaximize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmaximize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 781\u001b[0m \u001b[43m \u001b[49m\u001b[43mcapturable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcapturable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 782\u001b[0m \u001b[43m \u001b[49m\u001b[43mdifferentiable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdifferentiable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 783\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrad_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgrad_scale\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 784\u001b[0m \u001b[43m \u001b[49m\u001b[43mfound_inf\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfound_inf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 785\u001b[0m \u001b[43m \u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhas_complex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 786\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/adamw.py:600\u001b[0m, in \u001b[0;36m_multi_tensor_adamw\u001b[0;34m(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, grad_scale, found_inf, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize, capturable, differentiable, has_complex)\u001b[0m\n\u001b[1;32m 598\u001b[0m exp_avg_sq_sqrt \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39m_foreach_sqrt(device_max_exp_avg_sqs)\n\u001b[1;32m 599\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 600\u001b[0m exp_avg_sq_sqrt \u001b[38;5;241m=\u001b[39m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_foreach_sqrt\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice_exp_avg_sqs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 602\u001b[0m torch\u001b[38;5;241m.\u001b[39m_foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt)\n\u001b[1;32m 603\u001b[0m torch\u001b[38;5;241m.\u001b[39m_foreach_add_(exp_avg_sq_sqrt, eps)\n", + "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 148.00 MiB. GPU 0 has a total capacity of 3.93 GiB of which 129.75 MiB is free. Including non-PyTorch memory, this process has 2.94 GiB memory in use. Of the allocated memory 1.89 GiB is allocated by PyTorch, and 979.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)" + ] + } + ], "source": [ "import mlflow\n", "from datasets import (\n", @@ -353,6 +298,11 @@ "from transformers.data.data_collator import default_data_collator\n", "\n", "from mozilla_sec_eia.library.mlflow import configure_mlflow\n", + "from mozilla_sec_eia.models.sec10k.ex_21.data.common import (\n", + " BBOX_COLS,\n", + " LABELS,\n", + " get_id_label_conversions,\n", + ")\n", "\n", "load_dotenv()\n", "\n", @@ -360,19 +310,40 @@ "configure_mlflow()\n", "mlflow.set_experiment(\"exhibit21_extraction_test\")\n", "\n", + "\n", + "def _prepare_dataset(annotations, processor, label2id):\n", + " \"\"\"Put the dataset in its final format for training LayoutLM.\"\"\"\n", + "\n", + " def _convert_ner_tags_to_id(ner_tags, label2id):\n", + " return [int(label2id[ner_tag]) for ner_tag in ner_tags]\n", + "\n", + " images = annotations[\"image\"]\n", + " words = annotations[\"tokens\"]\n", + " boxes = annotations[\"bboxes\"]\n", + " # Map over labels and convert to numeric id for each ner_tag\n", + " ner_tags = [\n", + " _convert_ner_tags_to_id(ner_tags, label2id)\n", + " for ner_tags in annotations[\"ner_tags\"]\n", + " ]\n", + "\n", + " encoding = processor(\n", + " images,\n", + " words,\n", + " boxes=boxes,\n", + " word_labels=ner_tags,\n", + " truncation=True,\n", + " padding=\"max_length\",\n", + " )\n", + "\n", + " return encoding\n", + "\n", "# Only finetune if configured to do so\n", "training_run_id = None\n", "if context.op_config[\"uri\"] is None:\n", + " id2label, label2id = get_id_label_conversions(LABELS)\n", " # Change temp_dir to save training data locally for inspection\n", - " with TemporaryDirectory() as temp_dir:\n", - " ner_annotations = format_as_ner_annotations(\n", - " labeled_json_path=Path(temp_dir) / \"sec10k_filings\" / \"labeled_jsons\",\n", - " pdfs_path=Path(temp_dir) / \"sec10k_filings\" / \"pdfs\",\n", - " gcs_folder_name=context.op_config[\"training_set\"],\n", - " )\n", - "\n", " # Cache/prepare training data\n", - " dataset = Dataset.from_list(ner_annotations)\n", + " dataset = Dataset.from_list(ex21_training_data)\n", "\n", " # Load pretrained model\n", " model = LayoutLMv3ForTokenClassification.from_pretrained(\n", @@ -449,7 +420,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "id": "42c8e920-d671-40c2-b5db-c43611a33897", "metadata": { "tags": [] @@ -621,71 +592,12 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "id": "4d802e00-1ca4-40b3-b15b-561711a9db70", "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "ff844a110fb04ddcbe788e647651786c", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading artifacts: 0%| | 0/1 [00:00, skipping schema inference\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "004ac3503c77461f9ce7938949a660c5", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading artifacts: 0%| | 0/17 [00:00`_ for more details.\n", - " warnings.warn(\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "02516db30cd241ed97c08df920368bf8", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading artifacts: 0%| | 0/17 [00:00 pd.DataFrame:\n", - " \"\"\"Format Label Studio output JSONs into dataframe.\"\"\"\n", - " labeled_df = pd.DataFrame()\n", - " # TODO: make this path stuff less janky?\n", - " tracking_df = validation_helpers.load_training_data(\"ex21_labels.csv\")\n", - " for json_filename in os.listdir(labeled_json_dir):\n", - " if not json_filename[0].isdigit() or json_filename.endswith(\".json\"):\n", - " continue\n", - " json_file_path = labeled_json_dir / json_filename\n", - " with Path.open(json_file_path) as j:\n", - " doc_dict = json.loads(j.read())\n", - "\n", - " filename = doc_dict[\"task\"][\"data\"][\"ocr\"].split(\"/\")[-1].split(\".\")[0]\n", - " # check if old local naming schema is being used\n", - " if len(filename.split(\"-\")) == 6:\n", - " filename = \"-\".join(filename.split(\"-\")[2:])\n", - " if not _is_cik_in_training_data(filename, tracking_df=tracking_df):\n", - " continue\n", - "\n", - " pdf_filename = filename + \".pdf\"\n", - " src_path = pdfs_dir / pdf_filename\n", - " extracted, pg = get_pdf_data_from_path(src_path)\n", - " txt = extracted[\"pdf_text\"]\n", - " pg_meta = extracted[\"page\"]\n", - " # normalize bboxes between 0 and 1000 for Hugging Face\n", - " txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta)\n", - " # parse the output dictionary of labeled bounding boxes from Label Studio\n", - " doc_df = pd.DataFrame()\n", - " for item in doc_dict[\"result\"]:\n", - " value = item[\"value\"]\n", - " # sometimes Label Studio will fill in an empty list as a label\n", - " # when there is really no label\n", - " # TODO: do this without dict comprehension?\n", - " if (\"labels\" in value) and value[\"labels\"] == []:\n", - " value = {k: v for k, v in value.items() if k != \"labels\"}\n", - " ind = int(item[\"id\"].split(\"_\")[-1])\n", - " doc_df = pd.concat([doc_df, pd.DataFrame(value, index=[ind])])\n", - "\n", - " # combine the bounding boxes for each word\n", - " doc_df = doc_df.groupby(level=0).first()\n", - " txt.loc[:, \"id\"] = filename\n", - " # TODO: probably want to filter out these empty Ex. 21 docs\n", - " # the doc might not have any labels in it if it was an empty Ex. 21\n", - " if \"labels\" not in doc_df:\n", - " doc_df.loc[:, \"labels\"] = pd.Series()\n", - "\n", - " output_df = pd.concat([txt, doc_df[[\"labels\"]]], axis=1)\n", - " labeled_df = pd.concat([labeled_df, output_df])\n", - "\n", - " # fill in unlabeled words and clean up labeled dataframe\n", - " labeled_df[\"labels\"] = labeled_df[\"labels\"].fillna(\"O\")\n", - " labeled_df = labeled_df.rename(columns={\"labels\": \"ner_tag\"})\n", - " non_id_columns = [col for col in labeled_df.columns if col != \"id\"]\n", - " labeled_df = labeled_df.loc[:, [\"id\"] + non_id_columns]\n", - "\n", - " # TODO: add in sanity checks on labeled_df bounding boxes to make sure\n", - " # that no value is above 1000 or below 0\n", - "\n", - " return labeled_df\n", - "\n", - "\n", - "def get_image_dict(pdfs_dir):\n", - " \"\"\"Create a dictionary with filenames and their Ex. 21 images.\"\"\"\n", - " image_dict = {}\n", - " for pdf_filename in os.listdir(pdfs_dir):\n", - " if pdf_filename.split(\".\")[-1] != \"pdf\":\n", - " continue\n", - " pdf_file_path = pdfs_dir / pdf_filename\n", - " _, pg = get_pdf_data_from_path(pdf_file_path)\n", - " full_pg_img = render_page(pg)\n", - " filename = pdf_filename.split(\".\")[0]\n", - " image_dict[filename] = full_pg_img\n", - " return image_dict\n", - "\n", - "\n", - "def format_as_ner_annotations(\n", - " labeled_json_path: Path,\n", - " pdfs_path: Path,\n", - " gcs_folder_name: Path,\n", - ") -> list[dict]:\n", - " \"\"\"Format a Label Studio output JSONs as NER annotations.\n", - "\n", - " Formats the dataframe as named entity recognition annotations.\n", - " # TODO: say more about this format\n", - "\n", - " Returns:\n", - " ner_annotations: a list of dicts, with one dict for each doc.\n", - " \"\"\"\n", - " GCSArchive().cache_training_data(\n", - " json_cache_path=labeled_json_path,\n", - " pdf_cache_path=pdfs_path,\n", - " gcs_folder_name=gcs_folder_name\n", - " )\n", - "\n", - " labeled_df = format_label_studio_output(\n", - " labeled_json_dir=labeled_json_path, pdfs_dir=pdfs_path\n", - " )\n", - " # convert dataframe/dictionary into NER format\n", - " # document_annotation_to_ner https://github.com/butlerlabs/docai/blob/main/docai/annotations/ner_utils.py\n", - " # complete dataset is a list of dicts, with one dict for each doc\n", - " doc_filenames = labeled_df[\"id\"].unique()\n", - " image_dict = get_image_dict(pdfs_dir=pdfs_path)\n", - " ner_annotations = []\n", - " for filename in doc_filenames:\n", - " annotation = {\n", - " \"id\": filename,\n", - " \"tokens\": labeled_df.groupby(\"id\")[\"text\"].apply(list).loc[filename],\n", - " \"ner_tags\": labeled_df.groupby(\"id\")[\"ner_tag\"].apply(list).loc[filename],\n", - " \"bboxes\": labeled_df.loc[labeled_df[\"id\"] == filename, :][BBOX_COLS_PDF]\n", - " .to_numpy()\n", - " .tolist(),\n", - " \"image\": image_dict[filename],\n", - " }\n", - " ner_annotations.append(annotation)\n", - "\n", - " return ner_annotations\n", - "\n", - "def _prepare_dataset(annotations, processor, label2id):\n", - " \"\"\"Put the dataset in its final format for training LayoutLM.\"\"\"\n", - "\n", - " def _convert_ner_tags_to_id(ner_tags, label2id):\n", - " return [int(label2id[ner_tag]) for ner_tag in ner_tags]\n", - "\n", - " images = annotations[\"image\"]\n", - " words = annotations[\"tokens\"]\n", - " boxes = annotations[\"bboxes\"]\n", - " # Map over labels and convert to numeric id for each ner_tag\n", - " ner_tags = [\n", - " _convert_ner_tags_to_id(ner_tags, label2id)\n", - " for ner_tags in annotations[\"ner_tags\"]\n", - " ]\n", - "\n", - " encoding = processor(\n", - " images,\n", - " words,\n", - " boxes=boxes,\n", - " word_labels=ner_tags,\n", - " truncation=True,\n", - " padding=\"max_length\",\n", - " )\n", - "\n", - " return encoding\n", - "\n", - "def compute_metrics(p, metric, label_list, return_entity_level_metrics=False):\n", - " \"\"\"Compute metrics to train and evaluate the model on.\"\"\"\n", - " predictions, labels = p\n", - " predictions = np.argmax(predictions, axis=2)\n", - "\n", - " # Remove ignored index (special tokens)\n", - " true_predictions = [\n", - " [label_list[pred] for (pred, lab) in zip(prediction, label) if lab != -100]\n", - " for prediction, label in zip(predictions, labels)\n", - " ]\n", - " true_labels = [\n", - " [label_list[lab] for (pred, lab) in zip(prediction, label) if lab != -100]\n", - " for prediction, label in zip(predictions, labels)\n", - " ]\n", - "\n", - " results = metric.compute(predictions=true_predictions, references=true_labels)\n", - " if return_entity_level_metrics:\n", - " # Unpack nested dictionaries\n", - " final_results = {}\n", - " for key, value in results.items():\n", - " if isinstance(value, dict):\n", - " for n, v in value.items():\n", - " final_results[f\"{key}_{n}\"] = v\n", - " else:\n", - " final_results[key] = value\n", - " return final_results\n", - " return {\n", - " \"precision\": results[\"overall_precision\"],\n", - " \"recall\": results[\"overall_recall\"],\n", - " \"f1\": results[\"overall_f1\"],\n", - " \"accuracy\": results[\"overall_accuracy\"],\n", - " }" - ] - }, - { - "cell_type": "markdown", - "id": "8160263c-8f69-437c-918b-e56ad007961a", - "metadata": { - "tags": [] - }, - "source": [ - "#### Finetune Model\n", - "The next cell will use the functions defined in the previous section to actually construct a huggingface dataset from labeled data and finetune the `layoutlm` model. Model finetuning will only be run if configured to do so, otherwise a pretrained version will be used from the `mlflow` tracking server.\n", - "\n", - "Model training contains several steps implemented below:\n", - "1. Use temporary path to convert filings to PDF's and stash labels\n", - "2. Use PDF's and labels to convert PDF's and labels to NER annotations\n", - "3. Construct huggingface dataset from NER annotations and split into train and test sets\n", - "4. Load pretrained model from huggingface\n", - "5. Finetune model on training data and evaluate on test data" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "71d205b2-e6ea-4ad0-982c-22e762269119", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - " is empty\n", - "' The Southwest Companies Nevada PriMerit Bank Federally chartered stock savings bank Paiute Pipeline Company Nevada Carson Water Company Nevada Southwest Gas Transmission Company Partnership between Southwest Gas Corporation and Utility Financial Corp. Utility Financial Corp. Nevada Southwest Gas Corporation of Arizona Nevada PRIMERIT BANK SUBSIDIARIES AT DECEMBER 31, 1993'\n", - "
is empty\n", - "' TCA Management Company.................................................... Texas Teleservice Corporation of America........................................ Texas Texas Community Antennas, Inc............................................. Texas Texas Telecable, Inc...................................................... Texas TCA Cable of Amarillo, Inc................................................ Texas Telecable Associates, Inc................................................. Texas Delta Cablevision, Inc.................................................... Arkansas Sun Valley Cablevision, Inc............................................... Idaho VPI Communications, Inc................................................... Texas AvComm Corporation........................................................ Texas Tele-Communications of Arkansas L. P......................................'\n", - "
is empty\n", - "' DOMESTIC SUBSIDIARIES International Sales & Business, Inc. California KLA-Tencor Building Corporation California KLA-Tencor Disc Corporation California KLA-Tencor International Corporation California KLA-Tencor Klinnik Corporation California KLA-Tencor Management Corporation California KLA-Tencor (Thailand Branch) Corporation California VLSI Standards, Inc. California Amray, Inc. Delaware Groff Associates, Inc. California DeviceWare, Inc. California INTERNATIONAL SUBSIDIARIES'\n", - "
is empty\n", - "' 1. Northeast Energy, LLC (100%-Owned) .................................................... Florida 2. Northeast Energy Associates, A Limited Partnership (99%-Owned) (a) .................... Massachusetts 3. North Jersey Energy Associates, A Limited Partnership (99%-Owned) (a) ................. New Jersey (a) Northeast Energy, LLC owns the remaining 1% interest. '\n", - "
is empty\n", - "' 1. ESI Tractebel Urban Renewal Corporation (100%-Owned) .................................. New Jersey '\n", - "
is empty\n", - "' IVANHOE ENERGY HOLDINGS INC. (Nevada) 100% IVANHOE ENERGY (USA) INC. (Nevada) 100% (indirect) IVANHOE ENERGY ROYALTY INC. (Nevada) 100% (indirect) IVANHOE ENERGY INTERNATIONAL VENTURES INC. (BVI) 100% Ivanhoe Energy Sweetwater Limited (Malta) 100% (Indirect) Ivanhoe Energy (Qatar) Inc. (BVI) 100% (Indirect) GTL Japan Corporation (Japan) 100% (Indirect) IVANHOE ENERGY'\n", - "
is empty\n", - "' Airgas Canada, Inc. Canada Airgas Carbonic, Inc. DE Airgas Data, LLC DE Airgas East, Inc. DE Airgas Great Lakes, Inc. DE Airgas Gulf States, Inc. DE Airgas Intermountain, Inc. CO Airgas International, Inc. VI Airgas Mid America, Inc. DE Airgas Mid South, Inc. DE Airgas Nor Pac, Inc. DE'\n", - "
is empty\n", - "' Subsidiary Name State of Formation - --------------- ------------------- American Ecology Environmental Services Corporation Texas Corporation American Ecology Holdings Corporation Delaware Corporation American Ecology Recycle Center, Inc. Delaware Corporation American Ecology Services Corporation Delaware Corporation Texas Ecologists, Inc. Texas Corporation US Ecology, Inc. California Corporation US Ecology Idaho, Inc. Delaware'\n", - "Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "bae617cb831d4b2593c0fa4a874f1592", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Map: 0%| | 0/159 [00:00\n", - " \n", - " \n", - " [ 2/1000 : < :, Epoch 0.01/8]\n", - " \n", - "
\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
StepTraining LossValidation Loss

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/09/23 14:14:50 INFO mlflow.tracking._tracking_service.client: 🏃 View run bedecked-trout-555 at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13/runs/573e64992704411c9013937d849e1504.\n", - "2024/09/23 14:14:50 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13.\n", - "2024/09/23 14:14:51 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n", - "2024/09/23 14:14:51 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n" - ] - }, - { - "ename": "OutOfMemoryError", - "evalue": "CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacity of 3.93 GiB of which 20.12 MiB is free. Including non-PyTorch memory, this process has 2.72 GiB memory in use. Of the allocated memory 2.53 GiB is allocated by PyTorch, and 104.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mOutOfMemoryError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[6], line 94\u001b[0m\n\u001b[1;32m 91\u001b[0m mlflow\u001b[38;5;241m.\u001b[39mset_experiment(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mexhibit21_extraction_test\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 92\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m mlflow\u001b[38;5;241m.\u001b[39mstart_run():\n\u001b[1;32m 93\u001b[0m \u001b[38;5;66;03m# Train inside mlflow run. Mlflow will automatically handle logging training metrcis\u001b[39;00m\n\u001b[0;32m---> 94\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 96\u001b[0m \u001b[38;5;66;03m# Log finetuend model with mlflow\u001b[39;00m\n\u001b[1;32m 97\u001b[0m mlflow\u001b[38;5;241m.\u001b[39mtransformers\u001b[38;5;241m.\u001b[39mlog_model(\n\u001b[1;32m 98\u001b[0m trainer, artifact_path\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayoutlm_extractor\u001b[39m\u001b[38;5;124m\"\u001b[39m, task\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtoken-classification\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 99\u001b[0m )\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:1938\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 1936\u001b[0m hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m 1937\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1938\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1939\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1940\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1941\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1942\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1943\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:2279\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 2276\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_step_begin(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[1;32m 2278\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39maccumulate(model):\n\u001b[0;32m-> 2279\u001b[0m tr_loss_step \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtraining_step\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2281\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 2282\u001b[0m args\u001b[38;5;241m.\u001b[39mlogging_nan_inf_filter\n\u001b[1;32m 2283\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m is_torch_xla_available()\n\u001b[1;32m 2284\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m (torch\u001b[38;5;241m.\u001b[39misnan(tr_loss_step) \u001b[38;5;129;01mor\u001b[39;00m torch\u001b[38;5;241m.\u001b[39misinf(tr_loss_step))\n\u001b[1;32m 2285\u001b[0m ):\n\u001b[1;32m 2286\u001b[0m \u001b[38;5;66;03m# if loss is nan or inf simply add the average of previous logged losses\u001b[39;00m\n\u001b[1;32m 2287\u001b[0m tr_loss \u001b[38;5;241m+\u001b[39m\u001b[38;5;241m=\u001b[39m tr_loss \u001b[38;5;241m/\u001b[39m (\u001b[38;5;241m1\u001b[39m \u001b[38;5;241m+\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m-\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_globalstep_last_logged)\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:3318\u001b[0m, in \u001b[0;36mTrainer.training_step\u001b[0;34m(self, model, inputs)\u001b[0m\n\u001b[1;32m 3315\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss_mb\u001b[38;5;241m.\u001b[39mreduce_mean()\u001b[38;5;241m.\u001b[39mdetach()\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m 3317\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcompute_loss_context_manager():\n\u001b[0;32m-> 3318\u001b[0m loss \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcompute_loss\u001b[49m\u001b[43m(\u001b[49m\u001b[43mmodel\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3320\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m inputs\n\u001b[1;32m 3321\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m (\n\u001b[1;32m 3322\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mtorch_empty_cache_steps \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 3323\u001b[0m \u001b[38;5;129;01mand\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate\u001b[38;5;241m.\u001b[39mglobal_step \u001b[38;5;241m%\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mtorch_empty_cache_steps \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m\n\u001b[1;32m 3324\u001b[0m ):\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:3363\u001b[0m, in \u001b[0;36mTrainer.compute_loss\u001b[0;34m(self, model, inputs, return_outputs)\u001b[0m\n\u001b[1;32m 3361\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 3362\u001b[0m labels \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m-> 3363\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 3364\u001b[0m \u001b[38;5;66;03m# Save past state if it exists\u001b[39;00m\n\u001b[1;32m 3365\u001b[0m \u001b[38;5;66;03m# TODO: this needs to be fixed and made cleaner later.\u001b[39;00m\n\u001b[1;32m 3366\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39margs\u001b[38;5;241m.\u001b[39mpast_index \u001b[38;5;241m>\u001b[39m\u001b[38;5;241m=\u001b[39m \u001b[38;5;241m0\u001b[39m:\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:1099\u001b[0m, in \u001b[0;36mLayoutLMv3ForTokenClassification.forward\u001b[0;34m(self, input_ids, bbox, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, pixel_values)\u001b[0m\n\u001b[1;32m 1069\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124mr\u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 1070\u001b[0m \u001b[38;5;124;03mlabels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):\u001b[39;00m\n\u001b[1;32m 1071\u001b[0m \u001b[38;5;124;03m Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1095\u001b[0m \u001b[38;5;124;03m>>> logits = outputs.logits\u001b[39;00m\n\u001b[1;32m 1096\u001b[0m \u001b[38;5;124;03m```\"\"\"\u001b[39;00m\n\u001b[1;32m 1097\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[0;32m-> 1099\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlayoutlmv3\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1100\u001b[0m \u001b[43m \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1101\u001b[0m \u001b[43m \u001b[49m\u001b[43mbbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1102\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1103\u001b[0m \u001b[43m \u001b[49m\u001b[43mtoken_type_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtoken_type_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1104\u001b[0m \u001b[43m \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1105\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1106\u001b[0m \u001b[43m \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1107\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1108\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1109\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1110\u001b[0m \u001b[43m \u001b[49m\u001b[43mpixel_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpixel_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1111\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1112\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m input_ids \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m 1113\u001b[0m input_shape \u001b[38;5;241m=\u001b[39m input_ids\u001b[38;5;241m.\u001b[39msize()\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:975\u001b[0m, in \u001b[0;36mLayoutLMv3Model.forward\u001b[0;34m(self, input_ids, bbox, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, pixel_values, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m 968\u001b[0m \u001b[38;5;66;03m# Prepare head mask if needed\u001b[39;00m\n\u001b[1;32m 969\u001b[0m \u001b[38;5;66;03m# 1.0 in head_mask indicate we keep the head\u001b[39;00m\n\u001b[1;32m 970\u001b[0m \u001b[38;5;66;03m# attention_probs has shape bsz x n_heads x N x N\u001b[39;00m\n\u001b[1;32m 971\u001b[0m \u001b[38;5;66;03m# input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]\u001b[39;00m\n\u001b[1;32m 972\u001b[0m \u001b[38;5;66;03m# and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]\u001b[39;00m\n\u001b[1;32m 973\u001b[0m head_mask \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_head_mask(head_mask, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mnum_hidden_layers)\n\u001b[0;32m--> 975\u001b[0m encoder_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mencoder\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 976\u001b[0m \u001b[43m \u001b[49m\u001b[43membedding_output\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 977\u001b[0m \u001b[43m \u001b[49m\u001b[43mbbox\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfinal_bbox\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 978\u001b[0m \u001b[43m \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfinal_position_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 979\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mextended_attention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 980\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 981\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 982\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 983\u001b[0m \u001b[43m \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 984\u001b[0m \u001b[43m \u001b[49m\u001b[43mpatch_height\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpatch_height\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 985\u001b[0m \u001b[43m \u001b[49m\u001b[43mpatch_width\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpatch_width\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 986\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 988\u001b[0m sequence_output \u001b[38;5;241m=\u001b[39m encoder_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 990\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m return_dict:\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:681\u001b[0m, in \u001b[0;36mLayoutLMv3Encoder.forward\u001b[0;34m(self, hidden_states, bbox, attention_mask, head_mask, output_attentions, output_hidden_states, return_dict, position_ids, patch_height, patch_width)\u001b[0m\n\u001b[1;32m 671\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_gradient_checkpointing_func(\n\u001b[1;32m 672\u001b[0m layer_module\u001b[38;5;241m.\u001b[39m\u001b[38;5;21m__call__\u001b[39m,\n\u001b[1;32m 673\u001b[0m hidden_states,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 678\u001b[0m rel_2d_pos,\n\u001b[1;32m 679\u001b[0m )\n\u001b[1;32m 680\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 681\u001b[0m layer_outputs \u001b[38;5;241m=\u001b[39m \u001b[43mlayer_module\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 682\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 683\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 684\u001b[0m \u001b[43m \u001b[49m\u001b[43mlayer_head_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 685\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 686\u001b[0m \u001b[43m \u001b[49m\u001b[43mrel_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 687\u001b[0m \u001b[43m \u001b[49m\u001b[43mrel_2d_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_2d_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 688\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 690\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m layer_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 691\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m output_attentions:\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:532\u001b[0m, in \u001b[0;36mLayoutLMv3Layer.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, output_attentions, rel_pos, rel_2d_pos)\u001b[0m\n\u001b[1;32m 523\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m 524\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 525\u001b[0m hidden_states,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 530\u001b[0m rel_2d_pos\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 531\u001b[0m ):\n\u001b[0;32m--> 532\u001b[0m self_attention_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mattention\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 533\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 534\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 535\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 536\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 537\u001b[0m \u001b[43m \u001b[49m\u001b[43mrel_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 538\u001b[0m \u001b[43m \u001b[49m\u001b[43mrel_2d_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_2d_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 539\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 540\u001b[0m attention_output \u001b[38;5;241m=\u001b[39m self_attention_outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m 542\u001b[0m outputs \u001b[38;5;241m=\u001b[39m self_attention_outputs[\u001b[38;5;241m1\u001b[39m:] \u001b[38;5;66;03m# add self attentions if we output attention weights\u001b[39;00m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:500\u001b[0m, in \u001b[0;36mLayoutLMv3Attention.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, output_attentions, rel_pos, rel_2d_pos)\u001b[0m\n\u001b[1;32m 491\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m 492\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m 493\u001b[0m hidden_states,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 498\u001b[0m rel_2d_pos\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 499\u001b[0m ):\n\u001b[0;32m--> 500\u001b[0m self_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mself\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 501\u001b[0m \u001b[43m \u001b[49m\u001b[43mhidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 502\u001b[0m \u001b[43m \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 503\u001b[0m \u001b[43m \u001b[49m\u001b[43mhead_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 504\u001b[0m \u001b[43m \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 505\u001b[0m \u001b[43m \u001b[49m\u001b[43mrel_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 506\u001b[0m \u001b[43m \u001b[49m\u001b[43mrel_2d_pos\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mrel_2d_pos\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 507\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 508\u001b[0m attention_output \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39moutput(self_outputs[\u001b[38;5;241m0\u001b[39m], hidden_states)\n\u001b[1;32m 509\u001b[0m outputs \u001b[38;5;241m=\u001b[39m (attention_output,) \u001b[38;5;241m+\u001b[39m self_outputs[\u001b[38;5;241m1\u001b[39m:] \u001b[38;5;66;03m# add attentions if we output them\u001b[39;00m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1553\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1551\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs) \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m 1552\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1553\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/nn/modules/module.py:1562\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 1557\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m 1558\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m 1559\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m 1560\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m 1561\u001b[0m \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1562\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 1564\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 1565\u001b[0m result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:448\u001b[0m, in \u001b[0;36mLayoutLMv3SelfAttention.forward\u001b[0;34m(self, hidden_states, attention_mask, head_mask, output_attentions, rel_pos, rel_2d_pos)\u001b[0m\n\u001b[1;32m 444\u001b[0m attention_scores \u001b[38;5;241m=\u001b[39m attention_scores \u001b[38;5;241m+\u001b[39m attention_mask\n\u001b[1;32m 446\u001b[0m \u001b[38;5;66;03m# Normalize the attention scores to probabilities.\u001b[39;00m\n\u001b[1;32m 447\u001b[0m \u001b[38;5;66;03m# Use the trick of the CogView paper to stablize training\u001b[39;00m\n\u001b[0;32m--> 448\u001b[0m attention_probs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mcogview_attention\u001b[49m\u001b[43m(\u001b[49m\u001b[43mattention_scores\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 450\u001b[0m \u001b[38;5;66;03m# This is actually dropping out entire tokens to attend to, which might\u001b[39;00m\n\u001b[1;32m 451\u001b[0m \u001b[38;5;66;03m# seem a bit unusual, but is taken from the original Transformer paper.\u001b[39;00m\n\u001b[1;32m 452\u001b[0m attention_probs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdropout(attention_probs)\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/models/layoutlmv3/modeling_layoutlmv3.py:414\u001b[0m, in \u001b[0;36mLayoutLMv3SelfAttention.cogview_attention\u001b[0;34m(self, attention_scores, alpha)\u001b[0m\n\u001b[1;32m 412\u001b[0m scaled_attention_scores \u001b[38;5;241m=\u001b[39m attention_scores \u001b[38;5;241m/\u001b[39m alpha\n\u001b[1;32m 413\u001b[0m max_value \u001b[38;5;241m=\u001b[39m scaled_attention_scores\u001b[38;5;241m.\u001b[39mamax(dim\u001b[38;5;241m=\u001b[39m(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m))\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[0;32m--> 414\u001b[0m new_attention_scores \u001b[38;5;241m=\u001b[39m \u001b[43m(\u001b[49m\u001b[43mscaled_attention_scores\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m-\u001b[39;49m\u001b[43m \u001b[49m\u001b[43mmax_value\u001b[49m\u001b[43m)\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43m \u001b[49m\u001b[43malpha\u001b[49m\n\u001b[1;32m 415\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m nn\u001b[38;5;241m.\u001b[39mSoftmax(dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m)(new_attention_scores)\n", - "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 24.00 MiB. GPU 0 has a total capacity of 3.93 GiB of which 20.12 MiB is free. Including non-PyTorch memory, this process has 2.72 GiB memory in use. Of the allocated memory 2.53 GiB is allocated by PyTorch, and 104.91 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)" - ] - } - ], - "source": [ - "import mlflow\n", - "from datasets import (\n", - " Array2D,\n", - " Array3D,\n", - " Dataset,\n", - " Features,\n", - " Sequence,\n", - " Value,\n", - " load_metric,\n", - ")\n", - "from dotenv import load_dotenv\n", - "from transformers import (\n", - " AutoProcessor,\n", - " LayoutLMv3ForTokenClassification,\n", - " Trainer,\n", - " TrainingArguments,\n", - ")\n", - "from transformers.data.data_collator import default_data_collator\n", - "\n", - "from mozilla_sec_eia.library.mlflow import configure_mlflow\n", - "\n", - "load_dotenv()\n", - "\n", - "\n", - "configure_mlflow()\n", - "\n", - "# Only finetune if configured to do so\n", - "if context.op_config[\"train_model\"]:\n", - " # Change temp_dir to save training data locally for inspection\n", - " with TemporaryDirectory() as temp_dir:\n", - " ner_annotations = format_as_ner_annotations(\n", - " labeled_json_path=Path(temp_dir) / \"sec10k_filings\" / \"labeled_jsons\",\n", - " pdfs_path=Path(temp_dir) / \"sec10k_filings\" / \"pdfs\",\n", - " gcs_folder_name=\"labeledv0.2/\",\n", - " )\n", - "\n", - " # Cache/prepare training data\n", - " dataset = Dataset.from_list(ner_annotations)\n", - "\n", - " # Load pretrained model\n", - " model = LayoutLMv3ForTokenClassification.from_pretrained(\n", - " \"microsoft/layoutlmv3-base\", id2label=id2label, label2id=label2id\n", - " )\n", - " processor = AutoProcessor.from_pretrained(\n", - " \"microsoft/layoutlmv3-base\", apply_ocr=False\n", - " )\n", - "\n", - " # Prepare our train & eval dataset\n", - " column_names = dataset.column_names\n", - " features = Features(\n", - " {\n", - " \"pixel_values\": Array3D(dtype=\"float32\", shape=(3, 224, 224)),\n", - " \"input_ids\": Sequence(feature=Value(dtype=\"int64\")),\n", - " \"attention_mask\": Sequence(Value(dtype=\"int64\")),\n", - " \"bbox\": Array2D(dtype=\"int64\", shape=(512, 4)),\n", - " \"labels\": Sequence(feature=Value(dtype=\"int64\")),\n", - " }\n", - " )\n", - " dataset = dataset.map(\n", - " lambda annotations: _prepare_dataset(annotations, processor, label2id),\n", - " batched=True,\n", - " remove_columns=column_names,\n", - " features=features,\n", - " )\n", - " dataset.set_format(\"torch\")\n", - " split_dataset = dataset.train_test_split(test_size=0.2)\n", - " train_dataset, eval_dataset = split_dataset[\"train\"], split_dataset[\"test\"]\n", - "\n", - " # Initialize our Trainer\n", - " metric = load_metric(\"seqeval\")\n", - " training_args = TrainingArguments(\n", - " max_steps=1000,\n", - " per_device_train_batch_size=1,\n", - " per_device_eval_batch_size=1,\n", - " learning_rate=1e-5,\n", - " evaluation_strategy=\"steps\",\n", - " eval_steps=100,\n", - " load_best_model_at_end=True,\n", - " metric_for_best_model=\"f1\",\n", - " output_dir=\"./layoutlm\",\n", - " )\n", - " trainer = Trainer(\n", - " model=model,\n", - " args=training_args,\n", - " train_dataset=train_dataset,\n", - " eval_dataset=eval_dataset,\n", - " tokenizer=processor,\n", - " data_collator=default_data_collator,\n", - " compute_metrics=lambda p: compute_metrics(p, metric=metric, label_list=LABELS),\n", - " )\n", - "\n", - " mlflow.set_experiment(\"exhibit21_extraction_test\")\n", - " with mlflow.start_run():\n", - " # Train inside mlflow run. Mlflow will automatically handle logging training metrcis\n", - " trainer.train()\n", - "\n", - " # Log finetuend model with mlflow\n", - " mlflow.transformers.log_model(\n", - " trainer, artifact_path=\"layoutlm_extractor\", task=\"token-classification\"\n", - " )" - ] - }, - { - "cell_type": "markdown", - "id": "ee9b4e20-7781-43a7-b7aa-caf0690a201e", - "metadata": {}, - "source": [ - "## Model inference\n", - "Use the finetuned model to perform inference and evaluate on labeled validation data. First create a Huggingface `Pipeline` which wraps layoutlm with some custom pre/post processing steps. " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "42c8e920-d671-40c2-b5db-c43611a33897", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import torch\n", - "from transformers import Pipeline, pipeline\n", - "from transformers.tokenization_utils_base import BatchEncoding\n", - "\n", - "from mozilla_sec_eia.models.sec10k.utils.layoutlm import (\n", - " iob_to_label,\n", - ")\n", - "\n", - "\n", - "def _sort_by_label_priority(target_array):\n", - " id_priority = [label2id[label] for label in LABEL_PRIORITY]\n", - " # Create a priority map from the label priority\n", - " priority_map = {val: idx for idx, val in enumerate(id_priority)}\n", - " # Sort the target array based on the priority map\n", - " sorted_array = sorted(target_array, key=lambda x: priority_map.get(x, float(\"inf\")))\n", - " return sorted_array\n", - "\n", - "def get_flattened_mode_predictions(token_boxes_tensor, predictions_tensor):\n", - " \"\"\"Get the mode prediction for each box in an Ex. 21.\n", - "\n", - " When handling multi page documents LayoutLM uses a sliding 'frame'\n", - " with some overlap between frames. The overlap creates multiple\n", - " predictions for the same bounding boxes. Thus it's necessary to find\n", - " the mode of all the predictions for a bounding box and use that as the\n", - " single prediction for each box. If there are multiple mode\n", - " predictions for a bounding box, then ties are broken by setting\n", - " a priority for the labels (LABEL_PRIORITY) and choosing the highest priority\n", - " label.\n", - " \"\"\"\n", - " # Flatten the tensors\n", - " flat_token_boxes = token_boxes_tensor.view(-1, 4)\n", - " flat_predictions = predictions_tensor.view(-1)\n", - "\n", - " boxes = flat_token_boxes.numpy()\n", - " predictions = flat_predictions.numpy()\n", - "\n", - " # Find unique boxes and indices\n", - " unique_boxes, inverse_indices = np.unique(boxes, axis=0, return_inverse=True)\n", - "\n", - " # Compute the mode for each unique bounding box\n", - " # for each unique box in boxes, create a list with all predictions for that box\n", - " # get the indices in predictions where the corresponding index in boxes is\n", - " unique_box_predictions = [\n", - " predictions[np.where(inverse_indices == i)[0]] for i in range(len(unique_boxes))\n", - " ]\n", - " pred_counts = [np.bincount(arr) for arr in unique_box_predictions]\n", - " # Compute the mode of predictions for each group\n", - " # break ties by taking into account LABEL_PRIORITY\n", - " modes = np.array(\n", - " [\n", - " _sort_by_label_priority(np.where(arr == np.max(arr))[0])[0]\n", - " for arr in pred_counts\n", - " ]\n", - " )\n", - " flattened_modes = modes[inverse_indices]\n", - "\n", - " return flattened_modes\n", - "\n", - "class LayoutLMInferencePipeline(Pipeline):\n", - " \"\"\"Pipeline for performing inference with fine-tuned LayoutLM.\"\"\"\n", - "\n", - " def __init__(self, *args, **kwargs):\n", - " \"\"\"Initialize LayoutLMInferencePipeline.\"\"\"\n", - " super().__init__(*args, **kwargs)\n", - "\n", - " def _sanitize_parameters(self, **kwargs):\n", - " preprocess_kwargs = {}\n", - " if \"maybe_arg\" in kwargs:\n", - " preprocess_kwargs[\"maybe_arg\"] = kwargs[\"maybe_arg\"]\n", - " return preprocess_kwargs, {}, {}\n", - "\n", - " def preprocess(self, doc_dict):\n", - " \"\"\"Encode and tokenize model inputs.\"\"\"\n", - " image = doc_dict[\"image\"]\n", - " words = doc_dict[\"tokens\"]\n", - " boxes = doc_dict[\"bboxes\"]\n", - " encoding = self.tokenizer(\n", - " image,\n", - " words,\n", - " boxes=boxes,\n", - " return_tensors=\"pt\",\n", - " truncation=True,\n", - " padding=\"max_length\",\n", - " max_length=512, # this is the maximum max_length\n", - " stride=128,\n", - " return_offsets_mapping=True,\n", - " return_overflowing_tokens=True,\n", - " )\n", - " model_inputs = {}\n", - " model_inputs[\"raw_encoding\"] = encoding.copy()\n", - " model_inputs[\"doc_dict\"] = doc_dict\n", - " model_inputs[\"offset_mapping\"] = encoding.pop(\"offset_mapping\")\n", - " model_inputs[\"sample_mapping\"] = encoding.pop(\"overflow_to_sample_mapping\")\n", - " # TODO: do we actually need to make these into ints?\n", - " encoding[\"input_ids\"] = encoding[\"input_ids\"].to(torch.int64)\n", - " encoding[\"attention_mask\"] = encoding[\"attention_mask\"].to(torch.int64)\n", - " encoding[\"bbox\"] = encoding[\"bbox\"].to(torch.int64)\n", - " encoding[\"pixel_values\"] = torch.stack(encoding[\"pixel_values\"])\n", - " model_inputs[\"encoding\"] = encoding\n", - " return model_inputs\n", - "\n", - " def _forward(self, model_inputs):\n", - " # encoding is passed as a UserDict in the model_inputs dictionary\n", - " # turn it back into a BatchEncoding\n", - " encoding = BatchEncoding(model_inputs[\"encoding\"])\n", - " if torch.cuda.is_available():\n", - " encoding.to(\"cuda\")\n", - " self.model.to(\"cuda\")\n", - " # since we're doing inference, we don't need gradient computation\n", - " with torch.no_grad():\n", - " output = self.model(**encoding)\n", - " return {\n", - " \"logits\": output.logits,\n", - " \"predictions\": output.logits.argmax(-1).squeeze().tolist(),\n", - " \"raw_encoding\": model_inputs[\"raw_encoding\"],\n", - " \"doc_dict\": model_inputs[\"doc_dict\"],\n", - " }\n", - "\n", - " def postprocess(self, all_outputs):\n", - " \"\"\"Return logits, model predictions, and the extracted dataframe.\"\"\"\n", - " logits = all_outputs[\"logits\"]\n", - " predictions = all_outputs[\"logits\"].argmax(-1).squeeze().tolist()\n", - " output_df = self.extract_table(all_outputs)\n", - " return logits, predictions, output_df\n", - "\n", - " def extract_table(self, all_outputs):\n", - " \"\"\"Extract a structured table from a set of inference predictions.\n", - "\n", - " This function essentially works by stacking bounding boxes and predictions\n", - " into a dataframe and going from left to right and top to bottom. Then, every\n", - " every time a new subsidiary entity is encountered, it assigns a new group or\n", - " \"row\" to that subsidiary. Next, location and ownership percentage words/labeled\n", - " entities in between these subsidiary groups are assigned to a subsidiary row/group.\n", - " Finally, this is all formatted into a dataframe with an ID column from the original\n", - " filename and a basic cleaning function normalizes strings.\n", - " \"\"\"\n", - " # TODO: when model more mature, break this into sub functions to make it\n", - " # clearer what's going on\n", - " predictions = all_outputs[\"predictions\"]\n", - " encoding = all_outputs[\"raw_encoding\"]\n", - " doc_dict = all_outputs[\"doc_dict\"]\n", - "\n", - " token_boxes_tensor = encoding[\"bbox\"].flatten(start_dim=0, end_dim=1)\n", - " predictions_tensor = torch.tensor(predictions)\n", - " mode_predictions = get_flattened_mode_predictions(\n", - " token_boxes_tensor, predictions_tensor\n", - " )\n", - " token_boxes = encoding[\"bbox\"].flatten(start_dim=0, end_dim=1).tolist()\n", - " predicted_labels = [\n", - " self.model.config.id2label[pred] for pred in mode_predictions\n", - " ]\n", - " simple_preds = [iob_to_label(pred).lower() for pred in predicted_labels]\n", - "\n", - " df = pd.DataFrame(data=token_boxes, columns=BBOX_COLS)\n", - " df.loc[:, \"iob_pred\"] = predicted_labels\n", - " df.loc[:, \"pred\"] = simple_preds\n", - " invalid_mask = (\n", - " (df[\"top_left_x\"] == 0)\n", - " & (df[\"top_left_y\"] == 0)\n", - " & (df[\"bottom_right_x\"] == 0)\n", - " & (df[\"bottom_right_y\"] == 0)\n", - " )\n", - " df = df[~invalid_mask]\n", - " # we want to get actual words on the dataframe, not just subwords that correspond to tokens\n", - " # subwords from the same word share the same bounding box coordinates\n", - " # so we merge the original words onto our dataframe on bbox coordinates\n", - " words_df = pd.DataFrame(data=doc_dict[\"bboxes\"], columns=BBOX_COLS)\n", - " words_df.loc[:, \"word\"] = doc_dict[\"tokens\"]\n", - " df = df.merge(words_df, how=\"left\", on=BBOX_COLS).drop_duplicates(\n", - " subset=BBOX_COLS + [\"pred\", \"word\"]\n", - " )\n", - " # rows that are the first occurrence in a new group (subsidiary, loc, own_per)\n", - " # should always have a B entity label. Manually override labels so this is true.\n", - " first_in_group_df = df[\n", - " (df[\"pred\"].ne(df[\"pred\"].shift())) & (df[\"pred\"] != \"other\")\n", - " ]\n", - " first_in_group_df.loc[:, \"iob_pred\"] = (\n", - " \"B\" + first_in_group_df[\"iob_pred\"].str[1:]\n", - " )\n", - " df.update(first_in_group_df)\n", - " # filter for just words that were labeled with non \"other\" entities\n", - " entities_df = df.sort_values(by=[\"top_left_y\", \"top_left_x\"])\n", - " entities_df = entities_df[entities_df[\"pred\"] != \"other\"]\n", - " # words are labeled with IOB format which stands for inside, outside, beginning\n", - " # merge B and I entities to form one entity group\n", - " # (i.e. \"B-Subsidiary\" and \"I-Subsidiary\" become just \"subsidiary\"), assign a group ID\n", - " entities_df[\"group\"] = (entities_df[\"iob_pred\"].str.startswith(\"B-\")).cumsum()\n", - " grouped_df = (\n", - " entities_df.groupby([\"group\", \"pred\"])[\"word\"]\n", - " .apply(\" \".join)\n", - " .reset_index()[[\"pred\", \"word\"]]\n", - " )\n", - " # assign a new row every time there's a new subsidiary\n", - " grouped_df[\"row\"] = (grouped_df[\"pred\"].str.startswith(\"subsidiary\")).cumsum()\n", - " output_df = grouped_df.pivot_table(\n", - " index=\"row\", columns=\"pred\", values=\"word\", aggfunc=lambda x: \" \".join(x)\n", - " ).reset_index()\n", - " if output_df.empty:\n", - " return output_df\n", - " output_df.loc[:, \"id\"] = doc_dict[\"id\"]\n", - " return output_df" - ] - }, - { - "cell_type": "markdown", - "id": "ea9fe887-43ca-43e2-85e3-bf5371bd165f", - "metadata": {}, - "source": [ - "Next, wrap the `LayoutLMInferencePipeline` in an `mlflow` `pyfunc` model, which handles loading the pretrained model and managing inputs/outputs." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4d802e00-1ca4-40b3-b15b-561711a9db70", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from mozilla_sec_eia.models.sec10k.entities import (\n", - " Ex21CompanyOwnership,\n", - " Sec10kExtractionMetadata,\n", - ")\n", - "from mozilla_sec_eia.models.sec10k.ex_21.inference import clean_extracted_df\n", - "\n", - "# Construct model_uri from model_version\n", - "model_uri = f\"models:/layoutlm_extractor/{context.op_config['model_version']}\"\n", - "model_info = mlflow.models.get_model_info(model_uri)\n", - "\n", - "def _get_data(dataset):\n", - " yield from dataset\n", - "\n", - "class Ex21Extractor(mlflow.pyfunc.PythonModel):\n", - " \"\"\"Create an mlflow pyfunc model to perform full EX21 extraction.\"\"\"\n", - " def load_context(self, context):\n", - " \"\"\"Load pretrained model.\"\"\"\n", - " os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n", - " self.model_components = mlflow.transformers.load_model(\n", - " context.artifacts[\"layoutlm_extractor\"], return_type=\"components\"\n", - " )\n", - "\n", - " def predict(self, context, model_input: Dataset, params=None):\n", - " \"\"\"Use pretrained model and inference pipeline to perform inference.\"\"\"\n", - " # TODO: figure out device argument\n", - " pipe = pipeline(\n", - " \"token-classification\",\n", - " model=self.model_components[\"model\"],\n", - " tokenizer=self.model_components[\"tokenizer\"],\n", - " pipeline_class=LayoutLMInferencePipeline,\n", - " )\n", - "\n", - " logits = []\n", - " predictions = []\n", - " all_output_df = Ex21CompanyOwnership.example(size=0)\n", - " extraction_metadata = Sec10kExtractionMetadata.example(size=0)\n", - " for logit, pred, output_df in pipe(_get_data(model_input)):\n", - " logits.append(logit)\n", - " predictions.append(pred)\n", - " if not output_df.empty:\n", - " filename = get_metadata_filename(output_df[\"id\"].iloc[0])\n", - " extraction_metadata.loc[filename, [\"success\"]] = True\n", - " all_output_df = pd.concat([all_output_df, output_df])\n", - " all_output_df.columns.name = None\n", - " all_output_df = clean_extracted_df(all_output_df)\n", - " all_output_df = all_output_df[[\"id\", \"subsidiary\", \"loc\", \"own_per\"]]\n", - " all_output_df = all_output_df.reset_index(drop=True)\n", - " return extraction_metadata, all_output_df\n", - "\n", - "# Save model to local temp dir with artifacts, then reload for evaluation\n", - "with TemporaryDirectory() as tmp_dir:\n", - " mlflow.pyfunc.save_model(\n", - " path=tmp_dir,\n", - " python_model=Ex21Extractor(),\n", - " artifacts={\"model_components\": model_uri},\n", - " )\n", - " ex21_extraction_model = mlflow.pyfunc.load_model(tmp_dir)" - ] - }, - { - "cell_type": "markdown", - "id": "fee84b13-6c37-4afe-8faa-003ff149aa2d", - "metadata": {}, - "source": [ - "### Model Evaluation\n", - "Now the full extraction model can be evaluated using labeled validation data and logged to `mlflow`. The `mlflow` run used to evaluate and log the inference model will be created as a nested child run to the run used to train `layoutlm`. This setup allows multiple versions/configurations of inference to be associated with a single version of `layoutlm`, creating a clean organizational structure for testing the base model and inference logic separately." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "47c19b41-131f-4059-8f42-931237565a20", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "def clean_ex21_validation_set(validation_df: pd.DataFrame):\n", - " \"\"\"Clean Ex. 21 validation data to match extracted format.\"\"\"\n", - " validation_df = validation_df.rename(\n", - " columns={\n", - " \"Filename\": \"id\",\n", - " \"Subsidiary\": \"subsidiary\",\n", - " \"Location of Incorporation\": \"loc\",\n", - " \"Ownership Percentage\": \"own_per\",\n", - " }\n", - " )\n", - " validation_df[\"own_per\"] = validation_df[\"own_per\"].astype(str)\n", - " validation_df[\"filename\"] = validation_df[\"id\"].apply(get_metadata_filename)\n", - " validation_df = clean_extracted_df(validation_df)\n", - " return validation_df\n", - "\n", - "# Load labeled validation set\n", - "validation_set = clean_ex21_validation_set(\n", - " validation_helpers.load_validation_data(\"ex21_labels.csv\")\n", - ")\n", - "\n", - "# Get filing metadata for filings in validation set\n", - "cloud_interface = GCSArchive()\n", - "filing_metadata = cloud_interface.get_metadata()\n", - "ex21_validation_filing_metadata = filing_metadata[\n", - " filing_metadata.index.isin(validation_set[\"filename\"].unique())\n", - "]" - ] - }, - { - "cell_type": "markdown", - "id": "eddcc912-324a-42e9-9841-3a916c6ece6b", - "metadata": {}, - "source": [ - "Next define methods evaluating model output, then run extraction and log in child run." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f79bd14d-5156-4f34-9a50-e9c813b822cf", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "from mozilla_sec_eia.models.sec10k.ex_21.inference import create_inference_dataset\n", - "\n", - "\n", - "def ex21_validation_metrics(computed_df: pd.DataFrame, validation_df: pd.DataFrame):\n", - " \"\"\"Compute validation metrics for Ex. 21 extraction.\"\"\"\n", - " shared_cols = validation_df.columns.intersection(computed_df.columns)\n", - " validation_df = validation_df.astype(computed_df[shared_cols].dtypes)\n", - " n_equal = 0\n", - " validation_filenames = validation_df[\"id\"].unique()\n", - " n_files = len(validation_filenames)\n", - " table_metrics_dict = {}\n", - " jaccard_dict = {}\n", - " incorrect_files = []\n", - " # iterate through each file and check each extracted table\n", - " for filename in validation_filenames:\n", - " extracted_table_df = computed_df[computed_df[\"id\"] == filename].reset_index(\n", - " drop=True\n", - " )\n", - " validation_table_df = validation_df[\n", - " validation_df[\"id\"] == filename\n", - " ].reset_index(drop=True)\n", - " # check if the tables are exactly equal\n", - " if extracted_table_df.equals(validation_table_df):\n", - " # TODO: strip llc and other company strings before comparison\n", - " n_equal += 1\n", - " else:\n", - " incorrect_files.append(filename)\n", - " # compute precision and recall for each column\n", - " table_metrics_dict[filename] = {}\n", - " jaccard_dict[filename] = {}\n", - " for col in [\"subsidiary\", \"loc\", \"own_per\"]:\n", - " table_prec_recall = validation_helpers.pandas_compute_precision_recall(\n", - " extracted_table_df, validation_table_df, value_col=col\n", - " )\n", - " table_metrics_dict[filename][f\"{col}_precision\"] = table_prec_recall[\n", - " \"precision\"\n", - " ]\n", - " table_metrics_dict[filename][f\"{col}_recall\"] = table_prec_recall[\"recall\"]\n", - " # get the jaccard similarity between columns\n", - " jaccard_dict[filename][col] = validation_helpers.jaccard_similarity(\n", - " computed_df=extracted_table_df,\n", - " validation_df=validation_table_df,\n", - " value_col=col,\n", - " )\n", - "\n", - " jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient=\"index\").reset_index()\n", - " prec_recall_df = pd.DataFrame.from_dict(\n", - " table_metrics_dict, orient=\"index\"\n", - " ).reset_index()\n", - "\n", - " return (\n", - " jaccard_df,\n", - " prec_recall_df,\n", - " pd.DataFrame({\"filename\": incorrect_files}),\n", - " {\n", - " \"table_accuracy\": n_equal / n_files,\n", - " \"avg_subsidiary_jaccard_sim\": jaccard_df[\"subsidiary\"].sum() / n_files,\n", - " \"avg_location_jaccard_sim\": jaccard_df[\"loc\"].sum() / n_files,\n", - " \"avg_own_per_jaccard_sim\": jaccard_df[\"own_per\"].sum() / n_files,\n", - " \"avg_subsidiary_precision\": prec_recall_df[\"subsidiary_precision\"].sum()\n", - " / n_files,\n", - " \"avg_location_precision\": prec_recall_df[\"loc_precision\"].sum() / n_files,\n", - " \"avg_own_per_precision\": prec_recall_df[\"own_per_precision\"].sum()\n", - " / n_files,\n", - " \"avg_subsidiary_recall\": prec_recall_df[\"subsidiary_recall\"].sum()\n", - " / n_files,\n", - " \"avg_location_recall\": prec_recall_df[\"loc_recall\"].sum() / n_files,\n", - " \"avg_own_per_recall\": prec_recall_df[\"own_per_recall\"].sum() / n_files,\n", - " },\n", - " )\n", - "\n", - "\n", - "with mlflow.start_run(parent_run_id=model_info.run_id, nested=True):\n", - " failed_metadata, dataset = create_inference_dataset(\n", - " filings=ex21_validation_filing_metadata,\n", - " cloud_interface=cloud_interface,\n", - " has_labels=True,\n", - " )\n", - " metadata, extracted = ex21_extraction_model.predict(dataset)\n", - " metadata = pd.concat([failed_metadata, metadata])\n", - "\n", - " jaccard_df, prec_recall_df, incorrect_filenames, metrics = ex21_validation_metrics(extracted, validation_set)\n", - " mlflow.log_metrics(metrics)\n", - " mlflow.pyfunc.log_model(\"exhibit21_extractor\", python_model=ex21_extraction_model)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "45a5b13a-2276-4fb2-80dd-76e3f1184bea", - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/src/mozilla_sec_eia/models/sec10k/utils/pdf.py b/src/mozilla_sec_eia/models/sec10k/utils/pdf.py index df9be07..62a1cb6 100644 --- a/src/mozilla_sec_eia/models/sec10k/utils/pdf.py +++ b/src/mozilla_sec_eia/models/sec10k/utils/pdf.py @@ -5,6 +5,7 @@ """ import logging +import os from typing import Any import cv2 @@ -418,3 +419,17 @@ def _pil_img_from_pixmap(pix: fitz.Pixmap) -> Image.Image: img = Image.frombytes(mode, (pix.width, pix.height), pix.samples) return img + + +def get_image_dict(pdfs_dir): + """Create a dictionary with filenames and their Ex. 21 images.""" + image_dict = {} + for pdf_filename in os.listdir(pdfs_dir): + if pdf_filename.split(".")[-1] != "pdf": + continue + pdf_file_path = pdfs_dir / pdf_filename + _, pg = get_pdf_data_from_path(pdf_file_path) + full_pg_img = render_page(pg) + filename = pdf_filename.split(".")[0] + image_dict[filename] = full_pg_img + return image_dict diff --git a/tests/unit/models/sec10k/ex21_model_test.py b/tests/unit/models/sec10k/ex21_model_test.py index 0e89c1e..0b48743 100644 --- a/tests/unit/models/sec10k/ex21_model_test.py +++ b/tests/unit/models/sec10k/ex21_model_test.py @@ -11,13 +11,13 @@ pandas_compute_precision_recall, strip_down_company_names, ) -from mozilla_sec_eia.models.sec10k.ex_21.ex21_validation_helpers import ( - clean_ex21_validation_set, -) -from mozilla_sec_eia.models.sec10k.ex_21.inference import ( +from mozilla_sec_eia.models.sec10k.ex_21.data.common import ( LABELS, get_flattened_mode_predictions, ) +from mozilla_sec_eia.models.sec10k.ex_21.ex21_validation_helpers import ( + clean_ex21_validation_set, +) from mozilla_sec_eia.models.sec10k.utils.layoutlm import get_id_label_conversions From d6889e398b304c340239d6f52cb1cd2d7b541e78 Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 4 Oct 2024 12:42:23 -0400 Subject: [PATCH 086/161] Minor notebook fixes --- src/mozilla_sec_eia/models/sec10k/__init__.py | 4 +- .../notebooks/exhibit21_extractor.ipynb | 87 +++++++------------ 2 files changed, 35 insertions(+), 56 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 63097e9..da82dc0 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -53,7 +53,8 @@ class TrainConfig(Config): """Config for training notebook.""" - uri: str = "runs:/c363159de2f5439c93dd972d51247370/layoutlm_extractor" + uri: str | None = None + # "runs:/c363159de2f5439c93dd972d51247370/layoutlm_extractor" training_set: str = "labeledv0.2" @@ -67,6 +68,7 @@ class TrainConfig(Config): "ex21_failed_parsing_metadata": AssetIn(), "ex21_inference_dataset": AssetIn(), }, + save_notebook_on_failure=True, ) ex21_training_job = define_asset_job( "ex21_training", diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb index c03cc4e..ca3ed24 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb @@ -14,6 +14,28 @@ "company." ] }, + { + "cell_type": "markdown", + "id": "84aab877-9d59-4ec7-bf4b-c75e216fb1d6", + "metadata": {}, + "source": [ + "## Load upstream assets and configuration\n", + "The following cell can be run interactively to set configuration and load upstream assets. When running the notebook in dagster, this cell will be replaced with assets from the dagster run and dagster run configuration.\n", + "\n", + "### Config\n", + "- `layoutlm_uri`: If `None` the notebook will finetune layoutlm using `ex21_training_data`. If `layoutlm_uri` points to a valid model on the mlflow tracking server, the notebook will use the pre-trained model and perform inference on the validation set, logging validation metrics to a child run nested under the mlflow run associated with the pretrained model.\n", + "\n", + "### Upstream assets\n", + "We are using dagster assets to construct training/validation data outside the notebook to allow for easy caching. These datasets are fairly compute intensive to create, so this is useful when iterating on the model using the same data.\n", + "\n", + "NOTE: The notebook will load the most recent version of these assets, so to update the training/validation data you must rerun the dagster assets with desired configuration.\n", + "\n", + "- `ex21_training_data`: Dataset containing labeled data produced in label-studio to train `layoutlm`\n", + "- `ex21_validation_set`: Labeled validation data describing expected inference output on validation filings\n", + "- `ex21_failed_parsing_metadata`: Metadata for any validation filings that couldn't be parsed (usually empty)\n", + "- `ex21_inference_dataset`: Parsed validation filings prepped for inference model" + ] + }, { "cell_type": "code", "execution_count": 1, @@ -27,9 +49,17 @@ "source": [ "import dagstermill\n", "\n", + "from mozilla_sec_eia.models.sec10k import defs\n", + "\n", "context = dagstermill.get_context(op_config={\n", - " \"uri\": None,\n", - "})" + " \"layoutlm_uri\": None,\n", + "})\n", + "\n", + "ex21_training_data = defs.load_asset_value(\"ex21_training_data\")\n", + "\n", + "ex21_failed_parsing_metadata = defs.load_asset_value(\"ex21_failed_parsing_metadata\")\n", + "ex21_inference_dataset = defs.load_asset_value(\"ex21_inference_dataset\")\n", + "ex21_validation_set = defs.load_asset_value(\"ex21_validation_set\")" ] }, { @@ -106,43 +136,6 @@ " }" ] }, - { - "cell_type": "markdown", - "id": "39f0cbeb-7895-46bd-97d1-2c74e5265e12", - "metadata": { - "tags": [] - }, - "source": [ - "#### Load training data asset\n", - "\n", - "The following cell will load training data from a dagster asset. Using the dagster asset will allow easily caching the training data which can be computationally intensive to produce. When running this notebook in dagster directly, this cell will be replaced by dagster actually materializing the asset." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "f8df608a-32b7-4795-a670-63a2e8772910", - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", - "2024-10-03 17:47:13 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_training_data using PickledObjectFilesystemIOManager...\n" - ] - } - ], - "source": [ - "from mozilla_sec_eia.models.sec10k import defs\n", - "\n", - "ex21_training_data = defs.load_asset_value(\"ex21_training_data\")" - ] - }, { "cell_type": "markdown", "id": "8160263c-8f69-437c-918b-e56ad007961a", @@ -695,22 +688,6 @@ "Next, load an inference dataset containing validation data. This dataset is formatted exactly the same as those that will feed into the `Ex21Extractor` during a production run, but contain only data from the validation set. When creating inference datasets we also produce a metadata dataframe documenting any filings that couldn't be parsed/converted to a PDF. This dataframe should be empty for the validation set, but we will still load it for consistency with production runs." ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "47c19b41-131f-4059-8f42-931237565a20", - "metadata": { - "tags": [ - "parameters" - ] - }, - "outputs": [], - "source": [ - "ex21_failed_parsing_metadata = defs.load_asset_value(\"ex21_failed_parsing_metadata\")\n", - "ex21_inference_dataset = defs.load_asset_value(\"ex21_inference_dataset\")\n", - "ex21_validation_set = defs.load_asset_value(\"ex21_validation_set\")" - ] - }, { "cell_type": "markdown", "id": "eddcc912-324a-42e9-9841-3a916c6ece6b", From d5e013aaeba7080bf347155cc57f1cdd15a6d18f Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 4 Oct 2024 12:52:33 -0400 Subject: [PATCH 087/161] Fix import in notebook --- .../models/sec10k/notebooks/exhibit21_extractor.ipynb | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb index ca3ed24..e350119 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb @@ -424,7 +424,9 @@ "from transformers import Pipeline, pipeline\n", "from transformers.tokenization_utils_base import BatchEncoding\n", "\n", - "from mozilla_sec_eia.models.sec10k.inference import get_flattened_mode_predictions\n", + "from mozilla_sec_eia.models.sec10k.ex_21.data.common import (\n", + " get_flattened_mode_predictions,\n", + ")\n", "from mozilla_sec_eia.models.sec10k.utils.layoutlm import (\n", " iob_to_label,\n", ")\n", From f9810db4f777a6550181934c8f41c72f8a4fe092 Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 4 Oct 2024 12:57:04 -0400 Subject: [PATCH 088/161] add device to pipeline --- .../models/sec10k/notebooks/exhibit21_extractor.ipynb | 1 + 1 file changed, 1 insertion(+) diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb index e350119..fa04db6 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb @@ -641,6 +641,7 @@ " model=self.model_components[\"model\"],\n", " tokenizer=self.model_components[\"tokenizer\"],\n", " pipeline_class=LayoutLMInferencePipeline,\n", + " device=torch.device(\"cuda\" if torch.cuda.is_available() else \"cpu\"),\n", " )\n", "\n", " logits = []\n", From 27608819539713a913f55788a64e81ca3daaedf5 Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 4 Oct 2024 13:07:53 -0400 Subject: [PATCH 089/161] Fix signature inference --- .../notebooks/exhibit21_extractor.ipynb | 118 +----------------- 1 file changed, 5 insertions(+), 113 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb index fa04db6..d3419ad 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "48f185de-95ef-4194-9245-93f8d603d2e6", "metadata": { "tags": [ @@ -85,7 +85,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "id": "9372b908-d9b9-4d18-a5bf-d332648b3e49", "metadata": { "tags": [] @@ -156,120 +156,12 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "id": "71d205b2-e6ea-4ad0-982c-22e762269119", "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Some weights of LayoutLMv3ForTokenClassification were not initialized from the model checkpoint at microsoft/layoutlmv3-base and are newly initialized: ['classifier.bias', 'classifier.weight']\n", - "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "fafaf3dc8cfe431b90802b61bfe0acc6", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Map: 0%| | 0/159 [00:00\n", - " \n", - " \n", - " [ 6/1000 00:02 < 10:34, 1.57 it/s, Epoch 0.04/8]\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
StepTraining LossValidation Loss

" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2024/10/03 17:52:09 INFO mlflow.tracking._tracking_service.client: 🏃 View run orderly-mare-33 at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13/runs/a94ac72df36447a489d576ea06a71a4a.\n", - "2024/10/03 17:52:09 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13.\n", - "2024/10/03 17:52:09 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n", - "2024/10/03 17:52:10 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n" - ] - }, - { - "ename": "OutOfMemoryError", - "evalue": "CUDA out of memory. Tried to allocate 148.00 MiB. GPU 0 has a total capacity of 3.93 GiB of which 129.75 MiB is free. Including non-PyTorch memory, this process has 2.94 GiB memory in use. Of the allocated memory 1.89 GiB is allocated by PyTorch, and 979.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mOutOfMemoryError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[6], line 118\u001b[0m\n\u001b[1;32m 106\u001b[0m trainer \u001b[38;5;241m=\u001b[39m Trainer(\n\u001b[1;32m 107\u001b[0m model\u001b[38;5;241m=\u001b[39mmodel,\n\u001b[1;32m 108\u001b[0m args\u001b[38;5;241m=\u001b[39mtraining_args,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 113\u001b[0m compute_metrics\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mlambda\u001b[39;00m p: compute_metrics(p, metric\u001b[38;5;241m=\u001b[39mmetric, label_list\u001b[38;5;241m=\u001b[39mLABELS),\n\u001b[1;32m 114\u001b[0m )\n\u001b[1;32m 116\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m mlflow\u001b[38;5;241m.\u001b[39mstart_run() \u001b[38;5;28;01mas\u001b[39;00m training_run:\n\u001b[1;32m 117\u001b[0m \u001b[38;5;66;03m# Train inside mlflow run. Mlflow will automatically handle logging training metrcis\u001b[39;00m\n\u001b[0;32m--> 118\u001b[0m \u001b[43mtrainer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtrain\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 120\u001b[0m \u001b[38;5;66;03m# Log finetuend model with mlflow\u001b[39;00m\n\u001b[1;32m 121\u001b[0m model \u001b[38;5;241m=\u001b[39m {\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mmodel\u001b[39m\u001b[38;5;124m\"\u001b[39m: trainer\u001b[38;5;241m.\u001b[39mmodel, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mtokenizer\u001b[39m\u001b[38;5;124m\"\u001b[39m: trainer\u001b[38;5;241m.\u001b[39mtokenizer}\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:1938\u001b[0m, in \u001b[0;36mTrainer.train\u001b[0;34m(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)\u001b[0m\n\u001b[1;32m 1936\u001b[0m hf_hub_utils\u001b[38;5;241m.\u001b[39menable_progress_bars()\n\u001b[1;32m 1937\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1938\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43minner_training_loop\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 1939\u001b[0m \u001b[43m \u001b[49m\u001b[43margs\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1940\u001b[0m \u001b[43m \u001b[49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresume_from_checkpoint\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1941\u001b[0m \u001b[43m \u001b[49m\u001b[43mtrial\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mtrial\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1942\u001b[0m \u001b[43m \u001b[49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mignore_keys_for_eval\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 1943\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/trainer.py:2341\u001b[0m, in \u001b[0;36mTrainer._inner_training_loop\u001b[0;34m(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)\u001b[0m\n\u001b[1;32m 2338\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 2339\u001b[0m grad_norm \u001b[38;5;241m=\u001b[39m _grad_norm\n\u001b[0;32m-> 2341\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptimizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 2343\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcallback_handler\u001b[38;5;241m.\u001b[39mon_optimizer_step(args, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mstate, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcontrol)\n\u001b[1;32m 2345\u001b[0m optimizer_was_run \u001b[38;5;241m=\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator\u001b[38;5;241m.\u001b[39moptimizer_step_was_skipped\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/accelerate/optimizer.py:172\u001b[0m, in \u001b[0;36mAcceleratedOptimizer.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m 170\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_accelerate_step_called \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n\u001b[1;32m 171\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 172\u001b[0m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43moptimizer\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mstep\u001b[49m\u001b[43m(\u001b[49m\u001b[43mclosure\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 173\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39maccelerator_state\u001b[38;5;241m.\u001b[39mdistributed_type \u001b[38;5;241m==\u001b[39m DistributedType\u001b[38;5;241m.\u001b[39mXLA:\n\u001b[1;32m 174\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgradient_state\u001b[38;5;241m.\u001b[39mis_xla_gradients_synced \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/lr_scheduler.py:130\u001b[0m, in \u001b[0;36mLRScheduler.__init__..patch_track_step_called..wrap_step..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 128\u001b[0m opt \u001b[38;5;241m=\u001b[39m opt_ref()\n\u001b[1;32m 129\u001b[0m opt\u001b[38;5;241m.\u001b[39m_opt_called \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m \u001b[38;5;66;03m# type: ignore[union-attr]\u001b[39;00m\n\u001b[0;32m--> 130\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;21;43m__get__\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43mopt\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mopt\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[38;5;18;43m__class__\u001b[39;49m\u001b[43m)\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/optimizer.py:484\u001b[0m, in \u001b[0;36mOptimizer.profile_hook_step..wrapper\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 479\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 480\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mRuntimeError\u001b[39;00m(\n\u001b[1;32m 481\u001b[0m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mfunc\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m must return None or a tuple of (new_args, new_kwargs), but got \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mresult\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m.\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 482\u001b[0m )\n\u001b[0;32m--> 484\u001b[0m out \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 485\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_optimizer_step_code()\n\u001b[1;32m 487\u001b[0m \u001b[38;5;66;03m# call optimizer step post hooks\u001b[39;00m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/optimizer.py:89\u001b[0m, in \u001b[0;36m_use_grad_for_differentiable.._use_grad\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m 87\u001b[0m torch\u001b[38;5;241m.\u001b[39mset_grad_enabled(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdefaults[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdifferentiable\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 88\u001b[0m torch\u001b[38;5;241m.\u001b[39m_dynamo\u001b[38;5;241m.\u001b[39mgraph_break()\n\u001b[0;32m---> 89\u001b[0m ret \u001b[38;5;241m=\u001b[39m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 90\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[1;32m 91\u001b[0m torch\u001b[38;5;241m.\u001b[39m_dynamo\u001b[38;5;241m.\u001b[39mgraph_break()\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/adamw.py:227\u001b[0m, in \u001b[0;36mAdamW.step\u001b[0;34m(self, closure)\u001b[0m\n\u001b[1;32m 214\u001b[0m beta1, beta2 \u001b[38;5;241m=\u001b[39m cast(Tuple[\u001b[38;5;28mfloat\u001b[39m, \u001b[38;5;28mfloat\u001b[39m], group[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mbetas\u001b[39m\u001b[38;5;124m\"\u001b[39m])\n\u001b[1;32m 216\u001b[0m has_complex \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_init_group(\n\u001b[1;32m 217\u001b[0m group,\n\u001b[1;32m 218\u001b[0m params_with_grad,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 224\u001b[0m state_steps,\n\u001b[1;32m 225\u001b[0m )\n\u001b[0;32m--> 227\u001b[0m \u001b[43madamw\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 228\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams_with_grad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 229\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 230\u001b[0m \u001b[43m \u001b[49m\u001b[43mexp_avgs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 231\u001b[0m \u001b[43m \u001b[49m\u001b[43mexp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 232\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_exp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 233\u001b[0m \u001b[43m \u001b[49m\u001b[43mstate_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 234\u001b[0m \u001b[43m \u001b[49m\u001b[43mamsgrad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mamsgrad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 235\u001b[0m \u001b[43m \u001b[49m\u001b[43mbeta1\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta1\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 236\u001b[0m \u001b[43m \u001b[49m\u001b[43mbeta2\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 237\u001b[0m \u001b[43m \u001b[49m\u001b[43mlr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlr\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 238\u001b[0m \u001b[43m \u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mweight_decay\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 239\u001b[0m \u001b[43m \u001b[49m\u001b[43meps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43meps\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 240\u001b[0m \u001b[43m \u001b[49m\u001b[43mmaximize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mmaximize\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 241\u001b[0m \u001b[43m \u001b[49m\u001b[43mforeach\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mforeach\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 242\u001b[0m \u001b[43m \u001b[49m\u001b[43mcapturable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mcapturable\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 243\u001b[0m \u001b[43m \u001b[49m\u001b[43mdifferentiable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mdifferentiable\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 244\u001b[0m \u001b[43m \u001b[49m\u001b[43mfused\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgroup\u001b[49m\u001b[43m[\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfused\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 245\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrad_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mgrad_scale\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 246\u001b[0m \u001b[43m \u001b[49m\u001b[43mfound_inf\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mgetattr\u001b[39;49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mfound_inf\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43;01mNone\u001b[39;49;00m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 247\u001b[0m \u001b[43m \u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhas_complex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 248\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 250\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m loss\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/optimizer.py:161\u001b[0m, in \u001b[0;36m_disable_dynamo_if_unsupported..wrapper..maybe_fallback\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 159\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m disabled_func(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m 160\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 161\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/adamw.py:767\u001b[0m, in \u001b[0;36madamw\u001b[0;34m(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, foreach, capturable, differentiable, fused, grad_scale, found_inf, has_complex, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize)\u001b[0m\n\u001b[1;32m 764\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 765\u001b[0m func \u001b[38;5;241m=\u001b[39m _single_tensor_adamw\n\u001b[0;32m--> 767\u001b[0m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 768\u001b[0m \u001b[43m \u001b[49m\u001b[43mparams\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 769\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrads\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 770\u001b[0m \u001b[43m \u001b[49m\u001b[43mexp_avgs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 771\u001b[0m \u001b[43m \u001b[49m\u001b[43mexp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 772\u001b[0m \u001b[43m \u001b[49m\u001b[43mmax_exp_avg_sqs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 773\u001b[0m \u001b[43m \u001b[49m\u001b[43mstate_steps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 774\u001b[0m \u001b[43m \u001b[49m\u001b[43mamsgrad\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mamsgrad\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 775\u001b[0m \u001b[43m \u001b[49m\u001b[43mbeta1\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta1\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 776\u001b[0m \u001b[43m \u001b[49m\u001b[43mbeta2\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mbeta2\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 777\u001b[0m \u001b[43m \u001b[49m\u001b[43mlr\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mlr\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 778\u001b[0m \u001b[43m \u001b[49m\u001b[43mweight_decay\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mweight_decay\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 779\u001b[0m \u001b[43m \u001b[49m\u001b[43meps\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43meps\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 780\u001b[0m \u001b[43m \u001b[49m\u001b[43mmaximize\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmaximize\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 781\u001b[0m \u001b[43m \u001b[49m\u001b[43mcapturable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcapturable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 782\u001b[0m \u001b[43m \u001b[49m\u001b[43mdifferentiable\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mdifferentiable\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 783\u001b[0m \u001b[43m \u001b[49m\u001b[43mgrad_scale\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgrad_scale\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 784\u001b[0m \u001b[43m \u001b[49m\u001b[43mfound_inf\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mfound_inf\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 785\u001b[0m \u001b[43m \u001b[49m\u001b[43mhas_complex\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mhas_complex\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 786\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/torch/optim/adamw.py:600\u001b[0m, in \u001b[0;36m_multi_tensor_adamw\u001b[0;34m(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, grad_scale, found_inf, amsgrad, beta1, beta2, lr, weight_decay, eps, maximize, capturable, differentiable, has_complex)\u001b[0m\n\u001b[1;32m 598\u001b[0m exp_avg_sq_sqrt \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39m_foreach_sqrt(device_max_exp_avg_sqs)\n\u001b[1;32m 599\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 600\u001b[0m exp_avg_sq_sqrt \u001b[38;5;241m=\u001b[39m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_foreach_sqrt\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdevice_exp_avg_sqs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 602\u001b[0m torch\u001b[38;5;241m.\u001b[39m_foreach_div_(exp_avg_sq_sqrt, bias_correction2_sqrt)\n\u001b[1;32m 603\u001b[0m torch\u001b[38;5;241m.\u001b[39m_foreach_add_(exp_avg_sq_sqrt, eps)\n", - "\u001b[0;31mOutOfMemoryError\u001b[0m: CUDA out of memory. Tried to allocate 148.00 MiB. GPU 0 has a total capacity of 3.93 GiB of which 129.75 MiB is free. Including non-PyTorch memory, this process has 2.94 GiB memory in use. Of the allocated memory 1.89 GiB is allocated by PyTorch, and 979.98 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)" - ] - } - ], + "outputs": [], "source": [ "import mlflow\n", "from datasets import (\n", @@ -806,7 +698,7 @@ " \"exhibit21_extractor\",\n", " python_model=Ex21Extractor(),\n", " artifacts={\"model_components\": model_uri},\n", - " signature=infer_signature(dataset, extracted), # NOTE: model returns a second dataframe with metadata, but mlflow only supports one in signature\n", + " signature=infer_signature(ex21_inference_dataset, extracted), # NOTE: model returns a second dataframe with metadata, but mlflow only supports one in signature\n", " )" ] } From 1dcacfaf9a217b4b102ba1ed23da603dfb9c08bd Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 4 Oct 2024 13:21:07 -0400 Subject: [PATCH 090/161] Fix notebook dagster config --- src/mozilla_sec_eia/models/sec10k/__init__.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index da82dc0..1680f4a 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -53,8 +53,9 @@ class TrainConfig(Config): """Config for training notebook.""" - uri: str | None = None - # "runs:/c363159de2f5439c93dd972d51247370/layoutlm_extractor" + layoutlm_uri: str | None = ( + "runs:/32355367ed444dd0b07f2d1b845f62d8/layoutlm_extractor" + ) training_set: str = "labeledv0.2" From 39bb45bdd2622c0ab27237ce7d5690090c58892a Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 4 Oct 2024 13:24:00 -0400 Subject: [PATCH 091/161] Fix config param name --- .../models/sec10k/notebooks/exhibit21_extractor.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb index d3419ad..3183831 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb @@ -224,7 +224,7 @@ "\n", "# Only finetune if configured to do so\n", "training_run_id = None\n", - "if context.op_config[\"uri\"] is None:\n", + "if context.op_config[\"layoutlm_uri\"] is None:\n", " id2label, label2id = get_id_label_conversions(LABELS)\n", " # Change temp_dir to save training data locally for inspection\n", " # Cache/prepare training data\n", @@ -500,7 +500,7 @@ "if training_run_id is not None:\n", " model_uri = f\"runs:/{training_run_id}/layoutlm_extractor\"\n", "else:\n", - " model_uri = context.op_config[\"uri\"]\n", + " model_uri = context.op_config[\"layoutlm_uri\"]\n", "\n", "model_info = mlflow.models.get_model_info(model_uri)\n", "\n", From cb83862235cfbbd1cfe40a2b78bcd636a5882efe Mon Sep 17 00:00:00 2001 From: zschira Date: Sat, 5 Oct 2024 10:03:41 -0400 Subject: [PATCH 092/161] Partition training data --- .../models/sec10k/ex_21/data/__init__.py | 24 ++++++++------ .../notebooks/exhibit21_extractor.ipynb | 24 +++++++++++--- .../exhibit21_layout_classifier.ipynb | 33 +++++++++++++++++++ 3 files changed, 66 insertions(+), 15 deletions(-) create mode 100644 src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py index da5525f..2d5eff1 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py @@ -4,7 +4,13 @@ from tempfile import TemporaryDirectory import pandas as pd -from dagster import AssetOut, Config, asset, multi_asset +from dagster import ( + AssetExecutionContext, + AssetOut, + StaticPartitionsDefinition, + asset, + multi_asset, +) from mozilla_sec_eia.library import validation_helpers @@ -15,20 +21,18 @@ from .training import format_as_ner_annotations -class Ex21TrainingConfig(Config): - """Configure asset to produce ex21 training data.""" - - training_set: str = "labeledv0.2" - - -@asset -def ex21_training_data(config: Ex21TrainingConfig): +@asset( + partitions_def=StaticPartitionsDefinition( + ["labeledv0.0", "labeledv0.1", "labeledv0.2"] + ) +) +def ex21_training_data(context: AssetExecutionContext): """Construct training dataset for ex 21 extraction.""" with TemporaryDirectory() as temp_dir: ner_annotations = format_as_ner_annotations( labeled_json_path=Path(temp_dir) / "sec10k_filings" / "labeled_jsons", pdfs_path=Path(temp_dir) / "sec10k_filings" / "pdfs", - gcs_folder_name=config.training_set, + gcs_folder_name=context.partition_key, ) return ner_annotations diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb index 3183831..cc92a1e 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb @@ -55,7 +55,7 @@ " \"layoutlm_uri\": None,\n", "})\n", "\n", - "ex21_training_data = defs.load_asset_value(\"ex21_training_data\")\n", + "ex21_training_data = defs.load_asset_value(\"ex21_training_data\", partition_key=\"labeledv0.2\")\n", "\n", "ex21_failed_parsing_metadata = defs.load_asset_value(\"ex21_failed_parsing_metadata\")\n", "ex21_inference_dataset = defs.load_asset_value(\"ex21_inference_dataset\")\n", @@ -607,6 +607,13 @@ " \"\"\"Compute validation metrics for Ex. 21 extraction.\"\"\"\n", " shared_cols = validation_df.columns.intersection(computed_df.columns)\n", " validation_df = validation_df.astype(computed_df[shared_cols].dtypes)\n", + " # strip llc and other company name parts for the similarity comparison\n", + " computed_df[\"subsidiary\"] = validation_helpers.strip_down_company_names(\n", + " computed_df[\"subsidiary\"]\n", + " )\n", + " validation_df[\"subsidiary\"] = validation_helpers.strip_down_company_names(\n", + " validation_df[\"subsidiary\"]\n", + " )\n", " n_equal = 0\n", " validation_filenames = validation_df[\"id\"].unique()\n", " n_files = len(validation_filenames)\n", @@ -622,15 +629,22 @@ " validation_df[\"id\"] == filename\n", " ].reset_index(drop=True)\n", " # check if the tables are exactly equal\n", - " if extracted_table_df.equals(validation_table_df):\n", - " # TODO: strip llc and other company strings before comparison\n", + " if extracted_table_df[[\"subsidiary\", \"loc\", \"own_per\"]].equals(\n", + " validation_table_df[[\"subsidiary\", \"loc\", \"own_per\"]]\n", + " ):\n", " n_equal += 1\n", " else:\n", " incorrect_files.append(filename)\n", - " # compute precision and recall for each column\n", + " # compute jaccard sim + precision and recall for each column\n", " table_metrics_dict[filename] = {}\n", " jaccard_dict[filename] = {}\n", " for col in [\"subsidiary\", \"loc\", \"own_per\"]:\n", + " extracted_table_df[col] = validation_helpers.fill_nulls_for_comparison(\n", + " extracted_table_df[col]\n", + " )\n", + " validation_table_df[col] = validation_helpers.fill_nulls_for_comparison(\n", + " validation_table_df[col]\n", + " )\n", " table_prec_recall = validation_helpers.pandas_compute_precision_recall(\n", " extracted_table_df, validation_table_df, value_col=col\n", " )\n", @@ -669,7 +683,7 @@ " \"avg_location_recall\": prec_recall_df[\"loc_recall\"].sum() / n_files,\n", " \"avg_own_per_recall\": prec_recall_df[\"own_per_recall\"].sum() / n_files,\n", " },\n", - " )" + " )\n" ] }, { diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb new file mode 100644 index 0000000..1781454 --- /dev/null +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb @@ -0,0 +1,33 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "80cda90e-c2cb-4b71-b10d-cb23d7b51b3f", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From c71593c355866d74fc25b90dd72dfeaa8c43c62f Mon Sep 17 00:00:00 2001 From: zschira Date: Sat, 5 Oct 2024 10:27:23 -0400 Subject: [PATCH 093/161] Add partitions to notebook asset --- src/mozilla_sec_eia/models/sec10k/__init__.py | 2 +- .../models/sec10k/ex_21/data/__init__.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 1680f4a..9bb3557 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -56,7 +56,6 @@ class TrainConfig(Config): layoutlm_uri: str | None = ( "runs:/32355367ed444dd0b07f2d1b845f62d8/layoutlm_extractor" ) - training_set: str = "labeledv0.2" exhibit21_extractor = define_dagstermill_asset( @@ -70,6 +69,7 @@ class TrainConfig(Config): "ex21_inference_dataset": AssetIn(), }, save_notebook_on_failure=True, + partitions_def=ex_21.data.TRAINING_DATA_VERSION_PARTS, ) ex21_training_job = define_asset_job( "ex21_training", diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py index 2d5eff1..06860f1 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py @@ -20,12 +20,12 @@ from .inference import create_inference_dataset from .training import format_as_ner_annotations - -@asset( - partitions_def=StaticPartitionsDefinition( - ["labeledv0.0", "labeledv0.1", "labeledv0.2"] - ) +TRAINING_DATA_VERSION_PARTS = StaticPartitionsDefinition( + ["labeledv0.0", "labeledv0.1", "labeledv0.2"] ) + + +@asset(partitions_def=TRAINING_DATA_VERSION_PARTS) def ex21_training_data(context: AssetExecutionContext): """Construct training dataset for ex 21 extraction.""" with TemporaryDirectory() as temp_dir: From 4efa5152eca11edc79228a4f0a04a8c2a3c149f8 Mon Sep 17 00:00:00 2001 From: zschira Date: Sun, 6 Oct 2024 09:24:51 -0400 Subject: [PATCH 094/161] Update ex21 labels --- .../validation_data/ex21_labels.csv | 1612 +++++++++++++---- 1 file changed, 1275 insertions(+), 337 deletions(-) diff --git a/src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv b/src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv index 3d51f4f..006f344 100644 --- a/src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv +++ b/src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv @@ -140,7 +140,7 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 61339-0001161728-17-000004,"State Energy Services, LLC",, 107815-0000107815-17-000106,"ATC Management, Inc.",Wisconsin,26.24 107815-0000107815-17-000106,American Transmission Company LLC,Wisconsin,23.04 -107815-0000107815-17-000106,Bostco LLC,Wisconsin,100 +107815-0000107815-17-000106,Bostco LLC,Wisconsin,100.0 1317577-0001193125-13-356794,"Elemental Energy, Inc.",Arizona, 1317577-0001193125-13-356794,Klondyke Construction LLC,Arizona, 1317577-0001193125-13-356794,"Pike Electric, LLC",North Carolina, @@ -150,61 +150,61 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 1317577-0001193125-13-356794,"Pine Valley Power, Inc.",Utah, 1317577-0001193125-13-356794,"Synergetic Design Holdings, Inc.",Delaware, 1317577-0001193125-13-356794,"UC Synergetic, Inc.",South Carolina, -40545-0000040545-04-000013,"AMERICAN SILICONES, INC.",Indiana,100 -40545-0000040545-04-000013,"BENTLY NEVADA, LLC",Delaware,100 -40545-0000040545-04-000013,CARIBE GE INTERNATIONAL ELECTRIC METERS CORP,Puerto Rico,100 -40545-0000040545-04-000013,"CARDINAL COGEN, INC.",Delaware,100 -40545-0000040545-04-000013,"DATEX-OHMEDA, INC.",Delaware,100 -40545-0000040545-04-000013,ELANO CORPORATION,Ohio,100 -40545-0000040545-04-000013,"GEAE TECHNOLOGY, INC.",Delaware,100 -40545-0000040545-04-000013,GE CGR EUROPE,France,100 -40545-0000040545-04-000013,"GE DRIVES and CONTROLS, INC.",Delaware,100 -40545-0000040545-04-000013,GE DRUCK HOLDINGS LIMITED,Delaware,100 -40545-0000040545-04-000013,"GE ELECTRIC CANADA, INC.",Canada,100 -40545-0000040545-04-000013,"GE ENERGY EUROPE, BV",Netherlands,100 -40545-0000040545-04-000013,GE ENERGY PARTS INC.,Delaware,100 -40545-0000040545-04-000013,"GE ENERGY PRODUCTS, INC.",Delaware,100 -40545-0000040545-04-000013,"GE ENERGY SERVICES, INC.",Delaware,100 -40545-0000040545-04-000013,"GE ENERGY SERVICES-DALLAS, LP",Delaware,100 -40545-0000040545-04-000013,"GE ENGINE SERVICES DISTRIBUTION, LLC.",Delaware,100 -40545-0000040545-04-000013,"GE ENGINE SERVICES, INC.",Delaware,100 -40545-0000040545-04-000013,GE FANUC AUTOMATION CORPORATION,Delaware,50 -40545-0000040545-04-000013,GE GAS TURBINES (GREENVILLE) L.L.C,Delaware,100 -40545-0000040545-04-000013,"GE HUNGARY CO., LTD",Hungary,100 -40545-0000040545-04-000013,"GE INTERLOGIX, INC.",Delaware,100 -40545-0000040545-04-000013,"GE INVESTMENT, INC.",Nevada,100 -40545-0000040545-04-000013,"GE KEPPEL ENERGY SERVICES PTE, INC.",Singapore,100 -40545-0000040545-04-000013,"GE MEDICAL GLOBAL TECHNOLOGY CO., LLC",Delaware,100 -40545-0000040545-04-000013,"GE MEDICAL SYSTEMS INFORMATION TECHNOLOGIES, INC.",Wisconsin,100 -40545-0000040545-04-000013,"GE MEDICAL SYSTEMS, INC.",Delaware,100 -40545-0000040545-04-000013,GE PACKAGED POWER L.P.,Delaware,100 -40545-0000040545-04-000013,"GE PETROCHEMICALS, INC.",Delaware,100 -40545-0000040545-04-000013,"GE PLASTIC FINISHING, INC.",Delaware,100 -40545-0000040545-04-000013,GE PLASTICS ESPANA ScPA,"Spain & Canary Islands, Balearic Island",100 -40545-0000040545-04-000013,GE PLASTICS PACIFIC PTE. LTD,Singapore,100 -40545-0000040545-04-000013,"GE POLYMERLAND, INC",Delaware,100 -40545-0000040545-04-000013,GE POWER SYSTEMS LICENSING INC,Delaware,100 -40545-0000040545-04-000013,"GE QUARTZ, INC.",Delaware,100 -40545-0000040545-04-000013,"GE SILICONES WV, LLC",West Virginia,100 -40545-0000040545-04-000013,"GE SUPERABRASIVES, INC.",Delaware,100 -40545-0000040545-04-000013,"GE TRANSPORTATION PARTS, LLC",Delaware,100 -40545-0000040545-04-000013,"GE TRANSPORTATION SERVICES, LLC.",Delaware,100 -40545-0000040545-04-000013,"GE TRANSPORTATION SYSTEMS GLOBAL SIGNALING, LLC.",Delaware,100 -40545-0000040545-04-000013,GEA PRODUCTS LP,Delaware,100 -40545-0000040545-04-000013,GENERAL ELECTRIC INTERNATIONAL (BENELUX) BV,Netherlands,100 -40545-0000040545-04-000013,"GENERAL ELECTRIC INTERNATIONAL, INC.",Delaware,100 -40545-0000040545-04-000013,"GRANITE SERVICES, INC.",Delaware,100 -40545-0000040545-04-000013,NATIONAL BROADCASTING COMPANY (NBC),Delaware,100 -40545-0000040545-04-000013,"NUCLEAR FUEL HOLDING CO.,INC",Delaware,100 -40545-0000040545-04-000013,NUOVO PIGNONE HOLDING S.P.A,Italy,100 -40545-0000040545-04-000013,OEC MEDICAL SYSTEMS INC,Delaware,100 -40545-0000040545-04-000013,PII LIMITED,United Kingdom & Northern Ireland,100 -40545-0000040545-04-000013,"REUTER-STOKES, INC.",Delaware,100 -40545-0000040545-04-000013,"SENSING SOLUTIONS, INC.",Delaware,100 -40545-0000040545-04-000013,"VICEROY, INC.",Delaware,100 -40545-0000040545-04-000013,"GENERAL ELECTRIC CAPITAL SERVICES, INC.",Delaware,100 -40545-0000040545-04-000013,General Electric Capital Corporation,New York,100 -40545-0000040545-04-000013,GE Global Insurance Holding Corporation,Missouri,100 +40545-0000040545-04-000013,"AMERICAN SILICONES, INC.",Indiana,100.0 +40545-0000040545-04-000013,"BENTLY NEVADA, LLC",Delaware,100.0 +40545-0000040545-04-000013,CARIBE GE INTERNATIONAL ELECTRIC METERS CORP,Puerto Rico,100.0 +40545-0000040545-04-000013,"CARDINAL COGEN, INC.",Delaware,100.0 +40545-0000040545-04-000013,"DATEX-OHMEDA, INC.",Delaware,100.0 +40545-0000040545-04-000013,ELANO CORPORATION,Ohio,100.0 +40545-0000040545-04-000013,"GEAE TECHNOLOGY, INC.",Delaware,100.0 +40545-0000040545-04-000013,GE CGR EUROPE,France,100.0 +40545-0000040545-04-000013,"GE DRIVES and CONTROLS, INC.",Delaware,100.0 +40545-0000040545-04-000013,GE DRUCK HOLDINGS LIMITED,Delaware,100.0 +40545-0000040545-04-000013,"GE ELECTRIC CANADA, INC.",Canada,100.0 +40545-0000040545-04-000013,"GE ENERGY EUROPE, BV",Netherlands,100.0 +40545-0000040545-04-000013,GE ENERGY PARTS INC.,Delaware,100.0 +40545-0000040545-04-000013,"GE ENERGY PRODUCTS, INC.",Delaware,100.0 +40545-0000040545-04-000013,"GE ENERGY SERVICES, INC.",Delaware,100.0 +40545-0000040545-04-000013,"GE ENERGY SERVICES-DALLAS, LP",Delaware,100.0 +40545-0000040545-04-000013,"GE ENGINE SERVICES DISTRIBUTION, LLC.",Delaware,100.0 +40545-0000040545-04-000013,"GE ENGINE SERVICES, INC.",Delaware,100.0 +40545-0000040545-04-000013,GE FANUC AUTOMATION CORPORATION,Delaware,50.0 +40545-0000040545-04-000013,GE GAS TURBINES (GREENVILLE) L.L.C,Delaware,100.0 +40545-0000040545-04-000013,"GE HUNGARY CO., LTD",Hungary,100.0 +40545-0000040545-04-000013,"GE INTERLOGIX, INC.",Delaware,100.0 +40545-0000040545-04-000013,"GE INVESTMENT, INC.",Nevada,100.0 +40545-0000040545-04-000013,"GE KEPPEL ENERGY SERVICES PTE, INC.",Singapore,100.0 +40545-0000040545-04-000013,"GE MEDICAL GLOBAL TECHNOLOGY CO., LLC",Delaware,100.0 +40545-0000040545-04-000013,"GE MEDICAL SYSTEMS INFORMATION TECHNOLOGIES, INC.",Wisconsin,100.0 +40545-0000040545-04-000013,"GE MEDICAL SYSTEMS, INC.",Delaware,100.0 +40545-0000040545-04-000013,GE PACKAGED POWER L.P.,Delaware,100.0 +40545-0000040545-04-000013,"GE PETROCHEMICALS, INC.",Delaware,100.0 +40545-0000040545-04-000013,"GE PLASTIC FINISHING, INC.",Delaware,100.0 +40545-0000040545-04-000013,GE PLASTICS ESPANA ScPA,"Spain & Canary Islands, Balearic Island",100.0 +40545-0000040545-04-000013,GE PLASTICS PACIFIC PTE. LTD,Singapore,100.0 +40545-0000040545-04-000013,"GE POLYMERLAND, INC",Delaware,100.0 +40545-0000040545-04-000013,GE POWER SYSTEMS LICENSING INC,Delaware,100.0 +40545-0000040545-04-000013,"GE QUARTZ, INC.",Delaware,100.0 +40545-0000040545-04-000013,"GE SILICONES WV, LLC",West Virginia,100.0 +40545-0000040545-04-000013,"GE SUPERABRASIVES, INC.",Delaware,100.0 +40545-0000040545-04-000013,"GE TRANSPORTATION PARTS, LLC",Delaware,100.0 +40545-0000040545-04-000013,"GE TRANSPORTATION SERVICES, LLC.",Delaware,100.0 +40545-0000040545-04-000013,"GE TRANSPORTATION SYSTEMS GLOBAL SIGNALING, LLC.",Delaware,100.0 +40545-0000040545-04-000013,GEA PRODUCTS LP,Delaware,100.0 +40545-0000040545-04-000013,GENERAL ELECTRIC INTERNATIONAL (BENELUX) BV,Netherlands,100.0 +40545-0000040545-04-000013,"GENERAL ELECTRIC INTERNATIONAL, INC.",Delaware,100.0 +40545-0000040545-04-000013,"GRANITE SERVICES, INC.",Delaware,100.0 +40545-0000040545-04-000013,NATIONAL BROADCASTING COMPANY (NBC),Delaware,100.0 +40545-0000040545-04-000013,"NUCLEAR FUEL HOLDING CO.,INC",Delaware,100.0 +40545-0000040545-04-000013,NUOVO PIGNONE HOLDING S.P.A,Italy,100.0 +40545-0000040545-04-000013,OEC MEDICAL SYSTEMS INC,Delaware,100.0 +40545-0000040545-04-000013,PII LIMITED,United Kingdom & Northern Ireland,100.0 +40545-0000040545-04-000013,"REUTER-STOKES, INC.",Delaware,100.0 +40545-0000040545-04-000013,"SENSING SOLUTIONS, INC.",Delaware,100.0 +40545-0000040545-04-000013,"VICEROY, INC.",Delaware,100.0 +40545-0000040545-04-000013,"GENERAL ELECTRIC CAPITAL SERVICES, INC.",Delaware,100.0 +40545-0000040545-04-000013,General Electric Capital Corporation,New York,100.0 +40545-0000040545-04-000013,GE Global Insurance Holding Corporation,Missouri,100.0 39547-0001047469-03-024149,"Turtle Shell, Inc. (f/k/a Snapper, Inc.)","Georgia, USA", 39547-0001047469-03-024149,Actava Financial Ltd.,Delaware, 39547-0001047469-03-024149,"Actava SHL, Inc.",Delaware, @@ -271,11 +271,6 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 49728-0001144204-11-070058,IEC Electronics Corp.-Albuquerque,New Mexico, 49728-0001144204-11-070058,"Dynamic Research and Testing Laboratories, LLC",New Mexico, 49728-0001144204-11-070058,"Southern California Braiding, Inc.",Delaware, -200155-0000021267-99-000027,"CIG Exploration, Inc",Delaware, -200155-0000021267-99-000027,CIG Field Services Company,Delaware, -200155-0000021267-99-000027,"Great Divide Gas Services, LLC",Colorado,73 -200155-0000021267-99-000027,Colorado Water Supply Company,Delaware, -200155-0000021267-99-000027,Colorado Interstate Production Company,Delaware, 315858-0000315858-19-000023,"Woodbridge Holdings, LLC",Florida, 315858-0000315858-19-000023,"BBX Capital Florida, LLC",Florida, 315858-0000315858-19-000023,"Eden Services, Inc.",Florida, @@ -382,7 +377,7 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 315858-0000315858-19-000023,"Hialeah Multifamily, LLC",Florida, 315858-0000315858-19-000023,"BBX Residential Victoria Park, LLC",Florida, 315858-0000315858-19-000023,"Premier Flagler, LLC",Florida, -315858-0000315858-19-000023,Banc Servicing Center LLC,Florida, +315858-0000315858-19-000023,"Banc Servicing Center, LLC",Florida, 315858-0000315858-19-000023,"Fidelity Service, LLC",Florida, 315858-0000315858-19-000023,"Fidelity Tax, LLC",Florida, 315858-0000315858-19-000023,"Heartwood 3, LLC",Florida, @@ -509,11 +504,11 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 718877-0001047469-08-007085,Treyarch Corporation,Delaware, 718877-0001047469-08-007085,"Toys For Bob, Inc.",California, 718877-0001047469-08-007085,"Vicarious Visions, Inc.",New York, -811669-0000950123-05-002610,International Wine & Spirits Ltd.,Delaware,100 -811669-0000950123-05-002610,Ste. Michelle Wine Estates Ltd.,Washington,100 -811669-0000950123-05-002610,U.S. Smokeless Tobacco Company,Delaware,100 -811669-0000950123-05-002610,U.S. Smokeless Tobacco Manufacturing Limited Partnership,Delaware,100 -811669-0000950123-05-002610,U.S. Smokeless Tobacco Brands Inc.,Delaware,100 +811669-0000950123-05-002610,International Wine & Spirits Ltd.,Delaware,100.0 +811669-0000950123-05-002610,Ste. Michelle Wine Estates Ltd.,Washington,100.0 +811669-0000950123-05-002610,U.S. Smokeless Tobacco Company,Delaware,100.0 +811669-0000950123-05-002610,U.S. Smokeless Tobacco Manufacturing Limited Partnership,Delaware,100.0 +811669-0000950123-05-002610,U.S. Smokeless Tobacco Brands Inc.,Delaware,100.0 857501-0001065949-17-000087,Jacobs & Company,West Virginia, 857501-0001065949-17-000087,"FS Investments, Inc.",West Virginia, 857501-0001065949-17-000087,"Triangle Surety Agency, Inc.",West Virginia, @@ -626,22 +621,6 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 908255-0000908255-13-000006,BorgWarner Turbo and Emissions Systems de Mexico S.A. de C.V.,, 908255-0000908255-13-000006,BorgWarner (Thailand) Limited,, 908255-0000908255-13-000006,"BorgWarner (China) Research & Development Co., Ltd.",, -913614-0000930661-01-502777,Bactolac Pharmaceutical Inc.,Delaware, -913614-0000930661-01-502777,"ANI Pharmaceuticals, Inc.",Mississippi, -913614-0000930661-01-502777,NL Acquisition Company,Delaware, -923472-0000892569-97-000821,Samantha Hotel Corporation,Delaware, -923472-0000892569-97-000821,"RFS, Inc.",Tennessee, -923472-0000892569-97-000821,Doubletree Partners,Delaware, -923472-0000892569-97-000821,Doubletree Hotels Corporation,Arizona, -923472-0000892569-97-000821,"Doubletree of Phoenix, Inc.",Delaware, -923472-0000892569-97-000821,INNCO Corporation,Arizona, -923472-0000892569-97-000821,HOSCO Corporation,Arizona, -923472-0000892569-97-000821,"DT Management, Inc.",Arizona, -923472-0000892569-97-000821,"DT Real Estate, Inc.",Arizona, -923472-0000892569-97-000821,"Doubletree Hotel Systems, Inc.",Arizona, -923472-0000892569-97-000821,Harbor Hotels Corporation,Delaware, -923472-0000892569-97-000821,"DTM Burlingame, Inc.",Arizona, -923472-0000892569-97-000821,"Red Lion Hotels, Inc.",Delaware, 1484565-0001564590-20-008705,Soleno Therapeutics UK Ltd.,United Kingdom, 1484565-0001564590-20-008705,Soleno Therapeutics Europe Ltd.,Ireland, 1484565-0001564590-20-008705,"Essentialis, Inc.",Delaware, @@ -715,9 +694,9 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 354707-0000354707-19-000043,"Hawaii Electric Light Company, Inc.",Hawaii, 354707-0000354707-19-000043,"Renewable Hawaii, Inc.",Hawaii, 354707-0000354707-19-000043,Uluwehiokama Biofuels Corp.,Hawaii, -354707-0000354707-19-000043,HECO Capital Trust III,Delaware, +354707-0000354707-19-000043,HECO Capital Trust III (a statutory trust),Delaware, 354707-0000354707-19-000043,"ASB Hawaii, Inc.",Hawaii, -354707-0000354707-19-000043,"American Savings Bank, F.S.B.",, +354707-0000354707-19-000043,"American Savings Bank, F.S.B.",federally chartered, 354707-0000354707-19-000043,"The Old Oahu Tug Service, Inc. ",Hawaii, 354707-0000354707-19-000043,"Pacific Current, LLC",Hawaii, 354707-0000354707-19-000043,"Hamakua Holdings, LLC",Hawaii, @@ -739,7 +718,7 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 84557-0001046861-06-000007,The Southern Connecticut Gas Company,Connecticut, 100826-0001193125-09-042636,Ameren Corporation,Missouri, 100826-0001193125-09-042636,Ameren Development Company,Missouri, -100826-0001193125-09-042636,"Enporion, Inc.",Delaware,21 +100826-0001193125-09-042636,"Enporion, Inc.",Delaware,21.0 100826-0001193125-09-042636,Missouri Central Railroad Company,Delaware, 100826-0001193125-09-042636,CIPSCO Leasing Company,Illinois, 100826-0001193125-09-042636,"Gateway Energy Systems, L.C.",Missouri,89.1 @@ -749,7 +728,7 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 100826-0001193125-09-042636,Coffeen and Western Railroad Company,Illinois, 100826-0001193125-09-042636,Ameren Energy Marketing Company,Illinois, 100826-0001193125-09-042636,Illinois Materials Supply Co.,Illinois, -100826-0001193125-09-042636,"Electric Energy, Inc.",Illinois,80 +100826-0001193125-09-042636,"Electric Energy, Inc.",Illinois,80.0 100826-0001193125-09-042636,Midwest Electric Power Inc.,Illinois, 100826-0001193125-09-042636,Joppa and Eastern Railroad Company,Illinois, 100826-0001193125-09-042636,"Met South, Inc.",Illinois, @@ -765,76 +744,68 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 100826-0001193125-09-042636,CLC Aircraft Leasing LLC,Delaware, 100826-0001193125-09-042636,QST Enterprises Inc.,Illinois, 100826-0001193125-09-042636,ESE Land Corporation,Illinois, -100826-0001193125-09-042636,California/Nevada Development L.L.C.,Delaware,15 +100826-0001193125-09-042636,California/Nevada Development L.L.C.,Delaware,15.0 100826-0001193125-09-042636,Energy Risk Assurance Company,Vermont, 100826-0001193125-09-042636,Missouri Energy Risk Assurance Company LLC,Missouri, -100826-0001193125-09-042636,"Illinois Power Company, d/b/a AmerenIP",Illinois, +100826-0001193125-09-042636,Illinois Power Company,Illinois, 100826-0001193125-09-042636,Illinois Power Securitization Limited Liability Company,Delaware, 100826-0001193125-09-042636,Illinois Power Special Purpose Trust,Delaware, 100826-0001193125-09-042636,Union Electric Company,Missouri, -100826-0001193125-09-042636,Fuelco LLC,Delaware, -81033-0000950117-06-000927,Public Service Electric and Gas Company,New Jersey,100 -81033-0000950117-06-000927,PSEG Power LLC ,Delaware,100 -81033-0000950117-06-000927,PSEG Fossil LLC,Delaware,100 -81033-0000950117-06-000927,PSEG Energy Resources & Trade LLC,Delaware,100 -81033-0000950117-06-000927,PSEG Energy Holdings L.L.C. ,New Jersey,100 -81033-0000950117-06-000927,PSEG Resources L.L.C.,New Jersey,100 -81033-0000950117-06-000927,PSEG Global L.L.C.,New Jersey,100 -81033-0000950117-06-000927,PSEG Global International Holdings LLC,Delaware,100 -4904-0000004904-09-000040,"American Electric Power Company, Inc.",New York,100 -4904-0000004904-09-000040,American Electric Power Service Corporation,New York,100 -4904-0000004904-09-000040,"AEP C&I Company, LLC",Delaware,100 -4904-0000004904-09-000040,"AEP Coal, Inc.",Nevada,100 -4904-0000004904-09-000040,"AEP Communications, Inc.",Ohio,100 -4904-0000004904-09-000040,"AEP Credit, Inc.",Delaware,100 -4904-0000004904-09-000040,AEP Generating Company,Ohio,100 -4904-0000004904-09-000040,"AEP Investments, Inc.",Ohio,100 -4904-0000004904-09-000040,AEP Nonutility Funding LLC,Delaware,100 -4904-0000004904-09-000040,"AEP Power Marketing, Inc.",Ohio,100 -4904-0000004904-09-000040,"AEP Pro Serv, Inc.",Ohio,100 -4904-0000004904-09-000040,"AEP Resources, Inc.",Ohio,100 -4904-0000004904-09-000040,"AEP T&D Services, LLC",Delaware,100 -4904-0000004904-09-000040,"AEP Transmission Holding Company, LLC",Delaware,100 -4904-0000004904-09-000040,"AEP Utilities, Inc.",Delaware,100 -4904-0000004904-09-000040,AEP Texas Central Company,Texas,100 -4904-0000004904-09-000040,AEP Texas Central Transition Funding LLC,Delaware,100 -4904-0000004904-09-000040,AEP Texas Central Transition Funding II LLC,Delaware,100 -4904-0000004904-09-000040,AEP Texas North Company,Texas,100 -4904-0000004904-09-000040,AEP Texas North Generation Company LLC,Delaware,100 -4904-0000004904-09-000040,"CSW Energy, Inc.",Texas,100 -4904-0000004904-09-000040,"CSW Energy Services, Inc.",Delaware,100 -4904-0000004904-09-000040,"CSW International, Inc.",Delaware,100 -4904-0000004904-09-000040,"Electric Transmission Texas, LLC",Delaware,50 -4904-0000004904-09-000040,AEP Utility Funding LLC,Delaware,100 +100826-0001193125-09-042636,Fuelco LLC,Delaware,33.3 +4904-0000004904-09-000040,"American Electric Power Company, Inc.",New York, +4904-0000004904-09-000040,American Electric Power Service Corporation,New York,100.0 +4904-0000004904-09-000040,"AEP C&I Company, LLC",Delaware,100.0 +4904-0000004904-09-000040,"AEP Coal, Inc.",Nevada,100.0 +4904-0000004904-09-000040,"AEP Communications, Inc.",Ohio,100.0 +4904-0000004904-09-000040,"AEP Credit, Inc.",Delaware,100.0 +4904-0000004904-09-000040,AEP Generating Company,Ohio,100.0 +4904-0000004904-09-000040,"AEP Investments, Inc.",Ohio,100.0 +4904-0000004904-09-000040,AEP Nonutility Funding LLC,Delaware,100.0 +4904-0000004904-09-000040,"AEP Power Marketing, Inc.",Ohio,100.0 +4904-0000004904-09-000040,"AEP Pro Serv, Inc.",Ohio,100.0 +4904-0000004904-09-000040,"AEP Resources, Inc.",Ohio,100.0 +4904-0000004904-09-000040,"AEP T&D Services, LLC",Delaware,100.0 +4904-0000004904-09-000040,"AEP Transmission Holding Company, LLC",Delaware,100.0 +4904-0000004904-09-000040,"AEP Utilities, Inc.",Delaware,100.0 +4904-0000004904-09-000040,AEP Texas Central Company,Texas,100.0 +4904-0000004904-09-000040,AEP Texas Central Transition Funding LLC,Delaware,100.0 +4904-0000004904-09-000040,AEP Texas Central Transition Funding II LLC,Delaware,100.0 +4904-0000004904-09-000040,AEP Texas North Company,Texas,100.0 +4904-0000004904-09-000040,AEP Texas North Generation Company LLC,Delaware,100.0 +4904-0000004904-09-000040,"CSW Energy, Inc.",Texas,100.0 +4904-0000004904-09-000040,"CSW Energy Services, Inc.",Delaware,100.0 +4904-0000004904-09-000040,"CSW International, Inc.",Delaware,100.0 +4904-0000004904-09-000040,"Electric Transmission Texas, LLC",Delaware,50.0 +4904-0000004904-09-000040,AEP Utility Funding LLC,Delaware,100.0 4904-0000004904-09-000040,Appalachian Power Company,Virginia,98.7 -4904-0000004904-09-000040,Cedar Coal Co.,West Virginia,100 -4904-0000004904-09-000040,Central Appalachian Coal Company,West Virginia,100 -4904-0000004904-09-000040,Central Coal Company,West Virginia,50 -4904-0000004904-09-000040,Southern Appalachian Coal Company,West Virginia,100 -4904-0000004904-09-000040,Columbus Southern Power Company,Ohio,100 -4904-0000004904-09-000040,"Colomet, Inc.",Ohio,100 -4904-0000004904-09-000040,Conesville Coal Preparation Company ,Ohio,100 +4904-0000004904-09-000040,Cedar Coal Co.,West Virginia,100.0 +4904-0000004904-09-000040,Central Appalachian Coal Company,West Virginia,100.0 +4904-0000004904-09-000040,Central Coal Company,West Virginia,50.0 +4904-0000004904-09-000040,Southern Appalachian Coal Company,West Virginia,100.0 +4904-0000004904-09-000040,Columbus Southern Power Company,Ohio,100.0 +4904-0000004904-09-000040,"Colomet, Inc.",Ohio,100.0 +4904-0000004904-09-000040,Conesville Coal Preparation Company ,Ohio,100.0 4904-0000004904-09-000040,Ohio Valley Electric Corporation,Ohio,4.3 -4904-0000004904-09-000040,Indiana-Kentucky Electric Corporation,Indiana,100 -4904-0000004904-09-000040,Franklin Real Estate Company,Pennsylvania,100 -4904-0000004904-09-000040,Indiana Michigan Power Company,Indiana,100 -4904-0000004904-09-000040,Blackhawk Coal Company,Utah,100 -4904-0000004904-09-000040,Price River Coal Company ,Indiana,100 -4904-0000004904-09-000040,Kentucky Power Company,Kentucky,100 -4904-0000004904-09-000040,Kingsport Power Company,Virginia,100 +4904-0000004904-09-000040,Indiana-Kentucky Electric Corporation,Indiana,100.0 +4904-0000004904-09-000040,Franklin Real Estate Company,Pennsylvania,100.0 +4904-0000004904-09-000040,Indiana Michigan Power Company,Indiana,100.0 +4904-0000004904-09-000040,Blackhawk Coal Company,Utah,100.0 +4904-0000004904-09-000040,Price River Coal Company ,Indiana,100.0 +4904-0000004904-09-000040,Kentucky Power Company,Kentucky,100.0 +4904-0000004904-09-000040,Kingsport Power Company,Virginia,100.0 4904-0000004904-09-000040,Ohio Power Company ,Ohio,99.4 -4904-0000004904-09-000040,Cardinal Operating Company,Ohio,50 -4904-0000004904-09-000040,Central Coal Company,West Virginia,50 +4904-0000004904-09-000040,Cardinal Operating Company,Ohio,50.0 +4904-0000004904-09-000040,Central Coal Company,West Virginia,50.0 4904-0000004904-09-000040,Ohio Valley Electric Corporation,Ohio,39.2 -4904-0000004904-09-000040,Indiana-Kentucky Electric Corporation,Indiana,100 +4904-0000004904-09-000040,Indiana-Kentucky Electric Corporation,Indiana,100.0 4904-0000004904-09-000040,"Power Tree Carbon Company, LLC",Delaware,9.2 -4904-0000004904-09-000040,Public Service Company of Oklahoma,Oklahoma,100 -4904-0000004904-09-000040,Southwestern Electric Power Company,Delaware,100 -4904-0000004904-09-000040,"Dolet Hills Lignite Company, LLC ",Delaware,100 -4904-0000004904-09-000040,Southwestern Arkansas Utilities Corporation,Arkansas,100 -4904-0000004904-09-000040,SWEPCo Capital Trust I,Delaware,100 +4904-0000004904-09-000040,Public Service Company of Oklahoma,Oklahoma,100.0 +4904-0000004904-09-000040,Southwestern Electric Power Company,Delaware,100.0 +4904-0000004904-09-000040,"Dolet Hills Lignite Company, LLC ",Delaware,100.0 +4904-0000004904-09-000040,Southwestern Arkansas Utilities Corporation,Arkansas,100.0 +4904-0000004904-09-000040,SWEPCo Capital Trust I,Delaware,100.0 4904-0000004904-09-000040,The Arklahoma Corporation,Arkansas,47.6 -4904-0000004904-09-000040,Wheeling Power Company,West Virginia,100 +4904-0000004904-09-000040,Wheeling Power Company,West Virginia,100.0 46207-0001104659-13-011461,"Hawaiian Electric Company, Inc.",Hawaii, 46207-0001104659-13-011461,"Maui Electric Company, Limited",Hawaii, 46207-0001104659-13-011461,"Hawaii Electric Light Company, Inc.",Hawaii, @@ -842,46 +813,11 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 46207-0001104659-13-011461,Uluwehiokama Biofuels Corp.,Hawaii, 46207-0001104659-13-011461,HECO Capital Trust III,Delaware, 46207-0001104659-13-011461,"American Savings Holdings, Inc.",Hawaii, -46207-0001104659-13-011461,"American Savings Bank, F.S.B.",, +46207-0001104659-13-011461,"American Savings Bank, F.S.B.",federally chartered, 46207-0001104659-13-011461,"HEI Properties, Inc.",Hawaii, 46207-0001104659-13-011461,Hawaiian Electric Industries Capital Trust II ,Delaware, 46207-0001104659-13-011461,Hawaiian Electric Industries Capital Trust III,Delaware, 46207-0001104659-13-011461,"The Old Oahu Tug Service, Inc.",Hawaii, -205402-0000950114-99-000043,Graybar Foreign Sales Corporation,Barbados, -205402-0000950114-99-000043,"Graybar International, Inc.",Missouri, -205402-0000950114-99-000043,"Graybar Financial Services, Inc.",Missouri, -205402-0000950114-99-000043,"Graybar Electric de Mexico, S. DE R.L. DE C.V.,",Mexican, -205402-0000950114-99-000043,Graybar Electric Limited,Nova Scotia, -205402-0000950114-99-000043,"Graybar Foundation, Inc",Missouri, -205402-0000950114-99-000043,"Graybar Services, Inc.",Illinois, -205402-0000950114-99-000043,"Distribution Associates, Inc.",Missouri, -205402-0000950114-99-000043,Graybar Electric (Ontario) Limited,Ontario, -205402-0000950114-99-000043,Graybar International PTE LTD,Singaporean, -205402-0000950114-99-000043,"Graybar Business Services, Inc.",Missouri, -205402-0000950114-99-000043,Graybar International de Chile Limitada,Chile, -9342-0000009342-95-000008,"Baldor of Arkansas, Inc.",Arkansas,100 -9342-0000009342-95-000008,"Baldor of Nevada, Inc.",Nevada,100 -9342-0000009342-95-000008,BEC Business Trust,Massachusetts,100 -9342-0000009342-95-000008,"Baldor of Texas, L.P.",Texas,100 -9342-0000009342-95-000008,"Baldor International, Inc.",U.S. Virgin Islands,100 -9342-0000009342-95-000008,"Carolina Capacitors, Inc.",South Carolina,100 -9342-0000009342-95-000008,"Southwestern Die Casting Co., Inc.",Arkansas,100 -9342-0000009342-95-000008,"Sweo Controls, Inc.",Washington,100 -9342-0000009342-95-000008,"Baldor Holdings, Inc.",Delaware,100 -9342-0000009342-95-000008,"Baldor de Mexico, S.A. de C.V.",Mexico,100 -9342-0000009342-95-000008,"Baldor ASR, AG",Switzerland,100 -9342-0000009342-95-000008,Baldor ASR GmbH fuer Antriebstechnik,Germany,100 -9342-0000009342-95-000008,Baldor ASR U.K. Limited,United Kingdom,100 -9342-0000009342-95-000008,Australian Baldor Pty. Limited,Australia,60 -9342-0000009342-95-000008,Baldor Electric (Far East) PTE. Ltd.,Singapore,60 -9342-0000009342-95-000008,Baldor Electric (Thailand) Ltd.,Thailand,100 -9342-0000009342-95-000008,Baldor Industrial Automation PTE. Ltd.,Singapore,100 -9342-0000009342-95-000008,Baldor Electric (Indonesia) Ltd.,Indonesia,100 -9342-0000009342-95-000008,Baldor of Nevada,, -9342-0000009342-95-000008,Baldor Business Trust (LP),, -9342-0000009342-95-000008,Baldor of Arkansas (GP),, -9342-0000009342-95-000008,"Baldor Holdings, Inc.",, -9342-0000009342-95-000008,Baldor Electric (Far East) PTE. Ltd.,, 9534-0000897069-05-000574,Bandag A.G,Switzerland, 9534-0000897069-05-000574,Bandag Canada Ltd.,Canada, 9534-0000897069-05-000574,Bandag Europe N.V,Belgium, @@ -897,7 +833,7 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 18647-0001169232-08-000603,"Phoenix Development Company, Inc.",New York, 18647-0001169232-08-000603,Central Hudson Enterprises Corporation,New York, 18647-0001169232-08-000603,"Griffith Energy Services, Inc.",New York, -20947-0001031296-06-000044,Ohio Edison Company – Incorporated in Ohio,Ohio, +20947-0001031296-06-000044,Ohio Edison Company,Ohio, 20947-0001031296-06-000044,The Cleveland Electric Illuminating Company,Ohio, 20947-0001031296-06-000044,The Toledo Edison Company,Ohio, 20947-0001031296-06-000044,Centerior Service Company,Ohio, @@ -925,94 +861,54 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 20947-0001031296-06-000044,FirstEnergy Nuclear Generation Corp.,Ohio, 34067-0001104659-06-016592,Nobelclad Europe S.A.,"Rivesaltes, France", 34067-0001104659-06-016592,Nitro Metall Aktiebolag,"Likenas, Sweden", -38725-0000038725-17-000042,Bombas Leao SA,Brazil,100 -38725-0000038725-17-000042,Cookson & Zinn (PTL) Limited ,United Kingdom,100 -38725-0000038725-17-000042,Coverco S.r.l.,Italy,100 -38725-0000038725-17-000042,FE Latin America B.V.,Netherlands,100 -38725-0000038725-17-000042,FELE C.V.,Netherlands,100 -38725-0000038725-17-000042,"Franklin Control Systems, Inc.",Oregon,100 -38725-0000038725-17-000042,Franklin Electric (Australia) Pty. Ltd.,Australia,100 -38725-0000038725-17-000042,Franklin Electric (Chile) Ltda,Chile,100 -38725-0000038725-17-000042,Franklin Electric (SEA) Pty. Ltd.,Singapore,100 -38725-0000038725-17-000042,Franklin Electric (South Africa) Pty. Ltd.,South Africa,100 -38725-0000038725-17-000042,"Franklin Electric (Suzhou) Co., Ltd.",China,100 -38725-0000038725-17-000042,Franklin Electric (Zambia) Ltd.,Zambia,100 -38725-0000038725-17-000042,Franklin Electric B.V.,Netherlands,100 -38725-0000038725-17-000042,Franklin Electric Botswana Pty. Ltd.,Botswana,100 -38725-0000038725-17-000042,"Franklin Electric Canada, Inc.",Canada,100 -38725-0000038725-17-000042,Franklin Electric Colombia SAS,Colombia,100 -38725-0000038725-17-000042,Franklin Electric Europa GmbH,Germany,100 -38725-0000038725-17-000042,Franklin Electric Germany Holding GmbH,Germany,100 -38725-0000038725-17-000042,Franklin Electric Holding B.V.,Netherlands,100 -38725-0000038725-17-000042,Franklin Electric India Private Ltd.,Indiana,100 -38725-0000038725-17-000042,Franklin Electric Industria de Motobombas SA,Brazil,100 -38725-0000038725-17-000042,"Franklin Electric International, Inc.",Delaware,100 -38725-0000038725-17-000042,Franklin Electric NL BV,Netherlands,100 -38725-0000038725-17-000042,Franklin Electric spol s.r.o.,Czech Republic,100 -38725-0000038725-17-000042,"Franklin Electric Subsidiaries, LLC",Indiana,100 -38725-0000038725-17-000042,"Franklin Electric Trading (Shanghai) Co., Ltd.",China,100 -38725-0000038725-17-000042,Franklin Fueling Sistemas de Combustiveis Ltda,Brazil,100 -38725-0000038725-17-000042,Franklin Fueling Systems (Beijing) Company Ltd.,China,100 -38725-0000038725-17-000042,Franklin Fueling Systems Australia Pty. Ltd.,Australia,100 -38725-0000038725-17-000042,Franklin Fueling Systems France SARL,France,100 -38725-0000038725-17-000042,Franklin Fueling Systems India Private Ltd.,India,100 -38725-0000038725-17-000042,Franklin Fueling Systems Ltd.,United Kingdom,100 -38725-0000038725-17-000042,Franklin Fueling Systems GmbH,Germany,100 -38725-0000038725-17-000042,"Franklin Fueling Systems, Inc.",Indiana,100 -38725-0000038725-17-000042,Impo Motor Pompa Sanayi ve Ticaret A.S.,Turkey,90 -38725-0000038725-17-000042,"Intelligent Controls, LLC",Maine,100 -38725-0000038725-17-000042,Motores Electricos Sumergibles de Mexico S. de R.L de C.V.,Mexico,100 -38725-0000038725-17-000042,Motores Franklin S.A. de C.V.,Mexico,100 -38725-0000038725-17-000042,Motori Sommersi Riavvolgibili S.r.l.,Italy,75 -38725-0000038725-17-000042,Pioneer Pump Holdings Pty.,Australia,100 -38725-0000038725-17-000042,Pioneer Pump Ltd.,United Kingdom,100 -38725-0000038725-17-000042,Pioneer Pump Pty. Ltd. ,South Africa,100 -38725-0000038725-17-000042,Pioneer Pump Solutions Ltd.,United Kingdom,100 -38725-0000038725-17-000042,"Pioneer Pump, Inc.",Texas,100 -38725-0000038725-17-000042,Pluga Pumps and Motors Private Limited,India,70 -38725-0000038725-17-000042,Servicios de MESMEX S de SRL de CV,Mexico,100 -38725-0000038725-17-000042,Franklin Electric S.r.l,Italy,100 -38725-0000038725-17-000042,Franklin Wadcorpp India Private Limited,India,65 -60549-0001047469-98-012481,Louisville Gas and Electric Company,Kentucky, -60549-0001047469-98-012481,LG&E Capital Corp.,Kentucky, -60549-0001047469-98-012481,LG&E Power Inc.,Delaware, -60549-0001047469-98-012481,LG&E Power Operations Inc.,California, -60549-0001047469-98-012481,LG&E Energy Marketing Inc.,Oklahoma, -60549-0001047469-98-012481,LG&E International Inc.,Delaware, -60549-0001047469-98-012481,Louisville Gas and Electric Company,, -60549-0001047469-98-012481,LG&E Capital Corp.,, -60549-0001047469-98-012481,LG&E Power Inc.,, -60549-0001047469-98-012481,LG&E International Inc.,, -60549-0001047469-98-012481,LG&E Energy Marketing Inc.,, -60549-0001047469-98-012481,LG&E Power Operations ,, -60549-0001047469-98-012481,LG&E Energy Marketing Inc.,, -61986-0000061986-99-000003,"Femco Machine Co., Inc.",Nevada, -61986-0000061986-99-000003,Kolpak Manufacturing Company,Tennessee, -61986-0000061986-99-000003,"Manitex, Inc.",Texas, -61986-0000061986-99-000003,"Manitowoc MEC, Inc.",Nevada, -61986-0000061986-99-000003,"Manitowoc Equipment Works PTE, Ltd.",Singapore, -61986-0000061986-99-000003,"Manitowoc Equipment Works, Inc.",Nevada, -61986-0000061986-99-000003,"Manitowoc Europe Holdings, Ltd.",England, -61986-0000061986-99-000003,Manitowoc Europe Limited,England, -61986-0000061986-99-000003,Manitowoc International Sales Corp. ,Barbados, -61986-0000061986-99-000003,"Manitowoc Korea Company, Ltd.",Korea, -61986-0000061986-99-000003,"Manitowoc Marine Group, Inc",Nevada, -61986-0000061986-99-000003,"Manitowoc Re-Manufacturing, Inc.",Wisconsin, -61986-0000061986-99-000003,"Manitowoc Western Company, Inc.",Wisconsin, -61986-0000061986-99-000003,North Central Crane & Excavator Sales Corp.,Nevada, -61986-0000061986-99-000003,"West Manitowoc, Inc.",Wisconsin, -61986-0000061986-99-000003,"Manitowoc CP, Inc. ",Nevada, -61986-0000061986-99-000003,"Manitowoc FP, Inc.",Nevada, -61986-0000061986-99-000003,"KMT Refrigeration, Inc.",Wisconsin, -61986-0000061986-99-000003,"Manitowoc Foodservice Group, Inc.",Nevada, -61986-0000061986-99-000003,"Manitowoc Crane Group, Inc.",Nevada, -61986-0000061986-99-000003,"Manitowoc Ice, Inc.",Wisconsin, -61986-0000061986-99-000003,"Manitowoc Cranes, Inc.",Wisconsin, -61986-0000061986-99-000003,"SerVend International, Inc.",Nevada, -61986-0000061986-99-000003,"Manitowoc Beverage Systems, Inc. ",Nevada, -61986-0000061986-99-000003,KMT Sales Corporation,Nevada, -61986-0000061986-99-000003,SerVend Sales Corporation,Nevada, -61986-0000061986-99-000003,"USTC, Inc.",Nevada, +38725-0000038725-17-000042,Bombas Leao SA,Brazil,100.0 +38725-0000038725-17-000042,Cookson & Zinn (PTL) Limited ,United Kingdom,100.0 +38725-0000038725-17-000042,Coverco S.r.l.,Italy,100.0 +38725-0000038725-17-000042,FE Latin America B.V.,Netherlands,100.0 +38725-0000038725-17-000042,FELE C.V.,Netherlands,100.0 +38725-0000038725-17-000042,"Franklin Control Systems, Inc.",Oregon,100.0 +38725-0000038725-17-000042,Franklin Electric (Australia) Pty. Ltd.,Australia,100.0 +38725-0000038725-17-000042,Franklin Electric (Chile) Ltda,Chile,100.0 +38725-0000038725-17-000042,Franklin Electric (SEA) Pty. Ltd.,Singapore,100.0 +38725-0000038725-17-000042,Franklin Electric (South Africa) Pty. Ltd.,South Africa,100.0 +38725-0000038725-17-000042,"Franklin Electric (Suzhou) Co., Ltd.",China,100.0 +38725-0000038725-17-000042,Franklin Electric (Zambia) Ltd.,Zambia,100.0 +38725-0000038725-17-000042,Franklin Electric B.V.,Netherlands,100.0 +38725-0000038725-17-000042,Franklin Electric Botswana Pty. Ltd.,Botswana,100.0 +38725-0000038725-17-000042,"Franklin Electric Canada, Inc.",Canada,100.0 +38725-0000038725-17-000042,Franklin Electric Colombia SAS,Colombia,100.0 +38725-0000038725-17-000042,Franklin Electric Europa GmbH,Germany,100.0 +38725-0000038725-17-000042,Franklin Electric Germany Holding GmbH,Germany,100.0 +38725-0000038725-17-000042,Franklin Electric Holding B.V.,Netherlands,100.0 +38725-0000038725-17-000042,Franklin Electric India Private Ltd.,India,100.0 +38725-0000038725-17-000042,Franklin Electric Industria de Motobombas SA,Brazil,100.0 +38725-0000038725-17-000042,"Franklin Electric International, Inc.",Delaware,100.0 +38725-0000038725-17-000042,Franklin Electric NL BV,Netherlands,100.0 +38725-0000038725-17-000042,Franklin Electric spol s.r.o.,Czech Republic,100.0 +38725-0000038725-17-000042,"Franklin Electric Subsidiaries, LLC",Indiana,100.0 +38725-0000038725-17-000042,"Franklin Electric Trading (Shanghai) Co., Ltd.",China,100.0 +38725-0000038725-17-000042,Franklin Fueling Sistemas de Combustiveis Ltda,Brazil,100.0 +38725-0000038725-17-000042,Franklin Fueling Systems (Beijing) Company Ltd.,China,100.0 +38725-0000038725-17-000042,Franklin Fueling Systems Australia Pty. Ltd.,Australia,100.0 +38725-0000038725-17-000042,Franklin Fueling Systems France SARL,France,100.0 +38725-0000038725-17-000042,Franklin Fueling Systems India Private Ltd.,India,100.0 +38725-0000038725-17-000042,Franklin Fueling Systems Ltd.,United Kingdom,100.0 +38725-0000038725-17-000042,Franklin Fueling Systems GmbH,Germany,100.0 +38725-0000038725-17-000042,"Franklin Fueling Systems, Inc.",Indiana,100.0 +38725-0000038725-17-000042,Impo Motor Pompa Sanayi ve Ticaret A.S.,Turkey,90.0 +38725-0000038725-17-000042,"Intelligent Controls, LLC",Maine,100.0 +38725-0000038725-17-000042,Motores Electricos Sumergibles de Mexico S. de R.L de C.V.,Mexico,100.0 +38725-0000038725-17-000042,Motores Franklin S.A. de C.V.,Mexico,100.0 +38725-0000038725-17-000042,Motori Sommersi Riavvolgibili S.r.l.,Italy,75.0 +38725-0000038725-17-000042,Pioneer Pump Holdings Pty.,Australia,100.0 +38725-0000038725-17-000042,Pioneer Pump Ltd.,United Kingdom,100.0 +38725-0000038725-17-000042,Pioneer Pump Pty. Ltd. ,South Africa,100.0 +38725-0000038725-17-000042,Pioneer Pump Solutions Ltd.,United Kingdom,100.0 +38725-0000038725-17-000042,"Pioneer Pump, Inc.",Texas,100.0 +38725-0000038725-17-000042,Pluga Pumps and Motors Private Limited,India,70.0 +38725-0000038725-17-000042,Servicios de MESMEX S de SRL de CV,Mexico,100.0 +38725-0000038725-17-000042,Franklin Electric S.r.l,Italy,100.0 +38725-0000038725-17-000042,Franklin Wadcorpp India Private Limited,India,65.0 71675-0001046861-02-000012,Central Maine Power Company,Maine, 71675-0001046861-02-000012,Connecticut Natural Gas Corporation,Connecticut, 71675-0001046861-02-000012,"Energy East Enterprises, Inc.",Maine, @@ -1045,25 +941,18 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 77227-0001031296-09-000008,"GPU Power, Inc.",Delaware, 77227-0001031296-09-000008,FirstEnergy Foundation,Ohio, 77227-0001031296-09-000008,FirstEnergy Fiber Holdings Corp.,Delaware, -78778-0000078778-97-000019,Piper Jaffray Inc.,Delaware,100 -78778-0000078778-97-000019,Piper Jaffray International Inc.,Delaware,100 -78778-0000078778-97-000019,Piper Capital Management Incorporated,Delaware,100 -78778-0000078778-97-000019,Piper Trust Company,Minnesota,100 -78778-0000078778-97-000019,Premier Acceptance Corporation,Delaware,100 -78778-0000078778-97-000019,Piper Realty Management Incorporated,Delaware,100 -78778-0000078778-97-000019,"Piper Jaffray Ventures, Inc. ",Delaware,100 78890-0000078890-14-000004,The Pittston Company,Delaware, 78890-0000078890-14-000004,"Glen Allen Development, Inc.",Delaware, -78890-0000078890-14-000004,"Liberty National Development Company, LLC (32.5%)",Delaware, -78890-0000078890-14-000004,"New Liberty Residential Urban Renewal Company, LLC (17.5%)",New Jersey, +78890-0000078890-14-000004,"Liberty National Development Company, LLC",Delaware,32.5 +78890-0000078890-14-000004,"New Liberty Residential Urban Renewal Company, LLC",New Jersey,17.5 78890-0000078890-14-000004,Pittston Services Group Inc.,Virginia, 78890-0000078890-14-000004,Brink’s Holding Company,Delaware, 78890-0000078890-14-000004,"Brink’s, Incorporated (“BI”)",Delaware, 78890-0000078890-14-000004,"Brink’s Delaware, LLC",Delaware, 78890-0000078890-14-000004,Brink’s Express Company,Illinois, 78890-0000078890-14-000004,"Brink’s Global Payments, LLC",Delaware, -78890-0000078890-14-000004,Brink’s St. Lucia Ltd.,St. Lucia,26 -78890-0000078890-14-000004,Security Services (Brink’s Jordan) Company Ltd,Jordan,95 +78890-0000078890-14-000004,Brink’s St. Lucia Ltd.,St. Lucia,26.0 +78890-0000078890-14-000004,Security Services (Brink’s Jordan) Company Ltd,Jordan,95.0 78890-0000078890-14-000004,"Servicio Pan Americano de Protección S.A. de C.V. (“Serpaprosa”) (by Trust, BI is Settlor of Trust)",Mexico,99.75 78890-0000078890-14-000004,"Aeroflash Mensajeria, S.A. de C.V.",Mexico,99.75 78890-0000078890-14-000004,"Inmobiliaria, A.J., S.A. de C.V.",Mexico,99.75 @@ -1111,15 +1000,15 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 78890-0000078890-14-000004,Inversiones Petra S.A.,Chile, 78890-0000078890-14-000004,"Brink’s Chile, S.A. (BICV is beneficial owner)",Chile, 78890-0000078890-14-000004,Organismo Tecnico de Capacitacion Brink’s SpA,Chile, -78890-0000078890-14-000004,Brink’s de Colombia S.A.,Colombia,58 -78890-0000078890-14-000004,Domesa de Colombia S.A.,Colombia,70 -78890-0000078890-14-000004,Procesos & Canje S.A.,Colombia,58 +78890-0000078890-14-000004,Brink’s de Colombia S.A.,Colombia,58.0 +78890-0000078890-14-000004,Domesa de Colombia S.A.,Colombia,70.0 +78890-0000078890-14-000004,Procesos & Canje S.A.,Colombia,58.0 78890-0000078890-14-000004,Sistema Integrado Multiple de Pago Electronicos S.A. (“SIMPLE S.A.”),Colombia,14.5 78890-0000078890-14-000004,"Brink’s Canada Holdings, B.V. (BICV is beneficial owner)",Netherlands, 78890-0000078890-14-000004,Brink’s Canada Limited,Canada, 78890-0000078890-14-000004,"Brink’s Security Services, B.V.",Netherlands, 78890-0000078890-14-000004,"Centro Americana de Inversiones Balboa, C.A. (BICV is beneficial owner)",Panama, -78890-0000078890-14-000004,Hermes Transporte Blindados S.A.,Peru,36 +78890-0000078890-14-000004,Hermes Transporte Blindados S.A.,Peru,36.0 78890-0000078890-14-000004,"Brink’s Dutch Holdings, B.V. (BICV is beneficial owner)",Netherlands, 78890-0000078890-14-000004,"Brink’s Hellenic Holdings, B.V. (“BHH”)",Netherlands, 78890-0000078890-14-000004,"Athena Marathon Holdings, B.V. (“AMH”)",Netherlands, @@ -1131,33 +1020,33 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 78890-0000078890-14-000004,Brink’s Hermes Cash & Valuable Services S.A. (“Brink’s Cash & Valuable Services SA”),Greece, 78890-0000078890-14-000004,Brink's Hellas Guarding & Cash Services Joint VentureAnonymi Etairia,Greece, 78890-0000078890-14-000004,Brink’s Hermes Security Services SA (“Brink’s Security Services S.A.”),Greece, -78890-0000078890-14-000004,Brink’s Hermes Aviation Security Services S.A.,Greece,70 -78890-0000078890-14-000004,Hellenic Central Station SA - Reception & Processing Centre of Electronic Signals (“Hellenic Central Station”),Greece,10 +78890-0000078890-14-000004,Brink’s Hermes Aviation Security Services S.A.,Greece,70.0 +78890-0000078890-14-000004,Hellenic Central Station SA - Reception & Processing Centre of Electronic Signals (“Hellenic Central Station”),Greece,10.0 78890-0000078890-14-000004,"BHM Human Resources Mexico Holding, S.A. de C.V.",Mexico, 78890-0000078890-14-000004,"Servicios Administrativos Consolidados BM de Mexico, S.A. de C.V.",Mexico, 78890-0000078890-14-000004,"BM Control y Administracion de Personal, S.A. de C.V.",Mexico, 78890-0000078890-14-000004,BHM Human Resources Solutions B.V.,Netherlands, 78890-0000078890-14-000004,Brink’s Argentina S.A.,Argentina, -78890-0000078890-14-000004,Brink’s Seguridad Corporativa S.A.,Argentina,98 +78890-0000078890-14-000004,Brink’s Seguridad Corporativa S.A.,Argentina,98.0 78890-0000078890-14-000004,Brink’s India Private Limited,India, -78890-0000078890-14-000004,Brinks Mongolia LLC,Mongolia,51 -78890-0000078890-14-000004,Brink’s RUS Holding B.V.,Netherlands,70 -78890-0000078890-14-000004,Limited Liability Company Brink’s Management,Russian Federation,70 -78890-0000078890-14-000004,Limited Liability Company Brink’s Management,Russian Federation,70 -78890-0000078890-14-000004,Non Banking Credit Organization BRINKS,Russian Federation,70 -78890-0000078890-14-000004,Servicio Pan Americano de Proteccion C.A.,Venezuela,61 -78890-0000078890-14-000004,"Aeropanamericano, C.A.",Venezuela,61 -78890-0000078890-14-000004,"Aero Sky Panama, S.A.",Panama,61 -78890-0000078890-14-000004,"Artes Graficas Avanzadas 98, C.A.",Venezuela,61 -78890-0000078890-14-000004,"Blindados de Zulia Occidente, C.A.",Venezuela,61 -78890-0000078890-14-000004,"Blindados de Oriente, S.A.",Venezuela,61 -78890-0000078890-14-000004,"Blindados Panamericanos, S.A.",Venezuela,61 -78890-0000078890-14-000004,"Blindados Centro Occidente, S.A.",Venezuela,61 -78890-0000078890-14-000004,"Documentos Mercantiles, S.A.",Venezuela,61 -78890-0000078890-14-000004,"Instituto Panamericano, C.A.",Venezuela,61 -78890-0000078890-14-000004,"Intergraficas Panama, S.A.",Panama,61 -78890-0000078890-14-000004,"Panamericana de Vigilancia, S.A. ",Venezuela,61 -78890-0000078890-14-000004,"Transportes Expresos, C.A. ",Venezuela,61 +78890-0000078890-14-000004,Brinks Mongolia LLC,Mongolia,51.0 +78890-0000078890-14-000004,Brink’s RUS Holding B.V.,Netherlands,70.0 +78890-0000078890-14-000004,Limited Liability Company Brink’s Management,Russian Federation,70.0 +78890-0000078890-14-000004,Limited Liability Company Brink’s Management,Russian Federation,70.0 +78890-0000078890-14-000004,Non Banking Credit Organization BRINKS,Russian Federation,70.0 +78890-0000078890-14-000004,Servicio Pan Americano de Proteccion C.A.,Venezuela,61.0 +78890-0000078890-14-000004,"Aeropanamericano, C.A.",Venezuela,61.0 +78890-0000078890-14-000004,"Aero Sky Panama, S.A.",Panama,61.0 +78890-0000078890-14-000004,"Artes Graficas Avanzadas 98, C.A.",Venezuela,61.0 +78890-0000078890-14-000004,"Blindados de Zulia Occidente, C.A.",Venezuela,61.0 +78890-0000078890-14-000004,"Blindados de Oriente, S.A.",Venezuela,61.0 +78890-0000078890-14-000004,"Blindados Panamericanos, S.A.",Venezuela,61.0 +78890-0000078890-14-000004,"Blindados Centro Occidente, S.A.",Venezuela,61.0 +78890-0000078890-14-000004,"Documentos Mercantiles, S.A.",Venezuela,61.0 +78890-0000078890-14-000004,"Instituto Panamericano, C.A.",Venezuela,61.0 +78890-0000078890-14-000004,"Intergraficas Panama, S.A.",Panama,61.0 +78890-0000078890-14-000004,"Panamericana de Vigilancia, S.A. ",Venezuela,61.0 +78890-0000078890-14-000004,"Transportes Expresos, C.A. ",Venezuela,61.0 78890-0000078890-14-000004,Brink’s Panama S.A.,Panama, 78890-0000078890-14-000004,Inmobiliaria Brink’s Panama S.A.,Panama, 78890-0000078890-14-000004,Brink’s Global Services Poland Sp.zo.o.,Poland, @@ -1179,9 +1068,9 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 78890-0000078890-14-000004,Brink’s Évolution S.A.R.L.,France, 78890-0000078890-14-000004,Est Valeurs SAS,France, 78890-0000078890-14-000004,Brink’s Formation S.A.R.L.,France, -78890-0000078890-14-000004,Brink’s Madagascar S.A.,Madagascar,60 +78890-0000078890-14-000004,Brink’s Madagascar S.A.,Madagascar,60.0 78890-0000078890-14-000004,Brink’s Maroc S.A.S.,Morocco, -78890-0000078890-14-000004,Brink’s Qatar L.L.C.,Qatar,49 +78890-0000078890-14-000004,Brink’s Qatar L.L.C.,Qatar,49.0 78890-0000078890-14-000004,Brink’s Réunion S.A.R.L.,St. Denis, 78890-0000078890-14-000004,Brink’s Security Services SAS,France, 78890-0000078890-14-000004,Brink’s Teleservices SAS,France, @@ -1207,7 +1096,7 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 78890-0000078890-14-000004,Brink’s Diamond (Shanghai) Company Limited,China, 78890-0000078890-14-000004,Brink’s Jewellery Trading (Shanghai) Company Limited,China, 78890-0000078890-14-000004,Brink’s Security Transportation (Shanghai) Company Limited,China, -78890-0000078890-14-000004,Brink’s Global Services Korea Limited – Yunan Hoesa Brink’s Global,Korea,80 +78890-0000078890-14-000004,Brink’s Global Services Korea Limited – Yunan Hoesa Brink’s Global,Korea,80.0 78890-0000078890-14-000004,Brink’s Nederland B.V.,Netherlands, 78890-0000078890-14-000004,Brink’s Geldverwerking B.V.,Netherlands, 78890-0000078890-14-000004,Brink’s Regional Services B.V.,Netherlands, @@ -1218,12 +1107,12 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 78890-0000078890-14-000004,Redetrel – Rede Transacoes Eletronicas Ltda.,Brazil, 78890-0000078890-14-000004,ePago International Inc.,Panama, 78890-0000078890-14-000004,"Corporación ePago de Venezuela, C.A.",Venezuela, -78890-0000078890-14-000004,e-Pago de Colombia S.A. ,Colombia,75 +78890-0000078890-14-000004,e-Pago de Colombia S.A. ,Colombia,75.0 78890-0000078890-14-000004,Brink’s ePago S.A. de C.V.,Mexico, 78890-0000078890-14-000004,Brink’s Global Services (BGS) Botswana (Proprietary) Limited,Botswana, 78890-0000078890-14-000004,Brink’s Macau Limited,Macao, 78890-0000078890-14-000004,Brink’s Taiwan Security Limited,Taiwan, -78890-0000078890-14-000004,Brink’s (Thailand) Limited,Thailand,40 +78890-0000078890-14-000004,Brink’s (Thailand) Limited,Thailand,40.0 78890-0000078890-14-000004,Brink’s Global Technology Limited,Thailand, 78890-0000078890-14-000004,Brink’s Guvenlik Hizmetleri Anonim Sirketi,Turkey, 78890-0000078890-14-000004,Brink’s (UK) Limited,U.K., @@ -1269,7 +1158,6 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 78890-0000078890-14-000004,PMV Gold Company,Delaware, 78890-0000078890-14-000004,Pittston Mineral Ventures International Ltd.,Delaware, 78890-0000078890-14-000004,Mineral Ventures of Australia Pty Ltd,Australia, -80812-0000927016-98-004349,"ProvEnergy Investments, Ltd.",Rhode Island, 86521-0000086521-10-000019,Enova Corporation,California, 86521-0000086521-10-000019,Pacific Enterprises,California, 86521-0000086521-10-000019,Pacific Enterprises International,California, @@ -1341,26 +1229,6 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 96271-0001193125-07-042781,"TPS International Power, Inc.",Cayman Islands, 96271-0001193125-07-042781,"TPS de Ultramar, LTD",Cayman Islands, 96271-0001193125-07-042781,"TPS de Ultramar Guatemala, S.A.",Guatemalan, -99250-0000099250-00-000002,Cardinal Operating Company,Delaware,100 -99250-0000099250-00-000002,Cross Bay Operating Company,Delaware,100 -99250-0000099250-00-000002,Cumberland Operating Company,Delaware,100 -99250-0000099250-00-000002,Independence Operating Company,Delaware,100 -99250-0000099250-00-000002,"Marsh Resources, Inc.",Delaware,100 -99250-0000099250-00-000002,Pine Needle Operating Company,Delaware,100 -99250-0000099250-00-000002,"TGPL Enterprises, Inc.",Delaware,100 -99250-0000099250-00-000002,Transco Cross Bay Company,Delaware,100 -99250-0000099250-00-000002,TransCardinal Company,Delaware,100 -99250-0000099250-00-000002,TransCarolina LNG Company,Delaware,100 -99250-0000099250-00-000002,TransCumberland Pipeline Company,Delaware,100 -99250-0000099250-00-000002,Transco Independence Pipeline Company ,Delaware,100 -99250-0000099250-00-000002,"Delaware WGP Enterprises, Inc",Delaware,100 -99250-0000099250-00-000002,"Williams Gas Processing - Gulf Coast Company, L.P.",Delaware,99 -100122-0000941138-03-000007,Tucson Electric Power Company (TEP),Arizona, -100122-0000941138-03-000007,San Carlos Resources Inc.,Arizona, -100122-0000941138-03-000007,"Millennium Energy Holdings, Inc. (Millennium)",Arizona, -100122-0000941138-03-000007,"Advanced Energy Technologies, Inc.",Arizona, -100122-0000941138-03-000007,"Global Solar Energy, Inc.",Arizona, -100122-0000941138-03-000007,UniSource Energy Development (UED),Arizona, 103872-0001193125-13-444053,14011 So. Normandie Ave. Realty Corp.,Nevada, 103872-0001193125-13-444053,500 South Douglas Realty Corp.,Delaware, 103872-0001193125-13-444053,Arctern Consulting Private Limited (2),India, @@ -1462,7 +1330,1077 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 103872-0001193125-13-444053,"Volt Telecommunications Group, Inc.",Delaware, 103872-0001193125-13-444053,"Volt Temporary Services, Inc.",Delaware, 103872-0001193125-13-444053,"Volt Workforce Solutions, Inc.",Delaware, -320575-0001193125-07-117419,PDC MGMT. CO. (formerly SOTEX Exploration Company),Texas,100 -320575-0001193125-07-117419,"PDC Investment Corp.,",Delaware,100 -320575-0001193125-07-117419,"Pioneer Drilling Services, Ltd. (formerly Pioneer Drilling Co., Ltd.)",Texas,100 -320575-0001193125-07-117419,South Texas Drilling Company,Texas,100 +320575-0001193125-07-117419,PDC MGMT. CO. (formerly SOTEX Exploration Company),Texas,100.0 +320575-0001193125-07-117419,"PDC Investment Corp.,",Delaware,100.0 +320575-0001193125-07-117419,"Pioneer Drilling Services, Ltd. (formerly Pioneer Drilling Co., Ltd.)",Texas,100.0 +320575-0001193125-07-117419,South Texas Drilling Company,Texas,100.0 +3499-0000003499-08-000003,731 Commercial Holding LLC,, +3499-0000003499-08-000003,731 Commercial LLC,, +3499-0000003499-08-000003,731 Office One Holding LLC,, +3499-0000003499-08-000003,731 Office One LLC,, +3499-0000003499-08-000003,731 Office Two Holding LLC,, +3499-0000003499-08-000003,731 Office Two LLC,, +3499-0000003499-08-000003,731 Residential Holding LLC,, +3499-0000003499-08-000003,731 Residential LLC,, +3499-0000003499-08-000003,731 Restaurant LLC,, +3499-0000003499-08-000003,731 Retail One LLC,, +3499-0000003499-08-000003,"Alexander’s Department Stores of Brooklyn, Inc.",, +3499-0000003499-08-000003,"Alexander’s Department Stores of New Jersey, Inc.",, +3499-0000003499-08-000003,"Alexander’s Kings Plaza, LLC",, +3499-0000003499-08-000003,"Alexander’s of Kings, LLC",, +3499-0000003499-08-000003,Alexander’s Management LLC,, +3499-0000003499-08-000003,Alexander’s of Brooklyn II LLC,, +3499-0000003499-08-000003,"Alexander’s of Brooklyn, Inc.",, +3499-0000003499-08-000003,"Alexander’s of Flushing, Inc.",, +3499-0000003499-08-000003,"Alexander’s of Rego Park II, Inc.",, +3499-0000003499-08-000003,"Alexander’s of Rego Park III, Inc.",, +3499-0000003499-08-000003,"ALX of Paramus, LLC",, +3499-0000003499-08-000003,"Alexander’s Personnel Providers, Inc.",, +3499-0000003499-08-000003,"Alexander’s Rego Park Center, Inc.",, +3499-0000003499-08-000003,"Alexander’s Rego Shopping Center, Inc.",, +3499-0000003499-08-000003,Alexander’s Restaurant LLC,, +3499-0000003499-08-000003,"Kings Parking, LLC",, +3499-0000003499-08-000003,Kings Plaza Lender LLC,, +3499-0000003499-08-000003,Ownreal Inc.,, +3499-0000003499-08-000003,Rego Park Comercial LLC,, +3499-0000003499-08-000003,Rego Park Residential LLC,, +3499-0000003499-08-000003,"Sakraf Wine & Liquor Store, Inc.",, +3570-0000003570-17-000052,Caldera LNG Holdings SpA,Chile, +3570-0000003570-17-000052,Cheniere Chile SpA,Chile, +3570-0000003570-17-000052,"Cheniere CCH HoldCo I, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere CCH HoldCo II, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere Corpus Christi Holdings, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere Corpus Christi Pipeline, L.P.",Delaware, +3570-0000003570-17-000052,"Cheniere Corpus Christi Pipeline Stage III, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere Creole Trail Pipeline, L.P.",Delaware, +3570-0000003570-17-000052,"Cheniere Energy Investments, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere Energy Operating Co., Inc",Delaware, +3570-0000003570-17-000052,"Cheniere Energy Partners GP, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere Energy Partners LP Holdings, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere Energy Partners, L.P.",Delaware, +3570-0000003570-17-000052,"Cheniere Energy Shared Services Holdings, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere Energy Shared Services, Inc.",Delaware, +3570-0000003570-17-000052,"Cheniere Field Services, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere GP Holding Company, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere Ingleside Marine Terminal, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere International Investments Holdings, S.à.r.l",Luxembourg, +3570-0000003570-17-000052,"Cheniere International Investments, S.à.r.l",Luxembourg, +3570-0000003570-17-000052,"Cheniere Land Holdings, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere Liquids, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere LNG Holdings GP, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere LNG O&M Services, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere LNG Terminals, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere Marketing International HoldCo I, L.P.",Bermuda, +3570-0000003570-17-000052,"Cheniere Marketing International HoldCo II, Ltd.",Bermuda, +3570-0000003570-17-000052,"Cheniere Marketing International, LLP",United Kingdom, +3570-0000003570-17-000052,"Cheniere Marketing, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere Marketing, Ltd.",United Kingdom, +3570-0000003570-17-000052,Cheniere Marketing PTE Ltd.,Singapore, +3570-0000003570-17-000052,"Cheniere Midship Holdings, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere Midstream Holdings, Inc.",Delaware, +3570-0000003570-17-000052,"Cheniere Pipeline GP Interests, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere Pipeline Holdings, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere San Patricio Processing Hub, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere Southern Trail GP, Inc.",Delaware, +3570-0000003570-17-000052,"Cheniere SPH Pipeline, LLC",Delaware, +3570-0000003570-17-000052,"Cheniere Supply & Marketing, Inc.",Delaware, +3570-0000003570-17-000052,Concepción LNG Holding SpA,Chile, +3570-0000003570-17-000052,"Corpus Christi Liquefaction, LLC",Delaware, +3570-0000003570-17-000052,"Corpus Christi Liquefaction Stage III, LLC",Delaware, +3570-0000003570-17-000052,"Corpus Christi LNG, LLC",Delaware, +3570-0000003570-17-000052,"Corpus Christi Pipeline GP, LLC",Delaware, +3570-0000003570-17-000052,"Corpus Christi Tug Services, LLC",Delaware, +3570-0000003570-17-000052,"CQH Holdings Company, LLC",Delaware, +3570-0000003570-17-000052,"CUI I, LLC",Delaware, +3570-0000003570-17-000052,"Johnson Bayou Holdings, LLC",Delaware, +3570-0000003570-17-000052,"Live Oak LNG Holdings, LLC",Delaware, +3570-0000003570-17-000052,"Louisiana LNG Holdings, LLC",Delaware, +3570-0000003570-17-000052,"Nordheim Eagle Ford Gathering, LLC",Delaware, +3570-0000003570-17-000052,"Sabine Pass Liquefaction, LLC",Delaware, +3570-0000003570-17-000052,"Sabine Pass LNG-GP, LLC",Delaware, +3570-0000003570-17-000052,"Sabine Pass LNG-LP, LLC",Delaware, +3570-0000003570-17-000052,"Sabine Pass LNG, L.P.",Delaware, +3570-0000003570-17-000052,"Sabine Pass Tug Services, LLC",Delaware, +4127-0000004127-17-000033,"Skyworks Filter Solutions Japan Co., Ltd.",Japan, +4127-0000004127-17-000033,Skyworks Global Pte. Ltd.,Singapore, +4127-0000004127-17-000033,"Skyworks International Investments, LLC",Delaware, +4127-0000004127-17-000033,Skyworks Ireland Limited,Ireland, +4127-0000004127-17-000033,Skyworks Luxembourg S.a r.l,Luxembourg, +4127-0000004127-17-000033,Skyworks Semiconductor,France, +4127-0000004127-17-000033,Skyworks Solutions Canada Inc.,Canada, +4127-0000004127-17-000033,"Skyworks Solutions Commercial Co., Ltd. (Shenzhen)",Peoples Republic of China, +4127-0000004127-17-000033,"Skyworks Solutions Commercial Co., Ltd. (Shenzhen) - Beijing Branch",Beijing, +4127-0000004127-17-000033,"Skyworks Solutions Commercial Co., Ltd. (Shenzhen) - Shanghai Branch",Shanghai, +4127-0000004127-17-000033,"Skyworks Solutions Co, Limited",Japan, +4127-0000004127-17-000033,"Skyworks Solutions de Mexico, S de R.L. de C.V.",Mexico, +4127-0000004127-17-000033,Skyworks Solutions (Hong Kong) Limited,Hong Kong, +4127-0000004127-17-000033,Skyworks Solutions Ireland Limited,Ireland, +4127-0000004127-17-000033,Skyworks Solutions Korea Limited,Korea, +4127-0000004127-17-000033,Skyworks Solutions Limited,United Kingdom, +4127-0000004127-17-000033,Skyworks Solutions Oy,Finland, +4127-0000004127-17-000033,"Skyworks Solutions Worldwide, Inc.",Delaware, +4127-0000004127-17-000033,"Skyworks Solutions Worldwide, Inc., Taiwan Branch",Taiwan, +4127-0000004127-17-000033,"Skyworks Solutions Worldwide, Inc., Malaysia Branch",Malaysia, +4127-0000004127-17-000033,Advanced Analogic Technologies Incorporated,Delaware, +4127-0000004127-17-000033,"Advanced Analogic Technologies (China), Inc.",Peoples Republic of China, +4127-0000004127-17-000033,Axiom Microdevices Inc.,Delaware, +4127-0000004127-17-000033,ICWave LLC,Massachusetts, +4127-0000004127-17-000033,Isolink inc.,California, +4127-0000004127-17-000033,MEMS Solutions Inc.,Korea, +4127-0000004127-17-000033,Quantance Inc.,Delaware, +4127-0000004127-17-000033,SiGe Semiconductor Inc.,Delaware, +4127-0000004127-17-000033,SiGe Semiconductor (U.S.) Corp.,Delaware, +4127-0000004127-17-000033,SiGe Semiconductor (Europe) Limited,United Kingdom, +4127-0000004127-17-000033,"Trans-Tech, Inc.",Maryland, +4962-0001193125-10-041232,American Express Company,(USA) New York, +4962-0001193125-10-041232,56th Street AXP Campus LLC,(USA) Arizona, +4962-0001193125-10-041232,American Express Austria Bank GmbH,Austria, +4962-0001193125-10-041232,American Express Bank LLC,Russian Federation, +4962-0001193125-10-041232,American Express Bank Ltd. S.A,Argentina, +4962-0001193125-10-041232,American Express Banking Corp.,(USA) New York, +4962-0001193125-10-041232,"American Express Travel Related Services Company, Inc.",(USA) New York, +4962-0001193125-10-041232,American Express Bank (Mexico) S.A Institucion de Banca Multiple,Mexico, +4962-0001193125-10-041232,"American Express Bank Services, S.A. de C.V.",Mexico, +4962-0001193125-10-041232,American Express Bank FSB,United States, +4962-0001193125-10-041232,American Express Receivables Financing Corporation IV LLC,(USA) Delaware, +4962-0001193125-10-041232,American Express Business Loan Corporation,(USA) Utah, +4962-0001193125-10-041232,American Express Centurion Bank,(USA) Utah, +4962-0001193125-10-041232,American Express Receivables Financing Corporation III LLC,(USA) Delaware, +4962-0001193125-10-041232,American Express Company (Mexico) S.A. de C.V.,Mexico, +4962-0001193125-10-041232,"American Express Insurance Services, Agente de Seguros, S.A. de C.V.",Mexico, +4962-0001193125-10-041232,"American Express Servicios Profesionales, S.A. de C.V.",Mexico, +4962-0001193125-10-041232,American Express Credit Corporation,(USA) Delaware, +4962-0001193125-10-041232,American Express Capital Australia,Australia, +4962-0001193125-10-041232,"American Express Credit Mexico, LLC",(USA) Delaware, +4962-0001193125-10-041232,Fideicomiso Empresarial American Express No.232033,Mexico, +4962-0001193125-10-041232,American Express Euro Funding Limited Partnership,United Kingdom, +4962-0001193125-10-041232,American Express Overseas Credit Corporation Limited,Jersey, +4962-0001193125-10-041232,AEOCC Management Company Limited,Jersey, +4962-0001193125-10-041232,American Express Overseas Credit Corporation N.V.,Netherlands Antilles, +4962-0001193125-10-041232,AE Hungary Holdings Limited Liability Company,Hungary, +4962-0001193125-10-041232,American Express Canada Credit Corporation,Canada, +4962-0001193125-10-041232,American Express Canada Finance Limited,Canada, +4962-0001193125-10-041232,American Express Sterling Funding Limited Partnership,United Kingdom, +4962-0001193125-10-041232,American Express Funding (Luxembourg) S.a.r.l,Luxembourg, +4962-0001193125-10-041232,Credco Receivables Corp.,(USA) Delaware, +4962-0001193125-10-041232,"American Express Dutch Capital, LLC",(USA) Delaware, +4962-0001193125-10-041232,American Express Europe Limited,(USA) Delaware, +4962-0001193125-10-041232,Sceptre Nominees Limited,United Kingdom, +4962-0001193125-10-041232,"American Express Global Financial Services, Inc.",(USA) Delaware, +4962-0001193125-10-041232,American Express Holdings Netherlands CV,Netherlands, +4962-0001193125-10-041232,"American Express Insurance Agency of Puerto Rico, Inc.",Puerto Rico, +4962-0001193125-10-041232,"American Express International (NZ), Inc.",(USA) Delaware, +4962-0001193125-10-041232,American Express Limited,(USA) Delaware, +4962-0001193125-10-041232,Alpha Card SCRL,Belgium, +4962-0001193125-10-041232,Alpha Card Merchant Services SCRL,Belgium, +4962-0001193125-10-041232,BCC Corporate NV/SA,Belgium, +4962-0001193125-10-041232,American Express (Malaysia) SDN. BHD.,Malaysia, +4962-0001193125-10-041232,American Express (Thai) Co. Ltd,Thailand, +4962-0001193125-10-041232,American Express Brasil Assessoria Empresarial Ltda.,Brazil, +4962-0001193125-10-041232,American Express International (B) SDN.BHD,Brunei Darussalam, +4962-0001193125-10-041232,"American Express International Holdings, LLC",(USA) Delaware, +4962-0001193125-10-041232,American Express Argentina S.A.,Argentina, +4962-0001193125-10-041232,American Express Holdings (France) SAS,France, +4962-0001193125-10-041232,American Express France SAS,France, +4962-0001193125-10-041232,American Express Carte France SA,France, +4962-0001193125-10-041232,American Express Change SAS,France, +4962-0001193125-10-041232,American Express Paris SAS,France, +4962-0001193125-10-041232,American Express Services SA,France, +4962-0001193125-10-041232,American Express Voyages SAS,France, +4962-0001193125-10-041232,American Express Management,France, +4962-0001193125-10-041232,American Express France Finance SNC,France, +4962-0001193125-10-041232,South Pacific Credit Card Limited,New Zealand, +4962-0001193125-10-041232,Centurion Finance Limited,New Zealand, +4962-0001193125-10-041232,"American Express International, Inc",(USA) Delaware, +4962-0001193125-10-041232,AE Exposure Management Limited,Jersey, +4962-0001193125-10-041232,American Express (India) Private Limited,India, +4962-0001193125-10-041232,American Express Asia Network Consulting (Beijing) Limited Company,China, +4962-0001193125-10-041232,American Express Australia Limited,Australia, +4962-0001193125-10-041232,American Express Company AS,Norway, +4962-0001193125-10-041232,American Express Corporate Travel SA,Belgium, +4962-0001193125-10-041232,American Express Denmark A/S,Denmark, +4962-0001193125-10-041232,American Express Group Services Limited,United Kingdom, +4962-0001193125-10-041232,American Express Holding AB,Sweden, +4962-0001193125-10-041232,American Express Business Travel A/S,Denmark, +4962-0001193125-10-041232,American Express Business Travel AB,Sweden, +4962-0001193125-10-041232,American Express Business Travel AS,Norway, +4962-0001193125-10-041232,Forsakringsaktiebolaget Viator,Sweden, +4962-0001193125-10-041232,American Express Holdings Limited,United Kingdom, +4962-0001193125-10-041232,American Express Insurance Services Europe Limited,United Kingdom, +4962-0001193125-10-041232,American Express Services Europe Limited,United Kingdom, +4962-0001193125-10-041232,American Express Hungary Financial Services Closed Company Limited by Shares,Hungary, +4962-0001193125-10-041232,American Express Hungary Travel Services Ltd.,Hungary, +4962-0001193125-10-041232,"American Express International (Taiwan), Inc.","Taiwan, Province of China", +4962-0001193125-10-041232,American Express International SA,Greece, +4962-0001193125-10-041232,Key Tours SA,Greece, +4962-0001193125-10-041232,"American Express Japan Co., Ltd",Japan, +4962-0001193125-10-041232,American Express Locazioni Finanziarie s.r.l,Italy, +4962-0001193125-10-041232,American Express Payment Services Limited,United Kingdom, +4962-0001193125-10-041232,American Express Poland S.A.,Poland, +4962-0001193125-10-041232,American Express Reisebüro GmbH,Austria, +4962-0001193125-10-041232,American Express Services India Limited,India, +4962-0001193125-10-041232,American Express spol. s.r.o.,Czech Republic, +4962-0001193125-10-041232,American Express Swiss Holdings GmbH,Switzerland, +4962-0001193125-10-041232,Swisscard AECS AG,Switzerland, +4962-0001193125-10-041232,American Express Travel (Singapore) Pte. Ltd.,Singapore, +4962-0001193125-10-041232,American Express Travel Holdings (Hong Kong) Limited,Hong Kong, +4962-0001193125-10-041232,CITS American Express Air Services Ltd,China, +4962-0001193125-10-041232,CITS American Express Southern Air Services Ltd,China, +4962-0001193125-10-041232,CITS American Express Travel Services Ltd,China, +4962-0001193125-10-041232,Farrington American Express Travel Services Limited,Hong Kong, +4962-0001193125-10-041232,American Express Travel Holdings (M) Company SDN. BHD.,Malaysia, +4962-0001193125-10-041232,Mayflower American Express Travel Services SDN. BHD.,Malaysia, +4962-0001193125-10-041232,"American Express Travel Services Vostok, LLC",Russian Federation, +4962-0001193125-10-041232,ZAO “American Express International Services”,Russian Federation, +4962-0001193125-10-041232,American Express Wholesale Currency Services Pty Limited,Australia, +4962-0001193125-10-041232,Amex Broker Assicurativo s.r.l.,Italy, +4962-0001193125-10-041232,"Amex General Insurance Agency, Inc.","Taiwan, Province of China", +4962-0001193125-10-041232,"Amex Life Insurance Marketing, Inc.","Taiwan, Province of China", +4962-0001193125-10-041232,Amex Travel Holding (Japan) Ltd.,Japan, +4962-0001193125-10-041232,"American Express Nippon Travel Agency, Inc.",Japan, +4962-0001193125-10-041232,Interactive Transaction Solutions Limited,United Kingdom, +4962-0001193125-10-041232,Interactive Transactions Solutions SAS,France, +4962-0001193125-10-041232,Sociedad Internacional de Servicios de Panama S.A.,Panama, +4962-0001193125-10-041232,TransUnion Limited,Hong Kong, +4962-0001193125-10-041232,American Express Service (Thailand) Company Limited,Thailand, +4962-0001193125-10-041232,"TRS Card International, Inc.",(USA) Delaware, +4962-0001193125-10-041232,"American Express de Espana, S.A. (Sociedad Unipersonal)",Spain, +4962-0001193125-10-041232,"American Express E.F.C., S.A. (Sociedad Unipersonal)",Spain, +4962-0001193125-10-041232,"American Express Foreign Exchange, S.A. (Sociedad Unipersonal)",Spain, +4962-0001193125-10-041232,"American Express Viajes, S.A. (Sociedad Unipersonal)",Spain, +4962-0001193125-10-041232,American Express Barcelo Viajes SL,Spain, +4962-0001193125-10-041232,"Amex Asesores de Seguros, S.A. (Sociedad Unipersonal)",Spain, +4962-0001193125-10-041232,American Express Marketing & Development Corp.,(USA) Delaware, +4962-0001193125-10-041232,American Express Prepaid Card Management Corporation,(USA) Arizona, +4962-0001193125-10-041232,American Express Publishing Corporation,(USA) New York, +4962-0001193125-10-041232,American Express Receivables Financing Corporation II,(USA) Delaware, +4962-0001193125-10-041232,American Express Receivables Financing Corporation V LLC,(USA) Delaware, +4962-0001193125-10-041232,Amex (Middle East) B.S.C. (c),Bahrain, +4962-0001193125-10-041232,Amex Al Omania LLC,Oman, +4962-0001193125-10-041232,Amex Egypt LLC,Egypt, +4962-0001193125-10-041232,ASAL (American Express Saudi Arabia Ltd),Bahrain, +4962-0001193125-10-041232,Amex Bank of Canada,Canada, +4962-0001193125-10-041232,Amex Canada Inc.,Canada, +4962-0001193125-10-041232,Amex Card Services Company,(USA) Delaware, +4962-0001193125-10-041232,Asesorías e Inversiones American Express Chile Limitada,Chile, +4962-0001193125-10-041232,Amex Inmobiliaria Limitada,Chile, +4962-0001193125-10-041232,"Bansamex, S.A.",Spain, +4962-0001193125-10-041232,Cardmember Financial Services Limited,Jersey, +4962-0001193125-10-041232,"Cavendish Holdings, Inc.",(USA) Delaware, +4962-0001193125-10-041232,"Drillamex, Inc.",(USA) Delaware, +4962-0001193125-10-041232,FRC West Property. LLC,(USA) Arizona, +4962-0001193125-10-041232,"Harbor Payments, Inc.",(USA) Delaware, +4962-0001193125-10-041232,"Fiware Holdings, Inc.",(USA) Delaware, +4962-0001193125-10-041232,Harbor Payments Corporation,(USA) Georgia, +4962-0001193125-10-041232,Southern Africa Travellers Cheque Company (Pty) Ltd,South Africa, +4962-0001193125-10-041232,Swiss Bankers Prepaid Services AG,Switzerland, +4962-0001193125-10-041232,"Travel Impressions, Ltd.",(USA) Delaware, +4962-0001193125-10-041232,Travellers Cheque Associates Limited,United Kingdom, +4962-0001193125-10-041232,AMEX Assurance Company,(USA) Illinois, +4962-0001193125-10-041232,Amexco Insurance Company,(USA) Vermont, +4962-0001193125-10-041232,"National Express Company, Inc.",(USA) New York, +4962-0001193125-10-041232,"The Balcor Company Holdings, Inc.",(USA) Delaware, +4962-0001193125-10-041232,The Balcor Company,(USA) Delaware, +4962-0001193125-10-041232,"Rexport, Inc.",(USA) Delaware, +5981-0001193125-12-106666,AMVAC Chemical Corporation,California, +5981-0001193125-12-106666,"GemChem, Inc.",California, +5981-0001193125-12-106666,2110 Davie Corporation (formerly ABSCO Distributing),California, +5981-0001193125-12-106666,AMVAC Chemical UK Ltd.*,"Surrey, England", +5981-0001193125-12-106666,AMVAC Chemical GmbH,Switzerland, +5981-0001193125-12-106666,AMVAC do Brasil Representácoes Ltda,Brasil, +5981-0001193125-12-106666,"Agroservicios Amvac, SA de CV",Mexico, +5981-0001193125-12-106666,Quimica Amvac de Mexico SA de CV,Mexico, +5981-0001193125-12-106666,AMVAC de Costa Rica Sociedad Anonima,Costa Rica, +5981-0001193125-12-106666,"Environmental Mediation, Inc.",California, +5981-0001193125-12-106666,Calhart Corporation,California, +5981-0001193125-12-106666,"Manufacturers Mirror & Glass Co., Inc.",California, +5981-0001193125-12-106666,Todagco (80%)*,California, +5981-0001193125-12-106666,American Vanguard Corporation of Imperial Valley (90%)*,California, +5981-0001193125-12-106666,AMVAC Ag-Chem*,California, +5981-0001193125-12-106666,AMVAC Chemical Corporation-Nevada*,Nevada, +11199-0001104659-06-016718,"Bemis Company, Inc. (the “Registrant”)",Missouri, +11199-0001104659-06-016718,"Banner Packaging, Inc.",Wisconsin,100.0 +11199-0001104659-06-016718,"Bemis Clysar, Inc.",Minnesota,100.0 +11199-0001104659-06-016718,"Bemis Czech Republic, s.r.o.",Czech Republic,100.0 +11199-0001104659-06-016718,Bemis Deutschland Holdings GmbH,Germany,100.0 +11199-0001104659-06-016718,Bemis Packaging Deutschland GmbH,Germany,100.0 +11199-0001104659-06-016718,"Bemis Europe Holdings, S.A.",Belgium,100.0 +11199-0001104659-06-016718,Bemis Monceau S.A.,Belgium,100.0 +11199-0001104659-06-016718,Techy France S.A.R.L.,France,100.0 +11199-0001104659-06-016718,"Bemis Flexible Packaging de Mexico, S.A. de C.V.",Mexico,100.0 +11199-0001104659-06-016718,"Bemis Flexible Packaging Mexico Servicios, S.A. de C.V.",Mexico,100.0 +11199-0001104659-06-016718,Bemis France Holdings S.A.S.,France,100.0 +11199-0001104659-06-016718,Bemis Packaging France S.A.S.,France,100.0 +11199-0001104659-06-016718,Bemis Le Trait S.A.S.,France,100.0 +11199-0001104659-06-016718,Bemis Epernon S.A.S.,France,100.0 +11199-0001104659-06-016718,Bemis Hungary Trading Limited Liability Company,Hungary,100.0 +11199-0001104659-06-016718,Bemis Packaging Danmark ApS,Denmark,100.0 +11199-0001104659-06-016718,Bemis Packaging Italia S.r.l,Italy,100.0 +11199-0001104659-06-016718,Bemis Packaging Sverige A.B.,Sweden,100.0 +11199-0001104659-06-016718,Bemis Packaging U.K. Ltd.,United Kingdom,100.0 +11199-0001104659-06-016718,Bemis Valkeakoski Oy,Finland,100.0 +11199-0001104659-06-016718,Bolsas Bemis S.A. de C.V.,Mexico,51.0 +11199-0001104659-06-016718,Bolsas Bemis Servicios Mexico S.A. de C.V.,Mexico,51.0 +11199-0001104659-06-016718,"Curwood, Inc.",Delaware,100.0 +11199-0001104659-06-016718,Curwood Packaging (Canada) Limited,Canada,100.0 +11199-0001104659-06-016718,Bemis Packaging Ireland Limited,Ireland,100.0 +11199-0001104659-06-016718,Bemis Swansea Limited,United Kingdom,100.0 +11199-0001104659-06-016718,Bemis Packaging Espana sl,Spain,100.0 +11199-0001104659-06-016718,Itap Bemis Ltda.,Brazil,22.0 +11199-0001104659-06-016718,"Perfecseal, Inc.",Delaware,100.0 +11199-0001104659-06-016718,"Perfecseal Internacional de Puerto Rico, Inc.",Delaware,100.0 +11199-0001104659-06-016718,Perfecseal International Ltd.,Delaware,100.0 +11199-0001104659-06-016718,Perfecseal Limited,United Kingdom,100.0 +11199-0001104659-06-016718,Bemis Asia Pacific Sdn Bhd,Malaysia,100.0 +11199-0001104659-06-016718,"DEMF DT Holdings I, LLC",Delaware,100.0 +11199-0001104659-06-016718,Itap Bemis Ltda.,Brazil,23.0 +11199-0001104659-06-016718,Hayco Liquidation Company,Delaware,100.0 +11199-0001104659-06-016718,Bemis U.K. Limited,United Kingdom,50.0 +11199-0001104659-06-016718,"MacKay, Inc.",Kentucky,100.0 +11199-0001104659-06-016718,"Milprint, Inc.",Wisconsin,100.0 +11199-0001104659-06-016718,"Curwood Specialty Films – Lebanon, Inc.",Delaware,100.0 +11199-0001104659-06-016718,Misbe Participacoes Ltda.,Brazil,100.0 +11199-0001104659-06-016718,SH Participacoes S.A.,Brazil,100.0 +11199-0001104659-06-016718,DT Participacoes S.A.,Brazil,76.0 +11199-0001104659-06-016718,Dixie Toga S.A.,Brazil,92.0 +11199-0001104659-06-016718,DT Participacoes S.A.,Brazil,24.0 +11199-0001104659-06-016718,Dixie Toga S.A.,Brazil,8.0 +11199-0001104659-06-016718,American Packaging S.A.,Argentina,98.0 +11199-0001104659-06-016718,American Plast S.A.,Argentina,60.0 +11199-0001104659-06-016718,Dixie Toga International Ltd.,Cayman Islands,100.0 +11199-0001104659-06-016718,Dixie Toga Centro-Oeste Embalagens S.A.,Brazil,100.0 +11199-0001104659-06-016718,Dixie Toga Nordeste S.A.,Brazil,100.0 +11199-0001104659-06-016718,Impressora Paranaense S.A.,Brazil,100.0 +11199-0001104659-06-016718,Insit Embalagens Ltda.,Brazil,90.0 +11199-0001104659-06-016718,Itap Bemis Ltda.,Brazil,55.0 +11199-0001104659-06-016718,Itap Bemis Centro Oeste-Industria e Comércio de Embalagens Ltda.,Brazil,100.0 +11199-0001104659-06-016718,Curwood Chile Ltda.,Chile,100.0 +11199-0001104659-06-016718,Laminor S.A.,Brazil,50.0 +11199-0001104659-06-016718,M&W Toga Industria e Comércio S.A.,Brazil,60.0 +11199-0001104659-06-016718,Morgan Adhesives Company,Ohio,100.0 +11199-0001104659-06-016718,Bemis Coordination Center S.A.,Belgium,33.0 +11199-0001104659-06-016718,Bemis U.K. Limited,United Kingdom,50.0 +11199-0001104659-06-016718,MACtac U.K. Limited,United Kingdom,100.0 +11199-0001104659-06-016718,"Electronic Printing Products, Inc.",Ohio,100.0 +11199-0001104659-06-016718,Enterprise Software Inc.,Ohio,100.0 +11199-0001104659-06-016718,"MACtac Engineered Products, Inc.",Ohio,100.0 +29644-0001628280-16-019746,"Aerospace Filtration Systems, Inc.","Chesterfield, MO USA", +29644-0001628280-16-019746,ASHC LLC,"Minneapolis, MN USA", +29644-0001628280-16-019746,DLX Capital S.a.r.l.,"Luxembourg City, Luxembourg", +29644-0001628280-16-019746,DLX USD FIN CO. S.a.r.l.,"Luxembourg City, Luxembourg", +29644-0001628280-16-019746,"Donaldson (China) Holding Co., Ltd","Shanghai, China", +29644-0001628280-16-019746,"Donaldson (China) Trading Co., Ltd","Wuxi, China", +29644-0001628280-16-019746,Donaldson (Thailand) Ltd.,"Rayong, Thailand", +29644-0001628280-16-019746,"Donaldson (Wuxi) Filters Co., Ltd.","Wuxi, China", +29644-0001628280-16-019746,Donaldson (Xuzhou) Filters Co. Ltd.,"Xuzhou, China", +29644-0001628280-16-019746,Donaldson Australasia Pty. Ltd,"Wyong, Australia", +29644-0001628280-16-019746,"Donaldson Belgie, b.v.b.a.","Leuven, Belgium", +29644-0001628280-16-019746,"Donaldson Canada, Inc","Brockville, Ontario, Canada", +29644-0001628280-16-019746,"Donaldson Capital, Inc.","Minneapolis, MN USA", +29644-0001628280-16-019746,"Donaldson Chile, Ltd.","Santiago, Chile", +29644-0001628280-16-019746,Donaldson Columbia S.A.S.,"Bogotá, Columbia", +29644-0001628280-16-019746,Donaldson Czech Republic s.r.o,"Klasterec nad Ohri, Czech Republic", +29644-0001628280-16-019746,Donaldson do Brasil Equipamentos Industriais Ltda,"Atibaia, São Paulo, Brazil", +29644-0001628280-16-019746,"Donaldson Europe, b.v.b.a.","Leuven, Belgium", +29644-0001628280-16-019746,Donaldson Far East Ltd.,"Hong Kong, S.A.R., China", +29644-0001628280-16-019746,Donaldson Filter Components Ltd.,"Hull, United Kingdom", +29644-0001628280-16-019746,Donaldson Filtration (Asia Pacific) Pte. Ltd.,"Changi, Singapore", +29644-0001628280-16-019746,Donaldson Filtration (GB) Ltd.,"Leicester, United Kingdom", +29644-0001628280-16-019746,Donaldson Filtration (Malaysia) Sdn. Bhd.,"Selangor Darul Ehsan, Malaysia", +29644-0001628280-16-019746,Donaldson Filtration (Thailand) Ltd.,"Nonthaburi, Thailand", +29644-0001628280-16-019746,Donaldson Filtration Deutschland GmbH,"Haan, Germany", +29644-0001628280-16-019746,Donaldson Filtration Magyarorszag Kft,"Budapest, Hungary", +29644-0001628280-16-019746,Donaldson Filtration Norway a.s.,"Moss, Norway", +29644-0001628280-16-019746,"Donaldson Filtration Österreich, GmbH","Vienna, Austria", +29644-0001628280-16-019746,Donaldson Filtration Slovensko s.r.o.,"Bratislava, Slovakia", +29644-0001628280-16-019746,Donaldson Filtration Systems (Pty) Ltd.,"Cape Town, South Africa", +29644-0001628280-16-019746,Donaldson Filtre Sistemleri,"Istanbul, Turkey", +29644-0001628280-16-019746,"Donaldson France, s.a.s.","Paris, France", +29644-0001628280-16-019746,Donaldson Ibèrica Soluciones,"Barcelona, Spain", +29644-0001628280-16-019746,Donaldson India Filter Systems Pvt. Ltd.,"New Delhi, India", +29644-0001628280-16-019746,Donaldson Industrial CR - Konzern s.r.o.,"Kadan, Czech Republic", +29644-0001628280-16-019746,Donaldson Italia s.r.l.,"Ostiglia, Italy", +29644-0001628280-16-019746,"Donaldson Korea Co., Ltd.","Seoul, South Korea", +29644-0001628280-16-019746,Donaldson Luxembourg S.a.r.l,"Luxembourg City, Luxembourg", +29644-0001628280-16-019746,Donaldson Nederland B.V.,"Almere, Netherlands", +29644-0001628280-16-019746,Donaldson Overseas Holding S.a.r.l.,"Luxembourg City, Luxembourg", +29644-0001628280-16-019746,Donaldson Peru SAC,"Lima, Peru", +29644-0001628280-16-019746,Donaldson Polska Sp. z.o.o.,"Warsaw, Poland", +29644-0001628280-16-019746,Donaldson Scandinavia a.p.s.,"Hørsholm, Denmark", +29644-0001628280-16-019746,Donaldson Schweiz GmbH,"Zurich, Switzerland", +29644-0001628280-16-019746,Donaldson Taiwan Ltd.,"Taipei, Taiwan", +29644-0001628280-16-019746,Donaldson UK Holding Ltd.,"Hull, United Kingdom", +29644-0001628280-16-019746,"Donaldson, S.A. de C.V.","Aguascalientes, Mexico", +29644-0001628280-16-019746,"Donaldson, s.a.s.","Domjean, France", +29644-0001628280-16-019746,"Le Bozec Filtration et Systèmes, s.a.s.","Paris, France", +29644-0001628280-16-019746,Filtros Partmo S.A.S.,"Bogotá, Columbia", +29644-0001628280-16-019746,Nippon Donaldson Ltd.,"Tachikawa, Tokyo, Japan", +29644-0001628280-16-019746,"Northern Technical, L.L.C.","Abu Dhabi, United Arab Emirates", +29644-0001628280-16-019746,P.T. Donaldson Filtration Indonesia,"Jakarta, Indonesia", +29644-0001628280-16-019746,"Prestadora de Servicios Aguascalientes, S. de R.L. de C.V.","Aguascalientes, Mexico", +29644-0001628280-16-019746,Ultrafilter s.a.s.,"Vigny, France", +29644-0001628280-16-019746,Advanced Filtration Systems Inc.,"Champaign, IL USA", +29644-0001628280-16-019746,AFSI Europe s.r.o.,"Most, Czech Republic", +29644-0001628280-16-019746,"IFIL.USA, L.L.C.","Harrisonville, MO USA", +29644-0001628280-16-019746,P.T. Panata Jaya Mandiri,"Jakarta, Indonesia", +29644-0001628280-16-019746,Rashed Al-Rashed & Sons - Donaldson Company Ltd.,"Dammam, Saudi Arabia", +38723-0000038723-09-000029,"Franklin Securities, Inc.",Georgia, +38723-0000038723-09-000029,Frandisco Property and Casualty Insurance Company,Georgia, +38723-0000038723-09-000029,Frandisco Life Insurance Company,Georgia, +38723-0000038723-09-000029,"T&T, Inc.",Georgia, +320340-0000950123-10-027168,ChemFree Corporation,Georgia, +320340-0000950123-10-027168,"CoreCard Software, Inc.",Delaware, +320340-0000950123-10-027168,CoreCard SRL,Romania, +320340-0000950123-10-027168,ISC Software Pvt. Ltd.,India, +716646-0000950135-06-004150,Clinical Data BV,The Netherlands, +716646-0000950135-06-004150,Clinical Data Incorporated,Massachusetts, +716646-0000950135-06-004150,"Clinical Data Sales & Service, Inc.",Delaware, +716646-0000950135-06-004150,Electa Lab s.r.l.,Italy, +716646-0000950135-06-004150,"Genaissance Pharmaceuticals, Inc.",Delaware, +716646-0000950135-06-004150,Genome Express S.A.,France, +716646-0000950135-06-004150,"GPSI Acquisition, Inc.",Delaware, +716646-0000950135-06-004150,"Icoria, Inc.",Delaware, +716646-0000950135-06-004150,"Lark Technologies, Inc.",Delaware, +716646-0000950135-06-004150,NovaChem BV,The Netherlands, +716646-0000950135-06-004150,Spectronetics NV,"Curaçao, Netherlands Antilles", +716646-0000950135-06-004150,Vital Scientific NV,The Netherlands, +716646-0000950135-06-004150,Vital Diagnostics Pty. Ltd.,Australia, +716646-0000950135-06-004150,Vital Diagnostics Ltd,New Zealand, +719402-0001193125-14-113892,"First Bank, Inc.",Virginia, +719402-0001193125-14-113892,"First Bank Financial Services, Inc.",Virginia, +719402-0001193125-14-113892,"Shen-Valley Land Holdings, LLC",Virginia, +719402-0001193125-14-113892,First National (VA) Statutory Trust II,Delaware, +719402-0001193125-14-113892,First National (VA) Statutory Trust III,Delaware, +749660-0001193125-12-104800,"Xoft, Inc.",Delaware, +56679-0001193125-16-634657,Korn Ferry International S.A.,Argentina, +56679-0001193125-16-634657,Korn Ferry Futurestep Argentina S.R.L.,Argentina, +56679-0001193125-16-634657,Korn/Ferry International Pty Limited,Australia, +56679-0001193125-16-634657,Futurestep (Australia) Pty Ltd,Australia, +56679-0001193125-16-634657,Korn/Ferry International GmbH,Austria, +56679-0001193125-16-634657,Korn/Ferry International Futurestep (Belgium) BVBA,Belgium, +56679-0001193125-16-634657,"Personnel Decisions International Belgium, BVBA",Belgium, +56679-0001193125-16-634657,Korn/Ferry International Consultoria Ltda.,Brazil, +56679-0001193125-16-634657,"Korn/Ferry Canada, Inc.",Canada, +56679-0001193125-16-634657,Korn/Ferry International Futurestep (Canada) Inc.,Canada, +56679-0001193125-16-634657,Korn/Ferry International S.A.,Chile, +56679-0001193125-16-634657,Korn/Ferry International Human Capital Consulting (Beijing) Limited,"Beijing, China", +56679-0001193125-16-634657,Guangzhou Korn/Ferry Human Capital Company Ltd.,"Guangzhou, China", +56679-0001193125-16-634657,"Korn/Ferry (Shanghai) Human Capital Consulting Co., Ltd.","Shanghai, China", +56679-0001193125-16-634657,PuDe Management Consulting Co. Ltd.,"Shanghai, China", +56679-0001193125-16-634657,Futurestep (Shanghai) Talent Consulting Company Limited,China, +56679-0001193125-16-634657,Korn/Ferry International — Colombia,Colombia, +56679-0001193125-16-634657,Korn/Ferry International A/S,Denmark, +56679-0001193125-16-634657,Korn/Ferry International SAS,France, +56679-0001193125-16-634657,Korn/Ferry International Futurestep (France) SARL,France, +56679-0001193125-16-634657,Personnel Decisions International France SAS,France, +56679-0001193125-16-634657,Korn/Ferry International GmbH,Germany, +56679-0001193125-16-634657,Futurestep Germany GmbH,Germany, +56679-0001193125-16-634657,Korn/Ferry International SA,Greece, +56679-0001193125-16-634657,Korn/Ferry International (H.K.) Limited,Hong Kong, +56679-0001193125-16-634657,Futurestep (Hong Kong) Ltd.,Hong Kong, +56679-0001193125-16-634657,Korn/Ferry International Budapest Personnel Consulting and Service Ltd.,Hungary, +56679-0001193125-16-634657,"PDI Hungary, Kft.",Hungary, +56679-0001193125-16-634657,Korn/Ferry International Private Limited,India, +56679-0001193125-16-634657,Futurestep Recruitment Services Private Limited.,India, +56679-0001193125-16-634657,Personnel Decisions International India Pvt. Limited,India, +56679-0001193125-16-634657,PT. Korn/Ferry International,Indonesia, +56679-0001193125-16-634657,Korn/Ferry International S.R.L.,Italy, +56679-0001193125-16-634657,Futurestep (Italia) S.r.l.,Italy, +56679-0001193125-16-634657,Nihon Korn/Ferry International K.K.,Japan, +56679-0001193125-16-634657,Futurestep (Japan) K.K.,Japan, +56679-0001193125-16-634657,Korn Ferry Consulting — Japan,Japan, +56679-0001193125-16-634657,Korn/Ferry International (Korea) Limited,Korea, +56679-0001193125-16-634657,Agensi Pekerjaan Futurestep Worldwide (M) Sdn. Bhd.,Malaysia, +56679-0001193125-16-634657,Korn/Ferry International (M) Sdn. Bhd.,Malaysia, +56679-0001193125-16-634657,Korn/Ferry Investment India Limited (Mauritius OCB),Mauritius, +56679-0001193125-16-634657,Korn/Ferry Mexico S.C.,Mexico, +56679-0001193125-16-634657,Korn Ferry International B.V.,Netherlands, +56679-0001193125-16-634657,Korn/Ferry International Futurestep (Holdings) B.V.,Netherlands, +56679-0001193125-16-634657,Korn Ferry International NZ Limited,New Zealand, +56679-0001193125-16-634657,Futurestep (New Zealand) Ltd.,New Zealand, +56679-0001193125-16-634657,Korn/Ferry International A/S,Norway, +56679-0001193125-16-634657,Korn/Ferry International — Peru S.A.,Peru, +56679-0001193125-16-634657,Korn/Ferry International Sp.z.o.o.,Poland, +56679-0001193125-16-634657,Korn/Ferry International Futurestep (POLSKA) Sp.z.o.o.,Poland, +56679-0001193125-16-634657,Korn/Ferry International Pte. Ltd.,Singapore, +56679-0001193125-16-634657,Futurestep (Singapore) Pte Limited,Singapore, +56679-0001193125-16-634657,"PDI Slovensko, sro",Slovakia, +56679-0001193125-16-634657,Korn/Ferry International S.A.,Spain, +56679-0001193125-16-634657,"Futurestep (Espana), S.L.",Spain, +56679-0001193125-16-634657,Korn/Ferry International AB,Sweden, +56679-0001193125-16-634657,Personnel Decisions International Scandinavia A.B.,Sweden, +56679-0001193125-16-634657,Korn-Ferry (Schweiz) AG,Switzerland, +56679-0001193125-16-634657,Korn/Ferry International (Taiwan) Co. Limited,Taiwan, +56679-0001193125-16-634657,Korn/Ferry International Musavirlik Limited Sirketi,Turkey, +56679-0001193125-16-634657,Futurestep (UK) Limited,United Kingdom, +56679-0001193125-16-634657,Korn/Ferry International Limited,United Kingdom, +56679-0001193125-16-634657,KFI (UK) Limited,United Kingdom, +56679-0001193125-16-634657,The Whitehead Mann Partnership LLP,United Kingdom, +56679-0001193125-16-634657,Whitehead Mann Limited,United Kingdom, +56679-0001193125-16-634657,"Personnel Decisions International, Europe Limited",United Kingdom, +56679-0001193125-16-634657,Personnel Decisions International UK Ltd,United Kingdom, +56679-0001193125-16-634657,Korn Ferry Global Holdings (UK) Limited,United Kingdom, +56679-0001193125-16-634657,Korn Ferry GH1 Limited,United Kingdom, +56679-0001193125-16-634657,"Pivot Learning, Limited",United Kingdom, +56679-0001193125-16-634657,Continental American Management Corp.,"United States, California", +56679-0001193125-16-634657,Korn/Ferry International Holding India,"United States, California", +56679-0001193125-16-634657,"Korn/Ferry International Futurestep, Inc.","United States, Delaware", +56679-0001193125-16-634657,Korn/Ferry International Futurestep (Holdings) Inc.,"United States, Delaware", +56679-0001193125-16-634657,"Korn/Ferry International Worldwide, Inc.","United States, Delaware", +56679-0001193125-16-634657,"K/FI Canada Holdings, LLC","United States, Delaware", +56679-0001193125-16-634657,Korn Ferry Leadership Consulting Corporation,"United States, Delaware", +56679-0001193125-16-634657,"Ninth House, Inc.","United States, Delaware", +56679-0001193125-16-634657,"Korn Ferry Global Holdings, Inc.","United States, Delaware", +56679-0001193125-16-634657,Personnel Decisions International Greater China Corporation,"United States, Minnesota", +56679-0001193125-16-634657,Personnel Decisions International Singapore Corporation,"United States, Minnesota", +56679-0001193125-16-634657,"Sensa Solutions, Inc.","United States, Virginia", +56679-0001193125-16-634657,"Korn/Ferry International Consultores Asociados, C.A.",Venezuela, +56679-0001193125-16-634657,"Hay Group Holdings, Inc.","United States, Delaware", +56679-0001193125-16-634657,"Hay Group International, Inc.","United States, Delaware", +56679-0001193125-16-634657,"Hay Group, Inc.","United States, Delaware", +56679-0001193125-16-634657,"Hay Group Management, Inc.","United States, Delaware", +56679-0001193125-16-634657,Hay Group Limited,Canada, +56679-0001193125-16-634657,Hay Group N.V./S.A.,Belgium, +56679-0001193125-16-634657,Hay Group Czech s.r.o.,Czech Republic, +56679-0001193125-16-634657,Hay Group Oy,Finland, +56679-0001193125-16-634657,Hay Group S.A.,France, +56679-0001193125-16-634657,Hay France S.A.,France, +56679-0001193125-16-634657,Hay Group GmbH,Germany, +56679-0001193125-16-634657,Hay Group S.A.,Greece, +56679-0001193125-16-634657,Hay Group Management Consultants Ltd.,Hungary, +56679-0001193125-16-634657,Hay Group (Ireland) Limited,Ireland, +56679-0001193125-16-634657,Hay Management Consultants Ireland Ltd.,Ireland, +56679-0001193125-16-634657,Hay Group S.r.l.,Italy, +56679-0001193125-16-634657,Hay Group UAB,Lithuania, +56679-0001193125-16-634657,HG (Luxembourg) S.a.r.l.,Luxembourg, +56679-0001193125-16-634657,Talent Q International Ltd.,Malta, +56679-0001193125-16-634657,Talent Q Distribution Ltd.,Malta, +56679-0001193125-16-634657,Hay Group B.V.,Netherlands, +56679-0001193125-16-634657,Hay Group Investment Holding B.V.,Netherlands, +56679-0001193125-16-634657,Hay Management International B.V.,Netherlands, +56679-0001193125-16-634657,Hay Group Partners Holding B.V.,Netherlands, +56679-0001193125-16-634657,Hay Group AS,Norway, +56679-0001193125-16-634657,Hay Group Sp.Z o.o,Poland, +56679-0001193125-16-634657,Hay Group S.A.,Portugal, +56679-0001193125-16-634657,Hay Group LLC,Qatar, +56679-0001193125-16-634657,Hay Group Management Consultants SRL,Romania, +56679-0001193125-16-634657,OOO Hay Group (Hay Group Ltd.),Russia, +56679-0001193125-16-634657,Hay Group Saudi Arabia Ltd.,Saudi Arabia, +56679-0001193125-16-634657,Hay Group s.r.o.,Slovakia, +56679-0001193125-16-634657,Hay Group South Africa (Pty) Ltd.,South Africa, +56679-0001193125-16-634657,Hay Group S.A.,Spain, +56679-0001193125-16-634657,Hay Group AB,Sweden, +56679-0001193125-16-634657,Hay Group Danismanlik Limited Sirketi,Turkey, +56679-0001193125-16-634657,Hay Group LLC,Ukraine, +56679-0001193125-16-634657,The Hay Group Management Limited,United Kingdom, +56679-0001193125-16-634657,Hay Group UK Holdings Limited,United Kingdom, +56679-0001193125-16-634657,Hay Group Intermediary Limited,United Kingdom, +56679-0001193125-16-634657,Talent Q Services Limited,United Kingdom, +56679-0001193125-16-634657,Talent Q Limited,United Kingdom, +56679-0001193125-16-634657,Hay Group Pty. Limited,Australia, +56679-0001193125-16-634657,"Hay Group Co., (Shanghai) Ltd.",China, +56679-0001193125-16-634657,Hay Group Limited,Hong Kong, +56679-0001193125-16-634657,Hay Group Asia Limited,Hong Kong, +56679-0001193125-16-634657,Hay Consultants India Private Ltd.,India, +56679-0001193125-16-634657,Talent Q India Private Ltd.,India, +56679-0001193125-16-634657,PT Hay Group,Indonesia, +56679-0001193125-16-634657,"Hay Group (Japan), Ltd.",Japan, +56679-0001193125-16-634657,Hay Group Sdn. Bhd.,Malaysia, +56679-0001193125-16-634657,Hay Group Limited,New Zealand, +56679-0001193125-16-634657,Hay Group Pte Ltd.,Singapore, +56679-0001193125-16-634657,Hay Group Ltd.,South Korea, +56679-0001193125-16-634657,Hay Group Limited,Thailand, +56679-0001193125-16-634657,Hay Group Consulting Limited Liability,Vietnam, +56679-0001193125-16-634657,Hay Argentina S.A.,Argentina, +56679-0001193125-16-634657,Hay do Brasil Consultores Ltda.,Brazil, +56679-0001193125-16-634657,Hay GroupLimitada,Chile, +56679-0001193125-16-634657,Hay Group Ltda,Colombia, +56679-0001193125-16-634657,"Hay Group, S.R.L.",Costa Rica, +56679-0001193125-16-634657,Hay Financial Corporation N.V.,Curacao, +56679-0001193125-16-634657,Hay Group S.A. de C.V.,Mexico, +56679-0001193125-16-634657,Hay Group S.A.,Peru, +56679-0001193125-16-634657,"Hay Group Venezuela, S.A.",Venezuela, +56679-0001193125-16-634657,Hay Management Consultants Limited,Bermuda, +56679-0001193125-16-634657,HG (Bermuda) Holding Limited,Bermuda, +56679-0001193125-16-634657,Korn Ferry GP Ventures LLC,"United States, Delaware", +56679-0001193125-16-634657,Korn Ferry Global Ventures LP,United Kingdom, +56679-0001193125-16-634657,Korn/Ferry International Futurestep (the Netherlands) BV,Netherlands, +75829-0001206774-11-002167,Medsep Corporation,Delaware, +75829-0001206774-11-002167,Pall Acquisition LLC,Delaware, +75829-0001206774-11-002167,Pall Aeropower Corporation,Delaware, +75829-0001206774-11-002167,"Pall Biomedical, Inc.",Delaware, +75829-0001206774-11-002167,Pall Industrial Membranes LLC,Delaware, +75829-0001206774-11-002167,"Pall Life Sciences Puerto Rico, LLC",Puerto Rico, +75829-0001206774-11-002167,"Pall – PASS US, LLC",Delaware, +75829-0001206774-11-002167,Russell Associates Inc.,Maryland, +75829-0001206774-11-002167,"Gelman Sciences, Inc.",Michigan, +75829-0001206774-11-002167,Pall Austria Filter GesmbH,Austria, +75829-0001206774-11-002167,Pall (Canada) Limited,Canada, +75829-0001206774-11-002167,Pall Do Brasil,Brazil, +75829-0001206774-11-002167,Pall Europe Limited (a),England, +75829-0001206774-11-002167,Pall France S.A.S.,France, +75829-0001206774-11-002167,Pall Deutschland Beteiligungs GmbH,Germany, +75829-0001206774-11-002167,Pall Deutschland Holding GmbH & Co. KG Partnership (c),Germany, +75829-0001206774-11-002167,Pall Italia S.R.L.,Italy, +75829-0001206774-11-002167,Pall Manufacturing UK Limited,England, +75829-0001206774-11-002167,Gelman Ireland Ltd.,Ireland, +75829-0001206774-11-002167,Pall Netherlands B.V. (a),The Netherlands, +75829-0001206774-11-002167,PLLN C.V. Partnership (b),The Netherlands, +75829-0001206774-11-002167,Pall Norge AS,Norway, +75829-0001206774-11-002167,Pall Espana S.A.U.,Spain, +75829-0001206774-11-002167,Pall Norden AB,Sweden, +75829-0001206774-11-002167,Pall (Schweiz) A.G.,Switzerland, +75829-0001206774-11-002167,Argentaurum A.G.,Switzerland, +75829-0001206774-11-002167,Pall Asia International Ltd.,Hong Kong, +75829-0001206774-11-002167,Pall India Private Ltd.,India, +75829-0001206774-11-002167,PT Pall Filtration Indonesia,Indonesia, +75829-0001206774-11-002167,Nihon Pall Ltd.,Japan, +75829-0001206774-11-002167,Pall New Zealand Limited,New Zealand, +75829-0001206774-11-002167,Pall Filtration Pte. Ltd.,Singapore, +75829-0001206774-11-002167,Pall Singapore Taiwan Branch Holding Company Pte. Ltd.,Singapore, +75829-0001206774-11-002167,Pall Korea Ltd.,South Korea, +75829-0001206774-11-002167,Pall Corporation Filtration and Separations (Thailand) Ltd.,Thailand, +75829-0001206774-11-002167,Pall Australia Pty LTD,Australia, +89800-0000089800-18-000004,"Acquire Sourcing, LLC",DE, +89800-0000089800-18-000004,"Comex North America, Inc.",DE, +89800-0000089800-18-000004,Contract Transportation Systems Co.,DE, +89800-0000089800-18-000004,CTS National Corporation,DE, +89800-0000089800-18-000004,Omega Specialty Products & Services LLC,OH, +89800-0000089800-18-000004,"Plasti-Kote Co., Inc.",OH, +89800-0000089800-18-000004,"Sherwin-Williams Realty Holdings, Inc.",IL, +89800-0000089800-18-000004,SWIMC LLC,DE, +89800-0000089800-18-000004,The Sherwin-Williams Acceptance Corporation,NV, +89800-0000089800-18-000004,The Sherwin-Williams Headquarters Company,OH, +89800-0000089800-18-000004,The Sherwin-Williams Manufacturing Company,OH, +89800-0000089800-18-000004,The Sherwin-Williams US Licensing Company,DE, +89800-0000089800-18-000004,"Valspar Specialty Paints, LLC",DE, +89800-0000089800-18-000004,"Compania Sherwin-Williams, S.A. de C.V.",Mexico, +89800-0000089800-18-000004,Deep Pride Limited,Ireland, +89800-0000089800-18-000004,Dongguan Lilly Paint Industries Ltd,China, +89800-0000089800-18-000004,EPS B.V.,Netherlands, +89800-0000089800-18-000004,EPS Polidrox Industria e Comercio de Resinas Ltda,Brazil, +89800-0000089800-18-000004,"EPS (Shanghai) Trading Co., Ltd.",China, +89800-0000089800-18-000004,Geocel Limited,UK, +89800-0000089800-18-000004,Guangdong Valspar Paints Manufacturing Co Ltd.,China, +89800-0000089800-18-000004,Guangdong Yuegang Dadi Paints Company Limited,China, +89800-0000089800-18-000004,Guardsman Australia Pty Limited,Australia, +89800-0000089800-18-000004,Guardsman Industries Limited,UK, +89800-0000089800-18-000004,Invercolor Bologna Srl,Italy, +89800-0000089800-18-000004,Invercolor Ltd,UK, +89800-0000089800-18-000004,Invercolor Roma Srl,Italy, +89800-0000089800-18-000004,Invercolor Torino Srl,Italy, +89800-0000089800-18-000004,Invercolor Toscana Srl,Italy, +89800-0000089800-18-000004,Inver East Med S.A.,Greece, +89800-0000089800-18-000004,Inver France SAS,France, +89800-0000089800-18-000004,Inver GmbH,Germany, +89800-0000089800-18-000004,Inver Industrial Coating SRL,Romania, +89800-0000089800-18-000004,Inver Polska Spóika Z O.O,Poland, +89800-0000089800-18-000004,Inver Spa,Italy, +89800-0000089800-18-000004,Isocoat Tintas e Vernizes Ltda,Brazil, +89800-0000089800-18-000004,Isva Vernici Srl,Italy, +89800-0000089800-18-000004,"Jiangsu Pulanna Coating Co., Ltd.",China, +89800-0000089800-18-000004,Oy Sherwin-Williams Finland Ab,Finland, +89800-0000089800-18-000004,Pinturas Condor S.A.,Ecuador, +89800-0000089800-18-000004,Pinturas Industriales S.A.,Uruguay, +89800-0000089800-18-000004,Plasti-kote Limited,UK, +89800-0000089800-18-000004,"Productos Quimicos y Pinturas, S.A. de C.V.",Mexico, +89800-0000089800-18-000004,PT Sherwin-Williams Indonesia,Indonesia, +89800-0000089800-18-000004,PT Valspar Indonesia,Indonesia, +89800-0000089800-18-000004,Quest Automotive Products UK Limited,UK, +89800-0000089800-18-000004,"Quetzal Pinturas, S.A. de C.V.",Mexico, +89800-0000089800-18-000004,Resin Surfaces Limited,UK, +89800-0000089800-18-000004,Ronseal (Ireland) Limited,Ireland, +89800-0000089800-18-000004,Sherwin-Williams Argentina I.y C.S.A.,Argentina, +89800-0000089800-18-000004,Sherwin-Williams Aruba VBA,Aruba, +89800-0000089800-18-000004,Sherwin-Williams (Australia) Pty. Ltd.,AU, +89800-0000089800-18-000004,Sherwin-Williams Automotive Mexico S.de R.L.de C.V.,Mexico, +89800-0000089800-18-000004,Sherwin-Williams Balkan S.R.L.,Romania, +89800-0000089800-18-000004,Sherwin-Williams Bel,Belarus, +89800-0000089800-18-000004,Sherwin-Williams (Belize) Limited,Belize, +89800-0000089800-18-000004,Sherwin-Williams Benelux NV,Belgium, +89800-0000089800-18-000004,Sherwin-Williams Canada Inc.,Canada, +89800-0000089800-18-000004,Sherwin-Williams (Caribbean) N.V.,Curacao, +89800-0000089800-18-000004,Sherwin-Williams Cayman Islands Limited,Grand Cayman, +89800-0000089800-18-000004,Sherwin-Williams Chile S.A.,Chile, +89800-0000089800-18-000004,Sherwin-Williams Coatings India Private Limited,India, +89800-0000089800-18-000004,Sherwin-Williams Coatings S.a r.l.,Luxembourg, +89800-0000089800-18-000004,Sherwin Williams Colombia S.A.S.,Columbia, +89800-0000089800-18-000004,Sherwin-Williams Czech Republic spol. s r.o,Czech Republic, +89800-0000089800-18-000004,Sherwin-Williams Denmark A/S,Denmark, +89800-0000089800-18-000004,Sherwin-Williams Deutschland GmbH,Germany, +89800-0000089800-18-000004,Sherwin-Williams Diversified Brands (Australia) Pty Ltd,Australia, +89800-0000089800-18-000004,Sherwin-Williams Diversified Brands Limited,UK, +89800-0000089800-18-000004,Sherwin-Williams do Brasil Industria e Comercio Ltda.,Brazil, +89800-0000089800-18-000004,Sherwin-Williams France Finishes SAS,France, +89800-0000089800-18-000004,Sherwin-Williams (Ireland) Limited,Ireland, +89800-0000089800-18-000004,Sherwin-Williams Italy S.r.l.,Italy, +89800-0000089800-18-000004,Sherwin-Williams Luxembourg Investment Management Company S.a r.l.,Luxembourg, +89800-0000089800-18-000004,Sherwin-Williams (Malaysia) Sdn. Bhd.,Malaysia, +89800-0000089800-18-000004,Sherwin-Williams (Nantong) Company Limited,China, +89800-0000089800-18-000004,Sherwin-Williams Norway AS,Norway, +89800-0000089800-18-000004,Sherwin-Williams Paints Limited Liability Company,Russia, +89800-0000089800-18-000004,Sherwin-Williams Peru S.R.L.,Peru, +89800-0000089800-18-000004,Sherwin-Williams Pinturas de Venezuela S.A.,Venezuela, +89800-0000089800-18-000004,Sherwin-Williams Poland Sp. z o.o,Poland, +89800-0000089800-18-000004,Sherwin-Williams Protective & Marine Coatings,UK, +89800-0000089800-18-000004,Sherwin-Williams (S) Pte. Ltd.,Singapore, +89800-0000089800-18-000004,Sherwin-Williams Services (Malaysia) Sdn. Bhd.,Malaysia, +89800-0000089800-18-000004,Sherwin-Williams (Shanghai) Limited,China, +89800-0000089800-18-000004,"Sherwin-Williams (South China) Co., Ltd.",China, +89800-0000089800-18-000004,Sherwin-Williams Spain Coatings S.L.,Spain, +89800-0000089800-18-000004,Sherwin-Williams Sweden AB,Sweden, +89800-0000089800-18-000004,"Sherwin-Williams (Thailand) Co., Ltd.",Thailand, +89800-0000089800-18-000004,Sherwin-Williams Uruguay S.A.,Uruguay, +89800-0000089800-18-000004,Sherwin-Williams (Vietnam) Limited,Vietnam, +89800-0000089800-18-000004,Sherwin-Williams (West Indies) Limited,Jamaica, +89800-0000089800-18-000004,Spanyc Paints Joint Stock Company,Vietnam, +89800-0000089800-18-000004,SWIPCO – Sherwin Williams do Brasil Propriedade,Brazil, +89800-0000089800-18-000004,Syntema I Vaggeryd AB,Sweden, +89800-0000089800-18-000004,"Taiwan Valspar Co., Ltd.",Taiwan, +89800-0000089800-18-000004,The Valspar (Asia) Corporation Limited,Hong Kong, +89800-0000089800-18-000004,The Valspar (Australia) Corporation Pty. Ltd.,Australia, +89800-0000089800-18-000004,The Valspar Corporation Limitada,Brazil, +89800-0000089800-18-000004,The Valspar (Finland) Corporation Oy,Finland, +89800-0000089800-18-000004,The Valspar (France) Corporation S.A.S.,France, +89800-0000089800-18-000004,The Valspar (France) Research Corporation SAS,France, +89800-0000089800-18-000004,The Valspar (Germany) GmbH,Germany, +89800-0000089800-18-000004,The Valspar (Malaysia) Corporation Sdn Bhd,Malaysia, +89800-0000089800-18-000004,The Valspar (Nantes) Corporation S.A.S.,France, +89800-0000089800-18-000004,The Valspar (Singapore) Corporation Pte. Ltd,Singapore, +89800-0000089800-18-000004,The Valspar (South Africa) Corporation (Pty) Ltd,South Africa, +89800-0000089800-18-000004,The Valspar (Spain) Corporation S.R.L.,Spain, +89800-0000089800-18-000004,The Valspar (Switzerland) Corporation AG,Switzerland, +89800-0000089800-18-000004,The Valspar (Thailand) Corporation Ltd.,Thailand, +89800-0000089800-18-000004,The Valspar (UK) Corporation Limited,UK, +89800-0000089800-18-000004,The Valspar (Vietnam) Corporation Ltd.,Vietnam, +89800-0000089800-18-000004,TOB Becker Acroma Ukraine,Ukraine, +89800-0000089800-18-000004,UAB Sherwin-Williams Baltic,Lithuania, +89800-0000089800-18-000004,"Valspar Aries Coatings, S. de R.L. de C.V.",Mexico, +89800-0000089800-18-000004,Valspar Automotive Australia Pty Limited,Australia, +89800-0000089800-18-000004,Valspar Automotive (UK) Corporation Limited,UK, +89800-0000089800-18-000004,Valspar B.V.,Netherlands, +89800-0000089800-18-000004,"Valspar Coatings (Guangdong) Co., Ltd.",China, +89800-0000089800-18-000004,Valspar Coatings (Shanghai) Co. Ltd.,China, +89800-0000089800-18-000004,"Valspar Coatings (Tianjin) Co., Ltd",China, +89800-0000089800-18-000004,Valspar D.o.o Beograd,Serbia, +89800-0000089800-18-000004,Valspar Inc.,Canada, +89800-0000089800-18-000004,Valspar (India) Coatings Corporation Private Limited,India, +89800-0000089800-18-000004,Valspar Industries GmbH,Germany, +89800-0000089800-18-000004,Valspar Industries (Ireland) Ltd.,Ireland, +89800-0000089800-18-000004,Valspar Industries (Italy) S.r.l.,Italy, +89800-0000089800-18-000004,Valspar LLC,Russia, +89800-0000089800-18-000004,"Valspar Mexicana, S.A. de C.V.",Mexico, +89800-0000089800-18-000004,Valspar Paint (Australia) Pty Ltd,Australia, +89800-0000089800-18-000004,Valspar Paint (NZ) Limited,New Zealand, +89800-0000089800-18-000004,Valspar Powder Coatings Limited,UK, +89800-0000089800-18-000004,Valspar Rock Company Limited,Japan, +89800-0000089800-18-000004,"Valspar (Shanghai) Management Co., Ltd.",China, +89800-0000089800-18-000004,Vantaco Oy,Finland, +89800-0000089800-18-000004,Valspar (Uruguay) Corporation S.A.,Uruguay, +89800-0000089800-18-000004,Valspar (WPC) Pty Ltd,Australia, +89800-0000089800-18-000004,ZAO Sherwin-Williams,Russia, +799233-0000799233-13-000013,"Heartland Express, Inc.",NV, +799233-0000799233-13-000013,"A&M Express, Inc.",TN, +799233-0000799233-13-000013,"Heartland Express, Inc. of Iowa",IA, +799233-0000799233-13-000013,"Heartland Express Maintenance Services, Inc.",NV, +799233-0000799233-13-000013,"Heartland Express Services, Inc.",NV, +804328-0001234452-15-000271,"Qualcomm Technologies, Inc.",Delaware, +804328-0001234452-15-000271,Qualcomm Global Trading Pte. Ltd.,Singapore, +804328-0001234452-15-000271,Qualcomm CDMA Technologies Asia-Pacific Pte. Ltd.,Singapore, +804328-0001234452-15-000271,Qualcomm Asia Pacific Pte. Ltd.,Singapore, +804328-0001234452-15-000271,"Qualcomm Atheros, Inc.",Delaware, +804328-0001234452-15-000271,"Qualcomm Technologies International, Ltd.",United Kingdom, +821127-0000821127-11-000003,Borel Private Bank & Trust Company,California, +821127-0000821127-11-000003,Boston Private Bank & Trust Company,Massachusetts, +821127-0000821127-11-000003,Charter Private Bank,Washington, +821127-0000821127-11-000003,First Private Bank & Trust,California, +821127-0000821127-11-000003,"Anchor Capital Holdings, LLC",Delaware, +821127-0000821127-11-000003,"Bingham, Osborn, & Scarborough, LLC",California, +821127-0000821127-11-000003,"Dalton, Greiner, Hartman, Maher & Co. LLC",Delaware, +821127-0000821127-11-000003,"KLS Professional Advisors Group, LLC",Delaware, +821127-0000821127-11-000003,"Davidson Trust Company, LLC",Pennsylvania, +869495-0001213900-18-002720,Deep Well Oil & Gas (Alberta) Ltd.,"Alberta, Canada", +869495-0001213900-18-002720,Northern Alberta Oil Ltd.,"Alberta, Canada", +860546-0001104659-07-015618,"COPT Aerotech, LLC",, +860546-0001104659-07-015618,"COPT Interquest, LLC",, +860546-0001104659-07-015618,"COPT Interquest III, LLC",, +860546-0001104659-07-015618,"COPT Interquest IV, LLC",, +860546-0001104659-07-015618,"COPT Newport, LLC",, +860546-0001104659-07-015618,"COPT Newport C, LLC",, +860546-0001104659-07-015618,"COPT Newport D, LLC",, +860546-0001104659-07-015618,"COPT Northcreek, LLC",, +860546-0001104659-07-015618,"COPT Patriot Park at Galley, LLC",, +860546-0001104659-07-015618,"COPT Patriot Park I, LLC",, +860546-0001104659-07-015618,"COPT Patriot Park II, LLC",, +860546-0001104659-07-015618,"Patriot Park, LLC",, +860546-0001104659-07-015618,"Airport Square Holdings VI and VII, LLC",, +860546-0001104659-07-015618,"Blue Bell Investment Company, LP",, +860546-0001104659-07-015618,"COPT Acquisitions, Inc.",, +860546-0001104659-07-015618,"COPT Colgate General, LLC",, +860546-0001104659-07-015618,"COPT Concourse, LLC",, +860546-0001104659-07-015618,"COPT Gateway, LP",, +860546-0001104659-07-015618,"COPT Gateway Commerce, LLC",, +860546-0001104659-07-015618,"Corporate Gateway, LP",, +860546-0001104659-07-015618,"Corporate Office Properties, LP",, +860546-0001104659-07-015618,"Corporate Office Properties Holdings, Inc.",, +860546-0001104659-07-015618,"Crown Point, L.L.C.",, +860546-0001104659-07-015618,"Delaware Airport III, LLC",, +860546-0001104659-07-015618,"Delaware Airport VIII, LLC",, +860546-0001104659-07-015618,"Delaware Airport IX, LLC",, +860546-0001104659-07-015618,"Great Mills I, L.L.C.",, +860546-0001104659-07-015618,"Great Mills II, L.L.C.",, +860546-0001104659-07-015618,"Great Mills III, L.L.C.",, +860546-0001104659-07-015618,"Great Mills IV, L.L.C.",, +860546-0001104659-07-015618,"Great Mills V, L.L.C.",, +860546-0001104659-07-015618,"Harrisburg Corporate Gateway Partners, LP",, +860546-0001104659-07-015618,"Opportunity Invest Ventures, LLC",, +860546-0001104659-07-015618,"Sterling York, LLC",, +860546-0001104659-07-015618,"South Brunswick Investors, LP",, +860546-0001104659-07-015618,"11800 Tech Road, LLC",, +860546-0001104659-07-015618,"Aerotech Manager, LLC",, +860546-0001104659-07-015618,"Airport Square, LLC",, +860546-0001104659-07-015618,"Airport Square II, LLC",, +860546-0001104659-07-015618,"Airport Square IV, LLC",, +860546-0001104659-07-015618,"Airport Square V, LLC",, +860546-0001104659-07-015618,"Airport Square X, LLC",, +860546-0001104659-07-015618,"Airport Square XI, LLC",, +860546-0001104659-07-015618,"Airport Square XIII, LLC",, +860546-0001104659-07-015618,"Airport Square XIV, LLC",, +860546-0001104659-07-015618,"Airport Square XV, LLC",, +860546-0001104659-07-015618,"Airport Square XIX, LLC",, +860546-0001104659-07-015618,"Airport Square XX, LLC",, +860546-0001104659-07-015618,"Airport Square XX Parking, LLC",, +860546-0001104659-07-015618,"Airport Square XXI, LLC",, +860546-0001104659-07-015618,"Airport Square XXII, LLC",, +860546-0001104659-07-015618,"Airport Square Partners, LLC",, +860546-0001104659-07-015618,"Airport Square Storms, LLC",, +860546-0001104659-07-015618,"Ambassador Center, LLC",, +860546-0001104659-07-015618,"ASI, LLC",, +860546-0001104659-07-015618,"Atrium Building, LLC",, +860546-0001104659-07-015618,"Brown’s Wharf, LLC",, +860546-0001104659-07-015618,Centerpointe Limited Partnership,, +860546-0001104659-07-015618,"Clarks Hundred, LLC",, +860546-0001104659-07-015618,"Columbia Gateway S-28, LLC",, +860546-0001104659-07-015618,"Commons Office Research, LLC",, +860546-0001104659-07-015618,"Commons Office 6-B, LLC",, +860546-0001104659-07-015618,"Concourse 1304, LLC",, +860546-0001104659-07-015618,"COPT Arundel Preserve, LLC",, +860546-0001104659-07-015618,"COPT Baltimore County I, LLC",, +860546-0001104659-07-015618,"COPT Baltimore County II, LLC",, +860546-0001104659-07-015618,"COPT Development & Construction Services, LLC",, +860546-0001104659-07-015618,COPT Environmental Systems LLC,, +860546-0001104659-07-015618,"COPT Gate 63, LLC",, +860546-0001104659-07-015618,"COPT Gate 6700-6708-6724, LLC",, +860546-0001104659-07-015618,"COPT General, LLC",, +860546-0001104659-07-015618,"COPT Hunt Valley GP, LLC",, +860546-0001104659-07-015618,"COPT Montpelier, LLC",, +860546-0001104659-07-015618,"COPT Opportunity Invest I, LLC",, +860546-0001104659-07-015618,"COPT Property Management Services, LLC",, +860546-0001104659-07-015618,"COPT Renovation, LLC",, +860546-0001104659-07-015618,"COPT Riverwood, LLC",, +860546-0001104659-07-015618,"COPT T-11, LLC",, +860546-0001104659-07-015618,"COPT-FD Indian Head, LLC",, +860546-0001104659-07-015618,"Corporate Development Services, LLC",, +860546-0001104659-07-015618,"Corporate Gatespring, LLC",, +860546-0001104659-07-015618,"Corporate Gatespring II, LLC",, +860546-0001104659-07-015618,"Corporate Office Management, Inc.",, +860546-0001104659-07-015618,"Corporate Office Services, LLC",, +860546-0001104659-07-015618,"Corporate Paragon, LLC",, +860546-0001104659-07-015618,"Corporate Property, LLC",, +860546-0001104659-07-015618,"Cornucopia Holdings, LLC",, +860546-0001104659-07-015618,"Cornucopia Holdings II, LLC",, +860546-0001104659-07-015618,"Enterprise Campus Developer, LLC",, +860546-0001104659-07-015618,"Fourth Exploration, L.L.C.",, +860546-0001104659-07-015618,"Fifth Exploration, L.L.C.",, +860546-0001104659-07-015618,"Ft. Ritchie I, LLC",, +860546-0001104659-07-015618,"Ft. Ritchie II, LLC",, +860546-0001104659-07-015618,"Ft. Ritchie III, LLC",, +860546-0001104659-07-015618,"Ft. Ritchie IV, LLC",, +860546-0001104659-07-015618,"Ft. Ritchie Holding, LLC",, +860546-0001104659-07-015618,"Gateway 44, LLC",, +860546-0001104659-07-015618,"Gateway 67, LLC",, +860546-0001104659-07-015618,"Gateway 70, LLC",, +860546-0001104659-07-015618,"Gateway 70 Holdings, LLC",, +860546-0001104659-07-015618,"Gateway Crossing 95, LLC",, +860546-0001104659-07-015618,"Governors Court, LLC",, +860546-0001104659-07-015618,"Governors Court 21, LLC",, +860546-0001104659-07-015618,"Honeyland 108, LLC",, +860546-0001104659-07-015618,Hunt Valley 75 Limited Partnership,, +860546-0001104659-07-015618,"Jolly COPT I, LLC",, +860546-0001104659-07-015618,"Jolly COPT II, LLC",, +860546-0001104659-07-015618,"M Square NOAA, LLC",, +860546-0001104659-07-015618,"MOR Forbes, LLC",, +860546-0001104659-07-015618,"MOR Forbes 2, LLC",, +860546-0001104659-07-015618,"NBP One, LLC",, +860546-0001104659-07-015618,"NBP Huff & Puff, LLC",, +860546-0001104659-07-015618,"NBP Lot 3-A, LLC",, +860546-0001104659-07-015618,"NBP Retail, LLC",, +860546-0001104659-07-015618,"NBP 131-133-141, LLC",, +860546-0001104659-07-015618,"NBP 132, LLC",, +860546-0001104659-07-015618,"NBP 134, LLC",, +860546-0001104659-07-015618,"NBP 135, LLC",, +860546-0001104659-07-015618,"NBP 140, LLC",, +860546-0001104659-07-015618,"NBP 191, LLC",, +860546-0001104659-07-015618,"NBP 201, LLC",, +860546-0001104659-07-015618,"NBP 201 Holdings, LLC",, +860546-0001104659-07-015618,"NBP 211, LLC",, +860546-0001104659-07-015618,"NBP 211 Holdings, LLC",, +860546-0001104659-07-015618,"NBP 220, LLC",, +860546-0001104659-07-015618,"NBP 220 Holdings, LLC",, +860546-0001104659-07-015618,"NBP 221, LLC",, +860546-0001104659-07-015618,"NBP 302, LLC",, +860546-0001104659-07-015618,"NBP 304, LLC",, +860546-0001104659-07-015618,"NBP 306, LLC",, +860546-0001104659-07-015618,"NBP 318, LLC",, +860546-0001104659-07-015618,"NBP 320, LLC",, +860546-0001104659-07-015618,"NBP 322, LLC",, +860546-0001104659-07-015618,"Northcreek Manager, LLC",, +860546-0001104659-07-015618,"Pecan Court, L.L.C.",, +860546-0001104659-07-015618,"Red Cedar Building, LLC",, +860546-0001104659-07-015618,"RIVA Trustee, LLC",, +860546-0001104659-07-015618,"Rockville Corporate Center, LLC",, +860546-0001104659-07-015618,Rutherford 2 Limited Partnership,, +860546-0001104659-07-015618,"Tech Park I, LLC",, +860546-0001104659-07-015618,"Tech Park II, LLC",, +860546-0001104659-07-015618,"Tech Park IV, LLC",, +860546-0001104659-07-015618,"Third Exploration, L.L.C.",, +860546-0001104659-07-015618,"67 Financing, LLC",, +860546-0001104659-07-015618,"110 Thomas Johnson, LLC",, +860546-0001104659-07-015618,"134, LLC",, +860546-0001104659-07-015618,201 International Associates Limited Partnership,, +860546-0001104659-07-015618,"226 Schilling Circle, LLC",, +860546-0001104659-07-015618,"230 Schilling Circle, LLC",, +860546-0001104659-07-015618,"304 Sentinel, LLC",, +860546-0001104659-07-015618,"800 International, LLC",, +860546-0001104659-07-015618,"849 International, LLC",, +860546-0001104659-07-015618,"881 Elkridge Landing, LLC",, +860546-0001104659-07-015618,"900 International, LLC",, +860546-0001104659-07-015618,"930 International, LLC",, +860546-0001104659-07-015618,"999 Corporate, LLC",, +860546-0001104659-07-015618,"1099 Winterson, LLC",, +860546-0001104659-07-015618,"1190 Winterson, LLC",, +860546-0001104659-07-015618,"1199 Winterson, LLC",, +860546-0001104659-07-015618,"1460 Dorsey Road, LLC",, +860546-0001104659-07-015618,2500 Riva Trust,, +860546-0001104659-07-015618,"2691 Technology, LLC",, +860546-0001104659-07-015618,"2900 Lord Baltimore Drive, LLC",, +860546-0001104659-07-015618,"6700 Alexander Bell, LLC",, +860546-0001104659-07-015618,"6711 Gateway, LLC",, +860546-0001104659-07-015618,"6711 Gateway Funding, LLC",, +860546-0001104659-07-015618,"6721 Gateway, LLC",, +860546-0001104659-07-015618,"6731 Gateway, LLC",, +860546-0001104659-07-015618,"6741 Gateway, LLC",, +860546-0001104659-07-015618,"6940 CGD, LLC",, +860546-0001104659-07-015618,"7000 CG, LLC",, +860546-0001104659-07-015618,"7000 Honeys, LLC",, +860546-0001104659-07-015618,"7015 Albert Einstein Drive, LLC",, +860546-0001104659-07-015618,"7130 Columbia Gateway, LLC",, +860546-0001104659-07-015618,"7200 Riverwood, LLC",, +860546-0001104659-07-015618,"7210 Ambassador Road, LLC",, +860546-0001104659-07-015618,"7240 Parkway Drive Enterprises, LLC",, +860546-0001104659-07-015618,"7253 Ambassador Road, LLC",, +860546-0001104659-07-015618,"7318 Parkway Drive Enterprises, LLC",, +860546-0001104659-07-015618,"7320 Parkway Drive Enterprises, LLC",, +860546-0001104659-07-015618,"7320 PD, LLC",, +860546-0001104659-07-015618,"7321 Parkway Drive Enterprises, LLC",, +860546-0001104659-07-015618,"7468 Candlewood Road, LLC",, +860546-0001104659-07-015618,"8621 RFD, LLC",, +860546-0001104659-07-015618,"8661 RFD, LLC",, +860546-0001104659-07-015618,"9690 Deereco Road, LLC",, +860546-0001104659-07-015618,"11011 McCormick Road, LLC",, +860546-0001104659-07-015618,"11101 McCormick Road, LLC",, +860546-0001104659-07-015618,"COPT Princeton South, LLC",, +860546-0001104659-07-015618,"Cuaba Associates, L.L.C.",, +860546-0001104659-07-015618,"68 Culver, LLC",, +860546-0001104659-07-015618,Route 46 Partners,, +860546-0001104659-07-015618,"Route 46 Partners, L.L.C.",, +860546-0001104659-07-015618,"Bolivar Associates, LLC",, +860546-0001104659-07-015618,"Colgatedrive Associates, L.P.",, +860546-0001104659-07-015618,"COPT Pennlyn, L.P.",, +860546-0001104659-07-015618,"COPT San Antonio General, LLC",, +860546-0001104659-07-015618,"COPT San Antonio, LP",, +860546-0001104659-07-015618,"COPT Chantilly, LLC",, +860546-0001104659-07-015618,"COPT Chantilly II, LLC",, +860546-0001104659-07-015618,"COPT Dahlgren, LLC",, +860546-0001104659-07-015618,"COPT Dahlgren I, LLC",, +860546-0001104659-07-015618,"COPT Dahlgren II, LLC",, +860546-0001104659-07-015618,"COPT Dahlgren IV, LLC",, +860546-0001104659-07-015618,"COPT Dahlgren Land, LLC",, +860546-0001104659-07-015618,"COPT Greens I, LLC",, +860546-0001104659-07-015618,"COPT Greens II, LLC",, +860546-0001104659-07-015618,"COPT Greens III, LLC",, +860546-0001104659-07-015618,"COPT Park Meadow, LLC",, +860546-0001104659-07-015618,"COPT Parkstone, LLC",, +860546-0001104659-07-015618,"COPT Richmond I, LLC",, +860546-0001104659-07-015618,"COPT Ridgeview I, LLC",, +860546-0001104659-07-015618,"COPT Ridgeview II & III, LLC",, +860546-0001104659-07-015618,"COPT Southwest VA, LLC",, +860546-0001104659-07-015618,"COPT Stonecroft, LLC",, +860546-0001104659-07-015618,"COPT Sunrise, LLC",, +860546-0001104659-07-015618,"COPT Waterview I, LLC",, +860546-0001104659-07-015618,"COPT Waterview III, LLC",, +860546-0001104659-07-015618,"TRC Pinnacle Towers, L.L.C.",, +860546-0001104659-07-015618,"2900 Towerview Road, LLC",, +875622-0001140361-17-012337,Advance Biofactures Corp.,New York, +891014-0000891014-11-000007,APP China Specialty Minerals Pte Ltd.,Singapore, +891014-0000891014-11-000007,ASMAS Agir Sanayi Malzemeleri Imal ve Tic. A.S.,Turkey, +891014-0000891014-11-000007,Barretts Minerals Inc.,Delaware, +891014-0000891014-11-000007,ComSource Trading Ltd.,Delaware, +891014-0000891014-11-000007,Gold Lun Chemicals (Zhenjiang).,China, +891014-0000891014-11-000007,"Gold Sheng Chemicals (Zhenjiang) Co., Ltd.",China, +891014-0000891014-11-000007,"Gold Zuan Chemicals (Suzhou) Co., Ltd.",China, +891014-0000891014-11-000007,"Hi-Tech Specialty Minerals Company, Limited",Thailand, +891014-0000891014-11-000007,Minerals Technologies do Brasil Comercio é Industria de Minerais Ltda.,Brazil, +891014-0000891014-11-000007,Minerals Technologies Europe N.V.,Belgium, +891014-0000891014-11-000007,Minerals Technologies Holdings Inc.,Delaware, +891014-0000891014-11-000007,Minerals Technologies Holdings Ltd.,United Kingdom, +891014-0000891014-11-000007,Minerals Technologies India Private Limited,India, +891014-0000891014-11-000007,"Minerals Technologies Mexico Holdings, S. de R. L. de C.V.",Mexico, +891014-0000891014-11-000007,Minerals Technologies South Africa (Pty) Ltd.,South Africa, +891014-0000891014-11-000007,Mintech Canada Inc.,Canada, +891014-0000891014-11-000007,Mintech Japan K.K.,Japan, +891014-0000891014-11-000007,Minteq Australia Pty Ltd.,Australia, +891014-0000891014-11-000007,Minteq B.V.,The Netherlands, +891014-0000891014-11-000007,Minteq Europe Limited.,Ireland, +891014-0000891014-11-000007,Minteq International GmbH,Germany, +891014-0000891014-11-000007,Minteq International Inc.,Delaware, +891014-0000891014-11-000007,"Minteq International (Suzhou) Co., Ltd.",China, +891014-0000891014-11-000007,Minteq Italiana S.p.A.,Italy, +891014-0000891014-11-000007,Minteq Korea Inc.,Korea, +891014-0000891014-11-000007,Minteq Kosovo LLC.,Kosovo, +891014-0000891014-11-000007,Minteq Magnesite Limited,Ireland, +891014-0000891014-11-000007,"Minteq Metallurgical Materials (Suzhou) Co., Ltd.",China, +891014-0000891014-11-000007,Minteq Shapes and Services Inc.,Delaware, +891014-0000891014-11-000007,Minteq UK Limited.,United Kingdom, +891014-0000891014-11-000007,MTI Bermuda L.P.,Bermuda, +891014-0000891014-11-000007,MTI Holdings GmbH,Germany, +891014-0000891014-11-000007,MTI Holding Singapore Pte. Ltd.,Singapore, +891014-0000891014-11-000007,MTI Holdco I LLC,Delaware, +891014-0000891014-11-000007,MTI Holdco II LLC,Delaware, +891014-0000891014-11-000007,MTI Netherlands B.V.,Netherlands, +891014-0000891014-11-000007,MTX Finance Inc.,Delaware, +891014-0000891014-11-000007,MTX Finance Ireland,Ireland, +891014-0000891014-11-000007,Performance Minerals Netherlands C.V.,Netherlands, +891014-0000891014-11-000007,PT Sinar Mas Specialty Minerals,Indonesia, +891014-0000891014-11-000007,"Rijnstaal U.S.A., Inc.",Pennsylvania, +891014-0000891014-11-000007,SMI NewQuest India Private Limited SMI Poland Sp. z o.o.,India, +891014-0000891014-11-000007,Specialty Minerals Benelux,Poland, +891014-0000891014-11-000007,Specialty Minerals FMT K.K.,Belgium, +891014-0000891014-11-000007,Specialty Minerals France s.p.a.s.,Japan, +891014-0000891014-11-000007,Specialty Minerals GmbH,France, +891014-0000891014-11-000007,Specialty Minerals Inc.,Germany, +891014-0000891014-11-000007,Specialty Minerals India Holding Inc.,Delaware, +891014-0000891014-11-000007,Specialty Minerals International Inc.,Delaware, +891014-0000891014-11-000007,Specialty Minerals Malaysia Sdn. Bhd.,Malaysia, +891014-0000891014-11-000007,Specialty Minerals (Michigan) Inc.,Michigan, +891014-0000891014-11-000007,Specialty Minerals Mississippi Inc.,Delaware, +891014-0000891014-11-000007,Specialty Minerals Nordic Oy Ab,Finland, +891014-0000891014-11-000007,"Specialty Minerals (Portugal) Especialidades Minerais, S.A.",Portugal, +891014-0000891014-11-000007,Specialty Minerals S.A. de C.V.,Mexico, +891014-0000891014-11-000007,Specialty Minerals Servicios S. de R.L. de C.V.,Mexico, +891014-0000891014-11-000007,"Specialty Minerals Slovakia, spol. sr.o.",Slovakia, +891014-0000891014-11-000007,Specialty Minerals South Africa (Pty) Limited,South Africa, +891014-0000891014-11-000007,Specialty Minerals (Thailand) Limited,Thailand, +891014-0000891014-11-000007,Specialty Minerals UK Limited,United Kingdom, +891014-0000891014-11-000007,"Tecnologias Minerales de Mexico, S.A. de C.V.",Mexico, +891014-0000891014-11-000007,Yangpu Gold Hongda Chemicals Co. Ltd.,China, From 581b2e31129e73ca040948ff4ea6cd6b7a7dd01a Mon Sep 17 00:00:00 2001 From: zschira Date: Sun, 6 Oct 2024 10:13:19 -0400 Subject: [PATCH 095/161] Use run name for specifying training runs --- src/mozilla_sec_eia/models/sec10k/__init__.py | 5 ++--- .../sec10k/notebooks/exhibit21_extractor.ipynb | 18 ++++++++++-------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 9bb3557..a063f50 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -53,9 +53,8 @@ class TrainConfig(Config): """Config for training notebook.""" - layoutlm_uri: str | None = ( - "runs:/32355367ed444dd0b07f2d1b845f62d8/layoutlm_extractor" - ) + #: mlflow run name used to train layoutlm model + layoutlm_training_run: str | None = "layoutlm-labeledv0.2" exhibit21_extractor = define_dagstermill_asset( diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb index cc92a1e..69893da 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb @@ -52,7 +52,7 @@ "from mozilla_sec_eia.models.sec10k import defs\n", "\n", "context = dagstermill.get_context(op_config={\n", - " \"layoutlm_uri\": None,\n", + " \"layoutlm_training_run\": None,\n", "})\n", "\n", "ex21_training_data = defs.load_asset_value(\"ex21_training_data\", partition_key=\"labeledv0.2\")\n", @@ -222,9 +222,15 @@ "\n", " return encoding\n", "\n", + "if (run_name := context.op_config[\"layoutlm_training_run\"]) is not None:\n", + " filter_string = f\"attributes.run_name = '{run_name}'\"\n", + " run = mlflow.search_runs(filter_string=filter_string, output_format=\"list\")[0]\n", + " training_run_id = run.info.run_id\n", + "else:\n", + " training_run_id = None\n", + "\n", "# Only finetune if configured to do so\n", - "training_run_id = None\n", - "if context.op_config[\"layoutlm_uri\"] is None:\n", + "if training_run_id is None:\n", " id2label, label2id = get_id_label_conversions(LABELS)\n", " # Change temp_dir to save training data locally for inspection\n", " # Cache/prepare training data\n", @@ -497,11 +503,7 @@ ")\n", "\n", "# If a model was trained in this notebook, use it. Otherwise, use\n", - "if training_run_id is not None:\n", - " model_uri = f\"runs:/{training_run_id}/layoutlm_extractor\"\n", - "else:\n", - " model_uri = context.op_config[\"layoutlm_uri\"]\n", - "\n", + "model_uri = f\"runs:/{training_run_id}/layoutlm_extractor\"\n", "model_info = mlflow.models.get_model_info(model_uri)\n", "\n", "def _get_data(dataset):\n", From c67a1bef016b398bafd4ad3ee5e3540e1eadc3dc Mon Sep 17 00:00:00 2001 From: zschira Date: Sun, 6 Oct 2024 13:40:40 -0400 Subject: [PATCH 096/161] Rework how notebook is configured --- src/mozilla_sec_eia/models/sec10k/__init__.py | 10 +--------- .../models/sec10k/ex_21/data/__init__.py | 19 ++++++++++++------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index a063f50..33dd5b8 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -50,17 +50,10 @@ ) -class TrainConfig(Config): - """Config for training notebook.""" - - #: mlflow run name used to train layoutlm model - layoutlm_training_run: str | None = "layoutlm-labeledv0.2" - - exhibit21_extractor = define_dagstermill_asset( name="exhibit21_extractor", notebook_path=file_relative_path(__file__, "notebooks/exhibit21_extractor.ipynb"), - config_schema=TrainConfig.to_config_schema(), + config_schema=ex_21.data.Ex21TrainConfig.to_config_schema(), ins={ "ex21_training_data": AssetIn(), "ex21_validation_set": AssetIn(), @@ -68,7 +61,6 @@ class TrainConfig(Config): "ex21_inference_dataset": AssetIn(), }, save_notebook_on_failure=True, - partitions_def=ex_21.data.TRAINING_DATA_VERSION_PARTS, ) ex21_training_job = define_asset_job( "ex21_training", diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py index 06860f1..afa1f7a 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py @@ -7,7 +7,7 @@ from dagster import ( AssetExecutionContext, AssetOut, - StaticPartitionsDefinition, + Config, asset, multi_asset, ) @@ -20,19 +20,24 @@ from .inference import create_inference_dataset from .training import format_as_ner_annotations -TRAINING_DATA_VERSION_PARTS = StaticPartitionsDefinition( - ["labeledv0.0", "labeledv0.1", "labeledv0.2"] -) +class Ex21TrainConfig(Config): + """Config for training notebook.""" + + #: mlflow run name used to train layoutlm model + layoutlm_training_run: str | None = "layoutlm-labeledv0.2" + #: training data version (doesn't matter if using pretrained model) + training_data_version: str | None = "v0.2" -@asset(partitions_def=TRAINING_DATA_VERSION_PARTS) -def ex21_training_data(context: AssetExecutionContext): + +@asset +def ex21_training_data(config: Ex21TrainConfig): """Construct training dataset for ex 21 extraction.""" with TemporaryDirectory() as temp_dir: ner_annotations = format_as_ner_annotations( labeled_json_path=Path(temp_dir) / "sec10k_filings" / "labeled_jsons", pdfs_path=Path(temp_dir) / "sec10k_filings" / "pdfs", - gcs_folder_name=context.partition_key, + gcs_folder_name=f"labeled{config.training_data_version}", ) return ner_annotations From b8a5b247d4637fb6fd3a1f4e7c4358077a594f3c Mon Sep 17 00:00:00 2001 From: zschira Date: Sun, 6 Oct 2024 15:00:46 -0400 Subject: [PATCH 097/161] Finetune configuration --- src/mozilla_sec_eia/models/sec10k/__init__.py | 1 - src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py | 2 +- .../models/sec10k/notebooks/exhibit21_extractor.ipynb | 2 +- 3 files changed, 2 insertions(+), 3 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 33dd5b8..8087fb8 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -2,7 +2,6 @@ from dagster import ( AssetIn, - Config, Definitions, define_asset_job, file_relative_path, diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py index afa1f7a..2ae0b1e 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py @@ -27,7 +27,7 @@ class Ex21TrainConfig(Config): #: mlflow run name used to train layoutlm model layoutlm_training_run: str | None = "layoutlm-labeledv0.2" #: training data version (doesn't matter if using pretrained model) - training_data_version: str | None = "v0.2" + training_data_version: str = "v0.2" @asset diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb index 69893da..f989b4e 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb @@ -52,7 +52,7 @@ "from mozilla_sec_eia.models.sec10k import defs\n", "\n", "context = dagstermill.get_context(op_config={\n", - " \"layoutlm_training_run\": None,\n", + " \"layoutlm_training_run\": \"layoutlm-labeledv0.2\",\n", "})\n", "\n", "ex21_training_data = defs.load_asset_value(\"ex21_training_data\", partition_key=\"labeledv0.2\")\n", From 45d5cf8f9b280bda8c9ae80f103f98bcb01ca4c3 Mon Sep 17 00:00:00 2001 From: zschira Date: Mon, 7 Oct 2024 13:11:06 -0400 Subject: [PATCH 098/161] separate inference dataset creation from model prediction --- .../library/mlflow/mlflow_io_managers.py | 2 +- src/mozilla_sec_eia/library/model_jobs.py | 15 +++++-- src/mozilla_sec_eia/models/sec10k/__init__.py | 8 ++-- .../models/sec10k/ex_21/__init__.py | 44 ++++++++++++++++--- .../models/sec10k/ex_21/data/__init__.py | 1 - .../notebooks/exhibit21_extractor.ipynb | 42 ++++++++++++++++-- 6 files changed, 93 insertions(+), 19 deletions(-) diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py index 94468f5..abc2d1c 100644 --- a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py +++ b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py @@ -46,7 +46,7 @@ def load_input(self, context: InputContext): if model_uri is None: model_uri = f"models:/{context.name}" - mlflow.pyfunc.load_model( + return mlflow.pyfunc.load_model( model_uri, dst_path=cache_path, ) diff --git a/src/mozilla_sec_eia/library/model_jobs.py b/src/mozilla_sec_eia/library/model_jobs.py index 87f6d15..e3c9801 100644 --- a/src/mozilla_sec_eia/library/model_jobs.py +++ b/src/mozilla_sec_eia/library/model_jobs.py @@ -25,6 +25,7 @@ def create_production_model_job( job_name: str, assets: list[AssetsDefinition], concurrency_limit: int | None = None, + tag_concurrency_limits: list[dict] | None = None, **kwargs, ) -> JobDefinition: """Construct a dagster job and supply Definitions with assets and resources.""" @@ -39,10 +40,16 @@ def create_production_model_job( } }, } - if concurrency_limit is not None: - config["execution"] = { - "config": {"multiprocess": {"max_concurrent": concurrency_limit}} - } + if (concurrency_limit is not None) or (tag_concurrency_limits is not None): + config["execution"] = {"config": {"multiprocess": {}}} + if concurrency_limit is not None: + config["execution"]["config"]["multiprocess"][ + "max_concurrent" + ] = concurrency_limit + else: + config["execution"]["config"]["multiprocess"][ + "tag_concurrency_limits" + ] = tag_concurrency_limits return define_asset_job( job_name, diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 8087fb8..f0bb091 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -45,7 +45,9 @@ ex21_production_job = model_jobs.create_production_model_job( "ex21_extraction", ex_21.production_assets, - concurrency_limit=4, + tag_concurrency_limits=[ + {"key": "model", "value": "exhibit21_extractor", "limit": 2}, + ], ) @@ -85,10 +87,10 @@ "mlflow_interface": mlflow_interface_resource, "layoutlm_io_manager": MlflowPyfuncModelIOManager( mlflow_interface=mlflow_interface_resource, - uri="runs:/b959cfa0ba3c4b91a0f8fe158cd0109f/exhibit21_extractor", + uri="runs:/d603f8e219da4fd39f3c2f8d7d3bcb40/exhibit21_extractor", ), "pandas_parquet_io_manager": PandasParquetIOManager( - base_path=UPath("gs://sec10k-outputs") + base_path=UPath("gs://sec10k-outputs/v2") ), "output_notebook_io_manager": ConfigurableLocalOutputNotebookIOManager(), } diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 574074d..b5bc167 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -1,6 +1,7 @@ """Module for working with exhibit 21 data.""" import logging +import traceback import pandas as pd from dagster import ( @@ -10,6 +11,7 @@ graph_multi_asset, op, ) +from mlflow.pyfunc import PyFuncModel from ..entities import ( Ex21CompanyOwnership, @@ -18,7 +20,8 @@ sec10k_extract_metadata_type, ) from ..extract import chunk_filings, sec10k_filing_metadata, year_quarter_partitions -from .inference import extract_filings +from ..utils.cloud import GCSArchive +from .data.inference import create_inference_dataset logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -29,13 +32,32 @@ "extracted": Out(dagster_type=ex21_extract_type), }, ins={"exhibit21_extractor": In(input_manager_key="layoutlm_io_manager")}, + tags={"model": "exhibit21_extractor"}, ) def extract_filing_chunk( - filings: pd.DataFrame, + parsed_chunk: tuple[pd.DataFrame, pd.DataFrame], exhibit21_extractor, ) -> tuple[pd.DataFrame, pd.DataFrame]: """Extract a set of filings and return results.""" - return extract_filings(filings, exhibit21_extractor) + failed_parsing_metadata, inference_dataset = parsed_chunk + extracted = Ex21CompanyOwnership.example(size=0) + try: + if not inference_dataset.empty: + metadata, extracted = exhibit21_extractor.predict(inference_dataset) + metadata = pd.concat([failed_parsing_metadata, metadata]) + else: + metadata = failed_parsing_metadata + except Exception as e: + logger.warning(traceback.format_exc()) + logger.warning(f"Error while extracting filings: {inference_dataset['id']}") + metadata = pd.DataFrame( + { + "filename": inference_dataset["id"], + "success": [False] * len(inference_dataset), + "notes": [str(e)] * len(inference_dataset), + } + ).set_index("filename") + return metadata, extracted @op( @@ -65,6 +87,17 @@ def collect_extracted_chunks( ) +@op +def create_dataset( + cloud_interface: GCSArchive, filings: pd.DataFrame +) -> tuple[pd.DataFrame, pd.DataFrame]: + """Construct inference dataset from filing chunk.""" + return create_inference_dataset( + filing_metadata=filings, + cloud_interface=cloud_interface, + ) + + @graph_multi_asset( outs={ "ex21_extraction_metadata": AssetOut( @@ -81,9 +114,8 @@ def ex21_extract( ): """Extract ownership info from exhibit 21 docs.""" filing_chunks = chunk_filings(sec10k_filing_metadata) - metadata_chunks, extracted_chunks = filing_chunks.map( - lambda filings: extract_filing_chunk(filings) - ) + parsed_chunks = filing_chunks.map(create_dataset) + metadata_chunks, extracted_chunks = parsed_chunks.map(extract_filing_chunk) metadata, extracted = collect_extracted_chunks( metadata_chunks.collect(), extracted_chunks.collect() ) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py index 2ae0b1e..6c5e8aa 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py @@ -5,7 +5,6 @@ import pandas as pd from dagster import ( - AssetExecutionContext, AssetOut, Config, asset, diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb index f989b4e..7fc14b5 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb @@ -38,14 +38,46 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "48f185de-95ef-4194-9245-93f8d603d2e6", "metadata": { "tags": [ "parameters" ] }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", + "2024-10-06 15:23:43 -0400 - dagster - DEBUG - system - Loading 11 partitions...\n", + "2024-10-06 15:23:43 -0400 - dagster - DEBUG - system - Loading partition l from /home/zach/catalyst/workspace/storage/ex21_training_data/l using PickledObjectFilesystemIOManager...\n" + ] + }, + { + "ename": "NotADirectoryError", + "evalue": "[Errno 20] Not a directory: '/home/zach/catalyst/workspace/storage/ex21_training_data/l'", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNotADirectoryError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 9\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmozilla_sec_eia\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msec10k\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m defs\n\u001b[1;32m 5\u001b[0m context \u001b[38;5;241m=\u001b[39m dagstermill\u001b[38;5;241m.\u001b[39mget_context(op_config\u001b[38;5;241m=\u001b[39m{\n\u001b[1;32m 6\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayoutlm_training_run\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayoutlm-labeledv0.2\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 7\u001b[0m })\n\u001b[0;32m----> 9\u001b[0m ex21_training_data \u001b[38;5;241m=\u001b[39m \u001b[43mdefs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_asset_value\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mex21_training_data\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlabeledv0.2\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 11\u001b[0m ex21_failed_parsing_metadata \u001b[38;5;241m=\u001b[39m defs\u001b[38;5;241m.\u001b[39mload_asset_value(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mex21_failed_parsing_metadata\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 12\u001b[0m ex21_inference_dataset \u001b[38;5;241m=\u001b[39m defs\u001b[38;5;241m.\u001b[39mload_asset_value(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mex21_inference_dataset\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/definitions/definitions_class.py:519\u001b[0m, in \u001b[0;36mDefinitions.load_asset_value\u001b[0;34m(self, asset_key, python_type, instance, partition_key, metadata)\u001b[0m\n\u001b[1;32m 490\u001b[0m \u001b[38;5;129m@public\u001b[39m\n\u001b[1;32m 491\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_asset_value\u001b[39m(\n\u001b[1;32m 492\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 498\u001b[0m metadata: Optional[Dict[\u001b[38;5;28mstr\u001b[39m, Any]] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 499\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mobject\u001b[39m:\n\u001b[1;32m 500\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Load the contents of an asset as a Python object.\u001b[39;00m\n\u001b[1;32m 501\u001b[0m \n\u001b[1;32m 502\u001b[0m \u001b[38;5;124;03m Invokes `load_input` on the :py:class:`IOManager` associated with the asset.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 517\u001b[0m \u001b[38;5;124;03m The contents of an asset as a Python object.\u001b[39;00m\n\u001b[1;32m 518\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 519\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_repository_def\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_asset_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 520\u001b[0m \u001b[43m \u001b[49m\u001b[43masset_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43masset_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 521\u001b[0m \u001b[43m \u001b[49m\u001b[43mpython_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpython_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 522\u001b[0m \u001b[43m \u001b[49m\u001b[43minstance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minstance\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 523\u001b[0m \u001b[43m \u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartition_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 524\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 525\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/definitions/repository_definition/repository_definition.py:350\u001b[0m, in \u001b[0;36mRepositoryDefinition.load_asset_value\u001b[0;34m(self, asset_key, python_type, instance, partition_key, metadata, resource_config)\u001b[0m\n\u001b[1;32m 346\u001b[0m normalized_assets_defs_by_key \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 347\u001b[0m k: ad \u001b[38;5;28;01mfor\u001b[39;00m ad \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39masset_graph\u001b[38;5;241m.\u001b[39massets_defs \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m ad\u001b[38;5;241m.\u001b[39mkeys\n\u001b[1;32m 348\u001b[0m }\n\u001b[1;32m 349\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m AssetValueLoader(normalized_assets_defs_by_key, instance\u001b[38;5;241m=\u001b[39minstance) \u001b[38;5;28;01mas\u001b[39;00m loader:\n\u001b[0;32m--> 350\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mloader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_asset_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 351\u001b[0m \u001b[43m \u001b[49m\u001b[43masset_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 352\u001b[0m \u001b[43m \u001b[49m\u001b[43mpython_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpython_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 353\u001b[0m \u001b[43m \u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartition_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 354\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 355\u001b[0m \u001b[43m \u001b[49m\u001b[43mresource_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresource_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 356\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/decorator_utils.py:203\u001b[0m, in \u001b[0;36m_wrap_with_pre_call_fn..wrapped_with_pre_call_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 201\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m condition \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m condition(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 202\u001b[0m pre_call_fn()\n\u001b[0;32m--> 203\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/asset_value_loader.py:169\u001b[0m, in \u001b[0;36mAssetValueLoader.load_asset_value\u001b[0;34m(self, asset_key, python_type, partition_key, input_definition_metadata, resource_config, metadata)\u001b[0m\n\u001b[1;32m 139\u001b[0m io_manager_config \u001b[38;5;241m=\u001b[39m get_mapped_resource_config(\n\u001b[1;32m 140\u001b[0m {io_manager_key: io_manager_def}, io_resource_config\n\u001b[1;32m 141\u001b[0m )\n\u001b[1;32m 143\u001b[0m input_context \u001b[38;5;241m=\u001b[39m build_input_context(\n\u001b[1;32m 144\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 145\u001b[0m asset_key\u001b[38;5;241m=\u001b[39masset_key,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 166\u001b[0m ),\n\u001b[1;32m 167\u001b[0m )\n\u001b[0;32m--> 169\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_manager\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_input\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_context\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/upath_io_manager.py:412\u001b[0m, in \u001b[0;36mUPathIOManager.load_input\u001b[0;34m(self, context)\u001b[0m\n\u001b[1;32m 410\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 411\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 412\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_partitions\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontext\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/upath_io_manager.py:396\u001b[0m, in \u001b[0;36mUPathIOManager._load_partitions\u001b[0;34m(self, context)\u001b[0m\n\u001b[1;32m 393\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_load_partitions\u001b[39m(\u001b[38;5;28mself\u001b[39m, context: InputContext) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Dict[\u001b[38;5;28mstr\u001b[39m, Any]:\n\u001b[1;32m 394\u001b[0m \u001b[38;5;66;03m# load multiple partitions\u001b[39;00m\n\u001b[1;32m 395\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39miscoroutinefunction(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mload_from_path):\n\u001b[0;32m--> 396\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_partitions\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontext\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 397\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 398\u001b[0m \u001b[38;5;66;03m# load_from_path returns a coroutine, so we need to await the results\u001b[39;00m\n\u001b[1;32m 399\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mload_partitions_async(context)\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/upath_io_manager.py:81\u001b[0m, in \u001b[0;36mUPathIOManager.load_partitions\u001b[0;34m(self, context)\u001b[0m\n\u001b[1;32m 78\u001b[0m objs \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m 80\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m partition_key \u001b[38;5;129;01min\u001b[39;00m context\u001b[38;5;241m.\u001b[39masset_partition_keys:\n\u001b[0;32m---> 81\u001b[0m obj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_partition_from_path\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 82\u001b[0m \u001b[43m \u001b[49m\u001b[43mcontext\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 83\u001b[0m \u001b[43m \u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 84\u001b[0m \u001b[43m \u001b[49m\u001b[43mpaths\u001b[49m\u001b[43m[\u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 85\u001b[0m \u001b[43m \u001b[49m\u001b[43mbackcompat_paths\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 86\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m obj \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m: \u001b[38;5;66;03m# in case some partitions were skipped\u001b[39;00m\n\u001b[1;32m 88\u001b[0m objs[partition_key] \u001b[38;5;241m=\u001b[39m obj\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/upath_io_manager.py:307\u001b[0m, in \u001b[0;36mUPathIOManager._load_partition_from_path\u001b[0;34m(self, context, partition_key, path, backcompat_path)\u001b[0m\n\u001b[1;32m 305\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 306\u001b[0m context\u001b[38;5;241m.\u001b[39mlog\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_loading_input_partition_log_message(path, partition_key))\n\u001b[0;32m--> 307\u001b[0m obj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_from_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 308\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\n\u001b[1;32m 309\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/fs_io_manager.py:283\u001b[0m, in \u001b[0;36mPickledObjectFilesystemIOManager.load_from_path\u001b[0;34m(self, context, path)\u001b[0m\n\u001b[1;32m 282\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_from_path\u001b[39m(\u001b[38;5;28mself\u001b[39m, context: InputContext, path: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUPath\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m--> 283\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mpath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m 284\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m pickle\u001b[38;5;241m.\u001b[39mload(file)\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/upath/implementations/local.py:134\u001b[0m, in \u001b[0;36mPosixUPath.open\u001b[0;34m(self, mode, buffering, encoding, errors, newline, **fsspec_kwargs)\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m(LocalPath, \u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39mopen(\n\u001b[1;32m 126\u001b[0m mode\u001b[38;5;241m=\u001b[39mmode,\n\u001b[1;32m 127\u001b[0m buffering\u001b[38;5;241m=\u001b[39mbuffering,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfsspec_kwargs,\n\u001b[1;32m 132\u001b[0m )\n\u001b[1;32m 133\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 134\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mPosixPath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffering\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[43m)\u001b[49m\n", + "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/pathlib.py:1044\u001b[0m, in \u001b[0;36mPath.open\u001b[0;34m(self, mode, buffering, encoding, errors, newline)\u001b[0m\n\u001b[1;32m 1042\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1043\u001b[0m encoding \u001b[38;5;241m=\u001b[39m io\u001b[38;5;241m.\u001b[39mtext_encoding(encoding)\n\u001b[0;32m-> 1044\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m io\u001b[38;5;241m.\u001b[39mopen(\u001b[38;5;28mself\u001b[39m, mode, buffering, encoding, errors, newline)\n", + "\u001b[0;31mNotADirectoryError\u001b[0m: [Errno 20] Not a directory: '/home/zach/catalyst/workspace/storage/ex21_training_data/l'" + ] + } + ], "source": [ "import dagstermill\n", "\n", @@ -55,7 +87,7 @@ " \"layoutlm_training_run\": \"layoutlm-labeledv0.2\",\n", "})\n", "\n", - "ex21_training_data = defs.load_asset_value(\"ex21_training_data\", partition_key=\"labeledv0.2\")\n", + "ex21_training_data = defs.load_asset_value(\"ex21_training_data\")\n", "\n", "ex21_failed_parsing_metadata = defs.load_asset_value(\"ex21_failed_parsing_metadata\")\n", "ex21_inference_dataset = defs.load_asset_value(\"ex21_inference_dataset\")\n", @@ -715,7 +747,9 @@ " python_model=Ex21Extractor(),\n", " artifacts={\"model_components\": model_uri},\n", " signature=infer_signature(ex21_inference_dataset, extracted), # NOTE: model returns a second dataframe with metadata, but mlflow only supports one in signature\n", - " )" + " )\n", + " mlflow.log_table(extracted, \"extracted_data.json\")\n", + " mlflow.log_table(metadata, \"extraction_metadata.json\")" ] } ], From 3e15b1f1a107463575bf6a2bb1751fbd61667159 Mon Sep 17 00:00:00 2001 From: zschira Date: Mon, 7 Oct 2024 14:08:52 -0400 Subject: [PATCH 099/161] Remove deprecated inference module --- .../models/sec10k/ex_21/__init__.py | 2 +- .../models/sec10k/ex_21/inference.py | 40 ------------------- 2 files changed, 1 insertion(+), 41 deletions(-) delete mode 100644 src/mozilla_sec_eia/models/sec10k/ex_21/inference.py diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index b5bc167..34f6c3a 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -36,7 +36,7 @@ ) def extract_filing_chunk( parsed_chunk: tuple[pd.DataFrame, pd.DataFrame], - exhibit21_extractor, + exhibit21_extractor: PyFuncModel, ) -> tuple[pd.DataFrame, pd.DataFrame]: """Extract a set of filings and return results.""" failed_parsing_metadata, inference_dataset = parsed_chunk diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py deleted file mode 100644 index 6f517a3..0000000 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/inference.py +++ /dev/null @@ -1,40 +0,0 @@ -"""Module for formatting inputs and performing inference with a fine-tuned LayoutLM model.""" - -import logging -import traceback - -import pandas as pd -from mlflow.pyfunc import PyFuncModel - -from ..entities import Ex21CompanyOwnership -from ..utils.cloud import GCSArchive -from .data.inference import create_inference_dataset - -logger = logging.getLogger(f"catalystcoop.{__name__}") - - -def extract_filings( - filings: pd.DataFrame, - layoutlm: PyFuncModel, -) -> tuple[pd.DataFrame, pd.DataFrame]: - """Create huggingface dataset from filings and perform extraction.""" - try: - failed_metadata, dataset = create_inference_dataset( - filing_metadata=filings, - cloud_interface=GCSArchive(), - has_labels=False, - ) - metadata, extracted = layoutlm.predict(dataset) - metadata = pd.concat([failed_metadata, metadata]) - except Exception as e: - logger.warning(traceback.format_exc()) - logger.warning(f"Error while extracting filings: {filings.index}") - metadata = pd.DataFrame( - { - "filename": filings.index, - "success": [False] * len(filings), - "notes": [str(e)] * len(filings), - } - ).set_index("filename") - extracted = Ex21CompanyOwnership.example(size=0) - return metadata, extracted From 60a1260f8bda16847df9cc48b2d070fd342fa9cc Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 8 Oct 2024 16:22:11 -0400 Subject: [PATCH 100/161] Add notebook for training ex21 classifier --- src/mozilla_sec_eia/models/sec10k/__init__.py | 28 +- .../models/sec10k/ex_21/data/__init__.py | 45 +++- .../exhibit21_layout_classifier.ipynb | 249 +++++++++++++++++- .../validation_data/ex21_layout_histogram.csv | 110 ++++++++ 4 files changed, 426 insertions(+), 6 deletions(-) create mode 100644 src/mozilla_sec_eia/package_data/validation_data/ex21_layout_histogram.csv diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index f0bb091..79e181d 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -28,7 +28,7 @@ basic_10k_assets = load_assets_from_modules([basic_10k]) ex21_assets = load_assets_from_package_module(ex_21) -ex21_training_data_assets = load_assets_from_modules([ex_21.data]) +ex21_data_assets = load_assets_from_modules([ex_21.data]) shared_assets = load_assets_from_modules([extract]) basic_10k_production_job = model_jobs.create_production_model_job( @@ -65,7 +65,26 @@ ) ex21_training_job = define_asset_job( "ex21_training", - selection=[exhibit21_extractor] + ex21_training_data_assets, + selection=[exhibit21_extractor] + ex_21.data.ex21_extraction_training_assets, + executor_def=in_process_executor, +) + + +exhibit21_layout_classifier = define_dagstermill_asset( + name="exhibit21_layout_classifier", + notebook_path=file_relative_path( + __file__, "notebooks/exhibit21_layout_classifier.ipynb" + ), + config_schema=ex_21.data.Ex21TrainConfig.to_config_schema(), + ins={ + "ex21_layout_labels": AssetIn(), + "ex21_layout_classifier_training_dataset": AssetIn(), + }, + save_notebook_on_failure=True, +) +ex21_layout_classifier_training_job = define_asset_job( + "ex21_layout_classifier_training", + selection=[exhibit21_layout_classifier] + ex_21.data.ex21_layout_classifier_assets, executor_def=in_process_executor, ) @@ -74,13 +93,14 @@ assets=basic_10k_assets + ex21_assets + shared_assets - + [exhibit21_extractor] - + ex21_training_data_assets, + + [exhibit21_extractor, exhibit21_layout_classifier] + + ex21_data_assets, jobs=[ basic_10k_production_job, basic_10k_validation_job, ex21_production_job, ex21_training_job, + ex21_layout_classifier_training_job, ], resources={ "cloud_interface": cloud_interface_resource, diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py index 6c5e8aa..abdd4e5 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/__init__.py @@ -43,7 +43,7 @@ def ex21_training_data(config: Ex21TrainConfig): @asset(dagster_type=ex21_extract_type) def ex21_validation_set() -> pd.DataFrame: - """Return dataframe containing basic 10k validation data.""" + """Return dataframe containing ex 21 validation data.""" return clean_ex21_validation_set( validation_helpers.load_validation_data("ex21_labels.csv") ) @@ -78,3 +78,46 @@ def ex21_inference_dataset( filing_metadata=ex21_validation_filing_metadata, cloud_interface=cloud_interface, ) + + +@asset +def ex21_layout_labels() -> pd.DataFrame: + """Return dataframe with labels describing layout of validation filings.""" + return validation_helpers.load_validation_data("ex21_layout_histogram.csv") + + +@asset +def ex21_layout_classifier_filing_metadata( + cloud_interface: GCSArchive, + ex21_layout_labels: pd.DataFrame, +) -> pd.DataFrame: + """Get sec 10k filing metadata from validation set.""" + filing_metadata = cloud_interface.get_metadata() + return filing_metadata[filing_metadata.index.isin(ex21_layout_labels["filename"])] + + +@asset +def ex21_layout_classifier_training_dataset( + cloud_interface: GCSArchive, + ex21_layout_classifier_filing_metadata: pd.DataFrame, +) -> pd.DataFrame: + """Construct inference dataset for ex 21 extraction.""" + _, dataset = create_inference_dataset( + filing_metadata=ex21_layout_classifier_filing_metadata, + cloud_interface=cloud_interface, + ) + return dataset + + +ex21_extraction_training_assets = [ + ex21_training_data, + ex21_validation_set, + ex21_validation_filing_metadata, + ex21_inference_dataset, +] + +ex21_layout_classifier_assets = [ + ex21_layout_labels, + ex21_layout_classifier_filing_metadata, + ex21_layout_classifier_training_dataset, +] diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb index 1781454..584832c 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb @@ -1,9 +1,256 @@ { "cells": [ + { + "cell_type": "markdown", + "id": "618936ab-bda1-4b46-8ee5-dbdfc0090562", + "metadata": {}, + "source": [ + "## Exhibit 21 layout classifier\n", + "Some EX21 filings are formatted as a paragraph of text rather than a structured table. Given that the extraction model is trained/designed to work with a table layout, it tends to perform poorly on these filings. In this notebook we will develop a classifier model to detect these filings, so we can filter them out, and potentially develop a dedicated model to handle them." + ] + }, + { + "cell_type": "markdown", + "id": "a22bfc9d-9487-43ec-b0b7-d5bb6e17f994", + "metadata": {}, + "source": [ + "### Load labeled layouts from upstream asset" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "b4963648-2aac-46a7-9778-8808c1e5eeb2", + "metadata": { + "tags": [ + "parameters" + ] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", + "2024-10-08 13:55:05 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_layout_labels using PickledObjectFilesystemIOManager...\n", + "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", + "2024-10-08 13:55:05 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_layout_classifier_training_dataset using PickledObjectFilesystemIOManager...\n" + ] + } + ], + "source": [ + "from mozilla_sec_eia.models.sec10k import defs\n", + "\n", + "ex21_layout_labels = defs.load_asset_value(\"ex21_layout_labels\")\n", + "ex21_layout_classifier_training_dataset = defs.load_asset_value(\"ex21_layout_classifier_training_dataset\")" + ] + }, + { + "cell_type": "markdown", + "id": "3e72a132-a87b-4827-aef0-0898e72317ca", + "metadata": {}, + "source": [ + "### Implement method to construct feature dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "ee4ed368-7d01-4cb8-952f-f7941900d669", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "\n", + "from mozilla_sec_eia.models.sec10k.ex_21.data.common import BBOX_COLS_PDF\n", + "\n", + "\n", + "def calculate_features(record):\n", + " \"\"\"Compute features from bounding boxes in inference dataset.\"\"\"\n", + " df = pd.DataFrame(record[\"bboxes\"], columns=BBOX_COLS_PDF)\n", + " features = {}\n", + " features[\"n_bboxes\"] = len(df)\n", + "\n", + " # block density wasn't a very useful feature, maybe rework?\n", + " # Calculate the bounding box density of the area of the page with text\n", + " # x_width = df[\"bottom_right_x_pdf\"].max() - df[\"top_left_x_pdf\"].min()\n", + " # y_height = df[\"bottom_right_y_pdf\"].max() - df[\"top_left_y_pdf\"].min()\n", + " # text_area = x_width * y_height\n", + " # features[\"block_density\"] = features[\"n_bboxes\"] / text_area\n", + "\n", + " # Calculate average y-distance between bounding boxes for a given document\n", + " df = df.sort_values(by=[\"top_left_y_pdf\", \"top_left_x_pdf\"])\n", + " y_diffs = df[\"top_left_y_pdf\"].diff().dropna()\n", + " features[\"avg_y_distance\"] = y_diffs.mean()\n", + " features[\"std_y_distance\"] = y_diffs.std()\n", + "\n", + " # Calculate x-distance to assess horizontal alignment\n", + " x_diffs = df.groupby(\"top_left_y_pdf\")[\"top_left_x_pdf\"].apply(lambda x: x.diff().dropna())\n", + " features[\"avg_x_distance\"] = x_diffs.mean()\n", + " features[\"std_x_distance\"] = x_diffs.std()\n", + "\n", + " # Define a small threshold to group bounding boxes that are on the same line\n", + " y_threshold = 0.1\n", + " df.loc[:, \"line_group\"] = (df[\"top_left_y_pdf\"].diff().fillna(0).abs() > y_threshold).cumsum()\n", + " boxes_per_line = df.groupby(\"line_group\").size()\n", + " features[\"median_boxes_per_line\"] = boxes_per_line.median()\n", + " return pd.Series(features)" + ] + }, + { + "cell_type": "markdown", + "id": "44f87fd0-82ad-4564-8476-c0ddd78e1527", + "metadata": {}, + "source": [ + "### Create training/test sets" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "f71e2dfc-552d-49e7-b23d-267c2158efe2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "X = ex21_layout_classifier_training_dataset.sort_values(by=[\"id\"]).apply(calculate_features, axis=1)\n", + "y = np.where(ex21_layout_labels.sort_values(by=[\"filename\"])[\"layout\"] == \"Paragraph\", 1, 0)\n", + "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=16)" + ] + }, + { + "cell_type": "markdown", + "id": "de130cf4-cd52-4dde-8582-145566a0b1f3", + "metadata": {}, + "source": [ + "### Create mlflow model to wrap classifier" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "id": "08bf5f11-af80-4c65-a005-2a2de49c30b5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import mlflow\n", + "\n", + "\n", + "class Ex21LayoutClassifier(mlflow.pyfunc.PythonModel):\n", + " \"\"\"Wrap sklearn classifier in mlflow pyfunc model.\"\"\"\n", + "\n", + " def load_context(self, context):\n", + " \"\"\"Load sklearn model.\"\"\"\n", + " self.model = mlflow.sklearn.load_model(context.artifacts[\"layout_classifier\"])\n", + "\n", + " def predict(self, context, model_input: pd.DataFrame):\n", + " \"\"\"Create feature matrix from inference dataset and use trained model for prediction.\"\"\"\n", + " features_df = model_input.apply(calculate_features, axis=1)\n", + " return self.model.predict(features_df)" + ] + }, + { + "cell_type": "markdown", + "id": "c3a2c6a1-cdc1-4fd0-a1ca-a5d5cc34d139", + "metadata": {}, + "source": [ + "### Train and log model" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "55d2194e-82a8-4d1e-8318-a8c893dc29de", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/10/08 16:10:39 WARNING mlflow.utils.autologging_utils: MLflow sklearn autologging is known to be compatible with 0.24.1 <= scikit-learn <= 1.5.1, but the installed version is 1.5.2. If you encounter errors during autologging, try upgrading / downgrading scikit-learn to a compatible version, or try upgrading MLflow.\n", + "2024/10/08 16:10:39 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.\n", + "2024/10/08 16:10:40 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", + "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", + "\n", + "Increase the number of iterations (max_iter) or scale the data as shown in:\n", + " https://scikit-learn.org/stable/modules/preprocessing.html\n", + "Please also refer to the documentation for alternative solver options:\n", + " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", + " n_iter_i = _check_optimize_result(\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values `_ for more details.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "dd07e390d13f4f6692ae96288ffb1dbb", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading artifacts: 0%| | 0/5 [00:00 Date: Tue, 8 Oct 2024 17:53:53 -0400 Subject: [PATCH 101/161] Pull in model updates --- src/mozilla_sec_eia/models/sec10k/__init__.py | 4 + src/mozilla_sec_eia/models/sec10k/entities.py | 11 ++ .../models/sec10k/ex_21/__init__.py | 41 ++++++- .../notebooks/exhibit21_extractor.ipynb | 104 +++++++++++------- .../validation_data/ex21_labels.csv | 48 +++++--- 5 files changed, 152 insertions(+), 56 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 79e181d..2ecf3c2 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -109,6 +109,10 @@ mlflow_interface=mlflow_interface_resource, uri="runs:/d603f8e219da4fd39f3c2f8d7d3bcb40/exhibit21_extractor", ), + "ex21_classifier_io_manager": MlflowPyfuncModelIOManager( + mlflow_interface=mlflow_interface_resource, + uri="runs:/08802dbf347c4cd5b66751c11328a06f/exhibit21_layout_classifier", + ), "pandas_parquet_io_manager": PandasParquetIOManager( base_path=UPath("gs://sec10k-outputs/v2") ), diff --git a/src/mozilla_sec_eia/models/sec10k/entities.py b/src/mozilla_sec_eia/models/sec10k/entities.py index b0f6869..2ee5b23 100644 --- a/src/mozilla_sec_eia/models/sec10k/entities.py +++ b/src/mozilla_sec_eia/models/sec10k/entities.py @@ -53,6 +53,17 @@ class Sec10kExtractionMetadata(pa.DataFrameModel): ) +class Ex21Layout(pa.DataFrameModel): + """Define table structure for ex21 layout classification.""" + + filename: Index[str] = pa.Field(description="Name of extracted filing.") + paragraph: Series[bool] = pa.Field( + description="Indicates whether ex21 is formatted as a paragraph or not.", + coerce=True, + ) + + ex21_extract_type = pandera_schema_to_dagster_type(Ex21CompanyOwnership) basic_10k_extract_type = pandera_schema_to_dagster_type(Basic10kCompanyInfo) sec10k_extract_metadata_type = pandera_schema_to_dagster_type(Sec10kExtractionMetadata) +ex21_layout_type = pandera_schema_to_dagster_type(Ex21Layout) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 34f6c3a..9a70b74 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -15,8 +15,10 @@ from ..entities import ( Ex21CompanyOwnership, + Ex21Layout, Sec10kExtractionMetadata, ex21_extract_type, + ex21_layout_type, sec10k_extract_metadata_type, ) from ..extract import chunk_filings, sec10k_filing_metadata, year_quarter_partitions @@ -60,6 +62,28 @@ def extract_filing_chunk( return metadata, extracted +@op( + out={"layout": Out(dagster_type=ex21_layout_type)}, + ins={ + "exhibit21_layout_classifier": In( + input_manager_key="ex21_classifier_io_manager" + ) + }, +) +def classify_chunk_layouts( + parsed_chunk: tuple[pd.DataFrame, pd.DataFrame], + exhibit21_layout_classifier: PyFuncModel, +) -> pd.DataFrame: + """Extract a set of filings and return results.""" + _, inference_dataset = parsed_chunk + return pd.DataFrame( + { + "filename": inference_dataset["id"], + "paragraph": exhibit21_layout_classifier.predict(inference_dataset), + } + ).set_index("filename") + + @op( out={ "metadata": Out( @@ -70,20 +94,27 @@ def extract_filing_chunk( io_manager_key="pandas_parquet_io_manager", dagster_type=ex21_extract_type, ), + "layout": Out( + io_manager_key="pandas_parquet_io_manager", + dagster_type=ex21_layout_type, + ), } ) def collect_extracted_chunks( metadata_dfs: list[pd.DataFrame], extracted_dfs: list[pd.DataFrame], -) -> tuple[pd.DataFrame, pd.DataFrame]: + layout_dfs: list[pd.DataFrame], +) -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]: """Collect chunks of extracted filings.""" metadata_dfs = [df for df in metadata_dfs if not df.empty] extracted_dfs = [df for df in extracted_dfs if not df.empty] metadata_df = pd.concat(metadata_dfs) extracted_df = pd.concat(extracted_dfs) + layout_df = (pd.concat(layout_dfs),) return ( Sec10kExtractionMetadata.validate(metadata_df), Ex21CompanyOwnership.validate(extracted_df), + Ex21Layout.validate(layout_df), ) @@ -106,6 +137,7 @@ def create_dataset( "ex21_company_ownership_info": AssetOut( io_manager_key="pandas_parquet_io_manager" ), + "ex21_layout": AssetOut(io_manager_key="pandas_parquet_io_manager"), }, partitions_def=year_quarter_partitions, ) @@ -115,12 +147,11 @@ def ex21_extract( """Extract ownership info from exhibit 21 docs.""" filing_chunks = chunk_filings(sec10k_filing_metadata) parsed_chunks = filing_chunks.map(create_dataset) + layout_chunks = parsed_chunks.map(classify_chunk_layouts) metadata_chunks, extracted_chunks = parsed_chunks.map(extract_filing_chunk) - metadata, extracted = collect_extracted_chunks( - metadata_chunks.collect(), extracted_chunks.collect() + return collect_extracted_chunks( + metadata_chunks.collect(), extracted_chunks.collect(), layout_chunks.collect() ) - return metadata, extracted - production_assets = [sec10k_filing_metadata, ex21_extract] diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb index 7fc14b5..e155387 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb @@ -38,46 +38,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "48f185de-95ef-4194-9245-93f8d603d2e6", "metadata": { "tags": [ "parameters" ] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", - "2024-10-06 15:23:43 -0400 - dagster - DEBUG - system - Loading 11 partitions...\n", - "2024-10-06 15:23:43 -0400 - dagster - DEBUG - system - Loading partition l from /home/zach/catalyst/workspace/storage/ex21_training_data/l using PickledObjectFilesystemIOManager...\n" - ] - }, - { - "ename": "NotADirectoryError", - "evalue": "[Errno 20] Not a directory: '/home/zach/catalyst/workspace/storage/ex21_training_data/l'", - "output_type": "error", - "traceback": [ - "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", - "\u001b[0;31mNotADirectoryError\u001b[0m Traceback (most recent call last)", - "Cell \u001b[0;32mIn[1], line 9\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmozilla_sec_eia\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msec10k\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m defs\n\u001b[1;32m 5\u001b[0m context \u001b[38;5;241m=\u001b[39m dagstermill\u001b[38;5;241m.\u001b[39mget_context(op_config\u001b[38;5;241m=\u001b[39m{\n\u001b[1;32m 6\u001b[0m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayoutlm_training_run\u001b[39m\u001b[38;5;124m\"\u001b[39m: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mlayoutlm-labeledv0.2\u001b[39m\u001b[38;5;124m\"\u001b[39m,\n\u001b[1;32m 7\u001b[0m })\n\u001b[0;32m----> 9\u001b[0m ex21_training_data \u001b[38;5;241m=\u001b[39m \u001b[43mdefs\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_asset_value\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mex21_training_data\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mlabeledv0.2\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n\u001b[1;32m 11\u001b[0m ex21_failed_parsing_metadata \u001b[38;5;241m=\u001b[39m defs\u001b[38;5;241m.\u001b[39mload_asset_value(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mex21_failed_parsing_metadata\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 12\u001b[0m ex21_inference_dataset \u001b[38;5;241m=\u001b[39m defs\u001b[38;5;241m.\u001b[39mload_asset_value(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mex21_inference_dataset\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/definitions/definitions_class.py:519\u001b[0m, in \u001b[0;36mDefinitions.load_asset_value\u001b[0;34m(self, asset_key, python_type, instance, partition_key, metadata)\u001b[0m\n\u001b[1;32m 490\u001b[0m \u001b[38;5;129m@public\u001b[39m\n\u001b[1;32m 491\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_asset_value\u001b[39m(\n\u001b[1;32m 492\u001b[0m \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 498\u001b[0m metadata: Optional[Dict[\u001b[38;5;28mstr\u001b[39m, Any]] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 499\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m \u001b[38;5;28mobject\u001b[39m:\n\u001b[1;32m 500\u001b[0m \u001b[38;5;250m \u001b[39m\u001b[38;5;124;03m\"\"\"Load the contents of an asset as a Python object.\u001b[39;00m\n\u001b[1;32m 501\u001b[0m \n\u001b[1;32m 502\u001b[0m \u001b[38;5;124;03m Invokes `load_input` on the :py:class:`IOManager` associated with the asset.\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 517\u001b[0m \u001b[38;5;124;03m The contents of an asset as a Python object.\u001b[39;00m\n\u001b[1;32m 518\u001b[0m \u001b[38;5;124;03m \"\"\"\u001b[39;00m\n\u001b[0;32m--> 519\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget_repository_def\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_asset_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 520\u001b[0m \u001b[43m \u001b[49m\u001b[43masset_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43masset_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 521\u001b[0m \u001b[43m \u001b[49m\u001b[43mpython_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpython_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 522\u001b[0m \u001b[43m \u001b[49m\u001b[43minstance\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minstance\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 523\u001b[0m \u001b[43m \u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartition_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 524\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 525\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/definitions/repository_definition/repository_definition.py:350\u001b[0m, in \u001b[0;36mRepositoryDefinition.load_asset_value\u001b[0;34m(self, asset_key, python_type, instance, partition_key, metadata, resource_config)\u001b[0m\n\u001b[1;32m 346\u001b[0m normalized_assets_defs_by_key \u001b[38;5;241m=\u001b[39m {\n\u001b[1;32m 347\u001b[0m k: ad \u001b[38;5;28;01mfor\u001b[39;00m ad \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39masset_graph\u001b[38;5;241m.\u001b[39massets_defs \u001b[38;5;28;01mfor\u001b[39;00m k \u001b[38;5;129;01min\u001b[39;00m ad\u001b[38;5;241m.\u001b[39mkeys\n\u001b[1;32m 348\u001b[0m }\n\u001b[1;32m 349\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m AssetValueLoader(normalized_assets_defs_by_key, instance\u001b[38;5;241m=\u001b[39minstance) \u001b[38;5;28;01mas\u001b[39;00m loader:\n\u001b[0;32m--> 350\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mloader\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_asset_value\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 351\u001b[0m \u001b[43m \u001b[49m\u001b[43masset_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 352\u001b[0m \u001b[43m \u001b[49m\u001b[43mpython_type\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpython_type\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 353\u001b[0m \u001b[43m \u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpartition_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 354\u001b[0m \u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadata\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 355\u001b[0m \u001b[43m \u001b[49m\u001b[43mresource_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mresource_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 356\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/decorator_utils.py:203\u001b[0m, in \u001b[0;36m_wrap_with_pre_call_fn..wrapped_with_pre_call_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 201\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m condition \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;129;01mor\u001b[39;00m condition(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m 202\u001b[0m pre_call_fn()\n\u001b[0;32m--> 203\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfn\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/asset_value_loader.py:169\u001b[0m, in \u001b[0;36mAssetValueLoader.load_asset_value\u001b[0;34m(self, asset_key, python_type, partition_key, input_definition_metadata, resource_config, metadata)\u001b[0m\n\u001b[1;32m 139\u001b[0m io_manager_config \u001b[38;5;241m=\u001b[39m get_mapped_resource_config(\n\u001b[1;32m 140\u001b[0m {io_manager_key: io_manager_def}, io_resource_config\n\u001b[1;32m 141\u001b[0m )\n\u001b[1;32m 143\u001b[0m input_context \u001b[38;5;241m=\u001b[39m build_input_context(\n\u001b[1;32m 144\u001b[0m name\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m 145\u001b[0m asset_key\u001b[38;5;241m=\u001b[39masset_key,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 166\u001b[0m ),\n\u001b[1;32m 167\u001b[0m )\n\u001b[0;32m--> 169\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mio_manager\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_input\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_context\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/upath_io_manager.py:412\u001b[0m, in \u001b[0;36mUPathIOManager.load_input\u001b[0;34m(self, context)\u001b[0m\n\u001b[1;32m 410\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 411\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 412\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_partitions\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontext\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/upath_io_manager.py:396\u001b[0m, in \u001b[0;36mUPathIOManager._load_partitions\u001b[0;34m(self, context)\u001b[0m\n\u001b[1;32m 393\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_load_partitions\u001b[39m(\u001b[38;5;28mself\u001b[39m, context: InputContext) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Dict[\u001b[38;5;28mstr\u001b[39m, Any]:\n\u001b[1;32m 394\u001b[0m \u001b[38;5;66;03m# load multiple partitions\u001b[39;00m\n\u001b[1;32m 395\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m inspect\u001b[38;5;241m.\u001b[39miscoroutinefunction(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mload_from_path):\n\u001b[0;32m--> 396\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_partitions\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontext\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 397\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 398\u001b[0m \u001b[38;5;66;03m# load_from_path returns a coroutine, so we need to await the results\u001b[39;00m\n\u001b[1;32m 399\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mload_partitions_async(context)\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/upath_io_manager.py:81\u001b[0m, in \u001b[0;36mUPathIOManager.load_partitions\u001b[0;34m(self, context)\u001b[0m\n\u001b[1;32m 78\u001b[0m objs \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m 80\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m partition_key \u001b[38;5;129;01min\u001b[39;00m context\u001b[38;5;241m.\u001b[39masset_partition_keys:\n\u001b[0;32m---> 81\u001b[0m obj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_load_partition_from_path\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m 82\u001b[0m \u001b[43m \u001b[49m\u001b[43mcontext\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 83\u001b[0m \u001b[43m \u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 84\u001b[0m \u001b[43m \u001b[49m\u001b[43mpaths\u001b[49m\u001b[43m[\u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[43m]\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 85\u001b[0m \u001b[43m \u001b[49m\u001b[43mbackcompat_paths\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mget\u001b[49m\u001b[43m(\u001b[49m\u001b[43mpartition_key\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m 86\u001b[0m \u001b[43m \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 87\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m obj \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m: \u001b[38;5;66;03m# in case some partitions were skipped\u001b[39;00m\n\u001b[1;32m 88\u001b[0m objs[partition_key] \u001b[38;5;241m=\u001b[39m obj\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/upath_io_manager.py:307\u001b[0m, in \u001b[0;36mUPathIOManager._load_partition_from_path\u001b[0;34m(self, context, partition_key, path, backcompat_path)\u001b[0m\n\u001b[1;32m 305\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m 306\u001b[0m context\u001b[38;5;241m.\u001b[39mlog\u001b[38;5;241m.\u001b[39mdebug(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mget_loading_input_partition_log_message(path, partition_key))\n\u001b[0;32m--> 307\u001b[0m obj \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mload_from_path\u001b[49m\u001b[43m(\u001b[49m\u001b[43mcontext\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mcontext\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 308\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m obj\n\u001b[1;32m 309\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m \u001b[38;5;167;01mFileNotFoundError\u001b[39;00m \u001b[38;5;28;01mas\u001b[39;00m e:\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/dagster/_core/storage/fs_io_manager.py:283\u001b[0m, in \u001b[0;36mPickledObjectFilesystemIOManager.load_from_path\u001b[0;34m(self, context, path)\u001b[0m\n\u001b[1;32m 282\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mload_from_path\u001b[39m(\u001b[38;5;28mself\u001b[39m, context: InputContext, path: \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mUPath\u001b[39m\u001b[38;5;124m\"\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Any:\n\u001b[0;32m--> 283\u001b[0m \u001b[38;5;28;01mwith\u001b[39;00m \u001b[43mpath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mrb\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m \u001b[38;5;28;01mas\u001b[39;00m file:\n\u001b[1;32m 284\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m pickle\u001b[38;5;241m.\u001b[39mload(file)\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/upath/implementations/local.py:134\u001b[0m, in \u001b[0;36mPosixUPath.open\u001b[0;34m(self, mode, buffering, encoding, errors, newline, **fsspec_kwargs)\u001b[0m\n\u001b[1;32m 125\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28msuper\u001b[39m(LocalPath, \u001b[38;5;28mself\u001b[39m)\u001b[38;5;241m.\u001b[39mopen(\n\u001b[1;32m 126\u001b[0m mode\u001b[38;5;241m=\u001b[39mmode,\n\u001b[1;32m 127\u001b[0m buffering\u001b[38;5;241m=\u001b[39mbuffering,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 131\u001b[0m \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mfsspec_kwargs,\n\u001b[1;32m 132\u001b[0m )\n\u001b[1;32m 133\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 134\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mPosixPath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mopen\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmode\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mbuffering\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mencoding\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43merrors\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mnewline\u001b[49m\u001b[43m)\u001b[49m\n", - "File \u001b[0;32m~/mambaforge/envs/mozilla-sec-eia/lib/python3.11/pathlib.py:1044\u001b[0m, in \u001b[0;36mPath.open\u001b[0;34m(self, mode, buffering, encoding, errors, newline)\u001b[0m\n\u001b[1;32m 1042\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mb\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;129;01min\u001b[39;00m mode:\n\u001b[1;32m 1043\u001b[0m encoding \u001b[38;5;241m=\u001b[39m io\u001b[38;5;241m.\u001b[39mtext_encoding(encoding)\n\u001b[0;32m-> 1044\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m io\u001b[38;5;241m.\u001b[39mopen(\u001b[38;5;28mself\u001b[39m, mode, buffering, encoding, errors, newline)\n", - "\u001b[0;31mNotADirectoryError\u001b[0m: [Errno 20] Not a directory: '/home/zach/catalyst/workspace/storage/ex21_training_data/l'" - ] - } - ], + "outputs": [], "source": [ "import dagstermill\n", "\n", @@ -362,6 +330,50 @@ ")\n", "\n", "\n", + "def separate_entities_by_row(df):\n", + " \"\"\"Separate entities that span multiple rows and should be distinct.\n", + "\n", + " Sometimes LayoutLM groups multiple entities that span multiple rows\n", + " into one entity. This function makes an attempt to break these out\n", + " into multiple entities, by taking the average distance between rows\n", + " and separating a grouped entity if the distance between y values\n", + " is greater than the third quantile of y value spacing.\n", + " \"\"\"\n", + " threshold = 1.0\n", + " for entity in [\"subsidiary\", \"loc\", \"own_per\"]:\n", + " entity_df = df[df[\"pred\"] == entity]\n", + " entity_df[\"line_group\"] = entity_df[\"top_left_y\"].transform(\n", + " lambda y: (y // threshold).astype(int)\n", + " )\n", + " # Get the unique y-values for each line (group) per file\n", + " line_positions = (\n", + " entity_df.groupby([\"line_group\"])[\"top_left_y\"].mean().reset_index()\n", + " )\n", + " # Calculate the difference between adjacent y-values (i.e., distance between lines)\n", + " line_positions[\"y_diff\"] = line_positions[\"top_left_y\"].diff()\n", + " # Filter out NaN values and take the mean of the valid distances\n", + " y_diffs = line_positions[\"y_diff\"].dropna()\n", + " avg_y_diff = y_diffs.apply(np.floor).mean()\n", + " # if an I labeled entity is more than avg_y_diff from it's previoius box then make it a B entity\n", + " entity_df[\"prev_y\"] = entity_df[\"top_left_y\"].shift(1)\n", + " entity_df[\"prev_iob\"] = entity_df[\"iob_pred\"].shift(1)\n", + "\n", + " # If the current prediction is an I label\n", + " # and y distance exceeds the average y difference\n", + " # update to a B label and make it the start of a new entity\n", + " entity_df[\"iob_pred\"] = np.where(\n", + " (entity_df[\"iob_pred\"].str[0] == \"I\")\n", + " & ((entity_df[\"top_left_y\"] - entity_df[\"prev_y\"]) >= avg_y_diff),\n", + " \"B\" + entity_df[\"iob_pred\"].str[1:], # Update to 'B'\n", + " entity_df[\"iob_pred\"], # Keep as is\n", + " )\n", + "\n", + " # Drop temporary columns\n", + " entity_df = entity_df.drop(columns=[\"prev_y\", \"prev_iob\"])\n", + " df.update(entity_df, overwrite=True)\n", + "\n", + " return df\n", + "\n", "class LayoutLMInferencePipeline(Pipeline):\n", " \"\"\"Pipeline for performing inference with fine-tuned LayoutLM.\"\"\"\n", "\n", @@ -485,8 +497,10 @@ " )\n", " df.update(first_in_group_df)\n", " # filter for just words that were labeled with non \"other\" entities\n", - " entities_df = df.sort_values(by=[\"top_left_y\", \"top_left_x\"])\n", - " entities_df = entities_df[entities_df[\"pred\"] != \"other\"]\n", + " entities_df = df[df[\"pred\"] != \"other\"]\n", + " # boxes that have the same group label but are on different rows\n", + " # should be updated to have two different B labels\n", + " entities_df = separate_entities_by_row(entities_df)\n", " # words are labeled with IOB format which stands for inside, outside, beginning\n", " # merge B and I entities to form one entity group\n", " # (i.e. \"B-Subsidiary\" and \"I-Subsidiary\" become just \"subsidiary\"), assign a group ID\n", @@ -541,6 +555,21 @@ "def _get_data(dataset):\n", " yield from dataset\n", "\n", + "def _fill_known_nulls(df):\n", + " \"\"\"Fill known nulls in location and own per column.\n", + "\n", + " Fill with known values from rows with same subsidiary.\n", + " \"\"\"\n", + " if \"own_per\" in df:\n", + " df[\"own_per\"] = df.groupby([\"id\", \"subsidiary\"])[\"own_per\"].transform(\n", + " lambda group: group.ffill()\n", + " )\n", + " if \"loc\" in df:\n", + " df[\"loc\"] = df.groupby([\"id\", \"subsidiary\"])[\"loc\"].transform(\n", + " lambda group: group.ffill()\n", + " )\n", + " return df\n", + "\n", "class Ex21Extractor(mlflow.pyfunc.PythonModel):\n", " \"\"\"Create an mlflow pyfunc model to perform full EX21 extraction.\"\"\"\n", " def load_context(self, context):\n", @@ -583,7 +612,8 @@ " all_output_df = pd.concat([all_output_df, output_df])\n", " all_output_df.columns.name = None\n", " all_output_df = clean_extracted_df(all_output_df)\n", - " all_output_df = all_output_df[[\"id\", \"subsidiary\", \"loc\", \"own_per\"]]\n", + " all_output_df = _fill_known_nulls(all_output_df)\n", + " all_output_df = all_output_df[[\"id\", \"subsidiary\", \"loc\", \"own_per\"]].drop_duplicates()\n", " all_output_df = all_output_df.reset_index(drop=True)\n", " return extraction_metadata, all_output_df\n", "\n", diff --git a/src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv b/src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv index 006f344..b5dc6aa 100644 --- a/src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv +++ b/src/mozilla_sec_eia/package_data/validation_data/ex21_labels.csv @@ -515,7 +515,7 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 857501-0001065949-17-000087,First Surety Corporation,West Virginia, 857501-0001065949-17-000087,"Crystal Mountain Water, Inc.",Arkansas, 874501-0000874501-15-000013,Ambac Assurance Corporation,Wisconsin, -874501-0000874501-15-000013,Ambac Assurance UK Limited,United Kingdom Insurance Company, +874501-0000874501-15-000013,Ambac Assurance UK Limited,United Kingdom, 874501-0000874501-15-000013,Ambac Capital Corporation,Delaware, 874501-0000874501-15-000013,"Ambac Capital Funding, Inc.",Delaware, 874501-0000874501-15-000013,"Ambac Credit Products, LLC",Delaware, @@ -751,7 +751,7 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 100826-0001193125-09-042636,Illinois Power Securitization Limited Liability Company,Delaware, 100826-0001193125-09-042636,Illinois Power Special Purpose Trust,Delaware, 100826-0001193125-09-042636,Union Electric Company,Missouri, -100826-0001193125-09-042636,Fuelco LLC,Delaware,33.3 +100826-0001193125-09-042636,Fuelco LLC,Delaware,33.33 4904-0000004904-09-000040,"American Electric Power Company, Inc.",New York, 4904-0000004904-09-000040,American Electric Power Service Corporation,New York,100.0 4904-0000004904-09-000040,"AEP C&I Company, LLC",Delaware,100.0 @@ -1443,13 +1443,13 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 4127-0000004127-17-000033,"Skyworks Solutions Worldwide, Inc., Malaysia Branch",Malaysia, 4127-0000004127-17-000033,Advanced Analogic Technologies Incorporated,Delaware, 4127-0000004127-17-000033,"Advanced Analogic Technologies (China), Inc.",Peoples Republic of China, -4127-0000004127-17-000033,Axiom Microdevices Inc.,Delaware, -4127-0000004127-17-000033,ICWave LLC,Massachusetts, -4127-0000004127-17-000033,Isolink inc.,California, -4127-0000004127-17-000033,MEMS Solutions Inc.,Korea, -4127-0000004127-17-000033,Quantance Inc.,Delaware, -4127-0000004127-17-000033,SiGe Semiconductor Inc.,Delaware, -4127-0000004127-17-000033,SiGe Semiconductor (U.S.) Corp.,Delaware, +4127-0000004127-17-000033,"Axiom Microdevices, Inc.",Delaware, +4127-0000004127-17-000033,"ICWave, LLC",Massachusetts, +4127-0000004127-17-000033,"Isolink, Inc.",California, +4127-0000004127-17-000033,"MEMS Solutions, Inc.",Korea, +4127-0000004127-17-000033,"Quantance, Inc.",Delaware, +4127-0000004127-17-000033,"SiGe Semiconductor, Inc.",Delaware, +4127-0000004127-17-000033,"SiGe Semiconductor (U.S.), Corp.",Delaware, 4127-0000004127-17-000033,SiGe Semiconductor (Europe) Limited,United Kingdom, 4127-0000004127-17-000033,"Trans-Tech, Inc.",Maryland, 4962-0001193125-10-041232,American Express Company,(USA) New York, @@ -1627,8 +1627,8 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 11199-0001104659-06-016718,"Bemis Europe Holdings, S.A.",Belgium,100.0 11199-0001104659-06-016718,Bemis Monceau S.A.,Belgium,100.0 11199-0001104659-06-016718,Techy France S.A.R.L.,France,100.0 -11199-0001104659-06-016718,"Bemis Flexible Packaging de Mexico, S.A. de C.V.",Mexico,100.0 -11199-0001104659-06-016718,"Bemis Flexible Packaging Mexico Servicios, S.A. de C.V.",Mexico,100.0 +11199-0001104659-06-016718,"Bemis Flexible Packaging de Mexico, S.A. de C.V.",Mexico,87.0 +11199-0001104659-06-016718,"Bemis Flexible Packaging Mexico Servicios, S.A. de C.V.",Mexico,86.0 11199-0001104659-06-016718,Bemis France Holdings S.A.S.,France,100.0 11199-0001104659-06-016718,Bemis Packaging France S.A.S.,France,100.0 11199-0001104659-06-016718,Bemis Le Trait S.A.S.,France,100.0 @@ -1684,6 +1684,26 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 11199-0001104659-06-016718,"Electronic Printing Products, Inc.",Ohio,100.0 11199-0001104659-06-016718,Enterprise Software Inc.,Ohio,100.0 11199-0001104659-06-016718,"MACtac Engineered Products, Inc.",Ohio,100.0 +11199-0001104659-06-016718,MACtac Europe S.A.,Belgium,89.0 +11199-0001104659-06-016718,Bemis Coordination Center S.A.,Belgium,67.0 +11199-0001104659-06-016718,Bemis Polska Sp. z o.o.,Poland,100.0 +11199-0001104659-06-016718,MACtac Asia-Pacific Self-Adhesive Products Pte Ltd.,Singapore,100.0 +11199-0001104659-06-016718,MACtac Deutschland GmbH,Germany,100.0 +11199-0001104659-06-016718,MACtac France E.U.R.L.,France,100.0 +11199-0001104659-06-016718,Multi-Fix N.V.,Belgium,100.0 +11199-0001104659-06-016718,MACtac Scandinavia A.B.,Sweden,100.0 +11199-0001104659-06-016718,MACtac Canada Limited/Limitee,Canada,100.0 +11199-0001104659-06-016718,MACtac Europe S.A.,Belgium,11.0 +11199-0001104659-06-016718,MACtac A.G.,Switzerland,100.0 +11199-0001104659-06-016718,"MACtac Mexico, S.A. de C.V.",Mexico,51.0 +11199-0001104659-06-016718,"MACtac Mexico Servicios, S.A. de C.V.",Mexico,51.0 +11199-0001104659-06-016718,"Morgan Adhesives America do Sul, Ltda.",Brazil,100.0 +11199-0001104659-06-016718,Paramount Packaging Corporation,Delaware,100.0 +11199-0001104659-06-016718,Bemis Elsham Limited,United Kingdom,100.0 +11199-0001104659-06-016718,"Bemis Shelbyville, Inc.",Tennessee,100.0 +11199-0001104659-06-016718,"Bemis Longview, Inc.",Texas,100.0 +11199-0001104659-06-016718,"PPC Royalty, Inc.",Delaware,100.0 +11199-0001104659-06-016718,"Pervel Industries, Inc.",Delaware,100.0 29644-0001628280-16-019746,"Aerospace Filtration Systems, Inc.","Chesterfield, MO USA", 29644-0001628280-16-019746,ASHC LLC,"Minneapolis, MN USA", 29644-0001628280-16-019746,DLX Capital S.a.r.l.,"Luxembourg City, Luxembourg", @@ -1937,15 +1957,15 @@ Filename,Subsidiary,Location of Incorporation,Ownership Percentage 75829-0001206774-11-002167,Pall Austria Filter GesmbH,Austria, 75829-0001206774-11-002167,Pall (Canada) Limited,Canada, 75829-0001206774-11-002167,Pall Do Brasil,Brazil, -75829-0001206774-11-002167,Pall Europe Limited (a),England, +75829-0001206774-11-002167,Pall Europe Limited,England, 75829-0001206774-11-002167,Pall France S.A.S.,France, 75829-0001206774-11-002167,Pall Deutschland Beteiligungs GmbH,Germany, 75829-0001206774-11-002167,Pall Deutschland Holding GmbH & Co. KG Partnership (c),Germany, 75829-0001206774-11-002167,Pall Italia S.R.L.,Italy, 75829-0001206774-11-002167,Pall Manufacturing UK Limited,England, 75829-0001206774-11-002167,Gelman Ireland Ltd.,Ireland, -75829-0001206774-11-002167,Pall Netherlands B.V. (a),The Netherlands, -75829-0001206774-11-002167,PLLN C.V. Partnership (b),The Netherlands, +75829-0001206774-11-002167,Pall Netherlands B.V.,The Netherlands, +75829-0001206774-11-002167,PLLN C.V. Partnership,The Netherlands, 75829-0001206774-11-002167,Pall Norge AS,Norway, 75829-0001206774-11-002167,Pall Espana S.A.U.,Spain, 75829-0001206774-11-002167,Pall Norden AB,Sweden, From 4d29037e502f6b8e88635b17fd12e82aa44810ab Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 8 Oct 2024 18:28:57 -0400 Subject: [PATCH 102/161] Update classifier model --- src/mozilla_sec_eia/models/sec10k/__init__.py | 4 +- .../exhibit21_layout_classifier.ipynb | 127 +++++++++++++----- 2 files changed, 94 insertions(+), 37 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 2ecf3c2..a1bd6c3 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -107,11 +107,11 @@ "mlflow_interface": mlflow_interface_resource, "layoutlm_io_manager": MlflowPyfuncModelIOManager( mlflow_interface=mlflow_interface_resource, - uri="runs:/d603f8e219da4fd39f3c2f8d7d3bcb40/exhibit21_extractor", + uri="runs:/582fcebbd4cf4d8b8a8f995406ddc560/exhibit21_extractor", ), "ex21_classifier_io_manager": MlflowPyfuncModelIOManager( mlflow_interface=mlflow_interface_resource, - uri="runs:/08802dbf347c4cd5b66751c11328a06f/exhibit21_layout_classifier", + uri="runs:/cbdd906766b2427c93e9c957be6ea9c8/exhibit21_layout_classifier", ), "pandas_parquet_io_manager": PandasParquetIOManager( base_path=UPath("gs://sec10k-outputs/v2") diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb index 584832c..8315fc1 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 1, "id": "b4963648-2aac-46a7-9778-8808c1e5eeb2", "metadata": { "tags": [ @@ -32,9 +32,9 @@ "output_type": "stream", "text": [ "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", - "2024-10-08 13:55:05 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_layout_labels using PickledObjectFilesystemIOManager...\n", + "2024-10-08 18:11:22 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_layout_labels using PickledObjectFilesystemIOManager...\n", "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", - "2024-10-08 13:55:05 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_layout_classifier_training_dataset using PickledObjectFilesystemIOManager...\n" + "2024-10-08 18:11:22 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_layout_classifier_training_dataset using PickledObjectFilesystemIOManager...\n" ] } ], @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 2, "id": "ee4ed368-7d01-4cb8-952f-f7941900d669", "metadata": { "tags": [] @@ -109,7 +109,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 3, "id": "f71e2dfc-552d-49e7-b23d-267c2158efe2", "metadata": { "tags": [] @@ -118,8 +118,10 @@ "source": [ "import numpy as np\n", "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", "\n", "X = ex21_layout_classifier_training_dataset.sort_values(by=[\"id\"]).apply(calculate_features, axis=1)\n", + "X = StandardScaler().fit_transform(X)\n", "y = np.where(ex21_layout_labels.sort_values(by=[\"filename\"])[\"layout\"] == \"Paragraph\", 1, 0)\n", "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=16)" ] @@ -134,7 +136,7 @@ }, { "cell_type": "code", - "execution_count": 61, + "execution_count": 4, "id": "08bf5f11-af80-4c65-a005-2a2de49c30b5", "metadata": { "tags": [] @@ -154,7 +156,8 @@ " def predict(self, context, model_input: pd.DataFrame):\n", " \"\"\"Create feature matrix from inference dataset and use trained model for prediction.\"\"\"\n", " features_df = model_input.apply(calculate_features, axis=1)\n", - " return self.model.predict(features_df)" + " scaled_features = StandardScaler().fit_transform(features_df)\n", + " return self.model.predict(scaled_features)" ] }, { @@ -167,7 +170,7 @@ }, { "cell_type": "code", - "execution_count": 66, + "execution_count": 6, "id": "55d2194e-82a8-4d1e-8318-a8c893dc29de", "metadata": { "tags": [] @@ -177,17 +180,9 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024/10/08 16:10:39 WARNING mlflow.utils.autologging_utils: MLflow sklearn autologging is known to be compatible with 0.24.1 <= scikit-learn <= 1.5.1, but the installed version is 1.5.2. If you encounter errors during autologging, try upgrading / downgrading scikit-learn to a compatible version, or try upgrading MLflow.\n", - "2024/10/08 16:10:39 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.\n", - "2024/10/08 16:10:40 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n", - "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/sklearn/linear_model/_logistic.py:469: ConvergenceWarning: lbfgs failed to converge (status=1):\n", - "STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.\n", - "\n", - "Increase the number of iterations (max_iter) or scale the data as shown in:\n", - " https://scikit-learn.org/stable/modules/preprocessing.html\n", - "Please also refer to the documentation for alternative solver options:\n", - " https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression\n", - " n_iter_i = _check_optimize_result(\n", + "2024/10/08 18:23:23 WARNING mlflow.utils.autologging_utils: MLflow sklearn autologging is known to be compatible with 0.24.1 <= scikit-learn <= 1.5.1, but the installed version is 1.5.2. If you encounter errors during autologging, try upgrading / downgrading scikit-learn to a compatible version, or try upgrading MLflow.\n", + "2024/10/08 18:23:23 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.\n", + "2024/10/08 18:23:24 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n", "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values `_ for more details.\n", " warnings.warn(\n" ] @@ -195,7 +190,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "dd07e390d13f4f6692ae96288ffb1dbb", + "model_id": "0fd59c52d9cd47548fa31d3edf451082", "version_major": 2, "version_minor": 0 }, @@ -210,17 +205,73 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024/10/08 16:11:30 INFO mlflow.tracking._tracking_service.client: 🏃 View run languid-shrimp-450 at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/15/runs/08802dbf347c4cd5b66751c11328a06f.\n", - "2024/10/08 16:11:30 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/15.\n", - "2024/10/08 16:11:30 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n", - "2024/10/08 16:11:31 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n" + "2024/10/08 18:24:13 INFO mlflow.tracking._tracking_service.client: 🏃 View run LogisticRegression at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/15/runs/5f5d526e1e16442983679d6035599df2.\n", + "2024/10/08 18:24:13 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/15.\n", + "2024/10/08 18:24:14 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n", + "2024/10/08 18:24:14 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n", + "2024/10/08 18:24:15 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n", + "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/mlflow/types/utils.py:407: UserWarning: Hint: Inferred schema contains integer column(s). Integer columns in Python cannot represent missing values. If your input data contains missing values at inference time, it will be encoded as floats and will cause a schema enforcement error. The best way to avoid this problem is to infer the model schema based on a realistic data sample (training dataset) that includes missing values. Alternatively, you can declare integer columns as doubles (float64) whenever these columns may have missing values. See `Handling Integers With Missing Values `_ for more details.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1c936d95469b42cdaec2a510caac0e97", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading artifacts: 0%| | 0/5 [00:00`_ for more details.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "beff1b6195844fdfa6d30048f4164f17", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading artifacts: 0%| | 0/5 [00:00 Date: Wed, 9 Oct 2024 13:04:20 -0400 Subject: [PATCH 103/161] Fix set on copy pandas issue --- src/mozilla_sec_eia/models/sec10k/__init__.py | 2 +- .../notebooks/exhibit21_extractor.ipynb | 72 ++++++++++--------- 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index a1bd6c3..985d739 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -111,7 +111,7 @@ ), "ex21_classifier_io_manager": MlflowPyfuncModelIOManager( mlflow_interface=mlflow_interface_resource, - uri="runs:/cbdd906766b2427c93e9c957be6ea9c8/exhibit21_layout_classifier", + uri="runs:/1d84be1656864f82b7b990a64fd113e3/exhibit21_layout_classifier", ), "pandas_parquet_io_manager": PandasParquetIOManager( base_path=UPath("gs://sec10k-outputs/v2") diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb index e155387..1b77029 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb @@ -330,7 +330,7 @@ ")\n", "\n", "\n", - "def separate_entities_by_row(df):\n", + "def separate_entities_by_row(entity_df):\n", " \"\"\"Separate entities that span multiple rows and should be distinct.\n", "\n", " Sometimes LayoutLM groups multiple entities that span multiple rows\n", @@ -340,39 +340,34 @@ " is greater than the third quantile of y value spacing.\n", " \"\"\"\n", " threshold = 1.0\n", - " for entity in [\"subsidiary\", \"loc\", \"own_per\"]:\n", - " entity_df = df[df[\"pred\"] == entity]\n", - " entity_df[\"line_group\"] = entity_df[\"top_left_y\"].transform(\n", - " lambda y: (y // threshold).astype(int)\n", - " )\n", - " # Get the unique y-values for each line (group) per file\n", - " line_positions = (\n", - " entity_df.groupby([\"line_group\"])[\"top_left_y\"].mean().reset_index()\n", - " )\n", - " # Calculate the difference between adjacent y-values (i.e., distance between lines)\n", - " line_positions[\"y_diff\"] = line_positions[\"top_left_y\"].diff()\n", - " # Filter out NaN values and take the mean of the valid distances\n", - " y_diffs = line_positions[\"y_diff\"].dropna()\n", - " avg_y_diff = y_diffs.apply(np.floor).mean()\n", - " # if an I labeled entity is more than avg_y_diff from it's previoius box then make it a B entity\n", - " entity_df[\"prev_y\"] = entity_df[\"top_left_y\"].shift(1)\n", - " entity_df[\"prev_iob\"] = entity_df[\"iob_pred\"].shift(1)\n", - "\n", - " # If the current prediction is an I label\n", - " # and y distance exceeds the average y difference\n", - " # update to a B label and make it the start of a new entity\n", - " entity_df[\"iob_pred\"] = np.where(\n", - " (entity_df[\"iob_pred\"].str[0] == \"I\")\n", - " & ((entity_df[\"top_left_y\"] - entity_df[\"prev_y\"]) >= avg_y_diff),\n", - " \"B\" + entity_df[\"iob_pred\"].str[1:], # Update to 'B'\n", - " entity_df[\"iob_pred\"], # Keep as is\n", - " )\n", - "\n", - " # Drop temporary columns\n", - " entity_df = entity_df.drop(columns=[\"prev_y\", \"prev_iob\"])\n", - " df.update(entity_df, overwrite=True)\n", + " entity_df.loc[:, \"line_group\"] = entity_df.loc[:, \"top_left_y\"].transform(\n", + " lambda y: (y // threshold).astype(int)\n", + " )\n", + " # Get the unique y-values for each line (group) per file\n", + " line_positions = (\n", + " entity_df.groupby([\"line_group\"])[\"top_left_y\"].mean().reset_index()\n", + " )\n", + " # Calculate the difference between adjacent y-values (i.e., distance between lines)\n", + " line_positions.loc[:, \"y_diff\"] = line_positions.loc[:, \"top_left_y\"].diff()\n", + " # Filter out NaN values and take the mean of the valid distances\n", + " y_diffs = line_positions[\"y_diff\"].dropna()\n", + " avg_y_diff = y_diffs.apply(np.floor).mean()\n", + " # if an I labeled entity is more than avg_y_diff from it's previoius box then make it a B entity\n", + " entity_df.loc[:, \"prev_y\"] = entity_df.loc[:, \"top_left_y\"].shift(1)\n", + " entity_df.loc[:, \"prev_iob\"] = entity_df.loc[:, \"iob_pred\"].shift(1)\n", + "\n", + " # If the current prediction is an I label\n", + " # and y distance exceeds the average y difference\n", + " # update to a B label and make it the start of a new entity\n", + " entity_df.loc[:, \"iob_pred\"] = np.where(\n", + " (entity_df[\"iob_pred\"].str[0] == \"I\")\n", + " & ((entity_df[\"top_left_y\"] - entity_df[\"prev_y\"]) >= avg_y_diff),\n", + " \"B\" + entity_df[\"iob_pred\"].str[1:], # Update to 'B'\n", + " entity_df[\"iob_pred\"], # Keep as is\n", + " )\n", "\n", - " return df\n", + " # Drop temporary columns\n", + " return entity_df.drop(columns=[\"prev_y\", \"prev_iob\"])\n", "\n", "class LayoutLMInferencePipeline(Pipeline):\n", " \"\"\"Pipeline for performing inference with fine-tuned LayoutLM.\"\"\"\n", @@ -500,8 +495,7 @@ " entities_df = df[df[\"pred\"] != \"other\"]\n", " # boxes that have the same group label but are on different rows\n", " # should be updated to have two different B labels\n", - " entities_df = separate_entities_by_row(entities_df)\n", - " # words are labeled with IOB format which stands for inside, outside, beginning\n", + " entities_df = entities_df.groupby([\"pred\"], as_index=False).apply(separate_entities_by_row).reset_index(level=0).sort_index()\n", " # merge B and I entities to form one entity group\n", " # (i.e. \"B-Subsidiary\" and \"I-Subsidiary\" become just \"subsidiary\"), assign a group ID\n", " entities_df[\"group\"] = (entities_df[\"iob_pred\"].str.startswith(\"B-\")).cumsum()\n", @@ -781,6 +775,14 @@ " mlflow.log_table(extracted, \"extracted_data.json\")\n", " mlflow.log_table(metadata, \"extraction_metadata.json\")" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d11e2a7b-ec74-4930-b331-144a8584c72f", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { From 52e358063073d8d4e4d2dba712ceec0040b2c515 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 9 Oct 2024 14:08:36 -0400 Subject: [PATCH 104/161] Fix model uri's --- src/mozilla_sec_eia/models/sec10k/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 985d739..94c643b 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -107,11 +107,11 @@ "mlflow_interface": mlflow_interface_resource, "layoutlm_io_manager": MlflowPyfuncModelIOManager( mlflow_interface=mlflow_interface_resource, - uri="runs:/582fcebbd4cf4d8b8a8f995406ddc560/exhibit21_extractor", + uri="runs:/1d84be1656864f82b7b990a64fd113e3/exhibit21_extractor", ), "ex21_classifier_io_manager": MlflowPyfuncModelIOManager( mlflow_interface=mlflow_interface_resource, - uri="runs:/1d84be1656864f82b7b990a64fd113e3/exhibit21_layout_classifier", + uri="runs:/cbdd906766b2427c93e9c957be6ea9c8/exhibit21_layout_classifier", ), "pandas_parquet_io_manager": PandasParquetIOManager( base_path=UPath("gs://sec10k-outputs/v2") From b709053f885b68cefd10d4d6bfe5108c15060640 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 9 Oct 2024 16:36:22 -0400 Subject: [PATCH 105/161] Fix indices in extraction model --- src/mozilla_sec_eia/models/sec10k/__init__.py | 2 +- .../notebooks/exhibit21_extractor.ipynb | 464 +++++++++++++++++- 2 files changed, 454 insertions(+), 12 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 94c643b..7d502c1 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -107,7 +107,7 @@ "mlflow_interface": mlflow_interface_resource, "layoutlm_io_manager": MlflowPyfuncModelIOManager( mlflow_interface=mlflow_interface_resource, - uri="runs:/1d84be1656864f82b7b990a64fd113e3/exhibit21_extractor", + uri="runs:/426dd1b67cbd4677b6fa22b6b9d9173a/exhibit21_extractor", ), "ex21_classifier_io_manager": MlflowPyfuncModelIOManager( mlflow_interface=mlflow_interface_resource, diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb index 1b77029..d136a25 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb @@ -38,14 +38,29 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "48f185de-95ef-4194-9245-93f8d603d2e6", "metadata": { "tags": [ "parameters" ] }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", + "2024-10-09 15:25:02 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_training_data using PickledObjectFilesystemIOManager...\n", + "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", + "2024-10-09 15:25:03 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_failed_parsing_metadata using PickledObjectFilesystemIOManager...\n", + "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", + "2024-10-09 15:25:03 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_inference_dataset using PickledObjectFilesystemIOManager...\n", + "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", + "2024-10-09 15:25:04 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_validation_set using PickledObjectFilesystemIOManager...\n" + ] + } + ], "source": [ "import dagstermill\n", "\n", @@ -85,7 +100,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "9372b908-d9b9-4d18-a5bf-d332648b3e49", "metadata": { "tags": [] @@ -156,7 +171,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "71d205b2-e6ea-4ad0-982c-22e762269119", "metadata": { "tags": [] @@ -311,7 +326,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 8, "id": "42c8e920-d671-40c2-b5db-c43611a33897", "metadata": { "tags": [] @@ -495,7 +510,9 @@ " entities_df = df[df[\"pred\"] != \"other\"]\n", " # boxes that have the same group label but are on different rows\n", " # should be updated to have two different B labels\n", - " entities_df = entities_df.groupby([\"pred\"], as_index=False).apply(separate_entities_by_row).reset_index(level=0).sort_index()\n", + "\n", + " entities_df = entities_df.groupby(\"pred\").apply(separate_entities_by_row, include_groups=False)\n", + " entities_df = entities_df.reset_index(\"pred\").sort_index()\n", " # merge B and I entities to form one entity group\n", " # (i.e. \"B-Subsidiary\" and \"I-Subsidiary\" become just \"subsidiary\"), assign a group ID\n", " entities_df[\"group\"] = (entities_df[\"iob_pred\"].str.startswith(\"B-\")).cumsum()\n", @@ -525,12 +542,71 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 9, "id": "4d802e00-1ca4-40b3-b15b-561711a9db70", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3145df6c447a4f958ac86b7a84c9f52d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading artifacts: 0%| | 0/1 [00:00, skipping schema inference\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bae0a0244e4141449874b48f750bd443", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading artifacts: 0%| | 0/17 [00:00`_ for more details.\n", + " warnings.warn(\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "21da151ecd6d4a9187bf77b40c7a8aed", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Downloading artifacts: 0%| | 0/17 [00:00 Date: Wed, 9 Oct 2024 17:43:47 -0400 Subject: [PATCH 106/161] Fix typo --- src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 9a70b74..4e2029c 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -110,7 +110,7 @@ def collect_extracted_chunks( extracted_dfs = [df for df in extracted_dfs if not df.empty] metadata_df = pd.concat(metadata_dfs) extracted_df = pd.concat(extracted_dfs) - layout_df = (pd.concat(layout_dfs),) + layout_df = pd.concat(layout_dfs) return ( Sec10kExtractionMetadata.validate(metadata_df), Ex21CompanyOwnership.validate(extracted_df), From e6b29ffcfae7c037b5554927d087a42b0bfc80a7 Mon Sep 17 00:00:00 2001 From: zschira Date: Thu, 10 Oct 2024 16:32:33 -0400 Subject: [PATCH 107/161] Add asset factory for loading models --- .../library/mlflow/__init__.py | 19 +++++++++ .../library/mlflow/mlflow_io_managers.py | 24 +++++------ src/mozilla_sec_eia/models/sec10k/__init__.py | 15 +++---- .../models/sec10k/ex_21/__init__.py | 41 +++++++++++++------ 4 files changed, 64 insertions(+), 35 deletions(-) diff --git a/src/mozilla_sec_eia/library/mlflow/__init__.py b/src/mozilla_sec_eia/library/mlflow/__init__.py index 17a765d..9376d33 100644 --- a/src/mozilla_sec_eia/library/mlflow/__init__.py +++ b/src/mozilla_sec_eia/library/mlflow/__init__.py @@ -1,5 +1,8 @@ """Implement tooling to interface with mlflow experiment tracking.""" +from dagster import Config, asset +from pydantic import create_model + from .mlflow_io_managers import ( MlflowBaseIOManager, MlflowMetricsIOManager, @@ -13,6 +16,22 @@ ) +def pyfunc_model_asset_factory(name: str, mlflow_run_uri: str): + """Create asset for loading a model logged to mlflow.""" + PyfuncConfig = create_model( # NOQA: N806 + f"PyfuncConfig{name}", mlflow_run_uri=(str, mlflow_run_uri), __base__=Config + ) + + @asset( + name=name, + io_manager_key="pyfunc_model_io_manager", + ) + def _model_asset(config: PyfuncConfig): + return config.mlflow_run_uri + + return _model_asset + + def get_mlflow_io_manager( key: str, mlflow_interface: MlflowInterface | None = None, diff --git a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py index abc2d1c..fffb424 100644 --- a/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py +++ b/src/mozilla_sec_eia/library/mlflow/mlflow_io_managers.py @@ -31,25 +31,25 @@ class MlflowPyfuncModelIOManager(MlflowBaseIOManager): uri: str | None = None - def handle_output(self, context, obj): - """Outputs not implemented.""" - raise NotImplementedError("Logging models not supported by io manager.") + def handle_output(self, context: OutputContext, model_uri: str): + """Takes model uri as a string and caches the model locally for future use.""" + cache_path = self.mlflow_interface.dagster_home_path / "model_cache" + cache_path.mkdir(exist_ok=True, parents=True) + + logger.info(f"Caching {context.name} model at {cache_path}") + mlflow.pyfunc.load_model( + model_uri, + dst_path=cache_path, + ) def load_input(self, context: InputContext): """Load pyfunc model with mlflow server.""" cache_path = ( self.mlflow_interface.dagster_home_path / "model_cache" / context.name ) - cache_path.mkdir(exist_ok=True, parents=True) - - model_uri = self.uri - if model_uri is None: - model_uri = f"models:/{context.name}" + logger.info(f"Loading {context.name} model from {cache_path}") - return mlflow.pyfunc.load_model( - model_uri, - dst_path=cache_path, - ) + return mlflow.pyfunc.load_model(cache_path) class MlflowPandasArtifactIOManager(MlflowBaseIOManager): diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 7d502c1..b01d58f 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -52,7 +52,7 @@ exhibit21_extractor = define_dagstermill_asset( - name="exhibit21_extractor", + name="train_exhibit21_extractor", notebook_path=file_relative_path(__file__, "notebooks/exhibit21_extractor.ipynb"), config_schema=ex_21.data.Ex21TrainConfig.to_config_schema(), ins={ @@ -71,7 +71,7 @@ exhibit21_layout_classifier = define_dagstermill_asset( - name="exhibit21_layout_classifier", + name="train_exhibit21_layout_classifier", notebook_path=file_relative_path( __file__, "notebooks/exhibit21_layout_classifier.ipynb" ), @@ -105,17 +105,12 @@ resources={ "cloud_interface": cloud_interface_resource, "mlflow_interface": mlflow_interface_resource, - "layoutlm_io_manager": MlflowPyfuncModelIOManager( - mlflow_interface=mlflow_interface_resource, - uri="runs:/426dd1b67cbd4677b6fa22b6b9d9173a/exhibit21_extractor", - ), - "ex21_classifier_io_manager": MlflowPyfuncModelIOManager( - mlflow_interface=mlflow_interface_resource, - uri="runs:/cbdd906766b2427c93e9c957be6ea9c8/exhibit21_layout_classifier", - ), "pandas_parquet_io_manager": PandasParquetIOManager( base_path=UPath("gs://sec10k-outputs/v2") ), + "pyfunc_model_io_manager": MlflowPyfuncModelIOManager( + mlflow_interface=mlflow_interface_resource + ), "output_notebook_io_manager": ConfigurableLocalOutputNotebookIOManager(), } | mlflow_train_test_io_managers, diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 4e2029c..5a9abc9 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -6,13 +6,14 @@ import pandas as pd from dagster import ( AssetOut, - In, Out, graph_multi_asset, op, ) from mlflow.pyfunc import PyFuncModel +from mozilla_sec_eia.library.mlflow import pyfunc_model_asset_factory + from ..entities import ( Ex21CompanyOwnership, Ex21Layout, @@ -33,7 +34,6 @@ "metadata": Out(dagster_type=sec10k_extract_metadata_type), "extracted": Out(dagster_type=ex21_extract_type), }, - ins={"exhibit21_extractor": In(input_manager_key="layoutlm_io_manager")}, tags={"model": "exhibit21_extractor"}, ) def extract_filing_chunk( @@ -62,14 +62,7 @@ def extract_filing_chunk( return metadata, extracted -@op( - out={"layout": Out(dagster_type=ex21_layout_type)}, - ins={ - "exhibit21_layout_classifier": In( - input_manager_key="ex21_classifier_io_manager" - ) - }, -) +@op(out={"layout": Out(dagster_type=ex21_layout_type)}) def classify_chunk_layouts( parsed_chunk: tuple[pd.DataFrame, pd.DataFrame], exhibit21_layout_classifier: PyFuncModel, @@ -129,6 +122,17 @@ def create_dataset( ) +exhibit21_extractor = pyfunc_model_asset_factory( + name="exhibit21_extractor", + mlflow_run_uri="runs:/426dd1b67cbd4677b6fa22b6b9d9173a/exhibit21_extractor", +) + +exhibit21_layout_classifier = pyfunc_model_asset_factory( + name="exhibit21_layout_classifier", + mlflow_run_uri="runs:/cbdd906766b2427c93e9c957be6ea9c8/exhibit21_layout_classifier", +) + + @graph_multi_asset( outs={ "ex21_extraction_metadata": AssetOut( @@ -143,15 +147,26 @@ def create_dataset( ) def ex21_extract( sec10k_filing_metadata: pd.DataFrame, + exhibit21_extractor: PyFuncModel, + exhibit21_layout_classifier: PyFuncModel, ): """Extract ownership info from exhibit 21 docs.""" filing_chunks = chunk_filings(sec10k_filing_metadata) parsed_chunks = filing_chunks.map(create_dataset) - layout_chunks = parsed_chunks.map(classify_chunk_layouts) - metadata_chunks, extracted_chunks = parsed_chunks.map(extract_filing_chunk) + layout_chunks = parsed_chunks.map( + lambda chunk: classify_chunk_layouts(chunk, exhibit21_layout_classifier) + ) + metadata_chunks, extracted_chunks = parsed_chunks.map( + lambda chunk: extract_filing_chunk(chunk, exhibit21_extractor) + ) return collect_extracted_chunks( metadata_chunks.collect(), extracted_chunks.collect(), layout_chunks.collect() ) -production_assets = [sec10k_filing_metadata, ex21_extract] +production_assets = [ + sec10k_filing_metadata, + ex21_extract, + exhibit21_extractor, + exhibit21_layout_classifier, +] From 3d11777b2c7c6a991d45b6b4cca94f1407a15b47 Mon Sep 17 00:00:00 2001 From: zschira Date: Thu, 10 Oct 2024 17:50:18 -0400 Subject: [PATCH 108/161] Catch layout classification NaN exception --- .../models/sec10k/ex_21/__init__.py | 22 ++++++++++++++----- 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 5a9abc9..2e6a13b 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -69,12 +69,22 @@ def classify_chunk_layouts( ) -> pd.DataFrame: """Extract a set of filings and return results.""" _, inference_dataset = parsed_chunk - return pd.DataFrame( - { - "filename": inference_dataset["id"], - "paragraph": exhibit21_layout_classifier.predict(inference_dataset), - } - ).set_index("filename") + try: + df = pd.DataFrame( + { + "filename": inference_dataset["id"], + "paragraph": exhibit21_layout_classifier.predict(inference_dataset), + } + ).set_index("filename") + except ValueError: + df = pd.DataFrame( + { + "filename": inference_dataset["id"], + "paragraph": [None] * len(inference_dataset), + } + ).set_index("filename") + + return df @op( From df5fe0d8db54162cb054aee94e80be0e85c34261 Mon Sep 17 00:00:00 2001 From: zschira Date: Thu, 10 Oct 2024 18:27:17 -0400 Subject: [PATCH 109/161] Use GCS pickle io-manager --- pyproject.toml | 1 + src/mozilla_sec_eia/models/sec10k/__init__.py | 6 ++++++ 2 files changed, 7 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 8a66d85..9e2e66e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,6 +24,7 @@ dependencies = [ "dagster-mlflow", "dagster-webserver", "dagster-pandera", + "dagster-gcp", "dagstermill", "datasets>=2.1,<3", # Access Hugging Face datasets "seqeval>=1.2,<2", # Sequence labeling evaluation diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index b01d58f..cd91cc4 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -9,6 +9,7 @@ load_assets_from_modules, load_assets_from_package_module, ) +from dagster_gcp.gcs import GCSPickleIOManager, GCSResource from dagstermill import ( ConfigurableLocalOutputNotebookIOManager, define_dagstermill_asset, @@ -112,6 +113,11 @@ mlflow_interface=mlflow_interface_resource ), "output_notebook_io_manager": ConfigurableLocalOutputNotebookIOManager(), + "io_manager": GCSPickleIOManager( + gcs_bucket="sec10k-outputs", + gcs_prefix="dagster_storage", + gcs=GCSResource(project="catalyst-cooperative-mozilla"), + ), } | mlflow_train_test_io_managers, ) From d6c41a2891c9cd56cdb610c5c7e70a38d4ab906a Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 11 Oct 2024 11:26:13 -0400 Subject: [PATCH 110/161] Switch gcs pickle io manager to upath based --- .../library/generic_io_managers.py | 18 ++++++++++++++++++ src/mozilla_sec_eia/models/sec10k/__init__.py | 14 +++++++------- .../models/sec10k/ex_21/__init__.py | 8 +++++++- 3 files changed, 32 insertions(+), 8 deletions(-) diff --git a/src/mozilla_sec_eia/library/generic_io_managers.py b/src/mozilla_sec_eia/library/generic_io_managers.py index e85aa68..7d25198 100644 --- a/src/mozilla_sec_eia/library/generic_io_managers.py +++ b/src/mozilla_sec_eia/library/generic_io_managers.py @@ -1,5 +1,7 @@ """Implement useful generic io-managers.""" +import pickle + import pandas as pd from dagster import InputContext, OutputContext, UPathIOManager from upath import UPath @@ -19,3 +21,19 @@ def load_from_path(self, context: InputContext, path: UPath) -> pd.DataFrame: """Read parquet.""" with path.open("rb") as file: return pd.read_parquet(file) + + +class PickleUPathIOManager(UPathIOManager): + """Read and write pandas dataframes as parquet files on local or remote filesystem.""" + + extension: str = ".pickle" + + def dump_to_path(self, context: OutputContext, obj: pd.DataFrame, path: UPath): + """Write parquet.""" + with path.open("wb") as file: + pickle.dump(obj, file) + + def load_from_path(self, context: InputContext, path: UPath) -> pd.DataFrame: + """Read parquet.""" + with path.open("rb") as file: + return pickle.load(file) # noqa: S301 diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index cd91cc4..6a33b78 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -9,7 +9,6 @@ load_assets_from_modules, load_assets_from_package_module, ) -from dagster_gcp.gcs import GCSPickleIOManager, GCSResource from dagstermill import ( ConfigurableLocalOutputNotebookIOManager, define_dagstermill_asset, @@ -17,7 +16,10 @@ from upath import UPath from mozilla_sec_eia.library import model_jobs -from mozilla_sec_eia.library.generic_io_managers import PandasParquetIOManager +from mozilla_sec_eia.library.generic_io_managers import ( + PandasParquetIOManager, + PickleUPathIOManager, +) from mozilla_sec_eia.library.mlflow import ( MlflowPyfuncModelIOManager, mlflow_interface_resource, @@ -109,15 +111,13 @@ "pandas_parquet_io_manager": PandasParquetIOManager( base_path=UPath("gs://sec10k-outputs/v2") ), + "pickle_gcs_io_manager": PickleUPathIOManager( + base_path=UPath("gs://sec10k-outputs/dagster_storage") + ), "pyfunc_model_io_manager": MlflowPyfuncModelIOManager( mlflow_interface=mlflow_interface_resource ), "output_notebook_io_manager": ConfigurableLocalOutputNotebookIOManager(), - "io_manager": GCSPickleIOManager( - gcs_bucket="sec10k-outputs", - gcs_prefix="dagster_storage", - gcs=GCSResource(project="catalyst-cooperative-mozilla"), - ), } | mlflow_train_test_io_managers, ) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 2e6a13b..3558cd9 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -121,7 +121,13 @@ def collect_extracted_chunks( ) -@op +@op( + out={ + "dataset": Out( + io_manager_key="pickle_gcs_io_manager", + ), + } +) def create_dataset( cloud_interface: GCSArchive, filings: pd.DataFrame ) -> tuple[pd.DataFrame, pd.DataFrame]: From ddd22639bbd6f4aebe8e602e31e508c1cfd83e9e Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 11 Oct 2024 12:05:31 -0400 Subject: [PATCH 111/161] Remove duplicate logger --- pyproject.toml | 1 - src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py | 3 --- 2 files changed, 4 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 9e2e66e..8a66d85 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,6 @@ dependencies = [ "dagster-mlflow", "dagster-webserver", "dagster-pandera", - "dagster-gcp", "dagstermill", "datasets>=2.1,<3", # Access Hugging Face datasets "seqeval>=1.2,<2", # Sequence labeling evaluation diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py index 56a7d9b..25db2d9 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py @@ -14,9 +14,6 @@ logger = logging.getLogger(f"catalystcoop.{__name__}") -logger = logging.getLogger(f"catalystcoop.{__name__}") - - def format_unlabeled_pdf_dataframe(pdfs_dir: Path): """Read and format PDFs into a dataframe (without labels).""" inference_df = pd.DataFrame() From 93bffcbea41ee945b8993b86c55275a8d5d5367f Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 11 Oct 2024 12:30:09 -0400 Subject: [PATCH 112/161] Fix config warnings --- pyproject.toml | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8a66d85..1398915 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -157,7 +157,7 @@ doctest_optionflags = [ [tool.ruff] exclude = ["notebooks/*"] -select = [ +lint.select = [ "A", # flake8-builtins # "ARG", # unused arguments # "B", # flake8-bugbear @@ -185,7 +185,7 @@ select = [ "UP", # pyupgrade (use modern python syntax) "W", # pycodestyle warnings ] -ignore = [ +lint.ignore = [ "D401", # Require imperative mood in docstrings. "D417", "E501", # Overlong lines. @@ -205,26 +205,26 @@ target-version = "py311" line-length = 88 # Don't automatically concatenate strings -- sometimes we forget a comma! -unfixable = ["ISC"] +lint.unfixable = ["ISC"] -[tool.ruff.per-file-ignores] +[tool.ruff.lint.per-file-ignores] "__init__.py" = ["F401"] # Ignore unused imports "tests/*" = ["D"] -[tool.ruff.pep8-naming] +[tool.ruff.lint.pep8-naming] # Allow Pydantic's `@validator` decorator to trigger class method treatment. classmethod-decorators = ["pydantic.validator", "pydantic.root_validator"] -[tool.ruff.isort] +[tool.ruff.lint.isort] known-first-party = ["pudl"] -[tool.ruff.pydocstyle] +[tool.ruff.lint.pydocstyle] convention = "google" -[tool.ruff.mccabe] +[tool.ruff.lint.mccabe] max-complexity = 10 -[tool.ruff.flake8-quotes] +[tool.ruff.lint.flake8-quotes] docstring-quotes = "double" inline-quotes = "double" multiline-quotes = "double" From d717caa72d821a846a961f4c3017e25c8bdb72cf Mon Sep 17 00:00:00 2001 From: zschira Date: Fri, 11 Oct 2024 12:46:20 -0400 Subject: [PATCH 113/161] Test pin sphinx --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1398915..e38d6b8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -92,7 +92,7 @@ dev = [ docs = [ "doc8>=1,<2", # Ensures clean documentation formatting "furo>=2022.4.7", - "sphinx>=6,<9", # The default Python documentation engine + "sphinx>=6,<8.1", # The default Python documentation engine "sphinx-autoapi>=2,<4", # Generates documentation from docstrings "sphinx-issues>=1.2,<5", # Allows references to GitHub issues From 09cd18974b69c3eb58e3ca0c7ab8b468fca48764 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Sun, 13 Oct 2024 12:59:23 -0700 Subject: [PATCH 114/161] add splink and model to environment --- environment.yml | 4 +++- pyproject.toml | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/environment.yml b/environment.yml index 4a985d5..73986fe 100644 --- a/environment.yml +++ b/environment.yml @@ -19,11 +19,13 @@ dependencies: # Jupyter packages: - jupyterlab>=3.2,<4 - - nbconvert>=6,<7 # Used to clear notebook outputs in pre-commit hooks + - nbconvert>=7 # Used to clear notebook outputs in pre-commit hooks # These are not normal Python packages available on PyPI - nodejs # Useful for Jupyter and prettier pre-commit hook + - catalystcoop.pudl>=2023.2.5,<=2024.8.0 + # Use pip to install the package defined by this repo for development: - pip: - --editable ./[dev,docs,tests,types] diff --git a/pyproject.toml b/pyproject.toml index e38d6b8..c38ae7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ dependencies = [ "pydantic-settings>=2", "python-bidi<0.7.0", "pymupdf", # Convert PDF to image + "splink>=4,<5", "sqlalchemy>=2,<3", "timm>0.9,<2", # dependency for Hugging Face computer vision models "torch>=2.2,<3", From 15be127ab15e67db68c50bd9cbb8a8f2272a7382 Mon Sep 17 00:00:00 2001 From: zschira Date: Mon, 14 Oct 2024 11:37:05 -0400 Subject: [PATCH 115/161] Catch errors while normalizing bounding boxes --- .../models/sec10k/ex_21/data/inference.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py index 25db2d9..c84c6db 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py @@ -7,6 +7,7 @@ import pandas as pd +from ...entities import Sec10kExtractionMetadata from ...utils.cloud import GCSArchive from ...utils.pdf import get_image_dict, get_pdf_data_from_path from .common import BBOX_COLS_PDF, format_label_studio_output, normalize_bboxes @@ -17,6 +18,7 @@ def format_unlabeled_pdf_dataframe(pdfs_dir: Path): """Read and format PDFs into a dataframe (without labels).""" inference_df = pd.DataFrame() + failed_format_metadata = Sec10kExtractionMetadata.example(0) for pdf_filename in os.listdir(pdfs_dir): if not pdf_filename.endswith(".pdf"): continue @@ -26,9 +28,16 @@ def format_unlabeled_pdf_dataframe(pdfs_dir: Path): txt = extracted["pdf_text"] pg_meta = extracted["page"] # normalize bboxes between 0 and 1000 for Hugging Face - txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta) - txt.loc[:, "id"] = filename - inference_df = pd.concat([inference_df, txt]) + try: + txt = normalize_bboxes(txt_df=txt, pg_meta_df=pg_meta) + txt.loc[:, "id"] = filename + inference_df = pd.concat([inference_df, txt]) + except KeyError: + logger.warning(f"Failed to normalize bounding boxes for filing: {filename}") + failed_format_metadata.loc[filename, ["success", "notes"]] = [ + False, + "Failed to normalize bounding boxes", + ] return inference_df From 4117d0ae161811d03e800e6a0098b707b43702cc Mon Sep 17 00:00:00 2001 From: zschira Date: Mon, 14 Oct 2024 11:40:01 -0400 Subject: [PATCH 116/161] Fix call to pandera example --- src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py index c84c6db..def88fd 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py @@ -18,7 +18,7 @@ def format_unlabeled_pdf_dataframe(pdfs_dir: Path): """Read and format PDFs into a dataframe (without labels).""" inference_df = pd.DataFrame() - failed_format_metadata = Sec10kExtractionMetadata.example(0) + failed_format_metadata = Sec10kExtractionMetadata.example(size=0) for pdf_filename in os.listdir(pdfs_dir): if not pdf_filename.endswith(".pdf"): continue From 8c8dd602f586e85cd44c6ddb9f998f32879195e0 Mon Sep 17 00:00:00 2001 From: zschira Date: Mon, 14 Oct 2024 11:52:10 -0400 Subject: [PATCH 117/161] Fix handle failures in converting to pdf --- .../models/sec10k/ex_21/data/inference.py | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py index def88fd..e2498d8 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py @@ -38,7 +38,7 @@ def format_unlabeled_pdf_dataframe(pdfs_dir: Path): False, "Failed to normalize bounding boxes", ] - return inference_df + return inference_df, failed_format_metadata def _cache_pdfs( @@ -100,8 +100,18 @@ def create_inference_dataset( labeled_json_dir=labeled_json_dir, pdfs_dir=pdfs_dir ) else: - inference_df = format_unlabeled_pdf_dataframe(pdfs_dir=pdfs_dir) + inference_df, failed_format_metadata = format_unlabeled_pdf_dataframe( + pdfs_dir=pdfs_dir + ) + extraction_metadata = pd.concat( + [extraction_metadata, failed_format_metadata] + ) image_dict = get_image_dict(pdfs_dir) + image_dict = { + filename: image + for filename, image in image_dict.items() + if filename not in extraction_metadata + } annotations = [] for filename, image in image_dict.items(): From ff821b56b964338f3152d3971a67c19f0eb2ade6 Mon Sep 17 00:00:00 2001 From: zschira Date: Mon, 14 Oct 2024 11:58:08 -0400 Subject: [PATCH 118/161] Actually fix handle failures in converting to pdf --- src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py index e2498d8..8e6b661 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/inference.py @@ -110,7 +110,7 @@ def create_inference_dataset( image_dict = { filename: image for filename, image in image_dict.items() - if filename not in extraction_metadata + if filename not in extraction_metadata.index } annotations = [] From a8eb359b0f156c8344c2a9896db2e125ecb6864b Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 16 Oct 2024 12:55:50 -0400 Subject: [PATCH 119/161] Add model documentation to sec10k readme --- src/mozilla_sec_eia/models/sec10k/README.rst | 56 ++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/src/mozilla_sec_eia/models/sec10k/README.rst b/src/mozilla_sec_eia/models/sec10k/README.rst index ffecf28..c5f94f4 100644 --- a/src/mozilla_sec_eia/models/sec10k/README.rst +++ b/src/mozilla_sec_eia/models/sec10k/README.rst @@ -3,6 +3,62 @@ sec10k: Extracting company ownership data from sec10k documents This repo contains exploratory development for an SEC-EIA linkage. +Models +------ +Basic 10k +^^^^^^^^^ +The extraction model for basic 10k company information is very simple and requires no +training. This model is implemented as a simple rules based parser that finds key-value +pairs containing company information, which is embedded in a header for all 10k filings. + +Exhibit 21 +^^^^^^^^^^ +Exhibit 21 extraction is much more complicated and requires pretrained models that are +cached with our mlflow tracking server. Currently, there are 2 models which are +implemented in the ``notebooks/`` directory. These notebooks use +`Dagstermill `_ +so they can be run interactively like any normal Jupyter Notebook, or run in a Dagster +job. + +Extraction +"""""""""" +The primary extraction model is implemented in the ``notebooks/exhibit21_extractor.ipynb``. +This model is based on +`layoutlm `_ with custom inference logic +to construct a table of ownership information from an exhibit 21 document. Both the +layoutlm model and the inference model are logged separately with mlflow. This +separation between the models allows for testing minor modifications to the inference +portion with the same pretrained layoutlm model. + +There are currently two configuration parameters that used by the extraction model +notebook: + +* ``layoutlm_training_run``: This should be an existing mlflow run name, which was used + to train layoutlm, and has a logged model associated with it. If ``None`` layoutlm + will be trained when the notebook is run, and the new training run will be used for + inference and validation. +* ``training_data_version``: This should point to a GCS folder containing training + data to use with layoutlm. If ``layoutlm_training_run`` is set, then this parameter + doesn't matter, as layoutlm will not be re-trained when the notebook is executed. + +The notebook also depends on several upstream dagster assets, which produce training and +validation datasets. Using upstream assets allows these datasets, which are relatively +expensive to produce, to be easily cached and reused while interating on the model. +These upstream assets need to be produced before the notebook can be run. They should +also be re-materialized if you want to modify the training or validation data, otherwise +the notebook can be re-run as many times as desired with existing data. + +Layout Classification +""""""""""""""""""""" +The second model is a classifier, which labels filings as either having a 'paragraph' +layout or not. This is done because the extraction model performs poorly on documents +formatted as paragraphs rather than tables. For now we will likely just filter out these +results, but we could also develop a separate extraction model which handles these +documents better. + +This model also depends on upstream assets to produce training data, which will need +to be produced before running the notebook. + Usage ----- From dc160ac21641fcd8e74e654b1d1ab97abb6f8d4f Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 16 Oct 2024 17:16:33 -0400 Subject: [PATCH 120/161] Fix ex 21 validation integration test --- .../sec10k/ex_21/ex21_validation_helpers.py | 84 +++++++++ .../notebooks/exhibit21_extractor.ipynb | 165 +++--------------- .../integration/models/sec10k/extract_test.py | 37 ++-- 3 files changed, 136 insertions(+), 150 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py b/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py index fca7168..0b530a9 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/ex21_validation_helpers.py @@ -3,9 +3,93 @@ import numpy as np import pandas as pd +from mozilla_sec_eia.library import validation_helpers from mozilla_sec_eia.models.sec10k.utils.cloud import get_metadata_filename +def ex21_validation_metrics(computed_df: pd.DataFrame, validation_df: pd.DataFrame): + """Compute validation metrics for Ex. 21 extraction.""" + shared_cols = validation_df.columns.intersection(computed_df.columns) + validation_df = validation_df.astype(computed_df[shared_cols].dtypes) + # strip llc and other company name parts for the similarity comparison + computed_df["subsidiary"] = validation_helpers.strip_down_company_names( + computed_df["subsidiary"] + ) + validation_df["subsidiary"] = validation_helpers.strip_down_company_names( + validation_df["subsidiary"] + ) + n_equal = 0 + validation_filenames = validation_df["id"].unique() + n_files = len(validation_filenames) + table_metrics_dict = {} + jaccard_dict = {} + incorrect_files = [] + # iterate through each file and check each extracted table + for filename in validation_filenames: + extracted_table_df = computed_df[computed_df["id"] == filename].reset_index( + drop=True + ) + validation_table_df = validation_df[ + validation_df["id"] == filename + ].reset_index(drop=True) + # check if the tables are exactly equal + if extracted_table_df[["subsidiary", "loc", "own_per"]].equals( + validation_table_df[["subsidiary", "loc", "own_per"]] + ): + n_equal += 1 + else: + incorrect_files.append(filename) + # compute jaccard sim + precision and recall for each column + table_metrics_dict[filename] = {} + jaccard_dict[filename] = {} + for col in ["subsidiary", "loc", "own_per"]: + extracted_table_df[col] = validation_helpers.fill_nulls_for_comparison( + extracted_table_df[col] + ) + validation_table_df[col] = validation_helpers.fill_nulls_for_comparison( + validation_table_df[col] + ) + table_prec_recall = validation_helpers.pandas_compute_precision_recall( + extracted_table_df, validation_table_df, value_col=col + ) + table_metrics_dict[filename][f"{col}_precision"] = table_prec_recall[ + "precision" + ] + table_metrics_dict[filename][f"{col}_recall"] = table_prec_recall["recall"] + # get the jaccard similarity between columns + jaccard_dict[filename][col] = validation_helpers.jaccard_similarity( + computed_df=extracted_table_df, + validation_df=validation_table_df, + value_col=col, + ) + + jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient="index").reset_index() + prec_recall_df = pd.DataFrame.from_dict( + table_metrics_dict, orient="index" + ).reset_index() + + return ( + jaccard_df, + prec_recall_df, + pd.DataFrame({"filename": incorrect_files}), + { + "table_accuracy": n_equal / n_files, + "avg_subsidiary_jaccard_sim": jaccard_df["subsidiary"].sum() / n_files, + "avg_location_jaccard_sim": jaccard_df["loc"].sum() / n_files, + "avg_own_per_jaccard_sim": jaccard_df["own_per"].sum() / n_files, + "avg_subsidiary_precision": prec_recall_df["subsidiary_precision"].sum() + / n_files, + "avg_location_precision": prec_recall_df["loc_precision"].sum() / n_files, + "avg_own_per_precision": prec_recall_df["own_per_precision"].sum() + / n_files, + "avg_subsidiary_recall": prec_recall_df["subsidiary_recall"].sum() + / n_files, + "avg_location_recall": prec_recall_df["loc_recall"].sum() / n_files, + "avg_own_per_recall": prec_recall_df["own_per_recall"].sum() / n_files, + }, + ) + + def clean_extracted_df(extracted_df): """Perform basic cleaning on a dataframe extracted from an Ex. 21.""" if extracted_df.empty: diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb index d136a25..53e16c8 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb @@ -51,13 +51,13 @@ "output_type": "stream", "text": [ "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", - "2024-10-09 15:25:02 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_training_data using PickledObjectFilesystemIOManager...\n", + "2024-10-16 17:11:06 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_training_data using PickledObjectFilesystemIOManager...\n", "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", - "2024-10-09 15:25:03 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_failed_parsing_metadata using PickledObjectFilesystemIOManager...\n", + "2024-10-16 17:11:12 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_failed_parsing_metadata using PickledObjectFilesystemIOManager...\n", "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", - "2024-10-09 15:25:03 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_inference_dataset using PickledObjectFilesystemIOManager...\n", + "2024-10-16 17:11:12 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_inference_dataset using PickledObjectFilesystemIOManager...\n", "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", - "2024-10-09 15:25:04 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_validation_set using PickledObjectFilesystemIOManager...\n" + "2024-10-16 17:11:15 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_validation_set using PickledObjectFilesystemIOManager...\n" ] } ], @@ -113,7 +113,6 @@ "import numpy as np\n", "import pandas as pd\n", "\n", - "from mozilla_sec_eia.library import validation_helpers\n", "from mozilla_sec_eia.models.sec10k.utils.cloud import get_metadata_filename\n", "\n", "\n", @@ -326,7 +325,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 4, "id": "42c8e920-d671-40c2-b5db-c43611a33897", "metadata": { "tags": [] @@ -542,7 +541,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 5, "id": "4d802e00-1ca4-40b3-b15b-561711a9db70", "metadata": { "tags": [] @@ -551,7 +550,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "3145df6c447a4f958ac86b7a84c9f52d", + "model_id": "d0779d02915a4503b0cd92d3df38cf88", "version_major": 2, "version_minor": 0 }, @@ -566,13 +565,13 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024/10/09 15:26:25 INFO mlflow.types.utils: Unsupported type hint: , skipping schema inference\n" + "2024/10/16 17:11:20 INFO mlflow.types.utils: Unsupported type hint: , skipping schema inference\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "bae0a0244e4141449874b48f750bd443", + "model_id": "601bb4ae91dd4a218fe5be047f4829d0", "version_major": 2, "version_minor": 0 }, @@ -587,15 +586,15 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024/10/09 15:26:54 WARNING mlflow.utils.requirements_utils: The following packages were not found in the public PyPI package index as of 2024-08-29; if these packages are not present in the public PyPI index, you must install them manually before loading your model: {'catalystcoop-mozilla-sec-eia'}\n", - "2024/10/09 15:26:54 WARNING mlflow.utils.requirements_utils: Found catalystcoop-mozilla-sec-eia version (0.1.dev298+g6f9d34a.d20240923) contains a local version label (+g6f9d34a.d20240923). MLflow logged a pip requirement for this package as 'catalystcoop-mozilla-sec-eia==0.1.dev298' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requirements`.\n", - "2024/10/09 15:26:54 WARNING mlflow.transformers.model_io: Could not specify device parameter for this pipeline type.Falling back to loading the model with the default device.\n" + "2024/10/16 17:11:51 WARNING mlflow.utils.requirements_utils: The following packages were not found in the public PyPI package index as of 2024-08-29; if these packages are not present in the public PyPI index, you must install them manually before loading your model: {'catalystcoop-mozilla-sec-eia'}\n", + "2024/10/16 17:11:51 WARNING mlflow.utils.requirements_utils: Found catalystcoop-mozilla-sec-eia version (0.1.dev353+gdf5fe0d.d20241011) contains a local version label (+gdf5fe0d.d20241011). MLflow logged a pip requirement for this package as 'catalystcoop-mozilla-sec-eia==0.1.dev353' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requirements`.\n", + "2024/10/16 17:11:51 WARNING mlflow.transformers.model_io: Could not specify device parameter for this pipeline type.Falling back to loading the model with the default device.\n" ] }, { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "a0efe85e59d7401092b6bc7eed6d0bb5", + "model_id": "68b8d5cef3a94294b243b6f0c3e8ee5f", "version_major": 2, "version_minor": 0 }, @@ -710,120 +709,10 @@ }, { "cell_type": "markdown", - "id": "0bd74bdc-bb63-4ad2-82ec-3dfcf93a6121", - "metadata": {}, - "source": [ - "#### Load validation data\n", - "Next, load an inference dataset containing validation data. This dataset is formatted exactly the same as those that will feed into the `Ex21Extractor` during a production run, but contain only data from the validation set. When creating inference datasets we also produce a metadata dataframe documenting any filings that couldn't be parsed/converted to a PDF. This dataframe should be empty for the validation set, but we will still load it for consistency with production runs." - ] - }, - { - "cell_type": "markdown", - "id": "eddcc912-324a-42e9-9841-3a916c6ece6b", - "metadata": {}, - "source": [ - "Next define method method for computing validation metrics. The metrics computed above for training are looking at bounding boxes output by `layoutlm` and pertain to one word at a time. These metrics will look at an entire table produced the inference pipeline and compare to the validation data. " - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "f79bd14d-5156-4f34-9a50-e9c813b822cf", + "id": "1dee550f-7b06-4091-a65e-71c6b23a5bea", "metadata": { "tags": [] }, - "outputs": [], - "source": [ - "from mlflow.models import infer_signature\n", - "\n", - "\n", - "def ex21_validation_metrics(computed_df: pd.DataFrame, validation_df: pd.DataFrame):\n", - " \"\"\"Compute validation metrics for Ex. 21 extraction.\"\"\"\n", - " shared_cols = validation_df.columns.intersection(computed_df.columns)\n", - " validation_df = validation_df.astype(computed_df[shared_cols].dtypes)\n", - " # strip llc and other company name parts for the similarity comparison\n", - " computed_df[\"subsidiary\"] = validation_helpers.strip_down_company_names(\n", - " computed_df[\"subsidiary\"]\n", - " )\n", - " validation_df[\"subsidiary\"] = validation_helpers.strip_down_company_names(\n", - " validation_df[\"subsidiary\"]\n", - " )\n", - " n_equal = 0\n", - " validation_filenames = validation_df[\"id\"].unique()\n", - " n_files = len(validation_filenames)\n", - " table_metrics_dict = {}\n", - " jaccard_dict = {}\n", - " incorrect_files = []\n", - " # iterate through each file and check each extracted table\n", - " for filename in validation_filenames:\n", - " extracted_table_df = computed_df[computed_df[\"id\"] == filename].reset_index(\n", - " drop=True\n", - " )\n", - " validation_table_df = validation_df[\n", - " validation_df[\"id\"] == filename\n", - " ].reset_index(drop=True)\n", - " # check if the tables are exactly equal\n", - " if extracted_table_df[[\"subsidiary\", \"loc\", \"own_per\"]].equals(\n", - " validation_table_df[[\"subsidiary\", \"loc\", \"own_per\"]]\n", - " ):\n", - " n_equal += 1\n", - " else:\n", - " incorrect_files.append(filename)\n", - " # compute jaccard sim + precision and recall for each column\n", - " table_metrics_dict[filename] = {}\n", - " jaccard_dict[filename] = {}\n", - " for col in [\"subsidiary\", \"loc\", \"own_per\"]:\n", - " extracted_table_df[col] = validation_helpers.fill_nulls_for_comparison(\n", - " extracted_table_df[col]\n", - " )\n", - " validation_table_df[col] = validation_helpers.fill_nulls_for_comparison(\n", - " validation_table_df[col]\n", - " )\n", - " table_prec_recall = validation_helpers.pandas_compute_precision_recall(\n", - " extracted_table_df, validation_table_df, value_col=col\n", - " )\n", - " table_metrics_dict[filename][f\"{col}_precision\"] = table_prec_recall[\n", - " \"precision\"\n", - " ]\n", - " table_metrics_dict[filename][f\"{col}_recall\"] = table_prec_recall[\"recall\"]\n", - " # get the jaccard similarity between columns\n", - " jaccard_dict[filename][col] = validation_helpers.jaccard_similarity(\n", - " computed_df=extracted_table_df,\n", - " validation_df=validation_table_df,\n", - " value_col=col,\n", - " )\n", - "\n", - " jaccard_df = pd.DataFrame.from_dict(jaccard_dict, orient=\"index\").reset_index()\n", - " prec_recall_df = pd.DataFrame.from_dict(\n", - " table_metrics_dict, orient=\"index\"\n", - " ).reset_index()\n", - "\n", - " return (\n", - " jaccard_df,\n", - " prec_recall_df,\n", - " pd.DataFrame({\"filename\": incorrect_files}),\n", - " {\n", - " \"table_accuracy\": n_equal / n_files,\n", - " \"avg_subsidiary_jaccard_sim\": jaccard_df[\"subsidiary\"].sum() / n_files,\n", - " \"avg_location_jaccard_sim\": jaccard_df[\"loc\"].sum() / n_files,\n", - " \"avg_own_per_jaccard_sim\": jaccard_df[\"own_per\"].sum() / n_files,\n", - " \"avg_subsidiary_precision\": prec_recall_df[\"subsidiary_precision\"].sum()\n", - " / n_files,\n", - " \"avg_location_precision\": prec_recall_df[\"loc_precision\"].sum() / n_files,\n", - " \"avg_own_per_precision\": prec_recall_df[\"own_per_precision\"].sum()\n", - " / n_files,\n", - " \"avg_subsidiary_recall\": prec_recall_df[\"subsidiary_recall\"].sum()\n", - " / n_files,\n", - " \"avg_location_recall\": prec_recall_df[\"loc_recall\"].sum() / n_files,\n", - " \"avg_own_per_recall\": prec_recall_df[\"own_per_recall\"].sum() / n_files,\n", - " },\n", - " )\n" - ] - }, - { - "cell_type": "markdown", - "id": "1dee550f-7b06-4091-a65e-71c6b23a5bea", - "metadata": {}, "source": [ "#### Validate model\n", "Finally, run the full model on the validation set and log metrics to mlflow. The logged metrics/model will appear in a nested run below the training run used for the current version of the model." @@ -831,15 +720,17 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": null, "id": "dfb56470-8527-424c-a9e5-4135e55fde4d", - "metadata": {}, + "metadata": { + "tags": [] + }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ - "2024/10/09 15:26:56 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n", + "2024/10/16 17:11:53 INFO mlflow.system_metrics.system_metrics_monitor: Started monitoring system metrics.\n", "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", " warnings.warn(\n", "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", @@ -960,7 +851,7 @@ " warnings.warn(\n", "/home/zach/mambaforge/envs/mozilla-sec-eia/lib/python3.11/site-packages/transformers/modeling_utils.py:1101: FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n", " warnings.warn(\n", - "/tmp/ipykernel_168606/2514174394.py:29: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + "/tmp/ipykernel_48762/2514174394.py:29: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", " lambda group: group.ffill()\n", "/home/zach/catalyst/mozilla-sec-eia/src/mozilla_sec_eia/library/validation_helpers.py:46: FutureWarning: The behavior of array concatenation with empty entries is deprecated. In a future version, this will no longer exclude empty items when determining the result dtype. To retain the old behavior, exclude the empty entries before the concat operation.\n", " padded_compute_set = pd.concat(\n", @@ -1177,7 +1068,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "21da151ecd6d4a9187bf77b40c7a8aed", + "model_id": "db36592620c244479123275dfc464648", "version_major": 2, "version_minor": 0 }, @@ -1192,16 +1083,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "2024/10/09 15:28:01 WARNING mlflow.utils.requirements_utils: The following packages were not found in the public PyPI package index as of 2024-08-29; if these packages are not present in the public PyPI index, you must install them manually before loading your model: {'catalystcoop-mozilla-sec-eia'}\n", - "2024/10/09 15:28:01 WARNING mlflow.utils.requirements_utils: Found catalystcoop-mozilla-sec-eia version (0.1.dev298+g6f9d34a.d20240923) contains a local version label (+g6f9d34a.d20240923). MLflow logged a pip requirement for this package as 'catalystcoop-mozilla-sec-eia==0.1.dev298' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requirements`.\n", - "2024/10/09 15:35:17 INFO mlflow.tracking._tracking_service.client: 🏃 View run unleashed-snake-419 at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13/runs/426dd1b67cbd4677b6fa22b6b9d9173a.\n", - "2024/10/09 15:35:17 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://mlflow-ned2up6sra-uc.a.run.app/#/experiments/13.\n", - "2024/10/09 15:35:17 INFO mlflow.system_metrics.system_metrics_monitor: Stopping system metrics monitoring...\n", - "2024/10/09 15:35:17 INFO mlflow.system_metrics.system_metrics_monitor: Successfully terminated system metrics monitoring!\n" + "2024/10/16 17:12:58 WARNING mlflow.utils.requirements_utils: The following packages were not found in the public PyPI package index as of 2024-08-29; if these packages are not present in the public PyPI index, you must install them manually before loading your model: {'catalystcoop-mozilla-sec-eia'}\n", + "2024/10/16 17:12:58 WARNING mlflow.utils.requirements_utils: Found catalystcoop-mozilla-sec-eia version (0.1.dev353+gdf5fe0d.d20241011) contains a local version label (+gdf5fe0d.d20241011). MLflow logged a pip requirement for this package as 'catalystcoop-mozilla-sec-eia==0.1.dev353' without the local version label to make it installable from PyPI. To specify pip requirements containing local version labels, please use `conda_env` or `pip_requirements`.\n" ] } ], "source": [ + "from mlflow.models import infer_signature\n", + "\n", + "from mozilla_sec_eia.models.sec10k.ex_21.ex21_validation_helpers import (\n", + " ex21_validation_metrics,\n", + ")\n", + "\n", "with mlflow.start_run(parent_run_id=model_info.run_id, nested=True):\n", " metadata, extracted = ex21_extraction_model.predict(ex21_inference_dataset.copy())\n", " metadata = pd.concat([ex21_failed_parsing_metadata, metadata])\n", diff --git a/tests/integration/models/sec10k/extract_test.py b/tests/integration/models/sec10k/extract_test.py index 87636f1..38a795a 100644 --- a/tests/integration/models/sec10k/extract_test.py +++ b/tests/integration/models/sec10k/extract_test.py @@ -2,9 +2,10 @@ import logging import os -import unittest import dotenv +import mlflow +from dagster import materialize_to_memory from mozilla_sec_eia.library.mlflow import configure_mlflow from mozilla_sec_eia.library.mlflow.mlflow_resource import get_most_recent_run @@ -41,20 +42,28 @@ def test_ex21_validation( os.getenv("MLFLOW_TRACKING_URI"), os.getenv("GCS_PROJECT"), ) - pretrained_model = sec10k.utils.layoutlm._load_pretrained_layoutlm( - cache_path=tmp_path + + # Load validation data + result = materialize_to_memory( + [ + sec10k.ex_21.data.ex21_validation_set, + sec10k.ex_21.data.ex21_validation_filing_metadata, + sec10k.ex_21.data.ex21_inference_dataset, + ], + resources={"cloud_interface": sec10k.utils.GCSArchive()}, + ) + ex21_inference_dataset = result.output_for_node( + "ex21_inference_dataset", output_name="ex21_inference_dataset" ) + ex21_validation_set = result.output_for_node("ex21_validation_set") - with unittest.mock.patch( - "mozilla_sec_eia.models.sec10k.utils.layoutlm._load_pretrained_layoutlm", - new=lambda cache_path, version: pretrained_model, - ): - set_test_mlflow_env_vars_factory() - result = sec10k.defs.get_job_def( - "ex21_extraction_validation" - ).execute_in_process() + # Load latest version of pretrained model + pretrained_model = mlflow.pyfunc.load_model("models:/exhibit21_extractor/latest") + _, extracted = pretrained_model.predict(ex21_inference_dataset) - run = get_most_recent_run("ex21_extraction_validation", result.run_id) + _, _, _, metrics = sec10k.ex_21.ex21_validation_helpers.ex21_validation_metrics( + extracted, ex21_validation_set + ) - assert run.data.metrics["avg_subsidiary_jaccard_sim"] > 0.85 - assert run.data.metrics["avg_location_jaccard_sim"] > 0.83 + assert metrics["avg_subsidiary_jaccard_sim"] > 0.85 + assert metrics["avg_location_jaccard_sim"] > 0.83 From 10b24a92cc092e64c4c8631b357d34ec49565680 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 16 Oct 2024 18:33:07 -0400 Subject: [PATCH 121/161] Improve classifier error handling --- src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 3558cd9..367c2e2 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -76,7 +76,7 @@ def classify_chunk_layouts( "paragraph": exhibit21_layout_classifier.predict(inference_dataset), } ).set_index("filename") - except ValueError: + except ValueError | KeyError: df = pd.DataFrame( { "filename": inference_dataset["id"], From ad549794b7357da8d3d089e9c025b55f0302030a Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 16 Oct 2024 18:38:59 -0400 Subject: [PATCH 122/161] Fully broaden classifier errors --- src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py index 367c2e2..5b7ac07 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/__init__.py @@ -69,6 +69,9 @@ def classify_chunk_layouts( ) -> pd.DataFrame: """Extract a set of filings and return results.""" _, inference_dataset = parsed_chunk + if inference_dataset.empty: + return Ex21Layout.example(size=0) + try: df = pd.DataFrame( { @@ -76,7 +79,8 @@ def classify_chunk_layouts( "paragraph": exhibit21_layout_classifier.predict(inference_dataset), } ).set_index("filename") - except ValueError | KeyError: + except Exception: + logger.warning(traceback.format_exc()) df = pd.DataFrame( { "filename": inference_dataset["id"], From 672e123a84070454ef23a307f5b56c6058486be2 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 16 Oct 2024 19:23:32 -0400 Subject: [PATCH 123/161] add more docs on running the notebooks --- src/mozilla_sec_eia/models/sec10k/README.rst | 26 +++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec10k/README.rst b/src/mozilla_sec_eia/models/sec10k/README.rst index c5f94f4..13afc39 100644 --- a/src/mozilla_sec_eia/models/sec10k/README.rst +++ b/src/mozilla_sec_eia/models/sec10k/README.rst @@ -56,9 +56,33 @@ formatted as paragraphs rather than tables. For now we will likely just filter o results, but we could also develop a separate extraction model which handles these documents better. -This model also depends on upstream assets to produce training data, which will need +This model is located in ``notebooks/exhibit21_layout_classifier.ipynb``, and it also +depends on upstream assets to produce training data, which will need to be produced before running the notebook. +Training the Models +""""""""""""""""""" +The models are trained by running the notebooks. This can be done either interactively +like a normal notebook or through dagster directly. + +Whether running interactively or with dagster, you will first need to produce the +upstream data assets: + +1. Launch dagster from the repo root with the ``dagster dev`` command +2. Locate the training Job in question using the webui +3. Select the upstream assets by holding down the shift key and clicking on each + asset excluding the notebook asset +4. Click ``Materialize all`` in the UI + +Once this is complete, you can simply launch ``Jupyter`` and run the notebooks +interactively as you would any other notebook. The first cell loads the upstream +assets and sets configuration. You can modify the configuration directly in the +notebook as normal. + +To run the notebook in dagster, you simply execute it like any other normal asset. +You can first set configuration in the dagster launchpad if desired, and when it +completes executing, you can click on the asset to view the fully rendered notebook. + Usage ----- From b95b2fb9d6664e929be7d2ee93c2b5722ea46ebe Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Mon, 21 Oct 2024 16:43:31 -0700 Subject: [PATCH 124/161] add splink notebooks and preprocessing functions --- environment.yml | 9 +- notebooks/16-kl-splink-ex21-filer-link.ipynb | 3507 +++++++++++++++++ .../17-kl-paragraph-layout-metrics.ipynb | 687 ++++ notebooks/18-kl-splink-sec-eia.ipynb | 3326 ++++++++++++++++ pyproject.toml | 12 +- .../models/sec_eia_record_linkage/__init__.py | 1 + .../sec_eia_record_linkage/preprocessing.py | 288 ++ 7 files changed, 7822 insertions(+), 8 deletions(-) create mode 100644 notebooks/16-kl-splink-ex21-filer-link.ipynb create mode 100644 notebooks/17-kl-paragraph-layout-metrics.ipynb create mode 100644 notebooks/18-kl-splink-sec-eia.ipynb create mode 100644 src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py create mode 100644 src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py diff --git a/environment.yml b/environment.yml index 73986fe..3ad1cd4 100644 --- a/environment.yml +++ b/environment.yml @@ -5,7 +5,7 @@ channels: dependencies: # Packages required for setting up the environment - pip>=21,<24 - - python>=3.10,<3.12 + - python>=3.10,<=3.12 - setuptools>=66,<69 # Packages specified in setup.py that need or benefit from binary conda packages @@ -24,8 +24,11 @@ dependencies: # These are not normal Python packages available on PyPI - nodejs # Useful for Jupyter and prettier pre-commit hook - - catalystcoop.pudl>=2023.2.5,<=2024.8.0 + - dask>=2024 + - gdal # Use pip to install the package defined by this repo for development: - pip: - - --editable ./[dev,docs,tests,types] + - git+https://github.com/catalyst-cooperative/pudl.git@main + # - -e /Users/katielamb/CatalystCoop/pudl[dev,docs,tests,types] + - --editable ./[dev,docs,tests,types] diff --git a/notebooks/16-kl-splink-ex21-filer-link.ipynb b/notebooks/16-kl-splink-ex21-filer-link.ipynb new file mode 100644 index 0000000..2e656d3 --- /dev/null +++ b/notebooks/16-kl-splink-ex21-filer-link.ipynb @@ -0,0 +1,3507 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "c535d97b-5dfa-4298-87f5-55c56c4c82ed", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 3" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "e1222c94-36cd-4bae-95fb-089e5411e490", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "from upath import UPath\n", + "\n", + "# from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive\n", + "# from pudl.analysis.record_linkage import name_cleaner\n", + "from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import prepare_sec10k_basic_info_df, prepare_ex21_df" + ] + }, + { + "cell_type": "markdown", + "id": "16cd6122-4cb9-42aa-8be1-84c997a34e96", + "metadata": {}, + "source": [ + "# Read in Inputs" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "67da3bf4-abbd-40c2-850b-1c73953625c8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "raw_eia_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/out_eia__yearly_utilities.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "28bdfdfd-beeb-4097-b4d3-b58a7c30f64d", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "eia_df = raw_eia_df.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ee54bb48-cbe4-4261-9545-d4b2bdcb731e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "mergers_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_mergers.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "8e69b4ba-8e7b-4d17-bc8c-a06f059f6015", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "raw_eia861_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__assn_utility.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "ce60f760-5b94-4889-92c5-ac0ed5cd6d82", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "missing_utils = raw_eia861_df[~raw_eia861_df.utility_id_eia.isin(raw_eia_df.utility_id_eia.unique())].utility_id_eia.unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "a3ef2365-e459-44b3-94b0-77020cd606f2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "harvested_df = pd.concat([\n", + " pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_utility_data_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n", + " pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_operational_data_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n", + " pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_demand_side_management_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n", + " pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_energy_efficiency.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "59fd9d69-b700-43ec-bb7a-f99eea1e0ec9", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "eia861_df = raw_eia861_df.merge(harvested_df, on=[\"report_date\", \"utility_id_eia\"], how=\"left\").drop_duplicates(subset=[\"report_date\", \"utility_id_eia\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "a47d17c1-0df1-412f-9687-3d540266f005", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "mergers_df = mergers_df[mergers_df[\"new_parent\"].notna()]\n", + "eia861_df = eia861_df.merge(mergers_df[[\"report_date\", \"new_parent\", \"merge_address\", \"merge_city\", \"merge_state\"]], \n", + " how=\"left\", \n", + " left_on=[\"report_date\", \"utility_name_eia\"],\n", + " right_on=[\"report_date\", \"new_parent\"]\n", + " )\n", + "eia861_df = eia861_df.rename(columns={\"merge_address\": \"street_address\", \"merge_city\": \"city\"})\n", + "eia861_df = eia861_df.groupby([\"report_date\", \"utility_id_eia\"]).first().reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "fa6515b1-5012-4ec0-af96-f9fda11a9c5d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "

\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
report_dateutility_id_eiastateutility_name_eianew_parentstreet_addresscitymerge_state
299332009-01-0117698LASouthwestern Electric Power CoSouthwestern Electric Power Co1 Riverside PlazaColumbusOH
332582010-01-0117698ARSouthwestern Electric Power CoSouthwestern Electric Power Co1 Riverside PlazaColumbusOH
490012015-01-0111788IAConsumers EnergyConsumers EnergyOne Enrgy PlazaJacksonMI
568532017-01-0119157IAMiEnergy CooperativeMiEnergy Cooperative31110 Cooperative WayRushfordMN
708202021-01-0140165AZDixie Escalante R E A, IncDixie Escalante R E A, Inc495 N 3200 WFlowellUT
\n", + "
" + ], + "text/plain": [ + " report_date utility_id_eia state utility_name_eia new_parent street_address city merge_state\n", + "29933 2009-01-01 17698 LA Southwestern Electric Power Co Southwestern Electric Power Co 1 Riverside Plaza Columbus OH\n", + "33258 2010-01-01 17698 AR Southwestern Electric Power Co Southwestern Electric Power Co 1 Riverside Plaza Columbus OH\n", + "49001 2015-01-01 11788 IA Consumers Energy Consumers Energy One Enrgy Plaza Jackson MI\n", + "56853 2017-01-01 19157 IA MiEnergy Cooperative MiEnergy Cooperative 31110 Cooperative Way Rushford MN\n", + "70820 2021-01-01 40165 AZ Dixie Escalante R E A, Inc Dixie Escalante R E A, Inc 495 N 3200 W Flowell UT" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eia861_df[(eia861_df.state != eia861_df.merge_state) & (eia861_df.merge_state.notna())]" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8ff7b788-5fef-4e88-94ff-89b25619aed8", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "eia861_df[\"state\"] = eia861_df[\"state\"].where(eia861_df[\"merge_state\"].isnull(), eia861_df[\"merge_state\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "17885342-b464-4f4d-ac75-b7be4d4ec7cc", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "eia861_df = eia861_df.drop(columns=[\"new_parent\", \"merge_state\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "fb71f68d-92da-468b-b8a5-02f5ba4b4459", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "eia_df = pd.concat([eia_df, eia861_df])" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "85402523-e28a-4410-b933-eb71572b9a00", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "eia_df = eia_df.drop_duplicates(subset=[\"utility_id_eia\", \"report_date\"], keep=\"first\")" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "94e824d6-dd6a-47db-9447-3363e8d14fe0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# not sure at what point this stops being a datetime\n", + "eia_df[\"report_date\"] = eia_df[\"report_date\"].astype(\"datetime64[ns]\")" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "56857668-ecd5-4c62-9286-e50c334750c5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# there are nulls from non harvested 861 utilities\n", + "eia_df = eia_df.dropna(subset=\"utility_name_eia\")" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "c29d0b75-759f-445c-adac-b2a6baf1fd0e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "# for now try just training on 2023\n", + "raw_sec_df = pd.concat([pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q1.parquet\"),\n", + " pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q2.parquet\"),\n", + " pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q3.parquet\"),\n", + " pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q4.parquet\"),\n", + " ]\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "dbf3b15c-3a5a-4b74-a929-71aec18750a1", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "raw_sec_df = raw_sec_df.reset_index().pivot_table(values=\"value\", index=\"filename\", columns=\"key\", aggfunc=\"first\")\n", + "raw_sec_df.columns.name = None" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "a8ec4fad-c92f-4cfc-a3d2-409a72a2df1e", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_path = UPath(\"gs://sec10k-outputs/v2/ex21_company_ownership_info\")" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "8e7a642d-7718-4101-b851-f1f4ee07180e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "raw_ex21_df = pd.DataFrame()\n", + "for file in ex21_path.iterdir():\n", + " if file.name.split(\".\")[-1] == \"parquet\":\n", + " report_year = file.name[:4]\n", + " # for now just train with 2023\n", + " if report_year != \"2023\":\n", + " continue\n", + " year_quarter_df = pd.read_parquet(ex21_path / file.name)\n", + " year_quarter_df.loc[:, \"report_year\"] = report_year\n", + " year_quarter_df.loc[:, \"report_year\"] = pd.to_datetime(year_quarter_df[\"report_year\"], format=\"%Y\").dt.year\n", + " raw_ex21_df = pd.concat([raw_ex21_df, year_quarter_df])" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "7daad7a6-c590-4324-9e31-2bb5c9fa4d6c", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
utility_id_eiautility_id_pudlutility_name_eiareport_datestreet_addresscitystatezip_codeplants_reported_ownerplants_reported_operator...contact_lastnamecontact_titlephone_numberphone_extensioncontact_firstname_2contact_lastname_2contact_title_2phone_number_2phone_extension_2data_maturity
336629216386.0Desert Willow Energy Storage2023-01-01100 Bayview CircleNewport BeachCANoneNoneNone...NoneNoneNoneNoneNoneNoneNoneNoneNoneprovisional
356629116385.0Portage Solar Plant2023-01-01N8917PortageWI53901NoneNone...NoneNoneNoneNoneNoneNoneNoneNoneNoneprovisional
376629016384.0NSF Energy One LLC2023-01-011241 University AveRochesterNY14607NoneNone...NoneNoneNoneNoneNoneNoneNoneNoneNoneprovisional
\n", + "

3 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " utility_id_eia utility_id_pudl utility_name_eia report_date \\\n", + "33 66292 16386.0 Desert Willow Energy Storage 2023-01-01 \n", + "35 66291 16385.0 Portage Solar Plant 2023-01-01 \n", + "37 66290 16384.0 NSF Energy One LLC 2023-01-01 \n", + "\n", + " street_address city state zip_code plants_reported_owner \\\n", + "33 100 Bayview Circle Newport Beach CA None None \n", + "35 N8917 Portage WI 53901 None \n", + "37 1241 University Ave Rochester NY 14607 None \n", + "\n", + " plants_reported_operator ... contact_lastname contact_title phone_number \\\n", + "33 None ... None None None \n", + "35 None ... None None None \n", + "37 None ... None None None \n", + "\n", + " phone_extension contact_firstname_2 contact_lastname_2 contact_title_2 \\\n", + "33 None None None None \n", + "35 None None None None \n", + "37 None None None None \n", + "\n", + " phone_number_2 phone_extension_2 data_maturity \n", + "33 None None provisional \n", + "35 None None provisional \n", + "37 None None provisional \n", + "\n", + "[3 rows x 27 columns]" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eia_df[(eia_df.street_address.notnull())].head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "30c02757-45c0-403c-aa38-7422d3549a2b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "eia_subset = eia_df[eia_df.report_date == \"2020-01-01\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "1c0365a3-51d2-455b-8863-bc4dc22572f9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
utility_id_eiautility_id_pudlutility_name_eiareport_datestreet_addresscitystatezip_codeplants_reported_ownerplants_reported_operator...contact_lastnamecontact_titlephone_numberphone_extensioncontact_firstname_2contact_lastname_2contact_title_2phone_number_2phone_extension_2data_maturity
71566541690.0Duke Energy Corp2010-01-01P O Box 1006CharlotteNC28202NoneNone...NoneNoneNoneNoneNoneNoneNoneNoneNonefinal
71568541690.0Duke Energy Corp2008-01-01NoneCharlotteNC28201NoneNone...NoneNoneNoneNoneNoneNoneNoneNoneNonefinal
71569541690.0Duke Energy Corp2007-01-01NoneCharlotteNC28201NoneNone...AshcraftSr. Engineering TechnologistNoneNoneRobertMc MurryDir Carolinas Integrated ResouNoneNonefinal
71570541690.0Duke Energy Corp2006-01-01NoneCharlotteNC28201NoneNone...DuckworthPlanning Engineer704-382-4327382StevenJesterDirector, Rate Admn & Cust Inq704-382-4887Nonefinal
71571541690.0Duke Energy Corp2005-01-01NoneCharlotteNC28201NoneNone...DuckworthPlanning Engineer704-382-4327382StevenJesterDirector, Rate Admn & Cust Inq704-382-4887Nonefinal
71572541690.0Duke Energy Corp2004-01-01NoneCharlotteNC28201NoneNone...DuckworthPlanning Engineer704-382-43270StevenJesterDirector, Rate Admn & Cust Inq704-382-4887Nonefinal
71573541690.0Duke Energy Corp2003-01-01NoneCharlotteNC28201NoneNone...DuckworthProcess LeaderNone0StevenJesterNoneNoneNonefinal
71574541690.0Duke Energy Corp2002-01-01NoneCharlotteNC28201NoneNone...Scott HenryProcess LeaderNone0NoneNoneMgr Reg Policy $ ResNoneNonefinal
71575541690.0Duke Energy Corp2001-01-01NoneCharlotteNC28201NoneNone...R S HenryNoneNone0NoneNoneMgr Operating Plann & AnalysisNoneNonefinal
\n", + "

9 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " utility_id_eia utility_id_pudl utility_name_eia report_date \\\n", + "71566 5416 90.0 Duke Energy Corp 2010-01-01 \n", + "71568 5416 90.0 Duke Energy Corp 2008-01-01 \n", + "71569 5416 90.0 Duke Energy Corp 2007-01-01 \n", + "71570 5416 90.0 Duke Energy Corp 2006-01-01 \n", + "71571 5416 90.0 Duke Energy Corp 2005-01-01 \n", + "71572 5416 90.0 Duke Energy Corp 2004-01-01 \n", + "71573 5416 90.0 Duke Energy Corp 2003-01-01 \n", + "71574 5416 90.0 Duke Energy Corp 2002-01-01 \n", + "71575 5416 90.0 Duke Energy Corp 2001-01-01 \n", + "\n", + " street_address city state zip_code plants_reported_owner \\\n", + "71566 P O Box 1006 Charlotte NC 28202 None \n", + "71568 None Charlotte NC 28201 None \n", + "71569 None Charlotte NC 28201 None \n", + "71570 None Charlotte NC 28201 None \n", + "71571 None Charlotte NC 28201 None \n", + "71572 None Charlotte NC 28201 None \n", + "71573 None Charlotte NC 28201 None \n", + "71574 None Charlotte NC 28201 None \n", + "71575 None Charlotte NC 28201 None \n", + "\n", + " plants_reported_operator ... contact_lastname \\\n", + "71566 None ... None \n", + "71568 None ... None \n", + "71569 None ... Ashcraft \n", + "71570 None ... Duckworth \n", + "71571 None ... Duckworth \n", + "71572 None ... Duckworth \n", + "71573 None ... Duckworth \n", + "71574 None ... Scott Henry \n", + "71575 None ... R S Henry \n", + "\n", + " contact_title phone_number phone_extension \\\n", + "71566 None None None \n", + "71568 None None None \n", + "71569 Sr. Engineering Technologist None None \n", + "71570 Planning Engineer 704-382-4327 382 \n", + "71571 Planning Engineer 704-382-4327 382 \n", + "71572 Planning Engineer 704-382-4327 0 \n", + "71573 Process Leader None 0 \n", + "71574 Process Leader None 0 \n", + "71575 None None 0 \n", + "\n", + " contact_firstname_2 contact_lastname_2 contact_title_2 \\\n", + "71566 None None None \n", + "71568 None None None \n", + "71569 Robert Mc Murry Dir Carolinas Integrated Resou \n", + "71570 Steven Jester Director, Rate Admn & Cust Inq \n", + "71571 Steven Jester Director, Rate Admn & Cust Inq \n", + "71572 Steven Jester Director, Rate Admn & Cust Inq \n", + "71573 Steven Jester None \n", + "71574 None None Mgr Reg Policy $ Res \n", + "71575 None None Mgr Operating Plann & Analysis \n", + "\n", + " phone_number_2 phone_extension_2 data_maturity \n", + "71566 None None final \n", + "71568 None None final \n", + "71569 None None final \n", + "71570 704-382-4887 None final \n", + "71571 704-382-4887 None final \n", + "71572 704-382-4887 None final \n", + "71573 None None final \n", + "71574 None None final \n", + "71575 None None final \n", + "\n", + "[9 rows x 27 columns]" + ] + }, + "execution_count": 58, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eia_df[(eia_df.utility_name_eia.str.contains(\"Duke Energy Corp\")) & (eia_df.state == \"NC\")].drop_duplicates()" + ] + }, + { + "cell_type": "markdown", + "id": "f3d5db08-3c42-4715-9f0d-4d02674b828a", + "metadata": {}, + "source": [ + "# Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "39706c77-90db-4f49-8011-47a9777a88b6", + "metadata": {}, + "outputs": [], + "source": [ + "sec_df = prepare_sec10k_basic_info_df(raw_sec_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "98d4f59e-d61f-4a24-84bc-6caa0d761e07", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_df = prepare_ex21_df(raw_ex21_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 84, + "id": "11caf325-8530-430d-a3d2-a54043447021", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 84, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# sec_df has filename as unique ID\n", + "sec_df.filename.is_unique" + ] + }, + { + "cell_type": "markdown", + "id": "ceed053b-f6ae-4aad-8b12-b2083ba8e236", + "metadata": {}, + "source": [ + "Note: not removing paragraph layout docs, but maybe should" + ] + }, + { + "cell_type": "markdown", + "id": "1bb694c9-cfbd-4e2f-b69c-9996a588d2d2", + "metadata": { + "tags": [] + }, + "source": [ + "# Match Ex. 21 Subsidiaries to a SEC filer" + ] + }, + { + "cell_type": "markdown", + "id": "01d3a5e1-ad17-4266-b2ef-358f246749db", + "metadata": { + "tags": [] + }, + "source": [ + "## Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 85, + "id": "4df63893-8a18-4b00-9b16-d036108bd567", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
statestate_of_incorporation
1nyde
2nyde
5camd
6gade
7njde
.........
8265nyde
8266txde
8267nyoh
8268txde
8269ctde
\n", + "

5051 rows × 2 columns

\n", + "
" + ], + "text/plain": [ + " state state_of_incorporation\n", + "1 ny de\n", + "2 ny de\n", + "5 ca md\n", + "6 ga de\n", + "7 nj de\n", + "... ... ...\n", + "8265 ny de\n", + "8266 tx de\n", + "8267 ny oh\n", + "8268 tx de\n", + "8269 ct de\n", + "\n", + "[5051 rows x 2 columns]" + ] + }, + "execution_count": 85, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_df[(sec_df[\"state\"] != sec_df[\"state_of_incorporation\"]) & (~sec_df[\"state_of_incorporation\"].isnull())][[\"state\", \"state_of_incorporation\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 86, + "id": "24890018-8efb-445f-ad91-ca316edccbe8", + "metadata": {}, + "outputs": [], + "source": [ + "sec_match_df = sec_df.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "83f859df-1764-4e97-addc-0064bdcb31b7", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "loc_of_incorporation\n", + "False 6359\n", + "True 748\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 87, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_match_df[\"loc_of_incorporation\"].isnull().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "id": "e9d0828f-0ad8-41ea-a449-ddd274a888d0", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ex21_match_df = ex21_df.copy()" + ] + }, + { + "cell_type": "markdown", + "id": "ef3f01c7-c21e-4755-ac99-4ea01f359c43", + "metadata": {}, + "source": [ + "Remove clearly \"invalid\" strings and fill nulls" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "4ca07927-185d-4bc6-978a-e8788a8f77b3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "company_name\n", + "rush truck center 120\n", + "encompass health rehabilitation hospital 79\n", + "rush peterbilt truck center 57\n", + "branch 52\n", + "sci funeral services, llc iowa limited liability company 33\n", + "partnership limited partnership 32\n", + "alderwoods group, llc de limited liability company 27\n", + "encompass health rehabilitation hospital of 26\n", + "u haul co. of 26\n", + "at&t 25\n", + "corporation 21\n", + "amh portfolio management 20\n", + "rush bus center 20\n", + "limited partnership limited partnership 18\n", + "rapy limited partnership 15\n", + "rush isuzu trucks 15\n", + "colgate palmolive limited 14\n", + "ecolab limited 11\n", + "rush truck centres 11\n", + "johnson and johnson limited 11\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 89, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ex21_match_df.company_name.value_counts().head(20)" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "8a4839e5-a2e5-4098-826a-4d340cdde638", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ex21_match_df = ex21_match_df[[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]\n", + "sec_match_df = sec_match_df[[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]" + ] + }, + { + "cell_type": "markdown", + "id": "c294372b-159c-4c90-a031-61c34532b965", + "metadata": {}, + "source": [ + "## Exploratory Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "c9dbc620-ed49-4a8e-9d02-6b6f2e0a14cf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from splink.exploratory import completeness_chart, profile_columns\n", + "from splink import DuckDBAPI\n", + "\n", + "db_api = DuckDBAPI()" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "id": "422ca098-e4e7-4284-8b04-74e976e36023", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "match_cols = [\"report_year\", \"company_name\", \"loc_of_incorporation\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "id": "232b5718-c1ed-4e63-8384-b4acf33210d3", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# sometimes this will show up as 100% non null in loc_of_incorporation, not sure why\n", + "completeness_chart(ex21_match_df[match_cols], db_api=db_api)" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "id": "520a9b86", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "completeness_chart(sec_match_df[match_cols], db_api=db_api)" + ] + }, + { + "cell_type": "markdown", + "id": "6b6b20bc-cd22-42cc-b24d-8d581a311ca8", + "metadata": {}, + "source": [ + "There is strong skew in the location of incorporation field with around 40-50% of the values being Delaware in both datasets. We therefore want to use `term_frequency_adjustments` in our linkage model." + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "id": "a5c26016-2c59-4335-bd39-8b2e7ea91840", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 95, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "profile_columns(ex21_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "id": "2a57f717-140f-434d-8998-983b8bf38ac5", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "profile_columns(sec_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)" + ] + }, + { + "cell_type": "markdown", + "id": "1f258250-97c1-4f19-b535-cb91ff9e0ea9", + "metadata": { + "tags": [] + }, + "source": [ + "## Blocking\n", + "\n", + "Can maybe do the subsidiary to filers match without blocking but probably want a blocking rule. \n", + "\n", + "TODO: can we block on nearest 5 report years instead of exact match report year?" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "id": "fb6d143b-5201-4b31-849c-97db80781ade", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "from splink import block_on\n", + "from splink.blocking_analysis import count_comparisons_from_blocking_rule, n_largest_blocks" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "id": "22766c9f-7371-483f-82b0-015549a84357", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "br = \"l.report_year = r.report_year and substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3)\"" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "60937a9c-dff6-4d68-808f-81b8228fc9f6", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "{'number_of_comparisons_generated_pre_filter_conditions': 2069828,\n", + " 'number_of_comparisons_to_be_scored_post_filter_conditions': 2069828,\n", + " 'filter_conditions_identified': '',\n", + " 'equi_join_conditions_identified': 'l.report_year = r.report_year AND SUBSTRING(l.company_name_mphone, 1, 3) = SUBSTRING(r.company_name_mphone, 1, 3)',\n", + " 'link_type_join_condition': 'where l.\"source_dataset\" || \\'-__-\\' || l.\"record_id\" < r.\"source_dataset\" || \\'-__-\\' || r.\"record_id\" and l.\"source_dataset\" != r.\"source_dataset\"'}" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# br0 = block_on(\"report_year\", \"report_year\")\n", + "# br1 = \"jaccard(l.company_name, r.company_name) < .1\"\n", + "# br2 = block_on(\"company_name\", \"company_name\")\n", + "\n", + "counts = count_comparisons_from_blocking_rule(\n", + " table_or_tables=[sec_match_df, ex21_match_df],\n", + " blocking_rule=br,\n", + " link_type=\"link_only\",\n", + " unique_id_column_name='record_id',\n", + " db_api=db_api,\n", + ")\n", + "\n", + "counts" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "id": "67717313-2c17-4b6b-b984-8f7bc955c678", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
key_0key_1count_lcount_rblock_count
02023STR68129788196
12023INT62127579050
22023KRN60129077400
\n", + "
" + ], + "text/plain": [ + " key_0 key_1 count_l count_r block_count\n", + "0 2023 STR 68 1297 88196\n", + "1 2023 INT 62 1275 79050\n", + "2 2023 KRN 60 1290 77400" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = n_largest_blocks(\n", + " table_or_tables=[sec_match_df, ex21_match_df],\n", + " blocking_rule=br,\n", + " link_type=\"link_only\",\n", + " db_api=db_api,\n", + " n_largest=3\n", + ")\n", + "\n", + "result.as_pandas_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "id": "6fe6fb99-f5fd-4538-a8bc-c9dd41f4ff9c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from splink.blocking_analysis import (\n", + " cumulative_comparisons_to_be_scored_from_blocking_rules_chart,\n", + ")\n", + "\n", + "blocking_rules_for_analysis = [\n", + " # block_on(\"substr(l.company_name_mphone,1,3)\", \"substr(r.company_name_mphone,1,3)\"),\n", + " \"l.report_year = r.report_year and substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3)\"\n", + "]\n", + "\n", + "\n", + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=[sec_match_df, ex21_match_df],\n", + " blocking_rules=blocking_rules_for_analysis,\n", + " db_api=db_api,\n", + " unique_id_column_name='record_id',\n", + " link_type=\"link_only\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "b553f3fb-0661-46ab-b43c-f5fcba608a09", + "metadata": {}, + "source": [ + "## Create Model\n", + "\n", + "Maybe want to deduplicate the Ex. 21 data first, then conduct a link to SEC filers?" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "id": "1f12d114-22fd-4f12-a0be-6a62500e80d5", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import splink.comparison_library as cl\n", + "from splink import Linker, SettingsCreator" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "id": "bb13b160-b554-45d6-a575-5fa2de061350", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'NameComparison' of \"company_name\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'company_name is NULL' with SQL rule: \"company_name_l\" IS NULL OR \"company_name_r\" IS NULL\n", + " - 'Exact match on company_name' with SQL rule: \"company_name_l\" = \"company_name_r\"\n", + " - 'Jaro-Winkler distance of company_name >= 0.92' with SQL rule: jaro_winkler_similarity(\"company_name_l\", \"company_name_r\") >= 0.92\n", + " - 'Jaro-Winkler distance of company_name >= 0.88' with SQL rule: jaro_winkler_similarity(\"company_name_l\", \"company_name_r\") >= 0.88\n", + " - 'Jaro-Winkler distance of company_name >= 0.7' with SQL rule: jaro_winkler_similarity(\"company_name_l\", \"company_name_r\") >= 0.7\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "company_name_comparison = cl.NameComparison(\n", + " \"company_name\",\n", + " # dmeta_col_name=\"company_name_mphone\" # this was breaking it for some reason\n", + ")\n", + "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "id": "7d2697d3-efdb-4be4-8911-18b457f5bab4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'JaroWinklerAtThresholds' of \"loc_of_incorporation\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'loc_of_incorporation is NULL' with SQL rule: \"loc_of_incorporation_l\" IS NULL OR \"loc_of_incorporation_r\" IS NULL\n", + " - 'Exact match on loc_of_incorporation' with SQL rule: \"loc_of_incorporation_l\" = \"loc_of_incorporation_r\"\n", + " - 'Jaro-Winkler distance of loc_of_incorporation >= 0.9' with SQL rule: jaro_winkler_similarity(\"loc_of_incorporation_l\", \"loc_of_incorporation_r\") >= 0.9\n", + " - 'Jaro-Winkler distance of loc_of_incorporation >= 0.7' with SQL rule: jaro_winkler_similarity(\"loc_of_incorporation_l\", \"loc_of_incorporation_r\") >= 0.7\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "# try with Levenshtein too\n", + "location_comparison = cl.JaroWinklerAtThresholds(\n", + " \"loc_of_incorporation\",\n", + ")\n", + "print(location_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "id": "92c1ad6b-4516-4ab4-90eb-394669c4a02b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "settings = SettingsCreator(\n", + " link_type=\"link_only\",\n", + " unique_id_column_name=\"record_id\",\n", + " comparisons=[\n", + " company_name_comparison,\n", + " location_comparison.configure(term_frequency_adjustments=True)\n", + " ],\n", + " blocking_rules_to_generate_predictions=[\n", + " br\n", + " ],\n", + " retain_intermediate_calculation_columns=True,\n", + ")\n", + "\n", + "linker = Linker([sec_match_df, ex21_match_df], settings, db_api=DuckDBAPI())" + ] + }, + { + "cell_type": "markdown", + "id": "2f293657-b40c-4539-8abd-8524d11c39c0", + "metadata": {}, + "source": [ + "Estimate probability two random records match" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "id": "e9eb59b9-49cc-45b7-8ffa-b8f7e5372608", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f8061ccbd73c426daa2d35dbf68e55fb", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Probability two random records match is estimated to be 0.000689.\n", + "This means that amongst all possible pairwise record comparisons, one in 1,452.36 are expected to match. With 1,365,709,548 total possible comparisons, we expect a total of around 940,336.47 matching pairs\n" + ] + } + ], + "source": [ + "deterministic_rules = [\n", + " block_on(\"company_name_mphone\", \"company_name_mphone\"),\n", + " \"jaccard(r.company_name, l.company_name) >= .9 and l.loc_of_incorporation = r.loc_of_incorporation\",\n", + " \"substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and jaccard(r.company_name, l.company_name) >= .8\",\n", + " # \"substr(l.company_name_mphone,1,5) = substr(r.company_name_mphone,1,5) and l.loc_of_incorporation = r.loc_of_incorporation\"\n", + "]\n", + "\n", + "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.85)" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "id": "5117653e-e72b-4c13-b923-d1228b39d357", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "e732ac0702e4459b82b86d2de5c9d9fc", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Estimated u probabilities using random sampling\n", + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - company_name (no m values are trained).\n", + " - loc_of_incorporation (no m values are trained).\n" + ] + } + ], + "source": [ + "linker.training.estimate_u_using_random_sampling(max_pairs=1e7)" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "id": "8b089a0d-4c91-4b4d-9806-ed83c9bd59b9", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n", + "Estimating the m probabilities of the model by blocking on:\n", + "(l.\"company_name_mphone\" = r.\"company_name_mphone\") AND (l.\"company_name_mphone\" = r.\"company_name_mphone\")\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - company_name\n", + " - loc_of_incorporation\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + "\n", + "Iteration 1: Largest change in params was -0.213 in the m_probability of loc_of_incorporation, level `Exact match on loc_of_incorporation`\n", + "Iteration 2: Largest change in params was 0.243 in the m_probability of loc_of_incorporation, level `All other comparisons`\n", + "Iteration 3: Largest change in params was 0.0314 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.88`\n", + "Iteration 4: Largest change in params was 0.0052 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 5: Largest change in params was 0.0087 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 6: Largest change in params was 0.0133 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 7: Largest change in params was 0.0188 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 8: Largest change in params was 0.0246 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 9: Largest change in params was 0.0297 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 10: Largest change in params was 0.0332 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 11: Largest change in params was 0.0346 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 12: Largest change in params was 0.0336 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 13: Largest change in params was 0.0306 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 14: Largest change in params was 0.0264 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 15: Largest change in params was 0.0218 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 16: Largest change in params was 0.0173 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 17: Largest change in params was 0.0134 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 18: Largest change in params was 0.0102 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 19: Largest change in params was 0.00758 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 20: Largest change in params was 0.00559 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 21: Largest change in params was 0.00409 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 22: Largest change in params was 0.00298 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 23: Largest change in params was 0.00216 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 24: Largest change in params was 0.00156 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 25: Largest change in params was 0.00112 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "\n", + "EM converged after 25 iterations\n", + "\n", + "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" + ] + } + ], + "source": [ + "training_blocking_rule = block_on(\"company_name_mphone\", \"company_name_mphone\")\n", + "training_session_fname_sname = (\n", + " linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "id": "88e058bc-800d-4da4-92aa-6ddb7377b4bf", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.visualisations.match_weights_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "id": "673a4776-1de1-46ce-a411-f7fd1668d54f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.HConcatChart(...)" + ] + }, + "execution_count": 116, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.visualisations.m_u_parameters_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ebf9e326-38f1-4d78-b302-15867cda1009", + "metadata": {}, + "outputs": [], + "source": [ + "settings = linker.misc.save_model_to_json(\n", + " \"../sec_ex21_model_settings/2023_model.json\", overwrite=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "a14055d2-6761-4906-8555-35c92553a0e9", + "metadata": {}, + "source": [ + "Log model in MLFlow." + ] + }, + { + "cell_type": "markdown", + "id": "dfe4feca-e694-4ec6-a5b0-11382c740516", + "metadata": {}, + "source": [ + "## Make predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "id": "72ff6575-68e3-4256-8253-85eb2564501f", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Blocking time: 0.37 seconds\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "d550d84b328c4d3082bd7cf5d03b803b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Predict time: 78.84 seconds\n" + ] + } + ], + "source": [ + "df_predictions = linker.inference.predict(threshold_match_probability=0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "24e14675-11cf-4c46-a592-7733326113d2", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "preds_df = df_predictions.as_pandas_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": 123, + "id": "3d733c2a-7004-4ce8-8d3f-25ed1e720c36", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrecord_id_lrecord_id_rcompany_name_lcompany_name_rgamma_company_nametf_company_name_ltf_company_name_rbf_company_namebf_tf_adj_company_nameloc_of_incorporation_lloc_of_incorporation_rgamma_loc_of_incorporationtf_loc_of_incorporation_ltf_loc_of_incorporation_rbf_loc_of_incorporationbf_tf_adj_loc_of_incorporationreport_year_lreport_year_rcompany_name_mphone_lcompany_name_mphone_r
53740.0089140.501545__splink__input_table_0__splink__input_table_169167681manitowoc co incorporatedmanitowoc crane companies, llc mcg10.0000050.00000512.5343191.000000wisconsinwisconsin30.0041000.0041002.3217850.18078520232023MNTWK K INKRPRTTMNTWK KRN KMPNS LK MKK
14520.0089140.501545__splink__input_table_0__splink__input_table_139951003schneider national, incorporated33.schneider logistics, incorporated10.0000050.00000512.5343191.000000wisconsinwisconsin30.0041000.0041002.3217850.18078520232023SXNTR NXNL INKRPRTTSXNTR LJSTKS INKRPRTT
41850.0089140.501545__splink__input_table_0__splink__input_table_14856819wisconsin electric power companywisconsin energy capital corporation10.0000100.00000512.5343191.000000wisconsinwisconsin30.0041000.0041002.3217850.18078520232023WSKNSN ELKTRK PWR KMPNWSKNSN ENRJ KPTL KRPRXN
39070.0089140.501545__splink__input_table_0__splink__input_table_118361390orion energy systems, incorporatedwilson funeral home, incorporated10.0000050.00000512.5343191.000000wisconsinwisconsin30.0041000.0041002.3217850.18078520232023ORN ENRJ SSTMS INKRPRTTWLSN FNRL HM INKRPRTT
14260.0089140.501545__splink__input_table_0__splink__input_table_139951010schneider national, incorporated40.schneider resources, incorporated10.0000050.00000512.5343191.000000wisconsinwisconsin30.0041000.0041002.3217850.18078520232023SXNTR NXNL INKRPRTTSXNTR RSRSS INKRPRTT
...........................................................................
467213.2322660.999896__splink__input_table_0__splink__input_table_165684608wesbanco incorporatedwesbanco, incorporated30.0000050.00000535295.4377531.000000west virginiawest virginia30.0012070.0012072.32178170.42967220232023WSBNK INKRPRTTWSBNK INKRPRTT
182913.2570620.999898__splink__input_table_0__splink__input_table_14974974berkshire hathaway energy companyberkshire hathaway energy company40.0000100.000010695779.2731160.053272iowaiowa30.0012460.0012462.32178165.10374520232023BRKXR H0W ENRJ KMPNBRKXR H0W ENRJ KMPN
645813.5508730.999917__splink__input_table_0__splink__input_table_13842749shiftpixy, incorporatedshiftpixy labs, incorporated30.0000050.00000535295.4377531.000000wyomingwyoming30.0009680.0009682.32178212.54735020232023XFTPKS INKRPRTTXFTPKS LBS INKRPRTT
133013.6214740.999921__splink__input_table_0__splink__input_table_14088476securetech innovations, incorporatedsecuretech innovations, incorporated40.0000100.000010695779.2731160.053272wyomingwyoming30.0009680.0009682.32178212.54735020232023SKRTX INFXNS INKRPRTTSKRTX INFXNS INKRPRTT
618614.2064360.999947__splink__input_table_0__splink__input_table_181162004southwestern public service companysouthwestern public service company40.0000100.000010695779.2731160.053272new mexiconew mexico30.0006450.0006452.32178318.82102420232023S0WSTRN PBLK SRFS KMPNS0WSTRN PBLK SRFS KMPN
\n", + "

7540 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_l company_name_r gamma_company_name tf_company_name_l tf_company_name_r bf_company_name bf_tf_adj_company_name loc_of_incorporation_l loc_of_incorporation_r gamma_loc_of_incorporation tf_loc_of_incorporation_l tf_loc_of_incorporation_r bf_loc_of_incorporation bf_tf_adj_loc_of_incorporation report_year_l report_year_r company_name_mphone_l company_name_mphone_r\n", + "5374 0.008914 0.501545 __splink__input_table_0 __splink__input_table_1 6916 7681 manitowoc co incorporated manitowoc crane companies, llc mcg 1 0.000005 0.000005 12.534319 1.000000 wisconsin wisconsin 3 0.004100 0.004100 2.32178 50.180785 2023 2023 MNTWK K INKRPRTT MNTWK KRN KMPNS LK MKK\n", + "1452 0.008914 0.501545 __splink__input_table_0 __splink__input_table_1 3995 1003 schneider national, incorporated 33.schneider logistics, incorporated 1 0.000005 0.000005 12.534319 1.000000 wisconsin wisconsin 3 0.004100 0.004100 2.32178 50.180785 2023 2023 SXNTR NXNL INKRPRTT SXNTR LJSTKS INKRPRTT\n", + "4185 0.008914 0.501545 __splink__input_table_0 __splink__input_table_1 485 6819 wisconsin electric power company wisconsin energy capital corporation 1 0.000010 0.000005 12.534319 1.000000 wisconsin wisconsin 3 0.004100 0.004100 2.32178 50.180785 2023 2023 WSKNSN ELKTRK PWR KMPN WSKNSN ENRJ KPTL KRPRXN\n", + "3907 0.008914 0.501545 __splink__input_table_0 __splink__input_table_1 1836 1390 orion energy systems, incorporated wilson funeral home, incorporated 1 0.000005 0.000005 12.534319 1.000000 wisconsin wisconsin 3 0.004100 0.004100 2.32178 50.180785 2023 2023 ORN ENRJ SSTMS INKRPRTT WLSN FNRL HM INKRPRTT\n", + "1426 0.008914 0.501545 __splink__input_table_0 __splink__input_table_1 3995 1010 schneider national, incorporated 40.schneider resources, incorporated 1 0.000005 0.000005 12.534319 1.000000 wisconsin wisconsin 3 0.004100 0.004100 2.32178 50.180785 2023 2023 SXNTR NXNL INKRPRTT SXNTR RSRSS INKRPRTT\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "4672 13.232266 0.999896 __splink__input_table_0 __splink__input_table_1 6568 4608 wesbanco incorporated wesbanco, incorporated 3 0.000005 0.000005 35295.437753 1.000000 west virginia west virginia 3 0.001207 0.001207 2.32178 170.429672 2023 2023 WSBNK INKRPRTT WSBNK INKRPRTT\n", + "1829 13.257062 0.999898 __splink__input_table_0 __splink__input_table_1 497 4974 berkshire hathaway energy company berkshire hathaway energy company 4 0.000010 0.000010 695779.273116 0.053272 iowa iowa 3 0.001246 0.001246 2.32178 165.103745 2023 2023 BRKXR H0W ENRJ KMPN BRKXR H0W ENRJ KMPN\n", + "6458 13.550873 0.999917 __splink__input_table_0 __splink__input_table_1 3842 749 shiftpixy, incorporated shiftpixy labs, incorporated 3 0.000005 0.000005 35295.437753 1.000000 wyoming wyoming 3 0.000968 0.000968 2.32178 212.547350 2023 2023 XFTPKS INKRPRTT XFTPKS LBS INKRPRTT\n", + "1330 13.621474 0.999921 __splink__input_table_0 __splink__input_table_1 4088 476 securetech innovations, incorporated securetech innovations, incorporated 4 0.000010 0.000010 695779.273116 0.053272 wyoming wyoming 3 0.000968 0.000968 2.32178 212.547350 2023 2023 SKRTX INFXNS INKRPRTT SKRTX INFXNS INKRPRTT\n", + "6186 14.206436 0.999947 __splink__input_table_0 __splink__input_table_1 8116 2004 southwestern public service company southwestern public service company 4 0.000010 0.000010 695779.273116 0.053272 new mexico new mexico 3 0.000645 0.000645 2.32178 318.821024 2023 2023 S0WSTRN PBLK SRFS KMPN S0WSTRN PBLK SRFS KMPN\n", + "\n", + "[7540 rows x 24 columns]" + ] + }, + "execution_count": 123, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preds_df.sort_values(by=\"match_probability\")" + ] + }, + { + "cell_type": "code", + "execution_count": 238, + "id": "255272b6-a5c4-4ab8-bebc-d13e77655938", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['match_weight', 'match_probability', 'source_dataset_l',\n", + " 'source_dataset_r', 'record_id_l', 'record_id_r', 'company_name_l',\n", + " 'company_name_r', 'gamma_company_name', 'tf_company_name_l',\n", + " 'tf_company_name_r', 'bf_company_name', 'bf_tf_adj_company_name',\n", + " 'loc_of_incorporation_l', 'loc_of_incorporation_r',\n", + " 'gamma_loc_of_incorporation', 'tf_loc_of_incorporation_l',\n", + " 'tf_loc_of_incorporation_r', 'bf_loc_of_incorporation',\n", + " 'bf_tf_adj_loc_of_incorporation', 'company_name_mphone_l',\n", + " 'company_name_mphone_r', 'report_year_l', 'report_year_r'],\n", + " dtype='object')" + ] + }, + "execution_count": 238, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preds_df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 249, + "id": "8e658c36-7b6f-480f-9d74-37af9510ebe2", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_probabilitycompany_name_lcompany_name_rloc_of_incorporation_lloc_of_incorporation_rcompany_name_mphone_lcompany_name_mphone_r
1500.996128santander drive auto receivables trust 2018-1santander drive auto receivables trustdelawaredelawareSNTNTR TRF AT RSFBLS TRSTSNTNTR TRF AT RSFBLS TRST
1510.996128santander drive auto receivables trust 2018-5santander drive auto receivables trustdelawaredelawareSNTNTR TRF AT RSFBLS TRSTSNTNTR TRF AT RSFBLS TRST
1520.996128santander drive auto receivables trust 2018-3santander drive auto receivables trustdelawaredelawareSNTNTR TRF AT RSFBLS TRSTSNTNTR TRF AT RSFBLS TRST
1530.996128santander drive auto receivables trust 2016-1santander drive auto receivables trustdelawaredelawareSNTNTR TRF AT RSFBLS TRSTSNTNTR TRF AT RSFBLS TRST
1540.573277constellation pharmaceuticals incconstellation connect, llcdelawaredelawareKNSTLXN FRMSTKLS INKKNSTLXN KNKT LK
1620.959568consolidated communications holdings, inc.consolidated communications ofdelawareillinoisKNSLTTT KMNKXNS HLTNKS INKKNSLTTT KMNKXNS OF
1630.959568consolidated communications holdings, inc.consolidated communications ofdelawaremissouriKNSLTTT KMNKXNS HLTNKS INKKNSLTTT KMNKXNS OF
1640.959568consolidated communications holdings, inc.consolidated communications ofdelawaremaineKNSLTTT KMNKXNS HLTNKS INKKNSLTTT KMNKXNS OF
1650.959568consolidated communications holdings, inc.consolidated communications ofdelawarekansasKNSLTTT KMNKXNS HLTNKS INKKNSLTTT KMNKXNS OF
1660.959568consolidated communications holdings, inc.consolidated communications ofdelawareminnesotaKNSLTTT KMNKXNS HLTNKS INKKNSLTTT KMNKXNS OF
1670.959568consolidated communications holdings, inc.consolidated communications ofdelawarecentralKNSLTTT KMNKXNS HLTNKS INKKNSLTTT KMNKXNS OF
1680.959568consolidated communications holdings, inc.consolidated communications ofdelawarefloridaKNSLTTT KMNKXNS HLTNKS INKKNSLTTT KMNKXNS OF
1690.959568consolidated communications holdings, inc.consolidated communications ofdelawarecaliforniaKNSLTTT KMNKXNS HLTNKS INKKNSLTTT KMNKXNS OF
1740.573277duke energy corpduke energy one, incdelawaredelawareTK ENRJ KRPTK ENRJ ON INK
1770.573277verus international, inc.emcor international, incdelawaredelawareFRS INTRNXNL INKEMKR INTRNXNL INK
1780.573277verus international, inc.emcor international, incdelawaredelawareFRS INTRNXNL INKEMKR INTRNXNL INK
1790.714594green plains inc.green plains superior llc fka superioriowaiowaKRN PLNS INKKRN PLNS SPRR LK FK SPRR
1830.996128duke energy corpduke energy group, llcdelawaredelawareTK ENRJ KRPTK ENRJ KRP LK
1950.884993green stream holdings inc.western gas wyoming, l.l.cwyomingwyomingKRN STRM HLTNKS INKWSTRN KS YMNK LLK
1960.884993green stream holdings inc.western gas wyoming, l.l.cwyomingwyomingKRN STRM HLTNKS INKWSTRN KS YMNK LLK
1970.992184fortress biotech, inc.fortress biotech, china, incdelawareNoneFRTRS BTX INKFRTRS BTX XN INK
1990.996128duke energy corpduke energy china corpdelawaredelawareTK ENRJ KRPTK ENRJ XN KRP
2000.573277duke energy corpduke energy corporate services, incdelawaredelawareTK ENRJ KRPTK ENRJ KRPRT SRFSS INK
2030.573277apollo global management, inc.apollo belenos management llcdelawaredelawareAPL KLBL MNJMNT INKAPL BLNS MNJMNT LK
2040.573277apollo global management, inc.apollo belenos management llcdelawaredelawareAPL KLBL MNJMNT INKAPL BLNS MNJMNT LK
2060.981099columbia property trust, inc.columbia courtyard, incmarylandmarylandKLMB PRPRT TRST INKKLMB KRTYRT INK
2080.573277duke energy corpduke energy beckjord, llcdelawaredelawareTK ENRJ KRPTK ENRJ BKJRT LK
2090.573277duke energy corpduke energy beckjord storage llcdelawaredelawareTK ENRJ KRPTK ENRJ BKJRT STRJ LK
2100.573277duke energy corpduke energy acp, llcdelawaredelawareTK ENRJ KRPTK ENRJ AKP LK
2130.981099spirit realty capital, inc.spirit reit, incmarylandmarylandSPRT RLT KPTL INKSPRT RT INK
2150.573277apollo global management, inc.apollo na management ii, llcdelawaredelawareAPL KLBL MNJMNT INKAPL N MNJMNT LK
2160.573277apollo global management, inc.apollo na management ii, llcdelawaredelawareAPL KLBL MNJMNT INKAPL N MNJMNT LK
2250.992184fortress biotech, inc.fortress biotech, china, incdelawareNoneFRTRS BTX INKFRTRS BTX XN INK
2260.573277green brick partners, inc.green brick mortgage, llcdelawaredelawareKRN BRK PRTNRS INKKRN BRK MRTKJ LK
2270.573277duke energy corpduke energy beckjord storage llcdelawaredelawareTK ENRJ KRPTK ENRJ BKJRT STRJ LK
2280.959568green plains inc.green plains madison llciowadelawareKRN PLNS INKKRN PLNS MTSN LK
2420.959568great lakes dredge & dock corpgreat lakes dredge & dock do brasil ltdadelawarebrazilKRT LKS TRJ TK KRPKRT LKS TRJ TK T BRSL LTT
2430.573277great lakes dredge & dock corpgreat lakes dredge & dock environmental, incdelawaredelawareKRT LKS TRJ TK KRPKRT LKS TRJ TK ENFRNMNTL INK
2440.996128great lakes dredge & dock corpgreat lakes dredge & dock company, llcdelawaredelawareKRT LKS TRJ TK KRPKRT LKS TRJ TK KMPN LK
2510.573277blackstone group incblackstone pb ii l.l.cdelawaredelawareBLKSTN KRP INKBLKSTN PB LLK
2520.573277blackstone group incblackstone pb i l.l.cdelawaredelawareBLKSTN KRP INKBLKSTN PB I LLK
2540.573277duke energy corpduke energy acp, llcdelawaredelawareTK ENRJ KRPTK ENRJ AKP LK
2550.573277duke energy corpduke energy shoreham, llcdelawaredelawareTK ENRJ KRPTK ENRJ XRHM LK
2560.573277duke energy corpduke energy sam, llcdelawaredelawareTK ENRJ KRPTK ENRJ SM LK
2570.573277blackstone group incblackstone obs l.l.cdelawaredelawareBLKSTN KRP INKBLKSTN OBS LLK
2640.992184freightcar america, inc.freightcar america leasing, llcdelawareNoneFRTKR AMRK INKFRTKR AMRK LSNK LK
2650.992184freightcar america, inc.freightcar america leasing, llcdelawareNoneFRTKR AMRK INKFRTKR AMRK LSNK LK
2660.959568qurate retail, inc.qurate retail group, incenglewooddeKRT RTL INKKRT RTL KRP INK
2670.884993green stream holdings inc.western gas wyoming, l.l.cwyomingwyomingKRN STRM HLTNKS INKWSTRN KS YMNK LLK
2680.884993green stream holdings inc.western gas wyoming, l.l.cwyomingwyomingKRN STRM HLTNKS INKWSTRN KS YMNK LLK
\n", + "
" + ], + "text/plain": [ + " match_probability company_name_l company_name_r loc_of_incorporation_l loc_of_incorporation_r company_name_mphone_l company_name_mphone_r\n", + "150 0.996128 santander drive auto receivables trust 2018-1 santander drive auto receivables trust delaware delaware SNTNTR TRF AT RSFBLS TRST SNTNTR TRF AT RSFBLS TRST\n", + "151 0.996128 santander drive auto receivables trust 2018-5 santander drive auto receivables trust delaware delaware SNTNTR TRF AT RSFBLS TRST SNTNTR TRF AT RSFBLS TRST\n", + "152 0.996128 santander drive auto receivables trust 2018-3 santander drive auto receivables trust delaware delaware SNTNTR TRF AT RSFBLS TRST SNTNTR TRF AT RSFBLS TRST\n", + "153 0.996128 santander drive auto receivables trust 2016-1 santander drive auto receivables trust delaware delaware SNTNTR TRF AT RSFBLS TRST SNTNTR TRF AT RSFBLS TRST\n", + "154 0.573277 constellation pharmaceuticals inc constellation connect, llc delaware delaware KNSTLXN FRMSTKLS INK KNSTLXN KNKT LK\n", + "162 0.959568 consolidated communications holdings, inc. consolidated communications of delaware illinois KNSLTTT KMNKXNS HLTNKS INK KNSLTTT KMNKXNS OF\n", + "163 0.959568 consolidated communications holdings, inc. consolidated communications of delaware missouri KNSLTTT KMNKXNS HLTNKS INK KNSLTTT KMNKXNS OF\n", + "164 0.959568 consolidated communications holdings, inc. consolidated communications of delaware maine KNSLTTT KMNKXNS HLTNKS INK KNSLTTT KMNKXNS OF\n", + "165 0.959568 consolidated communications holdings, inc. consolidated communications of delaware kansas KNSLTTT KMNKXNS HLTNKS INK KNSLTTT KMNKXNS OF\n", + "166 0.959568 consolidated communications holdings, inc. consolidated communications of delaware minnesota KNSLTTT KMNKXNS HLTNKS INK KNSLTTT KMNKXNS OF\n", + "167 0.959568 consolidated communications holdings, inc. consolidated communications of delaware central KNSLTTT KMNKXNS HLTNKS INK KNSLTTT KMNKXNS OF\n", + "168 0.959568 consolidated communications holdings, inc. consolidated communications of delaware florida KNSLTTT KMNKXNS HLTNKS INK KNSLTTT KMNKXNS OF\n", + "169 0.959568 consolidated communications holdings, inc. consolidated communications of delaware california KNSLTTT KMNKXNS HLTNKS INK KNSLTTT KMNKXNS OF\n", + "174 0.573277 duke energy corp duke energy one, inc delaware delaware TK ENRJ KRP TK ENRJ ON INK\n", + "177 0.573277 verus international, inc. emcor international, inc delaware delaware FRS INTRNXNL INK EMKR INTRNXNL INK\n", + "178 0.573277 verus international, inc. emcor international, inc delaware delaware FRS INTRNXNL INK EMKR INTRNXNL INK\n", + "179 0.714594 green plains inc. green plains superior llc fka superior iowa iowa KRN PLNS INK KRN PLNS SPRR LK FK SPRR\n", + "183 0.996128 duke energy corp duke energy group, llc delaware delaware TK ENRJ KRP TK ENRJ KRP LK\n", + "195 0.884993 green stream holdings inc. western gas wyoming, l.l.c wyoming wyoming KRN STRM HLTNKS INK WSTRN KS YMNK LLK\n", + "196 0.884993 green stream holdings inc. western gas wyoming, l.l.c wyoming wyoming KRN STRM HLTNKS INK WSTRN KS YMNK LLK\n", + "197 0.992184 fortress biotech, inc. fortress biotech, china, inc delaware None FRTRS BTX INK FRTRS BTX XN INK\n", + "199 0.996128 duke energy corp duke energy china corp delaware delaware TK ENRJ KRP TK ENRJ XN KRP\n", + "200 0.573277 duke energy corp duke energy corporate services, inc delaware delaware TK ENRJ KRP TK ENRJ KRPRT SRFSS INK\n", + "203 0.573277 apollo global management, inc. apollo belenos management llc delaware delaware APL KLBL MNJMNT INK APL BLNS MNJMNT LK\n", + "204 0.573277 apollo global management, inc. apollo belenos management llc delaware delaware APL KLBL MNJMNT INK APL BLNS MNJMNT LK\n", + "206 0.981099 columbia property trust, inc. columbia courtyard, inc maryland maryland KLMB PRPRT TRST INK KLMB KRTYRT INK\n", + "208 0.573277 duke energy corp duke energy beckjord, llc delaware delaware TK ENRJ KRP TK ENRJ BKJRT LK\n", + "209 0.573277 duke energy corp duke energy beckjord storage llc delaware delaware TK ENRJ KRP TK ENRJ BKJRT STRJ LK\n", + "210 0.573277 duke energy corp duke energy acp, llc delaware delaware TK ENRJ KRP TK ENRJ AKP LK\n", + "213 0.981099 spirit realty capital, inc. spirit reit, inc maryland maryland SPRT RLT KPTL INK SPRT RT INK\n", + "215 0.573277 apollo global management, inc. apollo na management ii, llc delaware delaware APL KLBL MNJMNT INK APL N MNJMNT LK\n", + "216 0.573277 apollo global management, inc. apollo na management ii, llc delaware delaware APL KLBL MNJMNT INK APL N MNJMNT LK\n", + "225 0.992184 fortress biotech, inc. fortress biotech, china, inc delaware None FRTRS BTX INK FRTRS BTX XN INK\n", + "226 0.573277 green brick partners, inc. green brick mortgage, llc delaware delaware KRN BRK PRTNRS INK KRN BRK MRTKJ LK\n", + "227 0.573277 duke energy corp duke energy beckjord storage llc delaware delaware TK ENRJ KRP TK ENRJ BKJRT STRJ LK\n", + "228 0.959568 green plains inc. green plains madison llc iowa delaware KRN PLNS INK KRN PLNS MTSN LK\n", + "242 0.959568 great lakes dredge & dock corp great lakes dredge & dock do brasil ltda delaware brazil KRT LKS TRJ TK KRP KRT LKS TRJ TK T BRSL LTT\n", + "243 0.573277 great lakes dredge & dock corp great lakes dredge & dock environmental, inc delaware delaware KRT LKS TRJ TK KRP KRT LKS TRJ TK ENFRNMNTL INK\n", + "244 0.996128 great lakes dredge & dock corp great lakes dredge & dock company, llc delaware delaware KRT LKS TRJ TK KRP KRT LKS TRJ TK KMPN LK\n", + "251 0.573277 blackstone group inc blackstone pb ii l.l.c delaware delaware BLKSTN KRP INK BLKSTN PB LLK\n", + "252 0.573277 blackstone group inc blackstone pb i l.l.c delaware delaware BLKSTN KRP INK BLKSTN PB I LLK\n", + "254 0.573277 duke energy corp duke energy acp, llc delaware delaware TK ENRJ KRP TK ENRJ AKP LK\n", + "255 0.573277 duke energy corp duke energy shoreham, llc delaware delaware TK ENRJ KRP TK ENRJ XRHM LK\n", + "256 0.573277 duke energy corp duke energy sam, llc delaware delaware TK ENRJ KRP TK ENRJ SM LK\n", + "257 0.573277 blackstone group inc blackstone obs l.l.c delaware delaware BLKSTN KRP INK BLKSTN OBS LLK\n", + "264 0.992184 freightcar america, inc. freightcar america leasing, llc delaware None FRTKR AMRK INK FRTKR AMRK LSNK LK\n", + "265 0.992184 freightcar america, inc. freightcar america leasing, llc delaware None FRTKR AMRK INK FRTKR AMRK LSNK LK\n", + "266 0.959568 qurate retail, inc. qurate retail group, inc englewood de KRT RTL INK KRT RTL KRP INK\n", + "267 0.884993 green stream holdings inc. western gas wyoming, l.l.c wyoming wyoming KRN STRM HLTNKS INK WSTRN KS YMNK LLK\n", + "268 0.884993 green stream holdings inc. western gas wyoming, l.l.c wyoming wyoming KRN STRM HLTNKS INK WSTRN KS YMNK LLK" + ] + }, + "execution_count": 249, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preds_df[preds_df.match_probability >= .5][[\"match_probability\", \"company_name_l\", \"company_name_r\", \"loc_of_incorporation_l\", \"loc_of_incorporation_r\", \"company_name_mphone_l\", \"company_name_mphone_r\"]].iloc[150:200]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mozilla_sec_eia", + "language": "python", + "name": "mozilla_sec_eia" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/17-kl-paragraph-layout-metrics.ipynb b/notebooks/17-kl-paragraph-layout-metrics.ipynb new file mode 100644 index 0000000..f7c3a8d --- /dev/null +++ b/notebooks/17-kl-paragraph-layout-metrics.ipynb @@ -0,0 +1,687 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "748b07d1-61ac-43b8-bff9-9f660626da1b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 3" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "bb513a3e-31f7-49da-895b-e3ed4f52efd4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import os\n", + "from pathlib import Path\n", + "\n", + "import pandas as pd\n", + "\n", + "from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "29c9b2e0-7f2f-4ab7-9972-f1ed30ff196a", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "archive = GCSArchive()\n", + "md = archive.get_metadata()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "1608bf1e-d6cf-4e3a-8f69-0e62744d0dfd", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
cikcompany_nameform_typedate_filedexhibit_21_versionyear_quarter
filename
edgar/data/17206/0000017206-94-000007.txt17206CAPITAL HOLDING CORP10-K/A1993-12-22None1993q4
edgar/data/29082/0000950131-94-000021.txt29082DISNEY WALT CO10-K1993-12-22211993q4
edgar/data/32377/0000032377-94-000001.txt32377ELIZABETHTOWN GAS CO10-K1993-12-13211993q4
edgar/data/353944/0000353944-94-000005.txt353944INTERNATIONAL GAME TECHNOLOGY10-K1993-12-23211993q4
edgar/data/60512/0000060512-94-000006.txt60512LOUISIANA LAND & EXPLORATION CO10-K/A1993-10-07None1993q4
.....................
edgar/data/932021/0001493152-23-046428.txt932021GLOBAL TECHNOLOGIES LTD10-K2023-12-2921.12023q4
edgar/data/933974/0001558370-23-019262.txt933974Azenta, Inc.10-K2023-11-2121.02023q4
edgar/data/935419/0001628280-23-041580.txt935419RCI HOSPITALITY HOLDINGS, INC.10-K2023-12-1421.12023q4
edgar/data/936395/0000936395-23-000044.txt936395CIENA CORP10-K2023-12-1521.12023q4
edgar/data/936528/0000936528-23-000207.txt936528WAFD INC10-K2023-11-17None2023q4
\n", + "

290379 rows × 6 columns

\n", + "
" + ], + "text/plain": [ + " cik \\\n", + "filename \n", + "edgar/data/17206/0000017206-94-000007.txt 17206 \n", + "edgar/data/29082/0000950131-94-000021.txt 29082 \n", + "edgar/data/32377/0000032377-94-000001.txt 32377 \n", + "edgar/data/353944/0000353944-94-000005.txt 353944 \n", + "edgar/data/60512/0000060512-94-000006.txt 60512 \n", + "... ... \n", + "edgar/data/932021/0001493152-23-046428.txt 932021 \n", + "edgar/data/933974/0001558370-23-019262.txt 933974 \n", + "edgar/data/935419/0001628280-23-041580.txt 935419 \n", + "edgar/data/936395/0000936395-23-000044.txt 936395 \n", + "edgar/data/936528/0000936528-23-000207.txt 936528 \n", + "\n", + " company_name \\\n", + "filename \n", + "edgar/data/17206/0000017206-94-000007.txt CAPITAL HOLDING CORP \n", + "edgar/data/29082/0000950131-94-000021.txt DISNEY WALT CO \n", + "edgar/data/32377/0000032377-94-000001.txt ELIZABETHTOWN GAS CO \n", + "edgar/data/353944/0000353944-94-000005.txt INTERNATIONAL GAME TECHNOLOGY \n", + "edgar/data/60512/0000060512-94-000006.txt LOUISIANA LAND & EXPLORATION CO \n", + "... ... \n", + "edgar/data/932021/0001493152-23-046428.txt GLOBAL TECHNOLOGIES LTD \n", + "edgar/data/933974/0001558370-23-019262.txt Azenta, Inc. \n", + "edgar/data/935419/0001628280-23-041580.txt RCI HOSPITALITY HOLDINGS, INC. \n", + "edgar/data/936395/0000936395-23-000044.txt CIENA CORP \n", + "edgar/data/936528/0000936528-23-000207.txt WAFD INC \n", + "\n", + " form_type date_filed \\\n", + "filename \n", + "edgar/data/17206/0000017206-94-000007.txt 10-K/A 1993-12-22 \n", + "edgar/data/29082/0000950131-94-000021.txt 10-K 1993-12-22 \n", + "edgar/data/32377/0000032377-94-000001.txt 10-K 1993-12-13 \n", + "edgar/data/353944/0000353944-94-000005.txt 10-K 1993-12-23 \n", + "edgar/data/60512/0000060512-94-000006.txt 10-K/A 1993-10-07 \n", + "... ... ... \n", + "edgar/data/932021/0001493152-23-046428.txt 10-K 2023-12-29 \n", + "edgar/data/933974/0001558370-23-019262.txt 10-K 2023-11-21 \n", + "edgar/data/935419/0001628280-23-041580.txt 10-K 2023-12-14 \n", + "edgar/data/936395/0000936395-23-000044.txt 10-K 2023-12-15 \n", + "edgar/data/936528/0000936528-23-000207.txt 10-K 2023-11-17 \n", + "\n", + " exhibit_21_version year_quarter \n", + "filename \n", + "edgar/data/17206/0000017206-94-000007.txt None 1993q4 \n", + "edgar/data/29082/0000950131-94-000021.txt 21 1993q4 \n", + "edgar/data/32377/0000032377-94-000001.txt 21 1993q4 \n", + "edgar/data/353944/0000353944-94-000005.txt 21 1993q4 \n", + "edgar/data/60512/0000060512-94-000006.txt None 1993q4 \n", + "... ... ... \n", + "edgar/data/932021/0001493152-23-046428.txt 21.1 2023q4 \n", + "edgar/data/933974/0001558370-23-019262.txt 21.0 2023q4 \n", + "edgar/data/935419/0001628280-23-041580.txt 21.1 2023q4 \n", + "edgar/data/936395/0000936395-23-000044.txt 21.1 2023q4 \n", + "edgar/data/936528/0000936528-23-000207.txt None 2023q4 \n", + "\n", + "[290379 rows x 6 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "md" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "bb94754e-3765-43f2-a5e1-8b55a4021da4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df = pd.DataFrame()\n", + "dir_name = Path(\"paragraph_layout_md\")\n", + "for filename in os.listdir(dir_name):\n", + " if filename.split(\".\")[-1] != \"parquet\":\n", + " continue\n", + " yq_df = pd.read_parquet(dir_name / filename)\n", + " df = pd.concat([df, yq_df])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "52828dfa-a951-4bc5-88a1-f8c2dca2628b", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
paragraph
1011174-0001193125-10-030674False
1010612-0000950123-10-019499False
1003410-0001193125-10-046549True
1011308-0000921895-10-000357True
1009672-0000950123-10-018301True
......
898293-0000950144-04-010550False
894490-0001193125-04-212822False
930803-0000950136-04-004585False
893430-0001193125-04-212647False
920354-0000950135-04-005647True
\n", + "

98712 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " paragraph\n", + "1011174-0001193125-10-030674 False\n", + "1010612-0000950123-10-019499 False\n", + "1003410-0001193125-10-046549 True\n", + "1011308-0000921895-10-000357 True\n", + "1009672-0000950123-10-018301 True\n", + "... ...\n", + "898293-0000950144-04-010550 False\n", + "894490-0001193125-04-212822 False\n", + "930803-0000950136-04-004585 False\n", + "893430-0001193125-04-212647 False\n", + "920354-0000950135-04-005647 True\n", + "\n", + "[98712 rows x 1 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "94b2ecbc-1e08-4b3a-835f-a10327f88298", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "df.loc[:, \"full_filename\"] = \"edgar/data/\" + df.index.str.replace('-', '/', n=1) + \".txt\"" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "b9c56e81-3e98-44bf-8c70-256ce1d58d80", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "md[\"date_filed\"] = md[\"date_filed\"].astype(\"datetime64[ns]\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "d60efebc-72ff-41e8-b765-8edcadbe185e", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
paragraphfull_filename
1011174-0001193125-10-030674Falseedgar/data/1011174/0001193125-10-030674.txt
1010612-0000950123-10-019499Falseedgar/data/1010612/0000950123-10-019499.txt
\n", + "
" + ], + "text/plain": [ + " paragraph \\\n", + "1011174-0001193125-10-030674 False \n", + "1010612-0000950123-10-019499 False \n", + "\n", + " full_filename \n", + "1011174-0001193125-10-030674 edgar/data/1011174/0001193125-10-030674.txt \n", + "1010612-0000950123-10-019499 edgar/data/1010612/0000950123-10-019499.txt " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "0f6d512f-b07a-4204-b3cf-69e08848ef2d", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.27785882162249775" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what percentage of files are paragraph layout?\n", + "md_merged = md.reset_index().merge(df, left_on=\"filename\", right_on=\"full_filename\", how=\"left\", validate=\"1:1\")\n", + "md_merged = md_merged.dropna(subset=\"paragraph\")\n", + "len(md_merged[md_merged.paragraph])/len(md_merged)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "67e63df0-ca52-4eef-b6aa-a1715f1ab081", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
filenamecikcompany_nameform_typedate_filedexhibit_21_versionyear_quarterparagraphfull_filename
6edgar/data/100240/0000950144-94-000787.txt100240TURNER BROADCASTING SYSTEM INC10-K1994-03-31211994q1Falseedgar/data/100240/0000950144-94-000787.txt
11edgar/data/100885/0000100885-94-000006.txt100885UNION PACIFIC CORP10-K1994-03-29211994q1Falseedgar/data/100885/0000100885-94-000006.txt
\n", + "
" + ], + "text/plain": [ + " filename cik \\\n", + "6 edgar/data/100240/0000950144-94-000787.txt 100240 \n", + "11 edgar/data/100885/0000100885-94-000006.txt 100885 \n", + "\n", + " company_name form_type date_filed exhibit_21_version \\\n", + "6 TURNER BROADCASTING SYSTEM INC 10-K 1994-03-31 21 \n", + "11 UNION PACIFIC CORP 10-K 1994-03-29 21 \n", + "\n", + " year_quarter paragraph full_filename \n", + "6 1994q1 False edgar/data/100240/0000950144-94-000787.txt \n", + "11 1994q1 False edgar/data/100885/0000100885-94-000006.txt " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "md_merged.head(2)" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "1e11faef-853b-48f2-9eb0-af7f8715cd41", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0.10292571287189956" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# what percentage of CIKs are only covered by paragraph layout docs\n", + "# get the set of unique CIKs in md_merged\n", + "all_ciks = set(md_merged.cik)\n", + "# remove the paragraph layout docs\n", + "no_paragraph_ciks = set(md_merged[md_merged[\"paragraph\"] == False].cik)\n", + "# get the set of CIKs that are in the full set but not the paragraph removed set\n", + "only_paragraph_ciks = all_ciks - no_paragraph_ciks\n", + "# divide that number by the total number of CIKs\n", + "len(only_paragraph_ciks)/len(all_ciks)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "6062d722-b1c7-4589-975e-7fe8cef65a40", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "data": { + "text/plain": [ + "1664" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(only_paragraph_ciks)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3b1f6ab8-e3be-48c2-9ecb-346425af3777", + "metadata": {}, + "outputs": [], + "source": [ + "# what percentage of CIK and year-quarter coverage do we get if we exclude all paragraph filings" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mozilla_sec_eia", + "language": "python", + "name": "mozilla_sec_eia" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb new file mode 100644 index 0000000..111ae84 --- /dev/null +++ b/notebooks/18-kl-splink-sec-eia.ipynb @@ -0,0 +1,3326 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "9029518c-ea19-4055-a938-36a5ea1804d8", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 3" + ] + }, + { + "cell_type": "code", + "execution_count": 262, + "id": "1107fe42-197c-4fea-9c48-06d08699af0b", + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "import os\n", + "from pathlib import Path\n", + "\n", + "import pandas as pd\n", + "from splink import block_on, DuckDBAPI, Linker, SettingsCreator\n", + "from splink.blocking_analysis import count_comparisons_from_blocking_rule, cumulative_comparisons_to_be_scored_from_blocking_rules_chart, n_largest_blocks\n", + "import splink.comparison_library as cl\n", + "from splink.exploratory import completeness_chart, profile_columns\n", + "from upath import UPath\n", + "\n", + "from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import prepare_sec10k_basic_info_df, prepare_ex21_df" + ] + }, + { + "cell_type": "markdown", + "id": "9b8224d4-7596-45b7-bfb5-028f29a96f3d", + "metadata": {}, + "source": [ + "# Inputs\n", + "\n", + "Questions:\n", + "* What's the best way to dagsterize this to get EIA data from PUDL?" + ] + }, + { + "cell_type": "markdown", + "id": "fb6b3f3f-8c30-4810-90dd-75cfbeecc4e0", + "metadata": {}, + "source": [ + "### EIA" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "4ab5594d-7d1f-425d-80e1-92c30be73011", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "raw_eia_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/out_eia__yearly_utilities.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "2edc29d4-6c85-4b31-aae6-0de38c846e44", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "mergers_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_mergers.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "eaa37762-9f94-4927-9341-0ab09be3c8ab", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "raw_eia861_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__assn_utility.parquet\")" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "id": "3fb7895f-10c5-4450-96f9-77b36471b53e", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "eia_df = raw_eia_df.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "id": "06c76b82-1aad-47b2-aecc-6225a286cc40", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "harvested_df = pd.concat([\n", + " pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_utility_data_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n", + " pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_operational_data_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n", + " pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_demand_side_management_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n", + " pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_energy_efficiency.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n", + "])" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "d95acde9-1640-4c26-a5d1-c50b6666ccf4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "eia861_df = raw_eia861_df.merge(harvested_df, on=[\"report_date\", \"utility_id_eia\"], how=\"left\").drop_duplicates(subset=[\"report_date\", \"utility_id_eia\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "3b7484de-bbc7-47ba-b408-a1af1183018c", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "mergers_df = mergers_df[mergers_df[\"new_parent\"].notna()]\n", + "eia861_df = eia861_df.merge(mergers_df[[\"report_date\", \"new_parent\", \"merge_address\", \"merge_city\", \"merge_state\"]], \n", + " how=\"left\", \n", + " left_on=[\"report_date\", \"utility_name_eia\"],\n", + " right_on=[\"report_date\", \"new_parent\"]\n", + " )\n", + "eia861_df = eia861_df.rename(columns={\"merge_address\": \"street_address\", \"merge_city\": \"city\"})\n", + "eia861_df = eia861_df.groupby([\"report_date\", \"utility_id_eia\"]).first().reset_index()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "id": "d3d39fc0-130f-4bbd-9cc9-bbaf58808109", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "eia861_df[\"state\"] = eia861_df[\"state\"].where(eia861_df[\"merge_state\"].isnull(), eia861_df[\"merge_state\"])\n", + "eia861_df = eia861_df.drop(columns=[\"new_parent\", \"merge_state\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "id": "04b6b682-91f4-49e2-9f74-2861548d1dd4", + "metadata": {}, + "outputs": [], + "source": [ + "eia_df = pd.concat([eia_df, eia861_df])\n", + "eia_df = eia_df.drop_duplicates(subset=[\"utility_id_eia\", \"report_date\"], keep=\"first\")\n", + "# not sure at what point this stops being a datetime\n", + "eia_df[\"report_date\"] = eia_df[\"report_date\"].astype(\"datetime64[ns]\")\n", + "# there are nulls from non harvested 861 utilities\n", + "eia_df = eia_df.dropna(subset=\"utility_name_eia\")" + ] + }, + { + "cell_type": "markdown", + "id": "ae7706d6-3f32-4c99-aeb6-dac6c6529eec", + "metadata": {}, + "source": [ + "### SEC 10K Basic Info" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "d4e950a6-ee6c-414c-b5b9-52a4175bf0b7", + "metadata": {}, + "outputs": [], + "source": [ + "sec_path = UPath(\"gs://sec10k-outputs/v2/basic_10k_company_info\")" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "14eb7f24-7f7b-43aa-a0df-85e888e43821", + "metadata": {}, + "outputs": [], + "source": [ + "raw_sec_df = pd.DataFrame()\n", + "for file in sec_path.iterdir():\n", + " if file.name.split(\".\")[-1] == \"parquet\":\n", + " raw_sec_df = pd.concat([raw_sec_df, pd.read_parquet(sec_path / file.name)])" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "23da5ca1-bd04-44d4-b252-7b114d6d553f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
value
filenamefiler_countblockblock_countkey
edgar/data/100240/0000950144-94-000787.txt0company_data0company_conformed_nameturner broadcasting system inc
central_index_key0000100240
standard_industrial_classification4833
irs_number580950695
state_of_incorporationga
..................
edgar/data/936528/0000936528-23-000207.txt0former_company0date_of_name_change20230928
1former_conformed_namewafd inc
date_of_name_change20230927
2former_conformed_namewashington federal inc
date_of_name_change19950206
\n", + "

7980908 rows × 1 columns

\n", + "
" + ], + "text/plain": [ + " value\n", + "filename filer_count block block_count key \n", + "edgar/data/100240/0000950144-94-000787.txt 0 company_data 0 company_conformed_name turner broadcasting system inc\n", + " central_index_key 0000100240\n", + " standard_industrial_classification 4833\n", + " irs_number 580950695\n", + " state_of_incorporation ga\n", + "... ...\n", + "edgar/data/936528/0000936528-23-000207.txt 0 former_company 0 date_of_name_change 20230928\n", + " 1 former_conformed_name wafd inc\n", + " date_of_name_change 20230927\n", + " 2 former_conformed_name washington federal inc\n", + " date_of_name_change 19950206\n", + "\n", + "[7980908 rows x 1 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_sec_df" + ] + }, + { + "cell_type": "code", + "execution_count": 87, + "id": "1be3364e-9887-42b2-b303-0a24e8681acf", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "raw_sec_df = raw_sec_df.reset_index().pivot_table(values=\"value\", index=\"filename\", columns=\"key\", aggfunc=\"first\")\n", + "raw_sec_df.columns.name = None" + ] + }, + { + "cell_type": "markdown", + "id": "3bac9280-1183-4aba-b78f-84bcf37ef1e2", + "metadata": {}, + "source": [ + "### Ex. 21" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "id": "611da616-45ef-40ae-bc06-8bfbc871274d", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_path = UPath(\"gs://sec10k-outputs/v2/ex21_company_ownership_info\")" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "id": "1d6272f2-b6f3-4497-9251-cbeedf794a0b", + "metadata": {}, + "outputs": [], + "source": [ + "raw_ex21_df = pd.DataFrame()\n", + "for file in ex21_path.iterdir():\n", + " if file.name.split(\".\")[-1] == \"parquet\":\n", + " year_quarter_df = pd.read_parquet(ex21_path / file.name)\n", + " report_year = file.name[:4]\n", + " year_quarter_df.loc[:, \"report_year\"] = report_year\n", + " year_quarter_df.loc[:, \"report_year\"] = pd.to_datetime(year_quarter_df[\"report_year\"], format=\"%Y\").dt.year\n", + " raw_ex21_df = pd.concat([raw_ex21_df, year_quarter_df])" + ] + }, + { + "cell_type": "markdown", + "id": "b636d438-ed71-426c-8c2a-9e550fe99958", + "metadata": {}, + "source": [ + "# Preprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 153, + "id": "f6f76c8b-ffbf-4e2b-870b-57f1260ba522", + "metadata": {}, + "outputs": [], + "source": [ + "# cleaning on both sides\n", + "sec_clean_df = prepare_sec10k_basic_info_df(raw_sec_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 157, + "id": "84e26751-663b-45a5-bb4d-fbfbbdca447e", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/katielamb/CatalystCoop/mozilla-sec-eia/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py:189: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " df = df.fillna(np.nan)\n" + ] + } + ], + "source": [ + "ex21_clean_df = prepare_ex21_df(raw_ex21_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 224, + "id": "24defbd5-ccfe-4844-ab87-3adb1b4df2d9", + "metadata": {}, + "outputs": [], + "source": [ + "eia_clean_df = prepare_eia_df(eia_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 228, + "id": "a284b2c9-8edf-4b3f-ab08-5b2cff65ed19", + "metadata": {}, + "outputs": [], + "source": [ + "SHARED_COLS = [\n", + " \"record_id\",\n", + " \"report_date\",\n", + " \"report_year\",\n", + " \"company_name\",\n", + " \"street_address\",\n", + " \"street_address_2\",\n", + " \"city\",\n", + " \"state\", # could use state of incorporation from SEC\n", + " \"zip_code\",\n", + " \"phone_number\",\n", + " \"company_name_mphone\"\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e90de0d3-3220-4869-80a3-fc7dd381d393", + "metadata": {}, + "outputs": [], + "source": [ + "# strip legal terms and then make a list column from company name\n", + "# use this for blocking and comnparison levels\n", + "eia_match_df[\"company_name_mphone_list\"] = eia_match_df[\"company_name_mphone\"].str.split()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "460c5bd5-f2e2-45c3-86c3-ac203bd053d0", + "metadata": {}, + "outputs": [], + "source": [ + "# create list column for address information as well" + ] + }, + { + "cell_type": "code", + "execution_count": 158, + "id": "c3bdc160-1939-4f34-914f-ecb0b5fdb5ac", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
report_datereport_yearcompany_namestreet_addressstreet_address_2citystatezip_codephone_number
02000-03-302000meta group incorporated208 harbor drNaNstamfordct06912-00612039736700
12001-04-022001meta group incorporated208 harbor drNaNstamfordct06912-00612039736700
22002-04-012002meta group incorporated208 harbor drNaNstamfordct06912-00612039736700
\n", + "
" + ], + "text/plain": [ + " report_date report_year company_name street_address street_address_2 city state zip_code phone_number\n", + "0 2000-03-30 2000 meta group incorporated 208 harbor dr NaN stamford ct 06912-0061 2039736700\n", + "1 2001-04-02 2001 meta group incorporated 208 harbor dr NaN stamford ct 06912-0061 2039736700\n", + "2 2002-04-01 2002 meta group incorporated 208 harbor dr NaN stamford ct 06912-0061 2039736700" + ] + }, + "execution_count": 158, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_clean_df[SHARED_COLS].head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 159, + "id": "9d73fdac-8d97-4030-9772-79ac058b0d33", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
report_datereport_yearcompany_namestreet_addressstreet_address_2citystatezip_codephone_number
332023-01-012023desert willow energy storage100 bayview circleNaNnewport beachcaNaNNaN
352023-01-012023portage solar plantn8917NaNportagewi53901NaN
372023-01-012023nsf energy one limited liability company1241 university aveNaNrochesterny14607NaN
\n", + "
" + ], + "text/plain": [ + " report_date report_year company_name street_address street_address_2 city state zip_code phone_number\n", + "33 2023-01-01 2023 desert willow energy storage 100 bayview circle NaN newport beach ca NaN NaN\n", + "35 2023-01-01 2023 portage solar plant n8917 NaN portage wi 53901 NaN\n", + "37 2023-01-01 2023 nsf energy one limited liability company 1241 university ave NaN rochester ny 14607 NaN" + ] + }, + "execution_count": 159, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "eia_clean_df[~eia_match_df.street_address.isnull()][SHARED_COLS].head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 160, + "id": "db2b1e13-824e-4c86-8065-fc99e9a1186c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
record_ididcompany_name_rawloc_of_incorporationown_perreport_yearcompany_namecompany_name_mphone
0014060-0000916131-94-000015brenton bank and trust companyiowaNaN1994brenton bank and trust companyBRNTN BNK ANT TRST KMPN
1114060-0000916131-94-000015adeliowaNaN1994adelATL
2214060-0000916131-94-000015brenton savings bank, fsb united statesames, iowaNaN1994brenton savings bank, fsb united statesBRNTN SFNKS BNK FSB UNTT STTS
\n", + "
" + ], + "text/plain": [ + " record_id id company_name_raw loc_of_incorporation own_per report_year company_name company_name_mphone\n", + "0 0 14060-0000916131-94-000015 brenton bank and trust company iowa NaN 1994 brenton bank and trust company BRNTN BNK ANT TRST KMPN\n", + "1 1 14060-0000916131-94-000015 adel iowa NaN 1994 adel ATL\n", + "2 2 14060-0000916131-94-000015 brenton savings bank, fsb united states ames, iowa NaN 1994 brenton savings bank, fsb united states BRNTN SFNKS BNK FSB UNTT STTS" + ] + }, + "execution_count": 160, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ex21_clean_df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 229, + "id": "4ea7c80a-5b5b-4a07-bca0-b6ed1e78dce9", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['record_id',\n", + " 'report_date',\n", + " 'report_year',\n", + " 'company_name',\n", + " 'street_address',\n", + " 'street_address_2',\n", + " 'city',\n", + " 'state',\n", + " 'zip_code',\n", + " 'phone_number',\n", + " 'company_name_mphone']" + ] + }, + "execution_count": 229, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "SHARED_COLS" + ] + }, + { + "cell_type": "code", + "execution_count": 231, + "id": "c734137b-fd76-4f6a-a828-23cd12d4ac27", + "metadata": {}, + "outputs": [], + "source": [ + "eia_match_df = eia_clean_df[SHARED_COLS]" + ] + }, + { + "cell_type": "code", + "execution_count": 232, + "id": "2b8b6313-abf0-4233-8bad-43b8b9cc1e0b", + "metadata": {}, + "outputs": [], + "source": [ + "sec_match_df = sec_clean_df[SHARED_COLS]" + ] + }, + { + "cell_type": "markdown", + "id": "9a04c196-e926-4502-82ee-c27352352591", + "metadata": { + "jp-MarkdownHeadingCollapsed": true, + "tags": [] + }, + "source": [ + "# Link in Ex. 21 records" + ] + }, + { + "cell_type": "code", + "execution_count": 165, + "id": "c1500344-ff7f-450e-90dd-1105d8e7c637", + "metadata": {}, + "outputs": [], + "source": [ + "# run the Ex.21 to SEC model\n", + "filepath = Path(\"../sec_ex21_model_settings/2023_model.json\")\n", + "with open(filepath, 'r') as file:\n", + " sec_ex21_settings = json.load(file)" + ] + }, + { + "cell_type": "code", + "execution_count": 192, + "id": "172ea84f-a0b7-4e9c-b746-322a47663171", + "metadata": {}, + "outputs": [], + "source": [ + "sec_test_df = sec_match_df[sec_match_df.report_year.isin([2016, 2017])][[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 193, + "id": "3f8ba4ee-b1e7-4e05-982e-43d8e446eea9", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_test_df = ex21_match_df[ex21_match_df.report_year.isin([2016, 2017])][[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 194, + "id": "2c715d7a-3d6d-4970-8ae3-5a6e1a12e937", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "14125" + ] + }, + "execution_count": 194, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(sec_test_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 195, + "id": "ec13db12-3664-4e00-aa83-7c372039b230", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "233101" + ] + }, + "execution_count": 195, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(ex21_test_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 196, + "id": "d2fcc1da-4435-4b17-8be7-cb34a6917522", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
record_idreport_yearcompany_nameloc_of_incorporationcompany_name_mphone
23232016nicholas financial incorporatedfloridaNXLS FNNXL INKRPRTT
24242017nicholas financial incorporatedfloridaNXLS FNNXL INKRPRTT
68682016sandisk corporationdelawareSNTSK KRPRXN
\n", + "
" + ], + "text/plain": [ + " record_id report_year company_name loc_of_incorporation company_name_mphone\n", + "23 23 2016 nicholas financial incorporated florida NXLS FNNXL INKRPRTT\n", + "24 24 2017 nicholas financial incorporated florida NXLS FNNXL INKRPRTT\n", + "68 68 2016 sandisk corporation delaware SNTSK KRPRXN" + ] + }, + "execution_count": 196, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_test_df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 197, + "id": "e24e2c8f-1124-4e87-b77d-55fca14a7d3c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
record_idreport_yearcompany_nameloc_of_incorporationcompany_name_mphone
283274602016capstone turbine singapore pte., limitedsingaporeKPSTN TRBN SNKPR PT LMTT
283274712016capstone turbine international, incorporateddelawareKPSTN TRBN INTRNXNL INKRPRTT
283274822016capstone turbine financial services, limited l...delawareKPSTN TRBN FNNXL SRFSS LMTT LBLT KMPN
\n", + "
" + ], + "text/plain": [ + " record_id report_year company_name loc_of_incorporation company_name_mphone\n", + "2832746 0 2016 capstone turbine singapore pte., limited singapore KPSTN TRBN SNKPR PT LMTT\n", + "2832747 1 2016 capstone turbine international, incorporated delaware KPSTN TRBN INTRNXNL INKRPRTT\n", + "2832748 2 2016 capstone turbine financial services, limited l... delaware KPSTN TRBN FNNXL SRFSS LMTT LBLT KMPN" + ] + }, + "execution_count": 197, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ex21_test_df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 198, + "id": "c531657f-5a0a-4ff5-b680-c6a1806feb75", + "metadata": {}, + "outputs": [], + "source": [ + "# can we just load this linker and make predictions? what happens with blocking?\n", + "sec_ex21_linker = Linker([sec_test_df, ex21_test_df], sec_ex21_settings, db_api=DuckDBAPI())" + ] + }, + { + "cell_type": "code", + "execution_count": 199, + "id": "14b239db-a816-428c-a132-dca0ed0998c4", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Blocking time: 0.44 seconds\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "661a74c00c7e41f59787cad30a26ec78", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Predict time: 115.79 seconds\n" + ] + } + ], + "source": [ + "sec_ex21_preds = sec_ex21_linker.inference.predict(threshold_match_probability=0.6)" + ] + }, + { + "cell_type": "code", + "execution_count": 200, + "id": "08167db9-9d9c-4b09-a839-847f85842324", + "metadata": {}, + "outputs": [], + "source": [ + "sec_ex21_preds_df = sec_ex21_preds.as_pandas_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": 201, + "id": "3f349a0a-269a-4f34-95e8-54a8c96c57f8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrecord_id_lrecord_id_rcompany_name_lcompany_name_rgamma_company_nametf_company_name_ltf_company_name_rbf_company_namebf_tf_adj_company_nameloc_of_incorporation_lloc_of_incorporation_rgamma_loc_of_incorporationtf_loc_of_incorporation_ltf_loc_of_incorporation_rbf_loc_of_incorporationbf_tf_adj_loc_of_incorporationcompany_name_mphone_lcompany_name_mphone_rreport_year_lreport_year_r
011.7269540.999705__splink__input_table_0__splink__input_table_1955515939pendrell corporationpentzer corporation30.0000080.00000435295.4377531.0washingtonwashington30.0034270.0034272.32178060.034545PNTRL KRPRXNPNTSR KRPRXN20172017
10.9817200.663845__splink__input_table_0__splink__input_table_1800411485spok holdings, incorporatedautohaus holdings, incorporated20.0000080.0000042126.9805721.0delawaredelaware30.3545130.3545132.3217800.580388SPK HLTNKS INKRPRTTATHS HLTNKS INKRPRTT20172017
24.6040020.960504__splink__input_table_0__splink__input_table_1720682731ashford hospitality trust incorporatedashford hospitality trust, incorporated30.0000080.00000435295.4377531.0marylandNone-10.010087NaN1.0000001.000000AXFRT HSPTLT TRST INKRPRTTAXFRT HSPTLT TRST INKRPRTT20172017
33.9010620.937263__splink__input_table_0__splink__input_table_1586521115tx holdings, incorporatedtex holdings, incorporated30.0000080.00000435295.4377531.0georgiadelaware00.0055960.3545130.6143191.000000TKS HLTNKS INKRPRTTTKS HLTNKS INKRPRTT20172017
44.6040020.960504__splink__input_table_0__splink__input_table_1829461757pharma bio serv, incorporatedpharma bio serv us, incorporated30.0000080.00000435295.4377531.0Nonedelaware-1NaN0.3545131.0000001.000000FRM B SRF INKRPRTTFRM B SRF US INKRPRTT20172017
...........................................................................
93430.9817200.663845__splink__input_table_0__splink__input_table_12486881135transenterix incorporatedtrane brands, incorporated20.0000080.0000042126.9805721.0delawaredelaware30.3545130.3545132.3217800.580388TRNSNTRKS INKRPRTTTRN BRNTS INKRPRTT20172017
93443.9010620.937263__splink__input_table_0__splink__input_table_12602833506cree incorporatedj.crew incorporated30.0000080.00000435295.4377531.0north carolinadelaware00.0049260.3545130.6143191.000000KR INKRPRTTJKR INKRPRTT20172017
93450.9817200.663845__splink__input_table_0__splink__input_table_12322583973applied minerals, incorporatedapplied materials spv2, incorporated20.0000080.0000082126.9805721.0delawaredelaware30.3545130.3545132.3217800.580388APLT MNRLS INKRPRTTAPLT MTRLS SPF INKRPRTT20172016
93463.9010620.937263__splink__input_table_0__splink__input_table_12322583970applied minerals, incorporatedapplied materials japan, incorporated30.0000080.00000835295.4377531.0delawarejapan00.3545130.0057950.6143191.000000APLT MNRLS INKRPRTTAPLT MTRLS JPN INKRPRTT20172016
93472.7249340.868616__splink__input_table_0__splink__input_table_1267563285guess incorporatedaquesys, incorporated20.0000080.0000082126.9805721.0delawareus delaware20.3545130.0004624.5112761.000000KS INKRPRTTAKSS INKRPRTT20172016
\n", + "

9348 rows × 24 columns

\n", + "
" + ], + "text/plain": [ + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_l company_name_r gamma_company_name tf_company_name_l tf_company_name_r bf_company_name bf_tf_adj_company_name loc_of_incorporation_l loc_of_incorporation_r gamma_loc_of_incorporation tf_loc_of_incorporation_l tf_loc_of_incorporation_r bf_loc_of_incorporation bf_tf_adj_loc_of_incorporation company_name_mphone_l company_name_mphone_r report_year_l report_year_r\n", + "0 11.726954 0.999705 __splink__input_table_0 __splink__input_table_1 95551 5939 pendrell corporation pentzer corporation 3 0.000008 0.000004 35295.437753 1.0 washington washington 3 0.003427 0.003427 2.321780 60.034545 PNTRL KRPRXN PNTSR KRPRXN 2017 2017\n", + "1 0.981720 0.663845 __splink__input_table_0 __splink__input_table_1 80041 1485 spok holdings, incorporated autohaus holdings, incorporated 2 0.000008 0.000004 2126.980572 1.0 delaware delaware 3 0.354513 0.354513 2.321780 0.580388 SPK HLTNKS INKRPRTT ATHS HLTNKS INKRPRTT 2017 2017\n", + "2 4.604002 0.960504 __splink__input_table_0 __splink__input_table_1 72068 2731 ashford hospitality trust incorporated ashford hospitality trust, incorporated 3 0.000008 0.000004 35295.437753 1.0 maryland None -1 0.010087 NaN 1.000000 1.000000 AXFRT HSPTLT TRST INKRPRTT AXFRT HSPTLT TRST INKRPRTT 2017 2017\n", + "3 3.901062 0.937263 __splink__input_table_0 __splink__input_table_1 58652 1115 tx holdings, incorporated tex holdings, incorporated 3 0.000008 0.000004 35295.437753 1.0 georgia delaware 0 0.005596 0.354513 0.614319 1.000000 TKS HLTNKS INKRPRTT TKS HLTNKS INKRPRTT 2017 2017\n", + "4 4.604002 0.960504 __splink__input_table_0 __splink__input_table_1 82946 1757 pharma bio serv, incorporated pharma bio serv us, incorporated 3 0.000008 0.000004 35295.437753 1.0 None delaware -1 NaN 0.354513 1.000000 1.000000 FRM B SRF INKRPRTT FRM B SRF US INKRPRTT 2017 2017\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "9343 0.981720 0.663845 __splink__input_table_0 __splink__input_table_1 248688 1135 transenterix incorporated trane brands, incorporated 2 0.000008 0.000004 2126.980572 1.0 delaware delaware 3 0.354513 0.354513 2.321780 0.580388 TRNSNTRKS INKRPRTT TRN BRNTS INKRPRTT 2017 2017\n", + "9344 3.901062 0.937263 __splink__input_table_0 __splink__input_table_1 260283 3506 cree incorporated j.crew incorporated 3 0.000008 0.000004 35295.437753 1.0 north carolina delaware 0 0.004926 0.354513 0.614319 1.000000 KR INKRPRTT JKR INKRPRTT 2017 2017\n", + "9345 0.981720 0.663845 __splink__input_table_0 __splink__input_table_1 232258 3973 applied minerals, incorporated applied materials spv2, incorporated 2 0.000008 0.000008 2126.980572 1.0 delaware delaware 3 0.354513 0.354513 2.321780 0.580388 APLT MNRLS INKRPRTT APLT MTRLS SPF INKRPRTT 2017 2016\n", + "9346 3.901062 0.937263 __splink__input_table_0 __splink__input_table_1 232258 3970 applied minerals, incorporated applied materials japan, incorporated 3 0.000008 0.000008 35295.437753 1.0 delaware japan 0 0.354513 0.005795 0.614319 1.000000 APLT MNRLS INKRPRTT APLT MTRLS JPN INKRPRTT 2017 2016\n", + "9347 2.724934 0.868616 __splink__input_table_0 __splink__input_table_1 267563 285 guess incorporated aquesys, incorporated 2 0.000008 0.000008 2126.980572 1.0 delaware us delaware 2 0.354513 0.000462 4.511276 1.000000 KS INKRPRTT AKSS INKRPRTT 2017 2016\n", + "\n", + "[9348 rows x 24 columns]" + ] + }, + "execution_count": 201, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# TODO: this needs to be improved, maybe just do a fuzzy match on string name?\n", + "sec_ex21_preds_df" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "defdf953-4af7-4d43-b7cf-5ae95360d70f", + "metadata": {}, + "outputs": [], + "source": [ + "# add the Ex. 21 subsidiaries that don't get a matching CIK to the SEC side\n", + "# run on all the data\n", + "# save the mapping of subsidiaries that are greater than a certain threshold (unclear why the blocking isn't working)\n", + "# get the subsidiaries that are less than a certain threshold\n", + "# transform them to have columns that match with the SEC df\n", + "# add them to the SEC side" + ] + }, + { + "cell_type": "markdown", + "id": "46d967d4-3722-437d-b2f0-37cbac17624f", + "metadata": {}, + "source": [ + "# Link SEC and EIA" + ] + }, + { + "cell_type": "markdown", + "id": "509988b1-ed2c-41b3-9334-f44ae599cf4f", + "metadata": {}, + "source": [ + "## Exploratory Analysis" + ] + }, + { + "cell_type": "code", + "execution_count": 205, + "id": "de894ad0-0b72-4486-b737-c3bfb95f8f05", + "metadata": {}, + "outputs": [], + "source": [ + "db_api = DuckDBAPI()" + ] + }, + { + "cell_type": "code", + "execution_count": 233, + "id": "ac4e560b-6946-4cc7-b2bc-6d5f4b154da6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 233, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "completeness_chart(sec_match_df, db_api=db_api)" + ] + }, + { + "cell_type": "code", + "execution_count": 234, + "id": "02063bcd-8301-4a70-aab1-0bbf6119cf8b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.LayerChart(...)" + ] + }, + "execution_count": 234, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "completeness_chart(eia_match_df, db_api=db_api)" + ] + }, + { + "cell_type": "code", + "execution_count": 209, + "id": "c4542c1f-d826-43c1-9af5-ce6473b79d90", + "metadata": {}, + "outputs": [], + "source": [ + "# could sub in zip code for street address?\n", + "match_cols = [\"company_name\", \"state\", \"city\", \"street_address\", \"zip_code\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 210, + "id": "95bd4db7-c447-4a0e-ab37-59d4beac0b11", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 210, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "profile_columns(sec_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 211, + "id": "d9cbc91b-b061-461b-b1f2-83036c70d7c7", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 211, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "profile_columns(eia_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)" + ] + }, + { + "cell_type": "markdown", + "id": "69f5fc54-f479-495c-86fc-48accda883d0", + "metadata": {}, + "source": [ + "## Blocking" + ] + }, + { + "cell_type": "code", + "execution_count": 300, + "id": "6402e556-b87c-47ca-bc30-ced2b42e6626", + "metadata": {}, + "outputs": [], + "source": [ + "br0 = \"l.report_year = r.report_year and substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3)\"\n", + "br1 = \"l.report_year = r.report_year and l.street_address = r.street_address\"\n", + "# br2 = \"l.report_year = r.report_year and substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and l.city = r.city\"\n", + "br4 = \"l.report_year = r.report_year and l.phone_number = r.phone_number\"" + ] + }, + { + "cell_type": "code", + "execution_count": 257, + "id": "1e6d3d24-f8ce-4118-9d31-d4f237d28237", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'number_of_comparisons_generated_pre_filter_conditions': 618634,\n", + " 'number_of_comparisons_to_be_scored_post_filter_conditions': 618634,\n", + " 'filter_conditions_identified': '',\n", + " 'equi_join_conditions_identified': 'l.report_year = r.report_year AND SUBSTRING(l.company_name_mphone, 1, 4) = SUBSTRING(r.company_name_mphone, 1, 4)',\n", + " 'link_type_join_condition': 'where l.\"source_dataset\" || \\'-__-\\' || l.\"record_id\" < r.\"source_dataset\" || \\'-__-\\' || r.\"record_id\" and l.\"source_dataset\" != r.\"source_dataset\"'}" + ] + }, + "execution_count": 257, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "counts = count_comparisons_from_blocking_rule(\n", + " table_or_tables=[sec_match_df, eia_match_df],\n", + " blocking_rule=br0,\n", + " link_type=\"link_only\",\n", + " unique_id_column_name='record_id',\n", + " db_api=db_api,\n", + ")\n", + "\n", + "counts" + ] + }, + { + "cell_type": "code", + "execution_count": 259, + "id": "c37c117a-0fa2-4b8a-9fae-da3569895ca3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
key_0key_1key_2count_lcount_rblock_count
02023boston0211011313415142
12022boston0211011611012760
22021boston02110113889944
\n", + "
" + ], + "text/plain": [ + " key_0 key_1 key_2 count_l count_r block_count\n", + "0 2023 boston 02110 113 134 15142\n", + "1 2022 boston 02110 116 110 12760\n", + "2 2021 boston 02110 113 88 9944" + ] + }, + "execution_count": 259, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "result = n_largest_blocks(\n", + " table_or_tables=[sec_match_df, eia_match_df],\n", + " blocking_rule=br3,\n", + " link_type=\"link_only\",\n", + " db_api=db_api,\n", + " n_largest=3\n", + ")\n", + "\n", + "result.as_pandas_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": 302, + "id": "4e1a9844-5d98-4cac-a083-eef134f083ce", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.Chart(...)" + ] + }, + "execution_count": 302, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "blocking_rules_for_analysis = [\n", + " br0, br1\n", + "]\n", + "\n", + "\n", + "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", + " table_or_tables=[sec_match_df, eia_match_df],\n", + " blocking_rules=blocking_rules_for_analysis,\n", + " db_api=db_api,\n", + " unique_id_column_name='record_id',\n", + " link_type=\"link_only\",\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "377b0017-e46f-4d06-8cb5-af2b7725fc0e", + "metadata": {}, + "source": [ + "## Create Model" + ] + }, + { + "cell_type": "code", + "execution_count": 382, + "id": "f72f546c-c338-4c9f-aab1-ba03c3169e18", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'JaccardAtThresholds' of \"company_name\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'company_name is NULL' with SQL rule: \"company_name_l\" IS NULL OR \"company_name_r\" IS NULL\n", + " - 'Exact match on company_name' with SQL rule: \"company_name_l\" = \"company_name_r\"\n", + " - 'Jaccard distance of 'company_name >= 0.9'' with SQL rule: jaccard(\"company_name_l\", \"company_name_r\") >= 0.9\n", + " - 'Jaccard distance of 'company_name >= 0.7'' with SQL rule: jaccard(\"company_name_l\", \"company_name_r\") >= 0.7\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "# company_name_comparison = cl.NameComparison(\n", + "# \"company_name\",\n", + " # dmeta_col_name=\"company_name_mphone_list\" # this was breaking it for some reason\n", + "# )\n", + "company_name_comparison = cl.JaccardAtThresholds(\n", + " \"company_name\",\n", + " # dmeta_col_name=\"company_name_mphone_list\" # this was breaking it for some reason\n", + ")\n", + "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 373, + "id": "4298a288-c306-4d75-9d72-e5b8f87774ce", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'LevenshteinAtThresholds' of \"street_address\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'street_address is NULL' with SQL rule: \"street_address_l\" IS NULL OR \"street_address_r\" IS NULL\n", + " - 'Exact match on street_address' with SQL rule: \"street_address_l\" = \"street_address_r\"\n", + " - 'Levenshtein distance of street_address <= 1' with SQL rule: levenshtein(\"street_address_l\", \"street_address_r\") <= 1\n", + " - 'Levenshtein distance of street_address <= 2' with SQL rule: levenshtein(\"street_address_l\", \"street_address_r\") <= 2\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "address_comparison = cl.LevenshteinAtThresholds(\n", + " \"street_address\",\n", + " # size_threshold_or_thresholds=[1,2,3]\n", + ")\n", + "print(address_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 267, + "id": "63ed7cd2-d803-4d17-b730-c9fc17df0607", + "metadata": {}, + "outputs": [], + "source": [ + "zip_code_comparison = cl.ExactMatch(\"zip_code\").configure(term_frequency_adjustments=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 268, + "id": "974a3982-38a1-45cb-9875-b8d4584c808d", + "metadata": {}, + "outputs": [], + "source": [ + "state_comparison = cl.ExactMatch(\"state\").configure(term_frequency_adjustments=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 269, + "id": "7592619b-340a-4496-8195-9ce932cae699", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'NameComparison' of \"city\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'city is NULL' with SQL rule: \"city_l\" IS NULL OR \"city_r\" IS NULL\n", + " - 'Exact match on city' with SQL rule: \"city_l\" = \"city_r\"\n", + " - 'Jaro-Winkler distance of city >= 0.9' with SQL rule: jaro_winkler_similarity(\"city_l\", \"city_r\") >= 0.9\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "city_comparison = cl.NameComparison(\n", + " \"city\",\n", + " jaro_winkler_thresholds=[0.9]\n", + " # dmeta_col_name=\"company_name_mphone\" # this was breaking it for some reason\n", + ")\n", + "print(city_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 383, + "id": "a946c820-9b93-41f1-9fc2-6da47d0cf407", + "metadata": {}, + "outputs": [], + "source": [ + "settings = SettingsCreator(\n", + " link_type=\"link_only\",\n", + " unique_id_column_name=\"record_id\",\n", + " comparisons=[\n", + " company_name_comparison,\n", + " address_comparison,\n", + " zip_code_comparison,\n", + " state_comparison,\n", + " city_comparison\n", + " ],\n", + " blocking_rules_to_generate_predictions=[\n", + " br0, br1\n", + " ],\n", + " retain_intermediate_calculation_columns=True,\n", + ")\n", + "\n", + "linker = Linker([sec_match_df, eia_match_df], settings, db_api=DuckDBAPI())" + ] + }, + { + "cell_type": "code", + "execution_count": 384, + "id": "36cae876-783d-4bff-89df-9d30cc5e60d6", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "358d0a088e2441deaef798c55ad97068", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Probability two random records match is estimated to be 2.18e-05.\n", + "This means that amongst all possible pairwise record comparisons, one in 45,828.17 are expected to match. With 40,620,617,120 total possible comparisons, we expect a total of around 886,367.78 matching pairs\n" + ] + } + ], + "source": [ + "deterministic_rules = [\n", + " block_on(\"company_name\", \"company_name\"),\n", + " block_on(\"phone_number\"),\n", + " block_on(\"street_address\"),\n", + " \"jaccard(r.company_name, l.company_name) >= .9 and l.city = r.city\",\n", + " \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4) and l.city = r.city\",\n", + "]\n", + "\n", + "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.9)" + ] + }, + { + "cell_type": "code", + "execution_count": 385, + "id": "a5190bda-070d-4d8e-a6db-be7c65b04db3", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "----- Estimating u probabilities using random sampling -----\n", + "\n", + "Estimated u probabilities using random sampling\n", + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - company_name (no m values are trained).\n", + " - street_address (no m values are trained).\n", + " - zip_code (no m values are trained).\n", + " - state (no m values are trained).\n", + " - city (no m values are trained).\n" + ] + } + ], + "source": [ + "linker.training.estimate_u_using_random_sampling(max_pairs=1e7)" + ] + }, + { + "cell_type": "code", + "execution_count": 386, + "id": "f8dd1336-8a5b-49d2-a054-df0c0a0e953f", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n", + "Estimating the m probabilities of the model by blocking on:\n", + "(l.\"company_name\" = r.\"company_name\") AND (l.\"company_name\" = r.\"company_name\")\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - street_address\n", + " - zip_code\n", + " - state\n", + " - city\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - company_name\n", + "\n", + "Iteration 1: Largest change in params was 0.804 in the m_probability of street_address, level `All other comparisons`\n", + "Iteration 2: Largest change in params was 0.0737 in the m_probability of state, level `Exact match on state`\n", + "Iteration 3: Largest change in params was -0.039 in the m_probability of state, level `All other comparisons`\n", + "Iteration 4: Largest change in params was 0.021 in the m_probability of city, level `All other comparisons`\n", + "Iteration 5: Largest change in params was 0.00805 in the m_probability of city, level `All other comparisons`\n", + "Iteration 6: Largest change in params was -0.00338 in the m_probability of state, level `All other comparisons`\n", + "Iteration 7: Largest change in params was 0.00164 in the m_probability of state, level `Exact match on state`\n", + "Iteration 8: Largest change in params was 0.000825 in the m_probability of state, level `Exact match on state`\n", + "Iteration 9: Largest change in params was -0.000425 in the m_probability of state, level `All other comparisons`\n", + "Iteration 10: Largest change in params was -0.000223 in the m_probability of state, level `All other comparisons`\n", + "Iteration 11: Largest change in params was 0.000118 in the m_probability of state, level `Exact match on state`\n", + "Iteration 12: Largest change in params was 6.29e-05 in the m_probability of state, level `Exact match on state`\n", + "\n", + "EM converged after 12 iterations\n", + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - company_name (no m values are trained).\n" + ] + } + ], + "source": [ + "training_blocking_rule = block_on(\"company_name\", \"company_name\")\n", + "training_session_fname_sname = (\n", + " linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 387, + "id": "9581aa18-3352-429a-86c4-6078bcf13a55", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "----- Starting EM training session -----\n", + "\n", + "Estimating the m probabilities of the model by blocking on:\n", + "(l.\"street_address\" = r.\"street_address\") AND (l.\"street_address\" = r.\"street_address\")\n", + "\n", + "Parameter estimates will be made for the following comparison(s):\n", + " - company_name\n", + " - zip_code\n", + " - state\n", + " - city\n", + "\n", + "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", + " - street_address\n", + "\n", + "Iteration 1: Largest change in params was -0.929 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 2: Largest change in params was 0.0355 in probability_two_random_records_match\n", + "Iteration 3: Largest change in params was 0.00843 in the m_probability of state, level `All other comparisons`\n", + "Iteration 4: Largest change in params was -0.00612 in the m_probability of state, level `Exact match on state`\n", + "Iteration 5: Largest change in params was -0.00431 in the m_probability of state, level `Exact match on state`\n", + "Iteration 6: Largest change in params was -0.00301 in the m_probability of state, level `Exact match on state`\n", + "Iteration 7: Largest change in params was 0.0021 in the m_probability of state, level `All other comparisons`\n", + "Iteration 8: Largest change in params was -0.00146 in the m_probability of state, level `Exact match on state`\n", + "Iteration 9: Largest change in params was 0.00101 in the m_probability of state, level `All other comparisons`\n", + "Iteration 10: Largest change in params was -0.000704 in the m_probability of state, level `Exact match on state`\n", + "Iteration 11: Largest change in params was 0.000489 in the m_probability of state, level `All other comparisons`\n", + "Iteration 12: Largest change in params was -0.00034 in the m_probability of state, level `Exact match on state`\n", + "Iteration 13: Largest change in params was -0.000236 in the m_probability of state, level `Exact match on state`\n", + "Iteration 14: Largest change in params was 0.000164 in the m_probability of state, level `All other comparisons`\n", + "Iteration 15: Largest change in params was -0.000114 in the m_probability of state, level `Exact match on state`\n", + "Iteration 16: Largest change in params was -7.88e-05 in the m_probability of state, level `Exact match on state`\n", + "\n", + "EM converged after 16 iterations\n", + "\n", + "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" + ] + } + ], + "source": [ + "training_blocking_rule = block_on(\"street_address\", \"street_address\")\n", + "training_session_fname_sname = (\n", + " linker.training.estimate_parameters_using_expectation_maximisation(training_blocking_rule)\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 388, + "id": "8ad317ed-1db9-4932-9815-6e9e0efa9580", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 388, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.visualisations.match_weights_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 389, + "id": "5e21bf55-64ac-4f4b-8f1c-d7507b5e7af6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.HConcatChart(...)" + ] + }, + "execution_count": 389, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# company_name doesn't look good here\n", + "linker.visualisations.m_u_parameters_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 285, + "id": "fedb78e1-ee73-4d1e-8a96-3b27f6561a91", + "metadata": {}, + "outputs": [], + "source": [ + "settings = linker.misc.save_model_to_json(\n", + " \"model_test.json\", overwrite=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "31f9d73d-cfa4-41fa-906f-c8501a29283b", + "metadata": {}, + "source": [ + "## Make Predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 390, + "id": "94e96441-89b6-4516-aa6a-4d1593ce03be", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Blocking time: 0.28 seconds\n" + ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "1680da9f410c424d8e5648fc98c88022", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Predict time: 3.06 seconds\n" + ] + } + ], + "source": [ + "df_predictions = linker.inference.predict(threshold_match_probability=0.5)" + ] + }, + { + "cell_type": "code", + "execution_count": 391, + "id": "722effbc-24e8-45ca-aa64-8cdcd75846e0", + "metadata": {}, + "outputs": [], + "source": [ + "preds_df = df_predictions.as_pandas_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": 392, + "id": "21b19a11-15c6-46ab-bd73-7c4752f7e25e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrecord_id_lrecord_id_rcompany_name_lcompany_name_rgamma_company_namebf_company_namestreet_address_lstreet_address_rgamma_street_addressbf_street_addresszip_code_lzip_code_rgamma_zip_codetf_zip_code_ltf_zip_code_rbf_zip_codebf_tf_adj_zip_codestate_lstate_rgamma_statetf_state_ltf_state_rbf_statebf_tf_adj_statecity_lcity_rgamma_citytf_city_ltf_city_rbf_citybf_tf_adj_citycompany_name_mphone_lcompany_name_mphone_rreport_year_lreport_year_rmatch_key
112110.0543320.509414__splink__input_table_0__splink__input_table_18576268295citi trends incorporatedgeorgia pacific corporation11.462842104 coleman boulevardNone-11.000000314083132600.0000450.0001030.4029181.000000gaga10.0233740.02337422.5980541.815434savannahsavannah20.0004540.000454215.5596819.129471ST TRNTS INKRPRTTJRJ PSFK KRPRXN202120080
116660.0980350.516982__splink__input_table_0__splink__input_table_19461575114chicopee bancorp, incorporatedchicopee city of00.84580070 center street725 front street00.844089010130102100.0000360.0000610.4029181.000000mama10.0429500.04295022.5980540.987961chicopeechicopee20.0001170.000117215.55968135.431042XKP BNKRP INKRPRTTXKP ST OF201220120
116650.0980350.516982__splink__input_table_0__splink__input_table_19461475115chicopee bancorp, incorporatedchicopee city of00.84580070 center street725 front street00.844089010130102100.0000360.0000610.4029181.000000mama10.0429500.04295022.5980540.987961chicopeechicopee20.0001170.000117215.55968135.431042XKP BNKRP INKRPRTTXKP ST OF201120110
116680.0980350.516982__splink__input_table_0__splink__input_table_19461875118chicopee bancorp, incorporatedchicopee city of00.84580070 center street725 front street00.844089010130102100.0000360.0000610.4029181.000000mama10.0429500.04295022.5980540.987961chicopeechicopee20.0001170.000117215.55968135.431042XKP BNKRP INKRPRTTXKP ST OF200820080
116690.0980350.516982__splink__input_table_0__splink__input_table_19462075116chicopee bancorp, incorporatedchicopee city of00.84580070 center streetp o box 40500.844089010130102100.0000360.0000610.4029181.000000mama10.0429500.04295022.5980540.987961chicopeechicopee20.0001170.000117215.55968135.431042XKP BNKRP INKRPRTTXKP ST OF201020100
...........................................................................................................................
1004345.0265911.000000__splink__input_table_0__splink__input_table_117769867483green mountain power corporationgreen mountain power corporation39751.372250163 acorn lane163 acorn lane324859.333063054460544610.0001430.0001431447.9883422.894003vtvt10.0026800.00268022.59805415.835981colchestercolchester20.0001980.000198215.55968120.959208KRN MNTN PWR KRPRXNKRN MNTN PWR KRPRXN200120010
1005145.0265911.000000__splink__input_table_0__splink__input_table_117770267479green mountain power corporationgreen mountain power corporation39751.372250163 acorn lane163 acorn lane324859.333063054460544610.0001430.0001431447.9883422.894003vtvt10.0026800.00268022.59805415.835981colchestercolchester20.0001980.000198215.55968120.959208KRN MNTN PWR KRPRXNKRN MNTN PWR KRPRXN200520050
1005045.0265911.000000__splink__input_table_0__splink__input_table_117770167480green mountain power corporationgreen mountain power corporation39751.372250163 acorn lane163 acorn lane324859.333063054460544610.0001430.0001431447.9883422.894003vtvt10.0026800.00268022.59805415.835981colchestercolchester20.0001980.000198215.55968120.959208KRN MNTN PWR KRPRXNKRN MNTN PWR KRPRXN200420040
1004945.0265911.000000__splink__input_table_0__splink__input_table_117769967482green mountain power corporationgreen mountain power corporation39751.372250163 acorn lane163 acorn lane324859.333063054460544610.0001430.0001431447.9883422.894003vtvt10.0026800.00268022.59805415.835981colchestercolchester20.0001980.000198215.55968120.959208KRN MNTN PWR KRPRXNKRN MNTN PWR KRPRXN200220020
1003545.0265911.000000__splink__input_table_0__splink__input_table_117770067481green mountain power corporationgreen mountain power corporation39751.372250163 acorn lane163 acorn lane324859.333063054460544610.0001430.0001431447.9883422.894003vtvt10.0026800.00268022.59805415.835981colchestercolchester20.0001980.000198215.55968120.959208KRN MNTN PWR KRPRXNKRN MNTN PWR KRPRXN200320030
\n", + "

12713 rows × 40 columns

\n", + "
" + ], + "text/plain": [ + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_l company_name_r gamma_company_name bf_company_name street_address_l street_address_r gamma_street_address bf_street_address zip_code_l zip_code_r gamma_zip_code tf_zip_code_l tf_zip_code_r bf_zip_code bf_tf_adj_zip_code state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r report_year_l report_year_r match_key\n", + "11211 0.054332 0.509414 __splink__input_table_0 __splink__input_table_1 85762 68295 citi trends incorporated georgia pacific corporation 1 1.462842 104 coleman boulevard None -1 1.000000 31408 31326 0 0.000045 0.000103 0.402918 1.000000 ga ga 1 0.023374 0.023374 22.598054 1.815434 savannah savannah 2 0.000454 0.000454 215.559681 9.129471 ST TRNTS INKRPRTT JRJ PSFK KRPRXN 2021 2008 0\n", + "11666 0.098035 0.516982 __splink__input_table_0 __splink__input_table_1 94615 75114 chicopee bancorp, incorporated chicopee city of 0 0.845800 70 center street 725 front street 0 0.844089 01013 01021 0 0.000036 0.000061 0.402918 1.000000 ma ma 1 0.042950 0.042950 22.598054 0.987961 chicopee chicopee 2 0.000117 0.000117 215.559681 35.431042 XKP BNKRP INKRPRTT XKP ST OF 2012 2012 0\n", + "11665 0.098035 0.516982 __splink__input_table_0 __splink__input_table_1 94614 75115 chicopee bancorp, incorporated chicopee city of 0 0.845800 70 center street 725 front street 0 0.844089 01013 01021 0 0.000036 0.000061 0.402918 1.000000 ma ma 1 0.042950 0.042950 22.598054 0.987961 chicopee chicopee 2 0.000117 0.000117 215.559681 35.431042 XKP BNKRP INKRPRTT XKP ST OF 2011 2011 0\n", + "11668 0.098035 0.516982 __splink__input_table_0 __splink__input_table_1 94618 75118 chicopee bancorp, incorporated chicopee city of 0 0.845800 70 center street 725 front street 0 0.844089 01013 01021 0 0.000036 0.000061 0.402918 1.000000 ma ma 1 0.042950 0.042950 22.598054 0.987961 chicopee chicopee 2 0.000117 0.000117 215.559681 35.431042 XKP BNKRP INKRPRTT XKP ST OF 2008 2008 0\n", + "11669 0.098035 0.516982 __splink__input_table_0 __splink__input_table_1 94620 75116 chicopee bancorp, incorporated chicopee city of 0 0.845800 70 center street p o box 405 0 0.844089 01013 01021 0 0.000036 0.000061 0.402918 1.000000 ma ma 1 0.042950 0.042950 22.598054 0.987961 chicopee chicopee 2 0.000117 0.000117 215.559681 35.431042 XKP BNKRP INKRPRTT XKP ST OF 2010 2010 0\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "10043 45.026591 1.000000 __splink__input_table_0 __splink__input_table_1 177698 67483 green mountain power corporation green mountain power corporation 3 9751.372250 163 acorn lane 163 acorn lane 3 24859.333063 05446 05446 1 0.000143 0.000143 1447.988342 2.894003 vt vt 1 0.002680 0.002680 22.598054 15.835981 colchester colchester 2 0.000198 0.000198 215.559681 20.959208 KRN MNTN PWR KRPRXN KRN MNTN PWR KRPRXN 2001 2001 0\n", + "10051 45.026591 1.000000 __splink__input_table_0 __splink__input_table_1 177702 67479 green mountain power corporation green mountain power corporation 3 9751.372250 163 acorn lane 163 acorn lane 3 24859.333063 05446 05446 1 0.000143 0.000143 1447.988342 2.894003 vt vt 1 0.002680 0.002680 22.598054 15.835981 colchester colchester 2 0.000198 0.000198 215.559681 20.959208 KRN MNTN PWR KRPRXN KRN MNTN PWR KRPRXN 2005 2005 0\n", + "10050 45.026591 1.000000 __splink__input_table_0 __splink__input_table_1 177701 67480 green mountain power corporation green mountain power corporation 3 9751.372250 163 acorn lane 163 acorn lane 3 24859.333063 05446 05446 1 0.000143 0.000143 1447.988342 2.894003 vt vt 1 0.002680 0.002680 22.598054 15.835981 colchester colchester 2 0.000198 0.000198 215.559681 20.959208 KRN MNTN PWR KRPRXN KRN MNTN PWR KRPRXN 2004 2004 0\n", + "10049 45.026591 1.000000 __splink__input_table_0 __splink__input_table_1 177699 67482 green mountain power corporation green mountain power corporation 3 9751.372250 163 acorn lane 163 acorn lane 3 24859.333063 05446 05446 1 0.000143 0.000143 1447.988342 2.894003 vt vt 1 0.002680 0.002680 22.598054 15.835981 colchester colchester 2 0.000198 0.000198 215.559681 20.959208 KRN MNTN PWR KRPRXN KRN MNTN PWR KRPRXN 2002 2002 0\n", + "10035 45.026591 1.000000 __splink__input_table_0 __splink__input_table_1 177700 67481 green mountain power corporation green mountain power corporation 3 9751.372250 163 acorn lane 163 acorn lane 3 24859.333063 05446 05446 1 0.000143 0.000143 1447.988342 2.894003 vt vt 1 0.002680 0.002680 22.598054 15.835981 colchester colchester 2 0.000198 0.000198 215.559681 20.959208 KRN MNTN PWR KRPRXN KRN MNTN PWR KRPRXN 2003 2003 0\n", + "\n", + "[12713 rows x 40 columns]" + ] + }, + "execution_count": 392, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preds_df.sort_values(by=\"match_probability\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "4f63fb3d-5fac-476d-9271-347412121902", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mozilla_sec_eia", + "language": "python", + "name": "mozilla_sec_eia" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/pyproject.toml b/pyproject.toml index c38ae7e..e0129cb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -14,7 +14,7 @@ readme = {file = "README.rst", content-type = "text/x-rst"} authors = [ {name = "Catalyst Cooperative", email = "pudl@catalyst.coop"} ] -requires-python = ">=3.10,<3.12" +requires-python = ">=3.10,<=3.12" dynamic = ["version"] license = {file = "LICENSE.txt"} dependencies = [ @@ -30,6 +30,7 @@ dependencies = [ "google-cloud-secret-manager>=2,<3", "google-cloud-storage>=2,<3", "hypothesis", + "jellyfish>=1.1", "matplotlib>=3.8,<4", "mlflow>=2.12", "opencv-python", @@ -62,6 +63,7 @@ classifiers = [ "Programming Language :: Python :: 3 :: Only", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ] keywords = [ "template", @@ -95,7 +97,7 @@ docs = [ "furo>=2022.4.7", "sphinx>=6,<8.1", # The default Python documentation engine "sphinx-autoapi>=2,<4", # Generates documentation from docstrings - "sphinx-issues>=1.2,<5", # Allows references to GitHub issues + "sphinx-issues>=5", # Allows references to GitHub issues ] tests = [ @@ -201,8 +203,8 @@ lint.ignore = [ "EXE002", ] -# Assume Python 3.11 -target-version = "py311" +# Assume Python 3.12 +target-version = "py312" line-length = 88 # Don't automatically concatenate strings -- sometimes we forget a comma! @@ -231,6 +233,6 @@ inline-quotes = "double" multiline-quotes = "double" [tool.mypy] -python_version = "3.10" +python_version = "3.12" warn_return_any = true warn_unused_configs = true diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py new file mode 100644 index 0000000..0fec63c --- /dev/null +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py @@ -0,0 +1 @@ +from . import preprocessing diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py new file mode 100644 index 0000000..9080cd7 --- /dev/null +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py @@ -0,0 +1,288 @@ +"""Preprocessing for EIA and SEC input data before record linkage.""" + +import jellyfish +import numpy as np +import pandas as pd + +from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive +from pudl.analysis.record_linkage import name_cleaner + +EIA_COL_MAP = { + "utility_name_eia": "company_name", # TODO: should be linking to owner or operator name? + "address_2": "street_address_2", +} + +EX21_COL_MAP = {"subsidiary": "company_name", "loc": "loc_of_incorporation"} + +SEC_COL_MAP = { + "company_conformed_name": "company_name", + "street_1": "street_address", + "street_2": "street_address_2", + "zip": "zip_code", + "business_phone": "phone_number", + "date_filed": "report_date", +} + +SHARED_COLS = [ + "report_date", + "report_year", + "company_name", + "street_address", + "street_address_2", + "city", + "state", # could use state of incorporation from SEC + "zip_code", + "phone_number", +] + +STR_COLS = [ + "company_name", + "street_address", + "street_address_2", + "city", + "state", + "zip_code", +] + +INVALID_NAMES = [ + "llc", + "limited liability company", + "limited", + "ltd", + "iiii", + "inc", + "incorporated", + "partnership", + "i", + "name", + "company", + "&", + "", +] + +state_code_dict = { + # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#States. + "AK": "Alaska", + "AL": "Alabama", + "AR": "Arkansas", + "AZ": "Arizona", + "CA": "California", + "CO": "Colorado", + "CT": "Connecticut", + "DE": "Delaware", + "FL": "Florida", + "GA": "Georgia", + "HI": "Hawaii", + "IA": "Iowa", + "ID": "Idaho", + "IL": "Illinois", + "IN": "Indiana", + "KS": "Kansas", + "KY": "Kentucky", + "LA": "Louisiana", + "MA": "Massachusetts", + "MD": "Maryland", + "ME": "Maine", + "MI": "Michigan", + "MN": "Minnesota", + "MO": "Missouri", + "MS": "Mississippi", + "MT": "Montana", + "NC": "North Carolina", + "ND": "North Dakota", + "NE": "Nebraska", + "NH": "New Hampshire", + "NJ": "New Jersey", + "NM": "New Mexico", + "NV": "Nevada", + "NY": "New York", + "OH": "Ohio", + "OK": "Oklahoma", + "OR": "Oregon", + "PA": "Pennsylvania", + "RI": "Rhode Island", + "SC": "South Carolina", + "SD": "South Dakota", + "TN": "Tennessee", + "TX": "Texas", + "UT": "Utah", + "VA": "Virginia", + "VT": "Vermont", + "WA": "Washington", + "WI": "Wisconsin", + "WV": "West Virginia", + "WY": "Wyoming", + # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#Federal_district. + "DC": "District of Columbia", + # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#Inhabited_territories. + "AS": "American Samoa", + "GU": "Guam GU", + "MP": "Northern Mariana Islands", + "PR": "Puerto Rico PR", + "VI": "U.S. Virgin Islands", +} +state_code_to_name = {k.lower(): v.lower() for k, v in state_code_dict.items()} + +company_name_cleaner = name_cleaner.CompanyNameCleaner( + cleaning_rules_list=[ + "remove_word_the_from_the_end", + "remove_word_the_from_the_beginning", + "replace_amperstand_between_space_by_AND", + "replace_hyphen_by_space", + "replace_hyphen_between_spaces_by_single_space", + "replace_underscore_by_space", + "replace_underscore_between_spaces_by_single_space", + # "remove_all_punctuation", + # "remove_numbers", + # "remove_math_symbols", + "remove_words_in_parentheses", + "remove_parentheses", + "remove_brackets", + "remove_curly_brackets", + "enforce_single_space_between_words", + ] +) + + +def _add_report_year_to_sec(sec_df): + """Merge metadata on to get a report year for extracted SEC data. + + Expects filename to be the index of the SEC dataframe. + """ + archive = GCSArchive() + md = archive.get_metadata() + return sec_df.merge( + md[["date_filed"]], how="left", left_index=True, right_index=True + ) + + +# TODO: this is in PUDL, pull out into helper function +def _get_metaphone(row, col_name): + if pd.isnull(row[col_name]): + return None + return jellyfish.metaphone(row[col_name]) + + +def _clean_company_name(df): + df.loc[:, "company_name_clean"] = company_name_cleaner.apply_name_cleaning( + df[["company_name"]] + ) + df = df[df["company_name_clean"] != ""] + df = df.rename(columns={"company_name": "company_name_raw"}).rename( + columns={"company_name_clean": "company_name"} + ) + return df + + +def clean_sec_df(df): + """Shared cleaning for SEC 10K and Ex. 21 dataframes. + + Arguments: + df: Ex. 21 or SEC 10K basic info dataframe with columns + company_name, loc_of_incorporation, and report_year. + """ + df[["company_name", "loc_of_incorporation"]] = ( + df[["company_name", "loc_of_incorporation"]] + .fillna(pd.NA) + .apply(lambda x: x.str.strip().str.lower()) + ) + df.loc[:, "company_name"] = df["company_name"].replace("", pd.NA) + df.loc[:, "loc_of_incorporation"] = df["loc_of_incorporation"].replace("", pd.NA) + df = _clean_company_name(df) + df = df[ + (~df["company_name"].isin(INVALID_NAMES)) + & ~(df["company_name_raw"].isin(INVALID_NAMES)) + ] + df = df.fillna(np.nan) + df = df.drop_duplicates( + subset=["company_name", "loc_of_incorporation", "report_year"] + ) + return df + + +def _remove_weird_sec_cols(sec_df): + for weird_col in ["]fiscal_year_end", "]irs_number", "]state_of_incorporation"]: + if weird_col not in sec_df: + continue + normal_col = weird_col[1:] + sec_df.loc[:, normal_col] = sec_df[normal_col].where( + sec_df[weird_col].isnull(), sec_df[weird_col] + ) + return sec_df + + +# TODO: for now split these into separate cleaning functions +# later unite them into one cleaning function +def prepare_sec10k_basic_info_df(sec_df): + """Preprocess SEC 10k basic information dataframe for record linkage.""" + sec_df = _add_report_year_to_sec(sec_df) + sec_df = sec_df.rename(columns=SEC_COL_MAP).reset_index() + sec_df.loc[:, "report_year"] = ( + sec_df["report_date"].astype("datetime64[ns]").dt.year + ) + sec_df.loc[:, "loc_of_incorporation"] = sec_df["state_of_incorporation"].replace( + state_code_to_name + ) + # TODO: maybe shouldn't expand the state names and comparison should + # just be an exact match or nothing? + # sec_df.loc[:, "state"] = sec_df["state"].replace(state_code_to_name) + # TODO: needs a record_id_sec column? + # sec_df = sec_df.rename(columns={"record_id_sec": "record_id"}) + sec_df = _remove_weird_sec_cols(sec_df) + sec_df = clean_sec_df(sec_df) + sec_df[STR_COLS] = sec_df[STR_COLS].apply(lambda x: x.str.strip().str.lower()) + sec_df.loc[:, "company_name_mphone"] = sec_df.apply( + _get_metaphone, axis=1, args=("company_name",) + ) + sec_df = sec_df.reset_index(names="record_id") + return sec_df + + +def prepare_ex21_df(ex21_df): + """Preprocess Ex. 21 extracted dataframe for record linkage.""" + ex21_df = ex21_df.rename(columns=EX21_COL_MAP) + # TODO: move this to general preprocessing function? + ex21_df.loc[:, "loc_of_incorporation"] = ex21_df["loc_of_incorporation"].replace( + state_code_to_name + ) + ex21_df = clean_sec_df(ex21_df) + ex21_df.loc[:, "company_name_mphone"] = ex21_df.apply( + _get_metaphone, axis=1, args=("company_name",) + ) + ex21_df = ex21_df.reset_index(names="record_id") + return ex21_df + + +def prepare_eia_df(eia_df): + """Preprocess EIA utility dataframe for record linkage.""" + eia_df = eia_df.rename(columns=EIA_COL_MAP) + eia_df.loc[:, "report_year"] = ( + eia_df["report_date"].astype("datetime64[ns]").dt.year + ) + eia_df = eia_df.fillna(np.nan) + eia_df[STR_COLS] = eia_df[STR_COLS].apply(lambda x: x.str.strip().str.lower()) + eia_df = _clean_company_name(eia_df) + eia_df.loc[:, "company_name_mphone"] = eia_df.apply( + _get_metaphone, axis=1, args=("company_name",) + ) + eia_df = eia_df.reset_index(names="record_id") + return eia_df + + +""" +def preprocessing(eia_df, sec_df): + # TODO: reorganize to be more similar to ferc to eia match structure + eia_df = eia_df.rename(columns=EIA_COL_MAP) + + # TODO: fill out this prepare for matching function + # eia_df = prepare_for_matching(eia_df) + # sec_df = prepare_for_matching(sec_df) + sec_df.loc[:, "loc_of_incorporation"] = sec_df["state_of_incorporation"].replace( + state_code_to_name + ) + sec_df.loc[:, "loc_of_incorporation"] = sec_df["loc_of_incorporation"].where( + ~sec_df["loc_of_incorporation"].isnull(), sec_df["city"] + ) + sec_df = sec_df.rename(columns={"record_id_sec": "record_id"}) + eia_df = eia_df.rename(columns={"record_id_eia": "record_id"}) +""" From 2dbdcaad4db68711985635f4cbc20f05b678bac8 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Tue, 22 Oct 2024 14:16:32 -0700 Subject: [PATCH 125/161] clean up feature creation in paragraph classifier --- .../exhibit21_layout_classifier.ipynb | 22 +++++++------------ 1 file changed, 8 insertions(+), 14 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb index 8315fc1..a45c2e3 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "ee4ed368-7d01-4cb8-952f-f7941900d669", "metadata": { "tags": [] @@ -71,14 +71,9 @@ " \"\"\"Compute features from bounding boxes in inference dataset.\"\"\"\n", " df = pd.DataFrame(record[\"bboxes\"], columns=BBOX_COLS_PDF)\n", " features = {}\n", - " features[\"n_bboxes\"] = len(df)\n", - "\n", - " # block density wasn't a very useful feature, maybe rework?\n", - " # Calculate the bounding box density of the area of the page with text\n", - " # x_width = df[\"bottom_right_x_pdf\"].max() - df[\"top_left_x_pdf\"].min()\n", - " # y_height = df[\"bottom_right_y_pdf\"].max() - df[\"top_left_y_pdf\"].min()\n", - " # text_area = x_width * y_height\n", - " # features[\"block_density\"] = features[\"n_bboxes\"] / text_area\n", + " \n", + " y_height = df[\"bottom_right_y_pdf\"].max() - df[\"top_left_y_pdf\"].min()\n", + " features[\"block_y_density\"] = len(df) / y_height\n", "\n", " # Calculate average y-distance between bounding boxes for a given document\n", " df = df.sort_values(by=[\"top_left_y_pdf\", \"top_left_x_pdf\"])\n", @@ -87,12 +82,11 @@ " features[\"std_y_distance\"] = y_diffs.std()\n", "\n", " # Calculate x-distance to assess horizontal alignment\n", - " x_diffs = df.groupby(\"top_left_y_pdf\")[\"top_left_x_pdf\"].apply(lambda x: x.diff().dropna())\n", - " features[\"avg_x_distance\"] = x_diffs.mean()\n", - " features[\"std_x_distance\"] = x_diffs.std()\n", + " x_diffs = df.groupby('line_group')['top_left_x_pdf'].apply(lambda x: x.diff().dropna())\n", + " features['avg_x_distance'] = x_diffs.mean()\n", "\n", " # Define a small threshold to group bounding boxes that are on the same line\n", - " y_threshold = 0.1\n", + " y_threshold = 0.5\n", " df.loc[:, \"line_group\"] = (df[\"top_left_y_pdf\"].diff().fillna(0).abs() > y_threshold).cumsum()\n", " boxes_per_line = df.groupby(\"line_group\").size()\n", " features[\"median_boxes_per_line\"] = boxes_per_line.median()\n", @@ -329,7 +323,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.10" + "version": "3.12.0" } }, "nbformat": 4, From cda3225b885f6b705dea7d5e4fa66b20f89b1347 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Tue, 22 Oct 2024 14:49:56 -0700 Subject: [PATCH 126/161] fix feature creation function --- .../sec10k/notebooks/exhibit21_layout_classifier.ipynb | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb index a45c2e3..455910f 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb @@ -80,14 +80,15 @@ " y_diffs = df[\"top_left_y_pdf\"].diff().dropna()\n", " features[\"avg_y_distance\"] = y_diffs.mean()\n", " features[\"std_y_distance\"] = y_diffs.std()\n", + " \n", + " # Define a small threshold to group bounding boxes that are on the same line\n", + " y_threshold = 0.5\n", + " df.loc[:, 'line_group'] = (df['top_left_y_pdf'].diff().fillna(0).abs() > y_threshold).cumsum()\n", "\n", " # Calculate x-distance to assess horizontal alignment\n", " x_diffs = df.groupby('line_group')['top_left_x_pdf'].apply(lambda x: x.diff().dropna())\n", " features['avg_x_distance'] = x_diffs.mean()\n", "\n", - " # Define a small threshold to group bounding boxes that are on the same line\n", - " y_threshold = 0.5\n", - " df.loc[:, \"line_group\"] = (df[\"top_left_y_pdf\"].diff().fillna(0).abs() > y_threshold).cumsum()\n", " boxes_per_line = df.groupby(\"line_group\").size()\n", " features[\"median_boxes_per_line\"] = boxes_per_line.median()\n", " return pd.Series(features)" From 509b7a0c23cc72460e447ff3d879a541d78382fb Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 22 Oct 2024 22:28:18 +0000 Subject: [PATCH 127/161] [pre-commit.ci] auto fixes from pre-commit.com hooks For more information, see https://pre-commit.ci --- .../sec10k/notebooks/exhibit21_layout_classifier.ipynb | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb index 455910f..a41f6d3 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_layout_classifier.ipynb @@ -71,7 +71,7 @@ " \"\"\"Compute features from bounding boxes in inference dataset.\"\"\"\n", " df = pd.DataFrame(record[\"bboxes\"], columns=BBOX_COLS_PDF)\n", " features = {}\n", - " \n", + "\n", " y_height = df[\"bottom_right_y_pdf\"].max() - df[\"top_left_y_pdf\"].min()\n", " features[\"block_y_density\"] = len(df) / y_height\n", "\n", @@ -80,14 +80,14 @@ " y_diffs = df[\"top_left_y_pdf\"].diff().dropna()\n", " features[\"avg_y_distance\"] = y_diffs.mean()\n", " features[\"std_y_distance\"] = y_diffs.std()\n", - " \n", + "\n", " # Define a small threshold to group bounding boxes that are on the same line\n", " y_threshold = 0.5\n", - " df.loc[:, 'line_group'] = (df['top_left_y_pdf'].diff().fillna(0).abs() > y_threshold).cumsum()\n", + " df.loc[:, \"line_group\"] = (df[\"top_left_y_pdf\"].diff().fillna(0).abs() > y_threshold).cumsum()\n", "\n", " # Calculate x-distance to assess horizontal alignment\n", - " x_diffs = df.groupby('line_group')['top_left_x_pdf'].apply(lambda x: x.diff().dropna())\n", - " features['avg_x_distance'] = x_diffs.mean()\n", + " x_diffs = df.groupby(\"line_group\")[\"top_left_x_pdf\"].apply(lambda x: x.diff().dropna())\n", + " features[\"avg_x_distance\"] = x_diffs.mean()\n", "\n", " boxes_per_line = df.groupby(\"line_group\").size()\n", " features[\"median_boxes_per_line\"] = boxes_per_line.median()\n", From 8855e5ed1451489aa7225cb1ffdcfd4607d9fc63 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Tue, 22 Oct 2024 15:45:19 -0700 Subject: [PATCH 128/161] small fixes to read in comments in tracking dataframe --- src/mozilla_sec_eia/library/validation_helpers.py | 3 ++- src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py | 3 +-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/mozilla_sec_eia/library/validation_helpers.py b/src/mozilla_sec_eia/library/validation_helpers.py index a05d4d0..adb278f 100644 --- a/src/mozilla_sec_eia/library/validation_helpers.py +++ b/src/mozilla_sec_eia/library/validation_helpers.py @@ -24,7 +24,8 @@ def load_validation_data( ) -> pd.DataFrame: """Load csv with validation data from `package_data` directory.""" df = pd.read_csv( - resources.files("mozilla_sec_eia.package_data.validation_data") / filename + resources.files("mozilla_sec_eia.package_data.validation_data") / filename, + comment="#", ) if index_cols is not None: df = df.set_index(index_cols) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py index 5f79109..91617dd 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py @@ -86,8 +86,7 @@ def iob_to_label(label): def _is_cik_in_training_data(labeled_json_filename, tracking_df): - # TODO: for now CIK is stored as an int, update when fixed - cik = int(labeled_json_filename.split("/")[-1].split("-")[0]) + cik = labeled_json_filename.split("/")[-1].split("-")[0] return cik in tracking_df.CIK.unique() From 590ba60766b7b31501bb20c30e18329c6a499ef0 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Tue, 22 Oct 2024 22:18:33 -0700 Subject: [PATCH 129/161] updates to model pipeline --- .../models/sec10k/ex_21/data/common.py | 8 +- .../notebooks/exhibit21_extractor.ipynb | 129 ++++++------------ 2 files changed, 48 insertions(+), 89 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py index 91617dd..157b538 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py @@ -1,6 +1,7 @@ """Implement methods used to construct both inference and training sets.""" import json +import logging import os from pathlib import Path @@ -11,6 +12,8 @@ from ...utils.pdf import get_pdf_data_from_path +logger = logging.getLogger(f"catalystcoop.{__name__}") + LABEL_PRIORITY = [ "I-Subsidiary", "I-Loc", @@ -86,7 +89,8 @@ def iob_to_label(label): def _is_cik_in_training_data(labeled_json_filename, tracking_df): - cik = labeled_json_filename.split("/")[-1].split("-")[0] + cik = int(labeled_json_filename.split("/")[-1].split("-")[0]) + logger.warning(f"CIK: {cik}") return cik in tracking_df.CIK.unique() @@ -97,6 +101,7 @@ def format_label_studio_output( """Format Label Studio output JSONs into dataframe.""" labeled_df = pd.DataFrame() tracking_df = validation_helpers.load_training_data("ex21_labels.csv") + logger.warning(f"tracking_df: {tracking_df.CIK.unique()}") for json_filename in os.listdir(labeled_json_dir): if not json_filename[0].isdigit() or json_filename.endswith(".json"): @@ -105,6 +110,7 @@ def format_label_studio_output( with Path.open(json_file_path) as j: doc_dict = json.loads(j.read()) filename = doc_dict["task"]["data"]["ocr"].split("/")[-1].split(".")[0] + logger.warning(f"FILENAME: {filename}") # check if old local naming schema is being used if len(filename.split("-")) == 6: filename = "-".join(filename.split("-")[2:]) diff --git a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb index 53e16c8..7e2852f 100644 --- a/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb +++ b/src/mozilla_sec_eia/models/sec10k/notebooks/exhibit21_extractor.ipynb @@ -38,29 +38,14 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": null, "id": "48f185de-95ef-4194-9245-93f8d603d2e6", "metadata": { "tags": [ "parameters" ] }, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", - "2024-10-16 17:11:06 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_training_data using PickledObjectFilesystemIOManager...\n", - "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", - "2024-10-16 17:11:12 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_failed_parsing_metadata using PickledObjectFilesystemIOManager...\n", - "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", - "2024-10-16 17:11:12 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_inference_dataset using PickledObjectFilesystemIOManager...\n", - "No dagster instance configuration file (dagster.yaml) found at /home/zach/catalyst/workspace. Defaulting to loading and storing all metadata with /home/zach/catalyst/workspace. If this is the desired behavior, create an empty dagster.yaml file in /home/zach/catalyst/workspace.\n", - "2024-10-16 17:11:15 -0400 - dagster - DEBUG - system - Loading file from: /home/zach/catalyst/workspace/storage/ex21_validation_set using PickledObjectFilesystemIOManager...\n" - ] - } - ], + "outputs": [], "source": [ "import dagstermill\n", "\n", @@ -443,14 +428,13 @@ " \"doc_dict\": model_inputs[\"doc_dict\"],\n", " }\n", "\n", - " def postprocess(self, all_outputs):\n", + " def postprocess(self, output_dict):\n", " \"\"\"Return logits, model predictions, and the extracted dataframe.\"\"\"\n", - " logits = all_outputs[\"logits\"]\n", - " predictions = all_outputs[\"logits\"].argmax(-1).squeeze().tolist()\n", - " output_df = self.extract_table(all_outputs)\n", - " return logits, predictions, output_df\n", + " output_df = self.extract_table(output_dict)\n", + " output_dict[\"output_df\"] = output_df\n", + " return output_dict\n", "\n", - " def extract_table(self, all_outputs):\n", + " def extract_table(self, output_dict):\n", " \"\"\"Extract a structured table from a set of inference predictions.\n", "\n", " This function essentially works by stacking bounding boxes and predictions\n", @@ -463,9 +447,9 @@ " \"\"\"\n", " # TODO: when model more mature, break this into sub functions to make it\n", " # clearer what's going on\n", - " predictions = all_outputs[\"predictions\"]\n", - " encoding = all_outputs[\"raw_encoding\"]\n", - " doc_dict = all_outputs[\"doc_dict\"]\n", + " predictions = output_dict[\"predictions\"]\n", + " encoding = output_dict[\"raw_encoding\"]\n", + " doc_dict = output_dict[\"doc_dict\"]\n", "\n", " token_boxes_tensor = encoding[\"bbox\"].flatten(start_dim=0, end_dim=1)\n", " predictions_tensor = torch.tensor(predictions)\n", @@ -496,6 +480,7 @@ " df = df.merge(words_df, how=\"left\", on=BBOX_COLS).drop_duplicates(\n", " subset=BBOX_COLS + [\"pred\", \"word\"]\n", " )\n", + " df = df.sort_values(by=[\"top_left_y\", \"top_left_x\"])\n", " # rows that are the first occurrence in a new group (subsidiary, loc, own_per)\n", " # should always have a B entity label. Manually override labels so this is true.\n", " first_in_group_df = df[\n", @@ -541,69 +526,22 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 1, "id": "4d802e00-1ca4-40b3-b15b-561711a9db70", "metadata": { "tags": [] }, "outputs": [ { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d0779d02915a4503b0cd92d3df38cf88", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading artifacts: 0%| | 0/1 [00:00, skipping schema inference\n" + "ename": "NameError", + "evalue": "name 'training_run_id' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[1], line 12\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmozilla_sec_eia\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mmodels\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01msec10k\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mex_21\u001b[39;00m\u001b[38;5;21;01m.\u001b[39;00m\u001b[38;5;21;01mex21_validation_helpers\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m (\n\u001b[1;32m 8\u001b[0m clean_extracted_df,\n\u001b[1;32m 9\u001b[0m )\n\u001b[1;32m 11\u001b[0m \u001b[38;5;66;03m# If a model was trained in this notebook, use it. Otherwise, use\u001b[39;00m\n\u001b[0;32m---> 12\u001b[0m model_uri \u001b[38;5;241m=\u001b[39m \u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mruns:/\u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[43mtraining_run_id\u001b[49m\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m/layoutlm_extractor\u001b[39m\u001b[38;5;124m\"\u001b[39m\n\u001b[1;32m 13\u001b[0m model_info \u001b[38;5;241m=\u001b[39m mlflow\u001b[38;5;241m.\u001b[39mmodels\u001b[38;5;241m.\u001b[39mget_model_info(model_uri)\n\u001b[1;32m 15\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_get_data\u001b[39m(dataset):\n", + "\u001b[0;31mNameError\u001b[0m: name 'training_run_id' is not defined" ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "601bb4ae91dd4a218fe5be047f4829d0", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "Downloading artifacts: 0%| | 0/17 [00:00 Date: Wed, 23 Oct 2024 09:53:30 -0700 Subject: [PATCH 130/161] take out logging messages --- src/mozilla_sec_eia/library/validation_helpers.py | 1 + src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py | 6 ------ 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/src/mozilla_sec_eia/library/validation_helpers.py b/src/mozilla_sec_eia/library/validation_helpers.py index adb278f..03044b2 100644 --- a/src/mozilla_sec_eia/library/validation_helpers.py +++ b/src/mozilla_sec_eia/library/validation_helpers.py @@ -92,6 +92,7 @@ def strip_down_company_names(ser: pd.Series) -> pd.Series: Used to compare subsidiary name columns during validation. """ + # TODO: unify with PUDL # this JSON is taken from PUDL package data (used for CompanyNameCleaner) json_source = ( resources.files("mozilla_sec_eia.package_data") / "us_legal_forms.json" diff --git a/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py b/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py index 157b538..08b0440 100644 --- a/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py +++ b/src/mozilla_sec_eia/models/sec10k/ex_21/data/common.py @@ -1,7 +1,6 @@ """Implement methods used to construct both inference and training sets.""" import json -import logging import os from pathlib import Path @@ -12,8 +11,6 @@ from ...utils.pdf import get_pdf_data_from_path -logger = logging.getLogger(f"catalystcoop.{__name__}") - LABEL_PRIORITY = [ "I-Subsidiary", "I-Loc", @@ -90,7 +87,6 @@ def iob_to_label(label): def _is_cik_in_training_data(labeled_json_filename, tracking_df): cik = int(labeled_json_filename.split("/")[-1].split("-")[0]) - logger.warning(f"CIK: {cik}") return cik in tracking_df.CIK.unique() @@ -101,7 +97,6 @@ def format_label_studio_output( """Format Label Studio output JSONs into dataframe.""" labeled_df = pd.DataFrame() tracking_df = validation_helpers.load_training_data("ex21_labels.csv") - logger.warning(f"tracking_df: {tracking_df.CIK.unique()}") for json_filename in os.listdir(labeled_json_dir): if not json_filename[0].isdigit() or json_filename.endswith(".json"): @@ -110,7 +105,6 @@ def format_label_studio_output( with Path.open(json_file_path) as j: doc_dict = json.loads(j.read()) filename = doc_dict["task"]["data"]["ocr"].split("/")[-1].split(".")[0] - logger.warning(f"FILENAME: {filename}") # check if old local naming schema is being used if len(filename.split("-")) == 6: filename = "-".join(filename.split("-")[2:]) From 61c8abf9ffa357483607334bb68e6b9ac1c13c87 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Wed, 23 Oct 2024 12:52:03 -0700 Subject: [PATCH 131/161] make pudl editable --- environment.yml | 6 +++--- .../models/sec_eia_record_linkage/preprocessing.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/environment.yml b/environment.yml index 3ad1cd4..a902ea3 100644 --- a/environment.yml +++ b/environment.yml @@ -29,6 +29,6 @@ dependencies: # Use pip to install the package defined by this repo for development: - pip: - - git+https://github.com/catalyst-cooperative/pudl.git@main - # - -e /Users/katielamb/CatalystCoop/pudl[dev,docs,tests,types] - - --editable ./[dev,docs,tests,types] + # - git+https://github.com/catalyst-cooperative/pudl.git@main + - -e /Users/katielamb/CatalystCoop/pudl + - --editable ./[dev,docs,tests,types] diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py index 9080cd7..ebb7843 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py @@ -166,7 +166,7 @@ def _get_metaphone(row, col_name): def _clean_company_name(df): df.loc[:, "company_name_clean"] = company_name_cleaner.apply_name_cleaning( df[["company_name"]] - ) + ).str.strip() df = df[df["company_name_clean"] != ""] df = df.rename(columns={"company_name": "company_name_raw"}).rename( columns={"company_name_clean": "company_name"} From e5148d8c70477e158a154463185f2b9494f52fa8 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Wed, 27 Nov 2024 15:18:41 -0500 Subject: [PATCH 132/161] add in record linkage modules --- notebooks/16-kl-splink-ex21-filer-link.ipynb | 5833 +++++++++++------ notebooks/18-kl-splink-sec-eia.ipynb | 4305 ++++++++---- src/mozilla_sec_eia/models/sec10k/__init__.py | 10 +- .../models/sec10k/sec_output_table.py | 327 + .../models/sec10k/utils/cloud.py | 14 + .../create_eia_input.py | 76 + .../sec_eia_record_linkage/preprocessing.py | 218 +- .../package_data/formDStateCodes.xsd.xml | 328 + 8 files changed, 8005 insertions(+), 3106 deletions(-) create mode 100644 src/mozilla_sec_eia/models/sec10k/sec_output_table.py create mode 100644 src/mozilla_sec_eia/models/sec_eia_record_linkage/create_eia_input.py create mode 100644 src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml diff --git a/notebooks/16-kl-splink-ex21-filer-link.ipynb b/notebooks/16-kl-splink-ex21-filer-link.ipynb index 2e656d3..efef952 100644 --- a/notebooks/16-kl-splink-ex21-filer-link.ipynb +++ b/notebooks/16-kl-splink-ex21-filer-link.ipynb @@ -15,20 +15,50 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 260, "id": "e1222c94-36cd-4bae-95fb-089e5411e490", "metadata": { "tags": [] }, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[autoreload of mozilla_sec_eia.models.sec10k.utils.cloud failed: Traceback (most recent call last):\n", + " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py\", line 274, in check\n", + " superreload(m, reload, self.old_objects, self.shell)\n", + " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py\", line 500, in superreload\n", + " update_generic(old_obj, new_obj)\n", + " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py\", line 397, in update_generic\n", + " update(a, b)\n", + " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/IPython/extensions/autoreload.py\", line 335, in update_class\n", + " if (old_obj == new_obj) is True:\n", + " ^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/_collections_abc.py\", line 834, in __eq__\n", + " return dict(self.items()) == dict(other.items())\n", + " ^^^^^^^^^^^^^^^^^^\n", + " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/_collections_abc.py\", line 893, in __iter__\n", + " for key in self._mapping:\n", + " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/pydantic/_internal/_mock_val_ser.py\", line 46, in __iter__\n", + " return self._get_built().__iter__()\n", + " ^^^^^^^^^^^^^^^^^\n", + " File \"/Users/katielamb/mambaforge/envs/mozilla_sec_eia/lib/python3.12/site-packages/pydantic/_internal/_mock_val_ser.py\", line 57, in _get_built\n", + " raise PydanticUserError(self._error_message, code=self._code)\n", + "pydantic.errors.PydanticUserError: Pydantic models should inherit from BaseModel, BaseModel cannot be instantiated directly\n", + "\n", + "For further information visit https://errors.pydantic.dev/2.9/u/base-model-instantiated\n", + "]\n" + ] + } + ], "source": [ "import numpy as np\n", "import pandas as pd\n", "from upath import UPath\n", "\n", - "# from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive\n", - "# from pudl.analysis.record_linkage import name_cleaner\n", - "from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import prepare_sec10k_basic_info_df, prepare_ex21_df" + "from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive, convert_ex21_id_to_filename\n", + "from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import prepare_sec10k_basic_info_df, prepare_ex21_df, add_sec_company_id_to_subsidiaries" ] }, { @@ -42,372 +72,296 @@ { "cell_type": "code", "execution_count": 3, - "id": "67da3bf4-abbd-40c2-850b-1c73953625c8", + "id": "c29d0b75-759f-445c-adac-b2a6baf1fd0e", "metadata": { "tags": [] }, "outputs": [], "source": [ - "raw_eia_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/out_eia__yearly_utilities.parquet\")" + "# for now try just training on 2023\n", + "raw_sec_df = pd.concat([pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q1.parquet\"),\n", + " pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q2.parquet\"),\n", + " pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q3.parquet\"),\n", + " pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q4.parquet\"),\n", + " ]\n", + " )" ] }, { "cell_type": "code", "execution_count": 4, - "id": "28bdfdfd-beeb-4097-b4d3-b58a7c30f64d", + "id": "dbf3b15c-3a5a-4b74-a929-71aec18750a1", "metadata": { "tags": [] }, "outputs": [], "source": [ - "eia_df = raw_eia_df.copy()" + "raw_sec_df = raw_sec_df.reset_index().pivot_table(values=\"value\", index=\"filename\", columns=\"key\", aggfunc=\"first\")\n", + "raw_sec_df.columns.name = None" ] }, { "cell_type": "code", "execution_count": 5, - "id": "ee54bb48-cbe4-4261-9545-d4b2bdcb731e", - "metadata": { - "tags": [] - }, + "id": "a8ec4fad-c92f-4cfc-a3d2-409a72a2df1e", + "metadata": {}, "outputs": [], "source": [ - "mergers_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_mergers.parquet\")" + "ex21_path = UPath(\"gs://sec10k-outputs/v2/ex21_company_ownership_info\")" ] }, { "cell_type": "code", "execution_count": 6, - "id": "8e69b4ba-8e7b-4d17-bc8c-a06f059f6015", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "raw_eia861_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__assn_utility.parquet\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "ce60f760-5b94-4889-92c5-ac0ed5cd6d82", + "id": "8e7a642d-7718-4101-b851-f1f4ee07180e", "metadata": { "tags": [] }, "outputs": [], "source": [ - "missing_utils = raw_eia861_df[~raw_eia861_df.utility_id_eia.isin(raw_eia_df.utility_id_eia.unique())].utility_id_eia.unique()" + "raw_ex21_df = pd.DataFrame()\n", + "for file in ex21_path.iterdir():\n", + " if file.name.split(\".\")[-1] == \"parquet\":\n", + " report_year = file.name[:4]\n", + " # for now just train with 2023\n", + " if report_year != \"2023\":\n", + " continue\n", + " year_quarter_df = pd.read_parquet(ex21_path / file.name)\n", + " year_quarter_df.loc[:, \"report_year\"] = report_year\n", + " year_quarter_df.loc[:, \"report_year\"] = pd.to_datetime(year_quarter_df[\"report_year\"], format=\"%Y\").dt.year\n", + " raw_ex21_df = pd.concat([raw_ex21_df, year_quarter_df])" ] }, { - "cell_type": "code", - "execution_count": 8, - "id": "a3ef2365-e459-44b3-94b0-77020cd606f2", - "metadata": { - "tags": [] - }, - "outputs": [], + "cell_type": "markdown", + "id": "f3d5db08-3c42-4715-9f0d-4d02674b828a", + "metadata": {}, "source": [ - "harvested_df = pd.concat([\n", - " pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_utility_data_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n", - " pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_operational_data_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n", - " pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_demand_side_management_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n", - " pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_energy_efficiency.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n", - "])" + "# Preprocessing" ] }, { "cell_type": "code", - "execution_count": 9, - "id": "59fd9d69-b700-43ec-bb7a-f99eea1e0ec9", - "metadata": { - "tags": [] - }, + "execution_count": null, + "id": "39706c77-90db-4f49-8011-47a9777a88b6", + "metadata": {}, "outputs": [], "source": [ - "eia861_df = raw_eia861_df.merge(harvested_df, on=[\"report_date\", \"utility_id_eia\"], how=\"left\").drop_duplicates(subset=[\"report_date\", \"utility_id_eia\"])" + "sec_df = prepare_sec10k_basic_info_df(raw_sec_df)" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "a47d17c1-0df1-412f-9687-3d540266f005", - "metadata": { - "tags": [] - }, - "outputs": [], + "execution_count": 157, + "id": "98d4f59e-d61f-4a24-84bc-6caa0d761e07", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/katielamb/CatalystCoop/mozilla-sec-eia/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py:233: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + " )\n" + ] + } + ], "source": [ - "mergers_df = mergers_df[mergers_df[\"new_parent\"].notna()]\n", - "eia861_df = eia861_df.merge(mergers_df[[\"report_date\", \"new_parent\", \"merge_address\", \"merge_city\", \"merge_state\"]], \n", - " how=\"left\", \n", - " left_on=[\"report_date\", \"utility_name_eia\"],\n", - " right_on=[\"report_date\", \"new_parent\"]\n", - " )\n", - "eia861_df = eia861_df.rename(columns={\"merge_address\": \"street_address\", \"merge_city\": \"city\"})\n", - "eia861_df = eia861_df.groupby([\"report_date\", \"utility_id_eia\"]).first().reset_index()" + "ex21_df = prepare_ex21_df(raw_ex21_df)" ] }, { "cell_type": "code", - "execution_count": 11, - "id": "fa6515b1-5012-4ec0-af96-f9fda11a9c5d", - "metadata": { - "tags": [] - }, + "execution_count": 69, + "id": "34a86ec8-5b6c-4147-8f94-021fa271174c", + "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
report_dateutility_id_eiastateutility_name_eianew_parentstreet_addresscitymerge_state
299332009-01-0117698LASouthwestern Electric Power CoSouthwestern Electric Power Co1 Riverside PlazaColumbusOH
332582010-01-0117698ARSouthwestern Electric Power CoSouthwestern Electric Power Co1 Riverside PlazaColumbusOH
490012015-01-0111788IAConsumers EnergyConsumers EnergyOne Enrgy PlazaJacksonMI
568532017-01-0119157IAMiEnergy CooperativeMiEnergy Cooperative31110 Cooperative WayRushfordMN
708202021-01-0140165AZDixie Escalante R E A, IncDixie Escalante R E A, Inc495 N 3200 WFlowellUT
\n", - "
" - ], "text/plain": [ - " report_date utility_id_eia state utility_name_eia new_parent street_address city merge_state\n", - "29933 2009-01-01 17698 LA Southwestern Electric Power Co Southwestern Electric Power Co 1 Riverside Plaza Columbus OH\n", - "33258 2010-01-01 17698 AR Southwestern Electric Power Co Southwestern Electric Power Co 1 Riverside Plaza Columbus OH\n", - "49001 2015-01-01 11788 IA Consumers Energy Consumers Energy One Enrgy Plaza Jackson MI\n", - "56853 2017-01-01 19157 IA MiEnergy Cooperative MiEnergy Cooperative 31110 Cooperative Way Rushford MN\n", - "70820 2021-01-01 40165 AZ Dixie Escalante R E A, Inc Dixie Escalante R E A, Inc 495 N 3200 W Flowell UT" + "True" ] }, - "execution_count": 11, + "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "eia861_df[(eia861_df.state != eia861_df.merge_state) & (eia861_df.merge_state.notna())]" + "ex21_df.record_id.is_unique" ] }, { "cell_type": "code", - "execution_count": 12, - "id": "8ff7b788-5fef-4e88-94ff-89b25619aed8", - "metadata": { - "tags": [] - }, - "outputs": [], + "execution_count": 70, + "id": "505b0c45-1748-4517-8cac-d2acf2fa9037", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 70, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "eia861_df[\"state\"] = eia861_df[\"state\"].where(eia861_df[\"merge_state\"].isnull(), eia861_df[\"merge_state\"])" + "sec_df.record_id.is_unique" ] }, { "cell_type": "code", - "execution_count": 13, - "id": "17885342-b464-4f4d-ac75-b7be4d4ec7cc", + "execution_count": null, + "id": "11caf325-8530-430d-a3d2-a54043447021", "metadata": { "tags": [] }, "outputs": [], "source": [ - "eia861_df = eia861_df.drop(columns=[\"new_parent\", \"merge_state\"])" + "# sec_df has filename as unique ID\n", + "sec_df.filename.is_unique" ] }, { - "cell_type": "code", - "execution_count": 14, - "id": "fb71f68d-92da-468b-b8a5-02f5ba4b4459", - "metadata": { - "tags": [] - }, - "outputs": [], + "cell_type": "markdown", + "id": "ceed053b-f6ae-4aad-8b12-b2083ba8e236", + "metadata": {}, "source": [ - "eia_df = pd.concat([eia_df, eia861_df])" + "Note: not removing paragraph layout docs, but maybe should" ] }, { - "cell_type": "code", - "execution_count": 15, - "id": "85402523-e28a-4410-b933-eb71572b9a00", - "metadata": { - "tags": [] - }, - "outputs": [], + "cell_type": "markdown", + "id": "6de284e1-2b76-418d-ac5e-9a84bd275c51", + "metadata": {}, "source": [ - "eia_df = eia_df.drop_duplicates(subset=[\"utility_id_eia\", \"report_date\"], keep=\"first\")" + "# Try to just match on cleaned name and location" ] }, { "cell_type": "code", - "execution_count": 16, - "id": "94e824d6-dd6a-47db-9447-3363e8d14fe0", - "metadata": { - "tags": [] - }, + "execution_count": 170, + "id": "2c9a384d-a9e1-4e4a-829f-e92f1a007c90", + "metadata": {}, "outputs": [], "source": [ - "# not sure at what point this stops being a datetime\n", - "eia_df[\"report_date\"] = eia_df[\"report_date\"].astype(\"datetime64[ns]\")" + "sec_match_df = sec_df.drop_duplicates(subset=[\"central_index_key\", \"company_name\", \"loc_of_incorporation\"])" ] }, { "cell_type": "code", - "execution_count": 17, - "id": "56857668-ecd5-4c62-9286-e50c334750c5", - "metadata": { - "tags": [] - }, + "execution_count": 179, + "id": "4bab406d-b1e0-495b-beee-90ae6b0c036b", + "metadata": {}, "outputs": [], "source": [ - "# there are nulls from non harvested 861 utilities\n", - "eia_df = eia_df.dropna(subset=\"utility_name_eia\")" + "merged_df = sec_match_df.merge(ex21_df, how=\"inner\", on=\"company_name\", suffixes=(\"_sec\", \"_ex21\"))" ] }, { "cell_type": "code", - "execution_count": 32, - "id": "c29d0b75-759f-445c-adac-b2a6baf1fd0e", - "metadata": { - "tags": [] - }, - "outputs": [], + "execution_count": 185, + "id": "b8732fda-9f0a-412c-b7ba-8f307ee7b213", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 florida\n", + "1 delaware\n", + "2 missouri\n", + "3 delaware\n", + "4 NaN\n", + " ... \n", + "515 delaware\n", + "516 delaware\n", + "517 delaware\n", + "518 delaware\n", + "519 delaware\n", + "Name: loc_of_incorporation_sec, Length: 520, dtype: object" + ] + }, + "execution_count": 185, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# for now try just training on 2023\n", - "raw_sec_df = pd.concat([pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q1.parquet\"),\n", - " pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q2.parquet\"),\n", - " pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q3.parquet\"),\n", - " pd.read_parquet(\"gs://sec10k-outputs/v2/basic_10k_company_info/2023q4.parquet\"),\n", - " ]\n", - " )" + "merged_df[\"loc_of_incorporation_sec\"]" ] }, { "cell_type": "code", - "execution_count": 52, - "id": "dbf3b15c-3a5a-4b74-a929-71aec18750a1", - "metadata": { - "tags": [] - }, + "execution_count": 209, + "id": "3427d77c-3c3f-4a05-99db-7f96d3f0f193", + "metadata": {}, "outputs": [], "source": [ - "raw_sec_df = raw_sec_df.reset_index().pivot_table(values=\"value\", index=\"filename\", columns=\"key\", aggfunc=\"first\")\n", - "raw_sec_df.columns.name = None" + "merged_df.loc[:, \"loc_tokens_sec\"] = merged_df[\"loc_of_incorporation_sec\"].fillna(\"\").str.lower().str.split()\n", + "merged_df.loc[:, \"loc_tokens_ex21\"] = merged_df[\"loc_of_incorporation_ex21\"].fillna(\"\").str.lower().str.split()\n", + "merged_df[\"loc_overlap\"] = merged_df.apply(\n", + " lambda row: len(set(row[\"loc_tokens_sec\"]) & set(row[\"loc_tokens_ex21\"])), axis=1\n", + ")\n", + "\n", + "# Select the row with the highest word overlap for each CIK and company name\n", + "closest_match = merged_df.loc[merged_df.groupby([\"central_index_key\", \"company_name\"])['loc_overlap'].idxmax()].reset_index(drop=True)" ] }, { "cell_type": "code", - "execution_count": 24, - "id": "a8ec4fad-c92f-4cfc-a3d2-409a72a2df1e", + "execution_count": 210, + "id": "92cc6570-f34c-4782-9bbf-0cdeaf2ce044", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "False 480\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 210, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "ex21_path = UPath(\"gs://sec10k-outputs/v2/ex21_company_ownership_info\")" + "# this should be 0\n", + "closest_match.duplicated(subset=[\"company_name\", \"loc_of_incorporation_ex21\"]).value_counts()" ] }, { "cell_type": "code", - "execution_count": 35, - "id": "8e7a642d-7718-4101-b851-f1f4ee07180e", - "metadata": { - "tags": [] - }, - "outputs": [], + "execution_count": 200, + "id": "d0c650d0-303d-43a4-9ae3-35c4fb6d481b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "central_index_key\n", + "False 480\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 200, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "raw_ex21_df = pd.DataFrame()\n", - "for file in ex21_path.iterdir():\n", - " if file.name.split(\".\")[-1] == \"parquet\":\n", - " report_year = file.name[:4]\n", - " # for now just train with 2023\n", - " if report_year != \"2023\":\n", - " continue\n", - " year_quarter_df = pd.read_parquet(ex21_path / file.name)\n", - " year_quarter_df.loc[:, \"report_year\"] = report_year\n", - " year_quarter_df.loc[:, \"report_year\"] = pd.to_datetime(year_quarter_df[\"report_year\"], format=\"%Y\").dt.year\n", - " raw_ex21_df = pd.concat([raw_ex21_df, year_quarter_df])" + "# it's okay if there's duplication here, but not ideal\n", + "# multiple subsidiaries can point to the same CIK\n", + "closest_match.central_index_key.duplicated().value_counts()" ] }, { "cell_type": "code", - "execution_count": 25, - "id": "7daad7a6-c590-4324-9e31-2bb5c9fa4d6c", - "metadata": { - "tags": [] - }, + "execution_count": 201, + "id": "2b3a2c1f-7df4-4515-8727-a339303ebd4e", + "metadata": {}, "outputs": [ { "data": { @@ -430,588 +384,570 @@ " \n", " \n", " \n", - " utility_id_eia\n", - " utility_id_pudl\n", - " utility_name_eia\n", - " report_date\n", - " street_address\n", + " record_id_sec\n", + " filename\n", + " phone_number\n", + " central_index_key\n", " city\n", + " company_name_raw_sec\n", + " date_of_name_change\n", + " film_number\n", + " fiscal_year_end\n", + " form_type\n", + " former_conformed_name\n", + " irs_number\n", + " organization_name\n", + " sec_act\n", + " sec_file_number\n", + " standard_industrial_classification\n", " state\n", + " state_of_incorporation\n", + " street_address\n", + " street_address_2\n", " zip_code\n", - " plants_reported_owner\n", - " plants_reported_operator\n", - " ...\n", - " contact_lastname\n", - " contact_title\n", - " phone_number\n", - " phone_extension\n", - " contact_firstname_2\n", - " contact_lastname_2\n", - " contact_title_2\n", - " phone_number_2\n", - " phone_extension_2\n", - " data_maturity\n", + " report_date\n", + " report_year_sec\n", + " loc_of_incorporation_sec\n", + " company_name\n", + " company_name_no_legal_sec\n", + " company_name_mphone_sec\n", + " record_id_ex21\n", + " id\n", + " company_name_raw_ex21\n", + " loc_of_incorporation_ex21\n", + " own_per\n", + " report_year_ex21\n", + " company_name_no_legal_ex21\n", + " company_name_mphone_ex21\n", + " loc_tokens_sec\n", + " loc_tokens_ex21\n", + " loc_overlap\n", " \n", " \n", " \n", " \n", - " 33\n", - " 66292\n", - " 16386.0\n", - " Desert Willow Energy Storage\n", - " 2023-01-01\n", - " 100 Bayview Circle\n", - " Newport Beach\n", - " CA\n", - " None\n", - " None\n", - " None\n", - " ...\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " provisional\n", - " \n", - " \n", - " 35\n", - " 66291\n", - " 16385.0\n", - " Portage Solar Plant\n", - " 2023-01-01\n", - " N8917\n", - " Portage\n", - " WI\n", - " 53901\n", - " None\n", - " None\n", - " ...\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " provisional\n", + " 0\n", + " 7990\n", + " edgar/data/910638/0000910638-23-000009.txt\n", + " 8033263900\n", + " 0000910638\n", + " rock hill\n", + " 3d systems corp\n", + " 19930816\n", + " 23738595\n", + " 1231\n", + " 10-k\n", + " 3 d systems corp\n", + " 954431352\n", + " NaN\n", + " 1934 act\n", + " 001-34220\n", + " services-prepackaged software [7372]\n", + " sc\n", + " de\n", + " 333 three d systems circle\n", + " NaN\n", + " 29730\n", + " 2023-03-16\n", + " 2023\n", + " delaware\n", + " 3d systems corporation\n", + " 3d systems\n", + " T SSTMS\n", + " 150739\n", + " 910638-0000910638-23-000009\n", + " 3d systems corporation\n", + " delaware\n", + " NaN\n", + " 2023\n", + " 3d systems\n", + " T SSTMS\n", + " [delaware]\n", + " [delaware]\n", + " 1\n", " \n", " \n", - " 37\n", - " 66290\n", - " 16384.0\n", - " NSF Energy One LLC\n", - " 2023-01-01\n", - " 1241 University Ave\n", - " Rochester\n", - " NY\n", - " 14607\n", - " None\n", - " None\n", - " ...\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " None\n", - " provisional\n", + " 1\n", + " 7526\n", + " edgar/data/824142/0000824142-23-000019.txt\n", + " 9185832266\n", + " 0000824142\n", + " tulsa\n", + " aaon, inc.\n", + " 19920703\n", + " 23675207\n", + " 1231\n", + " 10-k\n", + " aaon inc\n", + " 870448736\n", + " NaN\n", + " 1934 act\n", + " 000-18953\n", + " air cond & warm air heating equip & comm & ind...\n", + " ok\n", + " nv\n", + " 2425 south yukon ave.\n", + " NaN\n", + " 74107\n", + " 2023-02-27\n", + " 2023\n", + " nevada\n", + " aaon incorporated\n", + " aaon\n", + " N\n", + " 142821\n", + " 824142-0000824142-23-000019\n", + " aaon, inc\n", + " oklahoma\n", + " NaN\n", + " 2023\n", + " aaon\n", + " N\n", + " [nevada]\n", + " [oklahoma]\n", + " 0\n", " \n", " \n", "\n", - "

3 rows × 27 columns

\n", "" ], "text/plain": [ - " utility_id_eia utility_id_pudl utility_name_eia report_date \\\n", - "33 66292 16386.0 Desert Willow Energy Storage 2023-01-01 \n", - "35 66291 16385.0 Portage Solar Plant 2023-01-01 \n", - "37 66290 16384.0 NSF Energy One LLC 2023-01-01 \n", - "\n", - " street_address city state zip_code plants_reported_owner \\\n", - "33 100 Bayview Circle Newport Beach CA None None \n", - "35 N8917 Portage WI 53901 None \n", - "37 1241 University Ave Rochester NY 14607 None \n", - "\n", - " plants_reported_operator ... contact_lastname contact_title phone_number \\\n", - "33 None ... None None None \n", - "35 None ... None None None \n", - "37 None ... None None None \n", - "\n", - " phone_extension contact_firstname_2 contact_lastname_2 contact_title_2 \\\n", - "33 None None None None \n", - "35 None None None None \n", - "37 None None None None \n", - "\n", - " phone_number_2 phone_extension_2 data_maturity \n", - "33 None None provisional \n", - "35 None None provisional \n", - "37 None None provisional \n", - "\n", - "[3 rows x 27 columns]" + " record_id_sec filename phone_number central_index_key city company_name_raw_sec date_of_name_change film_number fiscal_year_end form_type former_conformed_name irs_number organization_name sec_act sec_file_number standard_industrial_classification state state_of_incorporation street_address street_address_2 zip_code report_date report_year_sec loc_of_incorporation_sec company_name company_name_no_legal_sec company_name_mphone_sec record_id_ex21 id company_name_raw_ex21 loc_of_incorporation_ex21 own_per report_year_ex21 company_name_no_legal_ex21 company_name_mphone_ex21 loc_tokens_sec loc_tokens_ex21 loc_overlap\n", + "0 7990 edgar/data/910638/0000910638-23-000009.txt 8033263900 0000910638 rock hill 3d systems corp 19930816 23738595 1231 10-k 3 d systems corp 954431352 NaN 1934 act 001-34220 services-prepackaged software [7372] sc de 333 three d systems circle NaN 29730 2023-03-16 2023 delaware 3d systems corporation 3d systems T SSTMS 150739 910638-0000910638-23-000009 3d systems corporation delaware NaN 2023 3d systems T SSTMS [delaware] [delaware] 1\n", + "1 7526 edgar/data/824142/0000824142-23-000019.txt 9185832266 0000824142 tulsa aaon, inc. 19920703 23675207 1231 10-k aaon inc 870448736 NaN 1934 act 000-18953 air cond & warm air heating equip & comm & ind... ok nv 2425 south yukon ave. NaN 74107 2023-02-27 2023 nevada aaon incorporated aaon N 142821 824142-0000824142-23-000019 aaon, inc oklahoma NaN 2023 aaon N [nevada] [oklahoma] 0" ] }, - "execution_count": 25, + "execution_count": 201, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "eia_df[(eia_df.street_address.notnull())].head(3)" + "closest_match.head(2)" ] }, { "cell_type": "code", - "execution_count": 26, - "id": "30c02757-45c0-403c-aa38-7422d3549a2b", - "metadata": { - "tags": [] - }, + "execution_count": 241, + "id": "78dfc42c-3921-444e-8342-d34fc2fd1a7a", + "metadata": {}, "outputs": [], "source": [ - "eia_subset = eia_df[eia_df.report_date == \"2020-01-01\"]" + "ex21_with_cik = ex21_df.merge(\n", + " closest_match[[\"company_name\", \"central_index_key\", \"loc_of_incorporation_ex21\"]].rename(columns={\"loc_of_incorporation_ex21\": \"loc_of_incorporation\"}),\n", + " how=\"left\",\n", + " on=[\"company_name\", \"loc_of_incorporation\"],\n", + ").rename(columns={\"central_index_key\": \"subsidiary_cik\"})" ] }, { "cell_type": "code", - "execution_count": 58, - "id": "1c0365a3-51d2-455b-8863-bc4dc22572f9", - "metadata": { - "tags": [] - }, + "execution_count": 242, + "id": "1f4bca08-3a65-484d-ac6b-cb7d4584b4e7", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_with_cik = ex21_with_cik.merge(closest_match[[\"company_name\", \"central_index_key\"]],\n", + " how=\"left\",\n", + " on=\"company_name\"\n", + " ).rename(columns={\"central_index_key\": \"company_name_merge_cik\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 243, + "id": "5462d9bb-23dd-45fb-b5bf-35396caba399", + "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
utility_id_eiautility_id_pudlutility_name_eiareport_datestreet_addresscitystatezip_codeplants_reported_ownerplants_reported_operator...contact_lastnamecontact_titlephone_numberphone_extensioncontact_firstname_2contact_lastname_2contact_title_2phone_number_2phone_extension_2data_maturity
71566541690.0Duke Energy Corp2010-01-01P O Box 1006CharlotteNC28202NoneNone...NoneNoneNoneNoneNoneNoneNoneNoneNonefinal
71568541690.0Duke Energy Corp2008-01-01NoneCharlotteNC28201NoneNone...NoneNoneNoneNoneNoneNoneNoneNoneNonefinal
71569541690.0Duke Energy Corp2007-01-01NoneCharlotteNC28201NoneNone...AshcraftSr. Engineering TechnologistNoneNoneRobertMc MurryDir Carolinas Integrated ResouNoneNonefinal
71570541690.0Duke Energy Corp2006-01-01NoneCharlotteNC28201NoneNone...DuckworthPlanning Engineer704-382-4327382StevenJesterDirector, Rate Admn & Cust Inq704-382-4887Nonefinal
71571541690.0Duke Energy Corp2005-01-01NoneCharlotteNC28201NoneNone...DuckworthPlanning Engineer704-382-4327382StevenJesterDirector, Rate Admn & Cust Inq704-382-4887Nonefinal
71572541690.0Duke Energy Corp2004-01-01NoneCharlotteNC28201NoneNone...DuckworthPlanning Engineer704-382-43270StevenJesterDirector, Rate Admn & Cust Inq704-382-4887Nonefinal
71573541690.0Duke Energy Corp2003-01-01NoneCharlotteNC28201NoneNone...DuckworthProcess LeaderNone0StevenJesterNoneNoneNonefinal
71574541690.0Duke Energy Corp2002-01-01NoneCharlotteNC28201NoneNone...Scott HenryProcess LeaderNone0NoneNoneMgr Reg Policy $ ResNoneNonefinal
71575541690.0Duke Energy Corp2001-01-01NoneCharlotteNC28201NoneNone...R S HenryNoneNone0NoneNoneMgr Operating Plann & AnalysisNoneNonefinal
\n", - "

9 rows × 27 columns

\n", - "
" - ], "text/plain": [ - " utility_id_eia utility_id_pudl utility_name_eia report_date \\\n", - "71566 5416 90.0 Duke Energy Corp 2010-01-01 \n", - "71568 5416 90.0 Duke Energy Corp 2008-01-01 \n", - "71569 5416 90.0 Duke Energy Corp 2007-01-01 \n", - "71570 5416 90.0 Duke Energy Corp 2006-01-01 \n", - "71571 5416 90.0 Duke Energy Corp 2005-01-01 \n", - "71572 5416 90.0 Duke Energy Corp 2004-01-01 \n", - "71573 5416 90.0 Duke Energy Corp 2003-01-01 \n", - "71574 5416 90.0 Duke Energy Corp 2002-01-01 \n", - "71575 5416 90.0 Duke Energy Corp 2001-01-01 \n", - "\n", - " street_address city state zip_code plants_reported_owner \\\n", - "71566 P O Box 1006 Charlotte NC 28202 None \n", - "71568 None Charlotte NC 28201 None \n", - "71569 None Charlotte NC 28201 None \n", - "71570 None Charlotte NC 28201 None \n", - "71571 None Charlotte NC 28201 None \n", - "71572 None Charlotte NC 28201 None \n", - "71573 None Charlotte NC 28201 None \n", - "71574 None Charlotte NC 28201 None \n", - "71575 None Charlotte NC 28201 None \n", - "\n", - " plants_reported_operator ... contact_lastname \\\n", - "71566 None ... None \n", - "71568 None ... None \n", - "71569 None ... Ashcraft \n", - "71570 None ... Duckworth \n", - "71571 None ... Duckworth \n", - "71572 None ... Duckworth \n", - "71573 None ... Duckworth \n", - "71574 None ... Scott Henry \n", - "71575 None ... R S Henry \n", - "\n", - " contact_title phone_number phone_extension \\\n", - "71566 None None None \n", - "71568 None None None \n", - "71569 Sr. Engineering Technologist None None \n", - "71570 Planning Engineer 704-382-4327 382 \n", - "71571 Planning Engineer 704-382-4327 382 \n", - "71572 Planning Engineer 704-382-4327 0 \n", - "71573 Process Leader None 0 \n", - "71574 Process Leader None 0 \n", - "71575 None None 0 \n", - "\n", - " contact_firstname_2 contact_lastname_2 contact_title_2 \\\n", - "71566 None None None \n", - "71568 None None None \n", - "71569 Robert Mc Murry Dir Carolinas Integrated Resou \n", - "71570 Steven Jester Director, Rate Admn & Cust Inq \n", - "71571 Steven Jester Director, Rate Admn & Cust Inq \n", - "71572 Steven Jester Director, Rate Admn & Cust Inq \n", - "71573 Steven Jester None \n", - "71574 None None Mgr Reg Policy $ Res \n", - "71575 None None Mgr Operating Plann & Analysis \n", - "\n", - " phone_number_2 phone_extension_2 data_maturity \n", - "71566 None None final \n", - "71568 None None final \n", - "71569 None None final \n", - "71570 704-382-4887 None final \n", - "71571 704-382-4887 None final \n", - "71572 704-382-4887 None final \n", - "71573 None None final \n", - "71574 None None final \n", - "71575 None None final \n", - "\n", - "[9 rows x 27 columns]" + "subsidiary_cik\n", + "True 191387\n", + "False 480\n", + "Name: count, dtype: int64" ] }, - "execution_count": 58, + "execution_count": 243, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "eia_df[(eia_df.utility_name_eia.str.contains(\"Duke Energy Corp\")) & (eia_df.state == \"NC\")].drop_duplicates()" - ] - }, - { - "cell_type": "markdown", - "id": "f3d5db08-3c42-4715-9f0d-4d02674b828a", - "metadata": {}, - "source": [ - "# Preprocessing" + "ex21_with_cik.subsidiary_cik.isnull().value_counts()" ] }, { "cell_type": "code", - "execution_count": 82, - "id": "39706c77-90db-4f49-8011-47a9777a88b6", + "execution_count": 244, + "id": "a38c45ad-56f3-49ad-bd62-fb91c4d89940", "metadata": {}, "outputs": [], "source": [ - "sec_df = prepare_sec10k_basic_info_df(raw_sec_df)" + "# if a subsidiary doesn't have a CIK and has a null location\n", + "# but its name was assigned a CIK (with a different location)\n", + "# then assign that CIK to the subsidiary\n", + "ex21_with_cik[\"subsidiary_cik\"] = ex21_with_cik[\"subsidiary_cik\"].where(\n", + " ~(ex21_with_cik.subsidiary_cik.isnull()) | ~(ex21_with_cik.loc_of_incorporation.isnull()), \n", + " ex21_with_cik[\"company_name_merge_cik\"]\n", + ")" ] }, { "cell_type": "code", - "execution_count": 83, - "id": "98d4f59e-d61f-4a24-84bc-6caa0d761e07", + "execution_count": 245, + "id": "4cca9da1-8371-4b45-b88d-8c2911209707", "metadata": {}, - "outputs": [], - "source": [ - "ex21_df = prepare_ex21_df(raw_ex21_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 84, - "id": "11caf325-8530-430d-a3d2-a54043447021", - "metadata": { - "tags": [] - }, "outputs": [ { "data": { "text/plain": [ - "True" + "subsidiary_cik\n", + "True 191386\n", + "False 481\n", + "Name: count, dtype: int64" ] }, - "execution_count": 84, + "execution_count": 245, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# sec_df has filename as unique ID\n", - "sec_df.filename.is_unique" + "ex21_with_cik.subsidiary_cik.isnull().value_counts()" ] }, { - "cell_type": "markdown", - "id": "ceed053b-f6ae-4aad-8b12-b2083ba8e236", + "cell_type": "code", + "execution_count": 252, + "id": "e5b57a88-ffaa-4834-bea4-c5b4779bd551", "metadata": {}, + "outputs": [], "source": [ - "Note: not removing paragraph layout docs, but maybe should" + "archive = GCSArchive()\n", + "md = archive.get_metadata()" ] }, { - "cell_type": "markdown", - "id": "1bb694c9-cfbd-4e2f-b69c-9996a588d2d2", - "metadata": { - "tags": [] - }, + "cell_type": "code", + "execution_count": 261, + "id": "a33be6e3-056f-4e4a-acd4-9a6dc6f98c90", + "metadata": {}, + "outputs": [], "source": [ - "# Match Ex. 21 Subsidiaries to a SEC filer" + "ex21_with_cik.loc[:, \"filename\"] = convert_ex21_id_to_filename(ex21_with_cik)" ] }, { - "cell_type": "markdown", - "id": "01d3a5e1-ad17-4266-b2ef-358f246749db", - "metadata": { - "tags": [] - }, + "cell_type": "code", + "execution_count": 263, + "id": "d0dec8af-d730-4a06-af5e-f390fa228ac8", + "metadata": {}, + "outputs": [], "source": [ - "## Preprocessing" + "ex21_with_cik = ex21_with_cik.merge(md[\"cik\"], how=\"left\", left_on=\"filename\", right_index=True).rename(columns={\"cik\": \"parent_cik\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 264, + "id": "228a1d4b-bc19-49eb-b557-4f26d1febbd9", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_with_cik = add_sec_company_id_to_subsidiaries(ex21_with_cik)" + ] + }, + { + "cell_type": "code", + "execution_count": 265, + "id": "c1b88c44-81d7-4d9d-a2a3-be1b030348bd", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
record_ididcompany_name_rawloc_of_incorporationown_perreport_yearcompany_namecompany_name_no_legalcompany_name_mphonesubsidiary_cikcompany_name_merge_cikfilenameparent_ciksec_company_id
1644821644821000045-0000950170-23-030037nicholas data services, incflorida100.02023nicholas data services incorporatednicholas data servicesNXLS TT SRFSSNaNNaNedgar/data/1000045/0000950170-23-030037.txt10000451000045_1
1644811644811000045-0000950170-23-030037nicholas financial, incflorida100.02023nicholas financial incorporatednicholas financialNXLS FNNXL00010000450001000045edgar/data/1000045/0000950170-23-030037.txt10000450001000045
89891000209-0000950170-23-007273medallion bankutahNaN2023medallion bankmedallion bankMTLN BNKNaNNaNedgar/data/1000209/0000950170-23-007273.txt10002091000209_1
88881000209-0000950170-23-007273freshstart venture capital corpnew yorkNaN2023freshstart venture capital corporationfreshstart venture capitalFRXSTRT FNTR KPTLNaNNaNedgar/data/1000209/0000950170-23-007273.txt10002091000209_2
87871000209-0000950170-23-007273medallion capital, incminnesotaNaN2023medallion capital incorporatedmedallion capitalMTLN KPTLNaNNaNedgar/data/1000209/0000950170-23-007273.txt10002091000209_3
.............................................
1619571619579984-0000009984-23-000060barnes molding solutions korea limitedkoreaNaN2023barnes molding solutions korea limitedbarnes molding solutions koreaBRNS MLTNK SLXNS KRNaNNaNedgar/data/9984/0000009984-23-000060.txt99849984_99
1619561619569984-0000009984-23-000060barnes molding solutions (jiangsu) co., ltdchinaNaN2023barnes molding solutions company limitedbarnes molding solutionsBRNS MLTNK SLXNSNaNNaNedgar/data/9984/0000009984-23-000060.txt99849984_100
1619551619559984-0000009984-23-000060barnes korea ltdkoreaNaN2023barnes korea limitedbarnes koreaBRNS KRNaNNaNedgar/data/9984/0000009984-23-000060.txt99849984_101
1619651619659984-0000009984-23-000060gimatic automation india pvt ltdindiaNaN2023gimatic automation india pvt limitedgimatic automation india pvtJMTK ATMXN INT PFTNaNNaNedgar/data/9984/0000009984-23-000060.txt99849984_102
1620181620189984-0000009984-23-000060synventive molding solutions ltdabrazilNaN2023synventive molding solutions ltdasynventive molding solutions ltdaSNFNTF MLTNK SLXNS LTTNaNNaNedgar/data/9984/0000009984-23-000060.txt99849984_103
\n", + "

191867 rows × 14 columns

\n", + "
" + ], + "text/plain": [ + " record_id id company_name_raw loc_of_incorporation own_per report_year company_name company_name_no_legal company_name_mphone subsidiary_cik company_name_merge_cik filename parent_cik sec_company_id\n", + "164482 164482 1000045-0000950170-23-030037 nicholas data services, inc florida 100.0 2023 nicholas data services incorporated nicholas data services NXLS TT SRFSS NaN NaN edgar/data/1000045/0000950170-23-030037.txt 1000045 1000045_1\n", + "164481 164481 1000045-0000950170-23-030037 nicholas financial, inc florida 100.0 2023 nicholas financial incorporated nicholas financial NXLS FNNXL 0001000045 0001000045 edgar/data/1000045/0000950170-23-030037.txt 1000045 0001000045\n", + "89 89 1000209-0000950170-23-007273 medallion bank utah NaN 2023 medallion bank medallion bank MTLN BNK NaN NaN edgar/data/1000209/0000950170-23-007273.txt 1000209 1000209_1\n", + "88 88 1000209-0000950170-23-007273 freshstart venture capital corp new york NaN 2023 freshstart venture capital corporation freshstart venture capital FRXSTRT FNTR KPTL NaN NaN edgar/data/1000209/0000950170-23-007273.txt 1000209 1000209_2\n", + "87 87 1000209-0000950170-23-007273 medallion capital, inc minnesota NaN 2023 medallion capital incorporated medallion capital MTLN KPTL NaN NaN edgar/data/1000209/0000950170-23-007273.txt 1000209 1000209_3\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "161957 161957 9984-0000009984-23-000060 barnes molding solutions korea limited korea NaN 2023 barnes molding solutions korea limited barnes molding solutions korea BRNS MLTNK SLXNS KR NaN NaN edgar/data/9984/0000009984-23-000060.txt 9984 9984_99\n", + "161956 161956 9984-0000009984-23-000060 barnes molding solutions (jiangsu) co., ltd china NaN 2023 barnes molding solutions company limited barnes molding solutions BRNS MLTNK SLXNS NaN NaN edgar/data/9984/0000009984-23-000060.txt 9984 9984_100\n", + "161955 161955 9984-0000009984-23-000060 barnes korea ltd korea NaN 2023 barnes korea limited barnes korea BRNS KR NaN NaN edgar/data/9984/0000009984-23-000060.txt 9984 9984_101\n", + "161965 161965 9984-0000009984-23-000060 gimatic automation india pvt ltd india NaN 2023 gimatic automation india pvt limited gimatic automation india pvt JMTK ATMXN INT PFT NaN NaN edgar/data/9984/0000009984-23-000060.txt 9984 9984_102\n", + "162018 162018 9984-0000009984-23-000060 synventive molding solutions ltda brazil NaN 2023 synventive molding solutions ltda synventive molding solutions ltda SNFNTF MLTNK SLXNS LTT NaN NaN edgar/data/9984/0000009984-23-000060.txt 9984 9984_103\n", + "\n", + "[191867 rows x 14 columns]" + ] + }, + "execution_count": 265, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ex21_with_cik" + ] + }, + { + "cell_type": "code", + "execution_count": 266, + "id": "192d3cac-b156-4e5c-8148-0cbdc3e8900d", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_with_cik.to_parquet(\"ex21_2023.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "1bb694c9-cfbd-4e2f-b69c-9996a588d2d2", + "metadata": { + "tags": [] + }, + "source": [ + "# Match Ex. 21 Subsidiaries to a SEC filer" + ] + }, + { + "cell_type": "markdown", + "id": "01d3a5e1-ad17-4266-b2ef-358f246749db", + "metadata": { + "tags": [] + }, + "source": [ + "## Preprocessing" ] }, { @@ -1136,7 +1072,7 @@ }, { "cell_type": "code", - "execution_count": 86, + "execution_count": 24, "id": "24890018-8efb-445f-ad91-ca316edccbe8", "metadata": {}, "outputs": [], @@ -1146,7 +1082,7 @@ }, { "cell_type": "code", - "execution_count": 87, + "execution_count": 25, "id": "83f859df-1764-4e97-addc-0064bdcb31b7", "metadata": { "tags": [] @@ -1156,12 +1092,12 @@ "data": { "text/plain": [ "loc_of_incorporation\n", - "False 6359\n", - "True 748\n", + "False 6382\n", + "True 749\n", "Name: count, dtype: int64" ] }, - "execution_count": 87, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1172,7 +1108,7 @@ }, { "cell_type": "code", - "execution_count": 88, + "execution_count": 26, "id": "e9d0828f-0ad8-41ea-a449-ddd274a888d0", "metadata": { "tags": [] @@ -1192,7 +1128,7 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 71, "id": "4ca07927-185d-4bc6-978a-e8788a8f77b3", "metadata": { "tags": [] @@ -1202,30 +1138,30 @@ "data": { "text/plain": [ "company_name\n", - "rush truck center 120\n", - "encompass health rehabilitation hospital 79\n", - "rush peterbilt truck center 57\n", - "branch 52\n", - "sci funeral services, llc iowa limited liability company 33\n", - "partnership limited partnership 32\n", - "alderwoods group, llc de limited liability company 27\n", - "encompass health rehabilitation hospital of 26\n", - "u haul co. of 26\n", - "at&t 25\n", - "corporation 21\n", - "amh portfolio management 20\n", - "rush bus center 20\n", - "limited partnership limited partnership 18\n", - "rapy limited partnership 15\n", - "rush isuzu trucks 15\n", - "colgate palmolive limited 14\n", - "ecolab limited 11\n", - "rush truck centres 11\n", - "johnson and johnson limited 11\n", + "rush truck center 120\n", + "encompass health rehabilitation hospital 79\n", + "rush peterbilt truck center 57\n", + "branch 52\n", + "sci funeral services llc iowa limited liability company 33\n", + "partnership limited partnership 32\n", + "alderwoods group llc de limited liability company 27\n", + "encompass health rehabilitation hospital of 26\n", + "u haul co of 26\n", + "at and t 25\n", + "corporation 21\n", + "amh portfolio management 20\n", + "rush bus center 20\n", + "limited partnership limited partnership 18\n", + "therapy limited partnership 15\n", + "rush isuzu trucks 15\n", + "colgate palmolive limited 14\n", + "johnson and johnson limited 11\n", + "ecolab limited 11\n", + "rush truck centres 11\n", "Name: count, dtype: int64" ] }, - "execution_count": 89, + "execution_count": 71, "metadata": {}, "output_type": "execute_result" } @@ -1236,7 +1172,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 72, "id": "8a4839e5-a2e5-4098-826a-4d340cdde638", "metadata": { "tags": [] @@ -1247,6 +1183,48 @@ "sec_match_df = sec_match_df[[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]" ] }, + { + "cell_type": "code", + "execution_count": 73, + "id": "baab7dfc-4efb-4c08-b090-32dd47025e15", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/var/folders/c0/5zrbrqhx17d5jm6t03bw2nkw0000gn/T/ipykernel_26291/3959766958.py:2: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " sec_match_df.loc[:, \"company_name_mphone_list\"] = sec_match_df[\"company_name_mphone\"].str.split(\" \")\n", + "/var/folders/c0/5zrbrqhx17d5jm6t03bw2nkw0000gn/T/ipykernel_26291/3959766958.py:3: SettingWithCopyWarning: \n", + "A value is trying to be set on a copy of a slice from a DataFrame.\n", + "Try using .loc[row_indexer,col_indexer] = value instead\n", + "\n", + "See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy\n", + " ex21_match_df.loc[:, \"company_name_mphone_list\"] = ex21_match_df[\"company_name_mphone\"].str.split(\" \")\n" + ] + } + ], + "source": [ + "# TEMP\n", + "sec_match_df.loc[:, \"company_name_mphone_list\"] = sec_match_df[\"company_name_mphone\"].str.split(\" \")\n", + "ex21_match_df.loc[:, \"company_name_mphone_list\"] = ex21_match_df[\"company_name_mphone\"].str.split(\" \")" + ] + }, + { + "cell_type": "code", + "execution_count": 74, + "id": "a1a6634e-e554-4a94-8a57-c2755048db22", + "metadata": {}, + "outputs": [], + "source": [ + "sec_match_df.loc[:, \"loc_list\"] = sec_match_df[\"loc_of_incorporation\"].str.replace(\",\", '').str.split(\" \")\n", + "ex21_match_df.loc[:, \"loc_list\"] = ex21_match_df[\"loc_of_incorporation\"].str.replace(\",\", '').str.split(\" \")" + ] + }, { "cell_type": "markdown", "id": "c294372b-159c-4c90-a031-61c34532b965", @@ -1257,7 +1235,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 75, "id": "c9dbc620-ed49-4a8e-9d02-6b6f2e0a14cf", "metadata": { "tags": [] @@ -1272,7 +1250,7 @@ }, { "cell_type": "code", - "execution_count": 102, + "execution_count": 76, "id": "422ca098-e4e7-4284-8b04-74e976e36023", "metadata": { "tags": [] @@ -1284,7 +1262,7 @@ }, { "cell_type": "code", - "execution_count": 93, + "execution_count": 77, "id": "232b5718-c1ed-4e63-8384-b4acf33210d3", "metadata": { "tags": [] @@ -1295,115 +1273,23 @@ "text/html": [ "\n", "\n", - "
\n", - "" - ], - "text/plain": [ - "alt.LayerChart(...)" - ] - }, - "execution_count": 93, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# sometimes this will show up as 100% non null in loc_of_incorporation, not sure why\n", - "completeness_chart(ex21_match_df[match_cols], db_api=db_api)" - ] - }, - { - "cell_type": "code", - "execution_count": 94, - "id": "520a9b86", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, - "execution_count": 94, + "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "completeness_chart(sec_match_df[match_cols], db_api=db_api)" + "# sometimes this will show up as 100% complete in loc_of_incorporation, not sure why\n", + "completeness_chart([ex21_match_df[match_cols], sec_match_df[match_cols]], db_api=db_api)" ] }, { @@ -1475,7 +1362,7 @@ }, { "cell_type": "code", - "execution_count": 95, + "execution_count": 78, "id": "a5c26016-2c59-4335-bd39-8b2e7ea91840", "metadata": { "tags": [] @@ -1486,23 +1373,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, - "execution_count": 95, + "execution_count": 78, "metadata": {}, "output_type": "execute_result" } @@ -1566,7 +1453,7 @@ }, { "cell_type": "code", - "execution_count": 96, + "execution_count": 79, "id": "2a57f717-140f-434d-8998-983b8bf38ac5", "metadata": { "tags": [] @@ -1577,23 +1464,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, - "execution_count": 96, + "execution_count": 79, "metadata": {}, "output_type": "execute_result" } @@ -1671,7 +1558,7 @@ }, { "cell_type": "code", - "execution_count": 97, + "execution_count": 36, "id": "fb6d143b-5201-4b31-849c-97db80781ade", "metadata": { "tags": [] @@ -1684,19 +1571,19 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 80, "id": "22766c9f-7371-483f-82b0-015549a84357", "metadata": { "tags": [] }, "outputs": [], "source": [ - "br = \"l.report_year = r.report_year and substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3)\"" + "br = \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4)\"" ] }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 81, "id": "60937a9c-dff6-4d68-808f-81b8228fc9f6", "metadata": { "tags": [] @@ -1705,14 +1592,14 @@ { "data": { "text/plain": [ - "{'number_of_comparisons_generated_pre_filter_conditions': 2069828,\n", - " 'number_of_comparisons_to_be_scored_post_filter_conditions': 2069828,\n", + "{'number_of_comparisons_generated_pre_filter_conditions': 531298,\n", + " 'number_of_comparisons_to_be_scored_post_filter_conditions': 531298,\n", " 'filter_conditions_identified': '',\n", - " 'equi_join_conditions_identified': 'l.report_year = r.report_year AND SUBSTRING(l.company_name_mphone, 1, 3) = SUBSTRING(r.company_name_mphone, 1, 3)',\n", + " 'equi_join_conditions_identified': 'SUBSTRING(l.company_name_mphone, 1, 4) = SUBSTRING(r.company_name_mphone, 1, 4)',\n", " 'link_type_join_condition': 'where l.\"source_dataset\" || \\'-__-\\' || l.\"record_id\" < r.\"source_dataset\" || \\'-__-\\' || r.\"record_id\" and l.\"source_dataset\" != r.\"source_dataset\"'}" ] }, - "execution_count": 104, + "execution_count": 81, "metadata": {}, "output_type": "execute_result" } @@ -1735,7 +1622,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 82, "id": "67717313-2c17-4b6b-b984-8f7bc955c678", "metadata": { "tags": [] @@ -1763,7 +1650,6 @@ " \n", " \n", " key_0\n", - " key_1\n", " count_l\n", " count_r\n", " block_count\n", @@ -1772,40 +1658,37 @@ " \n", " \n", " 0\n", - " 2023\n", - " STR\n", - " 68\n", - " 1297\n", - " 88196\n", + " AMRK\n", + " 56\n", + " 625\n", + " 35000\n", " \n", " \n", " 1\n", - " 2023\n", - " INT\n", - " 62\n", - " 1275\n", - " 79050\n", + " FRST\n", + " 56\n", + " 555\n", + " 31080\n", " \n", " \n", " 2\n", - " 2023\n", - " KRN\n", - " 60\n", - " 1290\n", - " 77400\n", + " INTR\n", + " 30\n", + " 659\n", + " 19770\n", " \n", " \n", "\n", "" ], "text/plain": [ - " key_0 key_1 count_l count_r block_count\n", - "0 2023 STR 68 1297 88196\n", - "1 2023 INT 62 1275 79050\n", - "2 2023 KRN 60 1290 77400" + " key_0 count_l count_r block_count\n", + "0 AMRK 56 625 35000\n", + "1 FRST 56 555 31080\n", + "2 INTR 30 659 19770" ] }, - "execution_count": 106, + "execution_count": 82, "metadata": {}, "output_type": "execute_result" } @@ -1824,7 +1707,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 84, "id": "6fe6fb99-f5fd-4538-a8bc-c9dd41f4ff9c", "metadata": {}, "outputs": [ @@ -1833,23 +1716,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, - "execution_count": 107, + "execution_count": 84, "metadata": {}, "output_type": "execute_result" } @@ -1913,8 +1796,7 @@ ")\n", "\n", "blocking_rules_for_analysis = [\n", - " # block_on(\"substr(l.company_name_mphone,1,3)\", \"substr(r.company_name_mphone,1,3)\"),\n", - " \"l.report_year = r.report_year and substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3)\"\n", + " br\n", "]\n", "\n", "\n", @@ -1939,7 +1821,7 @@ }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 44, "id": "1f12d114-22fd-4f12-a0be-6a62500e80d5", "metadata": { "tags": [] @@ -1952,54 +1834,104 @@ }, { "cell_type": "code", - "execution_count": 109, - "id": "bb13b160-b554-45d6-a575-5fa2de061350", - "metadata": { - "tags": [] - }, + "execution_count": 120, + "id": "e9cf27ac-6f65-4c73-9e11-9445a8977531", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Comparison 'NameComparison' of \"company_name\".\n", + "Comparison 'ExactMatch' of \"company_name\".\n", "Similarity is assessed using the following ComparisonLevels:\n", " - 'company_name is NULL' with SQL rule: \"company_name_l\" IS NULL OR \"company_name_r\" IS NULL\n", " - 'Exact match on company_name' with SQL rule: \"company_name_l\" = \"company_name_r\"\n", - " - 'Jaro-Winkler distance of company_name >= 0.92' with SQL rule: jaro_winkler_similarity(\"company_name_l\", \"company_name_r\") >= 0.92\n", - " - 'Jaro-Winkler distance of company_name >= 0.88' with SQL rule: jaro_winkler_similarity(\"company_name_l\", \"company_name_r\") >= 0.88\n", - " - 'Jaro-Winkler distance of company_name >= 0.7' with SQL rule: jaro_winkler_similarity(\"company_name_l\", \"company_name_r\") >= 0.7\n", " - 'All other comparisons' with SQL rule: ELSE\n", "\n" ] } ], "source": [ - "company_name_comparison = cl.NameComparison(\n", + "company_name_comparison = cl.ExactMatch(\n", " \"company_name\",\n", - " # dmeta_col_name=\"company_name_mphone\" # this was breaking it for some reason\n", ")\n", "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)" ] }, { "cell_type": "code", - "execution_count": 110, - "id": "7d2697d3-efdb-4be4-8911-18b457f5bab4", - "metadata": { - "tags": [] - }, + "execution_count": 85, + "id": "a0d056b4-b7b5-4f01-ad60-3ffc2bec54eb", + "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "Comparison 'JaroWinklerAtThresholds' of \"loc_of_incorporation\".\n", + "Comparison 'LevenshteinAtThresholds' of \"company_name\".\n", "Similarity is assessed using the following ComparisonLevels:\n", - " - 'loc_of_incorporation is NULL' with SQL rule: \"loc_of_incorporation_l\" IS NULL OR \"loc_of_incorporation_r\" IS NULL\n", + " - 'company_name is NULL' with SQL rule: \"company_name_l\" IS NULL OR \"company_name_r\" IS NULL\n", + " - 'Exact match on company_name' with SQL rule: \"company_name_l\" = \"company_name_r\"\n", + " - 'Levenshtein distance of company_name <= 1' with SQL rule: levenshtein(\"company_name_l\", \"company_name_r\") <= 1\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "company_name_comparison = cl.LevenshteinAtThresholds(\n", + " \"company_name\",\n", + " distance_threshold_or_thresholds=[1]\n", + ")\n", + "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 81, + "id": "bf199c98-5239-4a1e-8856-19d74e42b7db", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'ArrayIntersectAtSizes' of \"company_name_mphone_list\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'company_name_mphone_list is NULL' with SQL rule: \"company_name_mphone_list_l\" IS NULL OR \"company_name_mphone_list_r\" IS NULL\n", + " - 'Array intersection size >= 3' with SQL rule: array_length(list_intersect(\"company_name_mphone_list_l\", \"company_name_mphone_list_r\")) >= 3\n", + " - 'Array intersection size >= 2' with SQL rule: array_length(list_intersect(\"company_name_mphone_list_l\", \"company_name_mphone_list_r\")) >= 2\n", + " - 'Array intersection size >= 1' with SQL rule: array_length(list_intersect(\"company_name_mphone_list_l\", \"company_name_mphone_list_r\")) >= 1\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "company_name_comparison = cl.ArrayIntersectAtSizes(\n", + " \"company_name_mphone_list\",\n", + " size_threshold_or_thresholds=[3,2,1]\n", + ")\n", + "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "id": "7d2697d3-efdb-4be4-8911-18b457f5bab4", + "metadata": { + "tags": [] + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'JaroWinklerAtThresholds' of \"loc_of_incorporation\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'loc_of_incorporation is NULL' with SQL rule: \"loc_of_incorporation_l\" IS NULL OR \"loc_of_incorporation_r\" IS NULL\n", " - 'Exact match on loc_of_incorporation' with SQL rule: \"loc_of_incorporation_l\" = \"loc_of_incorporation_r\"\n", " - 'Jaro-Winkler distance of loc_of_incorporation >= 0.9' with SQL rule: jaro_winkler_similarity(\"loc_of_incorporation_l\", \"loc_of_incorporation_r\") >= 0.9\n", - " - 'Jaro-Winkler distance of loc_of_incorporation >= 0.7' with SQL rule: jaro_winkler_similarity(\"loc_of_incorporation_l\", \"loc_of_incorporation_r\") >= 0.7\n", " - 'All other comparisons' with SQL rule: ELSE\n", "\n" ] @@ -2009,13 +1941,42 @@ "# try with Levenshtein too\n", "location_comparison = cl.JaroWinklerAtThresholds(\n", " \"loc_of_incorporation\",\n", + " score_threshold_or_thresholds=[0.9]\n", + ")\n", + "print(location_comparison.get_comparison(\"duckdb\").human_readable_description)" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "id": "f3529a5a-7ced-46dd-af22-7bb44ed92aa2", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'ArrayIntersectAtSizes' of \"loc_list\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'loc_list is NULL' with SQL rule: \"loc_list_l\" IS NULL OR \"loc_list_r\" IS NULL\n", + " - 'Array intersection size >= 2' with SQL rule: array_length(list_intersect(\"loc_list_l\", \"loc_list_r\")) >= 2\n", + " - 'Array intersection size >= 1' with SQL rule: array_length(list_intersect(\"loc_list_l\", \"loc_list_r\")) >= 1\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], + "source": [ + "location_comparison = cl.ArrayIntersectAtSizes(\n", + " \"loc_list\",\n", + " size_threshold_or_thresholds=[2,1]\n", ")\n", "print(location_comparison.get_comparison(\"duckdb\").human_readable_description)" ] }, { "cell_type": "code", - "execution_count": 111, + "execution_count": 123, "id": "92c1ad6b-4516-4ab4-90eb-394669c4a02b", "metadata": { "tags": [] @@ -2048,7 +2009,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 124, "id": "e9eb59b9-49cc-45b7-8ffa-b8f7e5372608", "metadata": { "tags": [] @@ -2057,7 +2018,7 @@ { "data": { "application/vnd.jupyter.widget-view+json": { - "model_id": "f8061ccbd73c426daa2d35dbf68e55fb", + "model_id": "d8daffbf12a14f72a247e47fc2fa719a", "version_major": 2, "version_minor": 0 }, @@ -2072,25 +2033,25 @@ "name": "stderr", "output_type": "stream", "text": [ - "Probability two random records match is estimated to be 0.000689.\n", - "This means that amongst all possible pairwise record comparisons, one in 1,452.36 are expected to match. With 1,365,709,548 total possible comparisons, we expect a total of around 940,336.47 matching pairs\n" + "Probability two random records match is estimated to be 8.21e-05.\n", + "This means that amongst all possible pairwise record comparisons, one in 12,184.39 are expected to match. With 1,368,717,009 total possible comparisons, we expect a total of around 112,333.68 matching pairs\n" ] } ], "source": [ "deterministic_rules = [\n", " block_on(\"company_name_mphone\", \"company_name_mphone\"),\n", - " \"jaccard(r.company_name, l.company_name) >= .9 and l.loc_of_incorporation = r.loc_of_incorporation\",\n", - " \"substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and jaccard(r.company_name, l.company_name) >= .8\",\n", + " \"jaccard(r.company_name, l.company_name) >= .95 and l.loc_of_incorporation = r.loc_of_incorporation\",\n", + " \"substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and jaccard(r.company_name, l.company_name) >= .95\",\n", " # \"substr(l.company_name_mphone,1,5) = substr(r.company_name_mphone,1,5) and l.loc_of_incorporation = r.loc_of_incorporation\"\n", "]\n", "\n", - "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.85)" + "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.95)" ] }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 125, "id": "5117653e-e72b-4c13-b923-d1228b39d357", "metadata": { "tags": [] @@ -2100,27 +2061,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "----- Estimating u probabilities using random sampling -----\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "e732ac0702e4459b82b86d2de5c9d9fc", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ + "----- Estimating u probabilities using random sampling -----\n", "\n", "Estimated u probabilities using random sampling\n", "\n", @@ -2136,7 +2077,7 @@ }, { "cell_type": "code", - "execution_count": 114, + "execution_count": 126, "id": "8b089a0d-4c91-4b4d-9806-ed83c9bd59b9", "metadata": { "tags": [] @@ -2158,31 +2099,31 @@ "\n", "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", "\n", - "Iteration 1: Largest change in params was -0.213 in the m_probability of loc_of_incorporation, level `Exact match on loc_of_incorporation`\n", - "Iteration 2: Largest change in params was 0.243 in the m_probability of loc_of_incorporation, level `All other comparisons`\n", - "Iteration 3: Largest change in params was 0.0314 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.88`\n", - "Iteration 4: Largest change in params was 0.0052 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 5: Largest change in params was 0.0087 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 6: Largest change in params was 0.0133 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 7: Largest change in params was 0.0188 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 8: Largest change in params was 0.0246 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 9: Largest change in params was 0.0297 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 10: Largest change in params was 0.0332 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 11: Largest change in params was 0.0346 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 12: Largest change in params was 0.0336 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 13: Largest change in params was 0.0306 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 14: Largest change in params was 0.0264 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 15: Largest change in params was 0.0218 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 16: Largest change in params was 0.0173 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 17: Largest change in params was 0.0134 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 18: Largest change in params was 0.0102 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 19: Largest change in params was 0.00758 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 20: Largest change in params was 0.00559 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 21: Largest change in params was 0.00409 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 22: Largest change in params was 0.00298 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 23: Largest change in params was 0.00216 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 24: Largest change in params was 0.00156 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", - "Iteration 25: Largest change in params was 0.00112 in the m_probability of company_name, level `Jaro-Winkler distance of company_name >= 0.7`\n", + "Iteration 1: Largest change in params was -0.38 in the m_probability of loc_of_incorporation, level `Exact match on loc_of_incorporation`\n", + "Iteration 2: Largest change in params was 0.027 in the m_probability of loc_of_incorporation, level `All other comparisons`\n", + "Iteration 3: Largest change in params was -0.000274 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 4: Largest change in params was -0.00056 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 5: Largest change in params was 0.00112 in the m_probability of company_name, level `All other comparisons`\n", + "Iteration 6: Largest change in params was 0.00214 in the m_probability of company_name, level `All other comparisons`\n", + "Iteration 7: Largest change in params was 0.00387 in the m_probability of company_name, level `All other comparisons`\n", + "Iteration 8: Largest change in params was -0.00648 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 9: Largest change in params was 0.00989 in the m_probability of company_name, level `All other comparisons`\n", + "Iteration 10: Largest change in params was 0.0137 in the m_probability of company_name, level `All other comparisons`\n", + "Iteration 11: Largest change in params was 0.0171 in the m_probability of company_name, level `All other comparisons`\n", + "Iteration 12: Largest change in params was -0.0197 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 13: Largest change in params was 0.0209 in the m_probability of company_name, level `All other comparisons`\n", + "Iteration 14: Largest change in params was -0.0209 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 15: Largest change in params was -0.0201 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 16: Largest change in params was -0.0187 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 17: Largest change in params was -0.017 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 18: Largest change in params was 0.0153 in the m_probability of company_name, level `All other comparisons`\n", + "Iteration 19: Largest change in params was -0.0136 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 20: Largest change in params was -0.0121 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 21: Largest change in params was -0.0107 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 22: Largest change in params was -0.0094 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 23: Largest change in params was 0.00828 in the m_probability of company_name, level `All other comparisons`\n", + "Iteration 24: Largest change in params was -0.00728 in the m_probability of company_name, level `Exact match on company_name`\n", + "Iteration 25: Largest change in params was -0.00641 in the m_probability of company_name, level `Exact match on company_name`\n", "\n", "EM converged after 25 iterations\n", "\n", @@ -2199,7 +2140,7 @@ }, { "cell_type": "code", - "execution_count": 115, + "execution_count": 127, "id": "88e058bc-800d-4da4-92aa-6ddb7377b4bf", "metadata": { "tags": [] @@ -2210,23 +2151,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, - "execution_count": 115, + "execution_count": 127, "metadata": {}, "output_type": "execute_result" } @@ -2290,7 +2231,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 128, "id": "673a4776-1de1-46ce-a411-f7fd1668d54f", "metadata": { "tags": [] @@ -2301,23 +2242,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.HConcatChart(...)" ] }, - "execution_count": 116, + "execution_count": 128, "metadata": {}, "output_type": "execute_result" } @@ -2381,7 +2322,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 107, "id": "ebf9e326-38f1-4d78-b302-15867cda1009", "metadata": {}, "outputs": [], @@ -2409,7 +2350,7 @@ }, { "cell_type": "code", - "execution_count": 120, + "execution_count": 129, "id": "72ff6575-68e3-4256-8253-85eb2564501f", "metadata": { "tags": [] @@ -2419,28 +2360,8 @@ "name": "stderr", "output_type": "stream", "text": [ - "Blocking time: 0.37 seconds\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "d550d84b328c4d3082bd7cf5d03b803b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Predict time: 78.84 seconds\n" + "Blocking time: 0.20 seconds\n", + "Predict time: 0.12 seconds\n" ] } ], @@ -2450,7 +2371,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 130, "id": "24e14675-11cf-4c46-a592-7733326113d2", "metadata": { "tags": [] @@ -2462,7 +2383,27 @@ }, { "cell_type": "code", - "execution_count": 123, + "execution_count": 131, + "id": "d50332a5-a8dc-444b-be92-b9d29f73763e", + "metadata": {}, + "outputs": [], + "source": [ + "preds_df = preds_df.merge(sec_df[[\"record_id\", \"company_name_raw\"]], how=\"left\", left_on=\"record_id_l\", right_on=\"record_id\").rename(columns={\"company_name_raw\": \"company_name_sec\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 132, + "id": "fddbed17-3d71-4c85-95d5-c3d0fd517f9d", + "metadata": {}, + "outputs": [], + "source": [ + "preds_df = preds_df.merge(ex21_df[[\"record_id\", \"company_name_raw\"]], how=\"left\", left_on=\"record_id_r\", right_on=\"record_id\").rename(columns={\"company_name_raw\": \"company_name_ex21\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 133, "id": "3d733c2a-7004-4ce8-8d3f-25ed1e720c36", "metadata": { "tags": [] @@ -2498,10 +2439,7 @@ " company_name_l\n", " company_name_r\n", " gamma_company_name\n", - " tf_company_name_l\n", - " tf_company_name_r\n", " bf_company_name\n", - " bf_tf_adj_company_name\n", " loc_of_incorporation_l\n", " loc_of_incorporation_r\n", " gamma_loc_of_incorporation\n", @@ -2509,376 +2447,2601 @@ " tf_loc_of_incorporation_r\n", " bf_loc_of_incorporation\n", " bf_tf_adj_loc_of_incorporation\n", - " report_year_l\n", - " report_year_r\n", " company_name_mphone_l\n", " company_name_mphone_r\n", + " record_id_x\n", + " company_name_sec\n", + " record_id_y\n", + " company_name_ex21\n", + " \n", + " \n", + " \n", + " \n", + " 0\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 8180\n", + " 159390\n", + " national instruments corporation\n", + " national instruments corporation\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " republic of korea\n", + " 0\n", + " 0.372842\n", + " 0.000234\n", + " 0.551065\n", + " 1.0\n", + " NXNL INSTRMNTS\n", + " NXNL INSTRMNTS\n", + " 8180\n", + " national instruments corp\n", + " 159390\n", + " national instruments (korea) corporation\n", + " \n", + " \n", + " 176\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 6034\n", + " 107265\n", + " afternext healthtech acquisition corporation\n", + " afternext healthtech acquisition corporation\n", + " 1\n", + " 2.492261e+06\n", + " e9\n", + " cayman islands\n", + " 0\n", + " 0.001069\n", + " 0.015387\n", + " 0.551065\n", + " 1.0\n", + " AFTRNKST HL0TX AKKSXN\n", + " AFTRNKST HL0TX AKKSXN\n", + " 6034\n", + " afternext healthtech acquisition corp.\n", + " 107265\n", + " afternext healthtech acquisition corp\n", + " \n", + " \n", + " 178\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 6799\n", + " 117610\n", + " gap incorporated\n", + " gap incorporated\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " puerto rico\n", + " 0\n", + " 0.372842\n", + " 0.001548\n", + " 0.551065\n", + " 1.0\n", + " KP\n", + " KP\n", + " 6799\n", + " gap inc\n", + " 117610\n", + " gap (puerto rico), inc\n", + " \n", + " \n", + " 183\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 5811\n", + " 170135\n", + " rockley photonics holdings limited\n", + " rockley photonics holdings limited\n", + " 1\n", + " 2.492261e+06\n", + " e9\n", + " cayman islands\n", + " 0\n", + " 0.001069\n", + " 0.015387\n", + " 0.551065\n", + " 1.0\n", + " RKL FTNKS HLTNKS\n", + " RKL FTNKS HLTNKS\n", + " 5811\n", + " rockley photonics holdings ltd\n", + " 170135\n", + " rockley photonics holdings limited\n", + " \n", + " \n", + " 184\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 6799\n", + " 117608\n", + " gap incorporated\n", + " gap incorporated\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " california\n", + " 0\n", + " 0.372842\n", + " 0.015978\n", + " 0.551065\n", + " 1.0\n", + " KP\n", + " KP\n", + " 6799\n", + " gap inc\n", + " 117608\n", + " gap (itm) inc\n", + " \n", + " \n", + " 186\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 6799\n", + " 117605\n", + " gap incorporated\n", + " gap incorporated\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " canada\n", + " 0\n", + " 0.372842\n", + " 0.012191\n", + " 0.551065\n", + " 1.0\n", + " KP\n", + " KP\n", + " 6799\n", + " gap inc\n", + " 117605\n", + " gap (canada) inc\n", + " \n", + " \n", + " 412\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 1524\n", + " 165843\n", + " aircastle limited\n", + " aircastle limited\n", + " 1\n", + " 2.492261e+06\n", + " d0\n", + " ireland\n", + " 0\n", + " 0.000150\n", + " 0.008315\n", + " 0.551065\n", + " 1.0\n", + " ARKSTL\n", + " ARKSTL\n", + " 1524\n", + " aircastle ltd\n", + " 165843\n", + " aircastle (ireland) limited\n", + " \n", + " \n", + " 189\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 6753\n", + " 115383\n", + " arthur j gallagher and company\n", + " arthur j gallagher and company\n", + " 1\n", + " 2.492261e+06\n", + " illinois\n", + " delaware\n", + " 0\n", + " 0.006115\n", + " 0.372842\n", + " 0.551065\n", + " 1.0\n", + " AR0R J KLKHR ANT\n", + " AR0R J KLKHR ANT\n", + " 6753\n", + " arthur j. gallagher & co.\n", + " 115383\n", + " arthur j. gallagher & co\n", + " \n", + " \n", + " 193\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 6651\n", + " 110797\n", + " flowserve corporation\n", + " flowserve corporation\n", + " 1\n", + " 2.492261e+06\n", + " new york\n", + " mauritius\n", + " 0\n", + " 0.009913\n", + " 0.001075\n", + " 0.551065\n", + " 1.0\n", + " FLSRF\n", + " FLSRF\n", + " 6651\n", + " flowserve corp\n", + " 110797\n", + " flowserve (mauritius) corporation\n", + " \n", + " \n", + " 406\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 578\n", + " 24844\n", + " united parcel service incorporated\n", + " united parcel service incorporated\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " ohio\n", + " 0\n", + " 0.372842\n", + " 0.008136\n", + " 0.551065\n", + " 1.0\n", + " UNTT PRSL SRFS\n", + " UNTT PRSL SRFS\n", + " 578\n", + " united parcel service inc\n", + " 24844\n", + " united parcel service, inc\n", + " \n", + " \n", + " 198\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 5812\n", + " 171905\n", + " nextracker incorporated\n", + " nextracker incorporated\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " united states delaware\n", + " 0\n", + " 0.372842\n", + " 0.002278\n", + " 0.551065\n", + " 1.0\n", + " NKSTRKR\n", + " NKSTRKR\n", + " 5812\n", + " nextracker inc.\n", + " 171905\n", + " nextracker inc\n", + " \n", + " \n", + " 199\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 5843\n", + " 51850\n", + " sculptor acquisition corp i\n", + " sculptor acquisition corp i\n", + " 1\n", + " 2.492261e+06\n", + " e9\n", + " cayman islands\n", + " 0\n", + " 0.001069\n", + " 0.015387\n", + " 0.551065\n", + " 1.0\n", + " SKLPTR AKKSXN I\n", + " SKLPTR AKKSXN I\n", + " 5843\n", + " sculptor acquisition corp i\n", + " 51850\n", + " sculptor acquisition corp i\n", + " \n", + " \n", + " 174\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 7095\n", + " 179994\n", + " cintas corporation\n", + " cintas corporation\n", + " 1\n", + " 2.492261e+06\n", + " washington\n", + " nevada\n", + " 0\n", + " 0.002996\n", + " 0.014652\n", + " 0.551065\n", + " 1.0\n", + " SNTS\n", + " SNTS\n", + " 7095\n", + " cintas corp\n", + " 179994\n", + " cintas corporation\n", + " \n", + " \n", + " 405\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 285\n", + " 12641\n", + " onespan incorporated\n", + " onespan incorporated\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " usa, state of delaware\n", + " 0\n", + " 0.372842\n", + " 0.000011\n", + " 0.551065\n", + " 1.0\n", + " ONSPN\n", + " ONSPN\n", + " 285\n", + " onespan inc.\n", + " 12641\n", + " onespan inc\n", + " \n", + " \n", + " 207\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 6282\n", + " 97173\n", + " mars acquisition corporation\n", + " mars acquisition corporation\n", + " 1\n", + " 2.492261e+06\n", + " e9\n", + " delaware\n", + " 0\n", + " 0.001069\n", + " 0.372842\n", + " 0.551065\n", + " 1.0\n", + " MRS AKKSXN\n", + " MRS AKKSXN\n", + " 6282\n", + " mars acquisition corp.\n", + " 97173\n", + " mars acquisition corp\n", + " \n", + " \n", + " 212\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 4834\n", + " 97747\n", + " viatris incorporated\n", + " viatris incorporated\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " philippines\n", + " 0\n", + " 0.372842\n", + " 0.001927\n", + " 0.551065\n", + " 1.0\n", + " FTRS\n", + " FTRS\n", + " 4834\n", + " viatris inc\n", + " 97747\n", + " viatris, inc\n", + " \n", + " \n", + " 397\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 1205\n", + " 35911\n", + " turning point brands incorporated\n", + " turning point brands incorporated\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " ontario, canada\n", + " 0\n", + " 0.372842\n", + " 0.000852\n", + " 0.551065\n", + " 1.0\n", + " TRNNK PNT BRNTS\n", + " TRNNK PNT BRNTS\n", + " 1205\n", + " turning point brands, inc.\n", + " 35911\n", + " turning point brands (canada) inc\n", + " \n", + " \n", + " 396\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 1171\n", + " 35941\n", + " clearpoint neuro incorporated\n", + " clearpoint neuro incorporated\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " canada new brunswick\n", + " 0\n", + " 0.372842\n", + " 0.000006\n", + " 0.551065\n", + " 1.0\n", + " KLRPNT NR\n", + " KLRPNT NR\n", + " 1171\n", + " clearpoint neuro, inc.\n", + " 35941\n", + " clearpoint neuro (canada) inc\n", + " \n", + " \n", + " 393\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 1765\n", + " 51537\n", + " genpact limited\n", + " genpact limited\n", + " 1\n", + " 2.492261e+06\n", + " d0\n", + " united kingdom\n", + " 0\n", + " 0.000150\n", + " 0.031521\n", + " 0.551065\n", + " 1.0\n", + " JNPKT\n", + " JNPKT\n", + " 1765\n", + " genpact ltd\n", + " 51537\n", + " genpact (uk) ltd\n", + " \n", + " \n", + " 223\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 6181\n", + " 106386\n", + " perimeter solutions sa\n", + " perimeter solutions sa\n", + " 1\n", + " 2.492261e+06\n", + " n4\n", + " grand of luxembourg\n", + " 0\n", + " 0.000017\n", + " 0.000011\n", + " 0.551065\n", + " 1.0\n", + " PRMTR SLXNS S\n", + " PRMTR SLXNS S\n", + " 6181\n", + " perimeter solutions, sa\n", + " 106386\n", + " perimeter solutions sa\n", + " \n", + " \n", + " 390\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 949\n", + " 34324\n", + " ceva incorporated\n", + " ceva incorporated\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " cayman islands\n", + " 0\n", + " 0.372842\n", + " 0.015387\n", + " 0.551065\n", + " 1.0\n", + " SF\n", + " SF\n", + " 949\n", + " ceva inc\n", + " 34324\n", + " ceva inc\n", + " \n", + " \n", + " 226\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 6825\n", + " 123476\n", + " harte hanks incorporated\n", + " harte hanks incorporated\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " ohio\n", + " 0\n", + " 0.372842\n", + " 0.008136\n", + " 0.551065\n", + " 1.0\n", + " HRT HNKS\n", + " HRT HNKS\n", + " 6825\n", + " harte hanks inc\n", + " 123476\n", + " harte hanks, inc\n", + " \n", + " \n", + " 228\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 234\n", + " 6600\n", + " jones lang lasalle incorporated\n", + " jones lang lasalle incorporated\n", + " 1\n", + " 2.492261e+06\n", + " maryland\n", + " puerto rico\n", + " 0\n", + " 0.007786\n", + " 0.001548\n", + " 0.551065\n", + " 1.0\n", + " JNS LNK LSL\n", + " JNS LNK LSL\n", + " 234\n", + " jones lang lasalle inc\n", + " 6600\n", + " jones lang lasalle (puerto rico), inc\n", + " \n", + " \n", + " 229\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 234\n", + " 6596\n", + " jones lang lasalle incorporated\n", + " jones lang lasalle incorporated\n", + " 1\n", + " 2.492261e+06\n", + " maryland\n", + " philippines\n", + " 0\n", + " 0.007786\n", + " 0.001927\n", + " 0.551065\n", + " 1.0\n", + " JNS LNK LSL\n", + " JNS LNK LSL\n", + " 234\n", + " jones lang lasalle inc\n", + " 6596\n", + " jones lang lasalle (philippines), inc\n", + " \n", + " \n", + " 231\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 2097\n", + " 54939\n", + " optimizerx corporation\n", + " optimizerx corporation\n", + " 1\n", + " 2.492261e+06\n", + " nevada\n", + " michigan\n", + " 0\n", + " 0.014652\n", + " 0.007151\n", + " 0.551065\n", + " 1.0\n", + " OPTMSRKS\n", + " OPTMSRKS\n", + " 2097\n", + " optimizerx corp\n", + " 54939\n", + " optimizerx corporation\n", + " \n", + " \n", + " 201\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 6176\n", + " 166072\n", + " phoenix motor incorporated\n", + " phoenix motor incorporated\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " us\n", + " 0\n", + " 0.372842\n", + " 0.000908\n", + " 0.551065\n", + " 1.0\n", + " FNKS MTR\n", + " FNKS MTR\n", + " 6176\n", + " phoenix motor inc.\n", + " 166072\n", + " phoenix motor inc\n", + " \n", + " \n", + " 232\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 2117\n", + " 57288\n", + " transocean limited\n", + " transocean limited\n", + " 1\n", + " 2.492261e+06\n", + " v8\n", + " switzerland\n", + " 0\n", + " 0.000033\n", + " 0.006421\n", + " 0.551065\n", + " 1.0\n", + " TRNSSN\n", + " TRNSSN\n", + " 2117\n", + " transocean ltd.\n", + " 57288\n", + " transocean ltd\n", + " \n", + " \n", + " 421\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 1348\n", + " 40725\n", + " lazard group limited liability company\n", + " lazard group limited liability company\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " us\n", + " 0\n", + " 0.372842\n", + " 0.000908\n", + " 0.551065\n", + " 1.0\n", + " LSRT KRP\n", + " LSRT KRP\n", + " 1348\n", + " lazard group llc\n", + " 40725\n", + " lazard group llc\n", + " \n", + " \n", + " 169\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 6922\n", + " 189462\n", + " analog devices incorporated\n", + " analog devices incorporated\n", + " 1\n", + " 2.492261e+06\n", + " massachusetts\n", + " united states\n", + " 0\n", + " 0.004466\n", + " 0.012146\n", + " 0.551065\n", + " 1.0\n", + " ANLK TFSS\n", + " ANLK TFSS\n", + " 6922\n", + " analog devices inc\n", + " 189462\n", + " analog devices, inc\n", + " \n", + " \n", + " 115\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 2485\n", + " 167379\n", + " ameriguard security services incorporated\n", + " ameriguard security services incorporated\n", + " 1\n", + " 2.492261e+06\n", + " nevada\n", + " california\n", + " 0\n", + " 0.014652\n", + " 0.015978\n", + " 0.551065\n", + " 1.0\n", + " AMRKRT SKRT SRFSS\n", + " AMRKRT SKRT SRFSS\n", + " 2485\n", + " ameriguard security services, inc.\n", + " 167379\n", + " ameriguard security services, inc\n", + " \n", + " \n", + " 116\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 2486\n", + " 167379\n", + " ameriguard security services incorporated\n", + " ameriguard security services incorporated\n", + " 1\n", + " 2.492261e+06\n", + " nevada\n", + " california\n", + " 0\n", + " 0.014652\n", + " 0.015978\n", + " 0.551065\n", + " 1.0\n", + " AMRKRT SKRT SRFSS\n", + " AMRKRT SKRT SRFSS\n", + " 2486\n", + " ameriguard security services, inc.\n", + " 167379\n", + " ameriguard security services, inc\n", + " \n", + " \n", + " 120\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 4683\n", + " 95837\n", + " advantage solutions incorporated\n", + " advantage solutions incorporated\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " canada\n", + " 0\n", + " 0.372842\n", + " 0.012191\n", + " 0.551065\n", + " 1.0\n", + " ATFNTJ SLXNS\n", + " ATFNTJ SLXNS\n", + " 4683\n", + " advantage solutions inc.\n", + " 95837\n", + " advantage solutions inc\n", + " \n", + " \n", + " 445\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 926\n", + " 165871\n", + " commvault systems incorporated\n", + " commvault systems incorporated\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " ontario, canada\n", + " 0\n", + " 0.372842\n", + " 0.000852\n", + " 0.551065\n", + " 1.0\n", + " KMFLT SSTMS\n", + " KMFLT SSTMS\n", + " 926\n", + " commvault systems inc\n", + " 165871\n", + " commvault systems (canada) inc\n", + " \n", + " \n", + " 124\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 4148\n", + " 90738\n", + " firstsun capital bancorp\n", + " firstsun capital bancorp\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " new mexico\n", + " 0\n", + " 0.372842\n", + " 0.000652\n", + " 0.551065\n", + " 1.0\n", + " FRSTSN KPTL BNKRP\n", + " FRSTSN KPTL BNKRP\n", + " 4148\n", + " firstsun capital bancorp\n", + " 90738\n", + " firstsun capital bancorp\n", + " \n", + " \n", + " 126\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 5544\n", + " 26048\n", + " taboola com limited\n", + " taboola com limited\n", + " 1\n", + " 2.492261e+06\n", + " l3\n", + " israel\n", + " 0\n", + " 0.000061\n", + " 0.003057\n", + " 0.551065\n", + " 1.0\n", + " TBL KM\n", + " TBL KM\n", + " 5544\n", + " taboola.com ltd.\n", + " 26048\n", + " taboola.com ltd\n", + " \n", + " \n", + " 443\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 2\n", + " 96\n", + " henry schein incorporated\n", + " henry schein incorporated\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " pennsylvania\n", + " 0\n", + " 0.372842\n", + " 0.007919\n", + " 0.551065\n", + " 1.0\n", + " HNR SXN\n", + " HNR SXN\n", + " 2\n", + " henry schein inc\n", + " 96\n", + " henry schein (lancaster, pa) inc\n", + " \n", + " \n", + " 132\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 6668\n", + " 117995\n", + " tomi environmental solutions incorporated\n", + " tomi environmental solutions incorporated\n", + " 1\n", + " 2.492261e+06\n", + " florida\n", + " nevada\n", + " 0\n", + " 0.014691\n", + " 0.014652\n", + " 0.551065\n", + " 1.0\n", + " TM ENFRNMNTL SLXNS\n", + " TM ENFRNMNTL SLXNS\n", + " 6668\n", + " tomi environmental solutions, inc.\n", + " 117995\n", + " tomi environmental solutions, inc\n", + " \n", + " \n", + " 136\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 6148\n", + " 107455\n", + " esab corporation\n", + " esab corporation\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " united states\n", + " 0\n", + " 0.372842\n", + " 0.012146\n", + " 0.551065\n", + " 1.0\n", + " ESB\n", + " ESB\n", + " 6148\n", + " esab corp\n", + " 107455\n", + " esab corporation\n", + " \n", + " \n", + " 137\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 6958\n", + " 104521\n", + " apache corporation\n", + " apache corporation\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " new jersey\n", + " 0\n", + " 0.372842\n", + " 0.006143\n", + " 0.551065\n", + " 1.0\n", + " APX\n", + " APX\n", + " 6958\n", + " apache corp\n", + " 104521\n", + " apache corporation\n", + " \n", + " \n", + " 138\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 7011\n", + " 121758\n", + " ncr corporation\n", + " ncr corporation\n", + " 1\n", + " 2.492261e+06\n", + " maryland\n", + " new zealand\n", + " 0\n", + " 0.007786\n", + " 0.002590\n", + " 0.551065\n", + " 1.0\n", + " NKR\n", + " NKR\n", + " 7011\n", + " ncr corp\n", + " 121758\n", + " ncr (nz) corporation\n", + " \n", + " \n", + " 423\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 77\n", + " 165059\n", + " jakks pacific incorporated\n", + " jakks pacific incorporated\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " canada\n", + " 0\n", + " 0.372842\n", + " 0.012191\n", + " 0.551065\n", + " 1.0\n", + " JKS PSFK\n", + " JKS PSFK\n", + " 77\n", + " jakks pacific inc\n", + " 165059\n", + " jakks pacific (canada), inc\n", + " \n", + " \n", + " 139\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 4902\n", + " 170051\n", + " gan limited\n", + " gan limited\n", + " 1\n", + " 2.492261e+06\n", + " d0\n", + " england and wales\n", + " 0\n", + " 0.000150\n", + " 0.003536\n", + " 0.551065\n", + " 1.0\n", + " KN\n", + " KN\n", + " 4902\n", + " gan ltd\n", + " 170051\n", + " gan (uk) limited\n", + " \n", + " \n", + " 141\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 6613\n", + " 108716\n", + " cts corporation\n", + " cts corporation\n", + " 1\n", + " 2.492261e+06\n", + " indiana\n", + " delaware\n", + " 0\n", + " 0.004060\n", + " 0.372842\n", + " 0.551065\n", + " 1.0\n", + " KTS\n", + " KTS\n", + " 6613\n", + " cts corp\n", + " 108716\n", + " cts corporation\n", + " \n", + " \n", + " 437\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 738\n", + " 29776\n", + " garmin limited\n", + " garmin limited\n", + " 1\n", + " 2.492261e+06\n", + " v8\n", + " thailand\n", + " 0\n", + " 0.000033\n", + " 0.002378\n", + " 0.551065\n", + " 1.0\n", + " KRMN\n", + " KRMN\n", + " 738\n", + " garmin ltd\n", + " 29776\n", + " garmin (thailand) ltd\n", + " \n", + " \n", + " 435\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 277\n", + " 9849\n", + " c h robinson worldwide incorporated\n", + " c h robinson worldwide incorporated\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " united states\n", + " 0\n", + " 0.372842\n", + " 0.012146\n", + " 0.551065\n", + " 1.0\n", + " K H RBNSN WRLTWT\n", + " K H RBNSN WRLTWT\n", + " 277\n", + " c. h. robinson worldwide, inc.\n", + " 9849\n", + " c.h. robinson worldwide, inc\n", + " \n", + " \n", + " 146\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 6763\n", + " 176423\n", + " richardson electronics limited\n", + " richardson electronics limited\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " thailand\n", + " 0\n", + " 0.372842\n", + " 0.002378\n", + " 0.551065\n", + " 1.0\n", + " RXRTSN ELKTRNKS\n", + " RXRTSN ELKTRNKS\n", + " 6763\n", + " richardson electronics, ltd.\n", + " 176423\n", + " richardson electronics (thailand) limited\n", + " \n", + " \n", + " 149\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 4875\n", + " 98755\n", + " api group corporation\n", + " api group corporation\n", + " 1\n", + " 2.492261e+06\n", + " d8\n", + " delaware\n", + " 0\n", + " 0.000078\n", + " 0.372842\n", + " 0.551065\n", + " 1.0\n", + " AP KRP\n", + " AP KRP\n", + " 4875\n", + " api group corp\n", + " 98755\n", + " api group corporation\n", + " \n", + " \n", + " 432\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 2310\n", + " 167475\n", + " thermon group holdings incorporated\n", + " thermon group holdings incorporated\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " delaware, united states\n", + " 0\n", + " 0.372842\n", + " 0.002139\n", + " 0.551065\n", + " 1.0\n", + " 0RMN KRP HLTNKS\n", + " 0RMN KRP HLTNKS\n", + " 2310\n", + " thermon group holdings, inc.\n", + " 167475\n", + " thermon group holdings, inc\n", + " \n", + " \n", + " 156\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 6677\n", + " 118432\n", + " aon public limited company\n", + " aon public limited company\n", + " 1\n", + " 2.492261e+06\n", + " l2\n", + " ireland\n", + " 0\n", + " 0.000111\n", + " 0.008315\n", + " 0.551065\n", + " 1.0\n", + " AN\n", + " AN\n", + " 6677\n", + " aon plc\n", + " 118432\n", + " aon plc\n", + " \n", + " \n", + " 158\n", + " 6.816691\n", + " 0.991207\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 5955\n", + " 80272\n", + " minority equality opportunities acquisition in...\n", + " minority equality opportunities acquisition in...\n", + " 1\n", + " 2.492261e+06\n", + " delaware\n", + " delaware, united states\n", + " 0\n", + " 0.372842\n", + " 0.002139\n", + " 0.551065\n", + " 1.0\n", + " MNRT EKLT OPRTNTS AKKSXN\n", + " MNRT EKLT OPRTNTS AKKSXN\n", + " 5955\n", + " minority equality opportunities acquisition inc.\n", + " 80272\n", + " minority equality opportunities acquisition inc\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_l company_name_r gamma_company_name bf_company_name loc_of_incorporation_l loc_of_incorporation_r gamma_loc_of_incorporation tf_loc_of_incorporation_l tf_loc_of_incorporation_r bf_loc_of_incorporation bf_tf_adj_loc_of_incorporation company_name_mphone_l company_name_mphone_r record_id_x company_name_sec record_id_y company_name_ex21\n", + "0 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 8180 159390 national instruments corporation national instruments corporation 1 2.492261e+06 delaware republic of korea 0 0.372842 0.000234 0.551065 1.0 NXNL INSTRMNTS NXNL INSTRMNTS 8180 national instruments corp 159390 national instruments (korea) corporation\n", + "176 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6034 107265 afternext healthtech acquisition corporation afternext healthtech acquisition corporation 1 2.492261e+06 e9 cayman islands 0 0.001069 0.015387 0.551065 1.0 AFTRNKST HL0TX AKKSXN AFTRNKST HL0TX AKKSXN 6034 afternext healthtech acquisition corp. 107265 afternext healthtech acquisition corp\n", + "178 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6799 117610 gap incorporated gap incorporated 1 2.492261e+06 delaware puerto rico 0 0.372842 0.001548 0.551065 1.0 KP KP 6799 gap inc 117610 gap (puerto rico), inc\n", + "183 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 5811 170135 rockley photonics holdings limited rockley photonics holdings limited 1 2.492261e+06 e9 cayman islands 0 0.001069 0.015387 0.551065 1.0 RKL FTNKS HLTNKS RKL FTNKS HLTNKS 5811 rockley photonics holdings ltd 170135 rockley photonics holdings limited\n", + "184 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6799 117608 gap incorporated gap incorporated 1 2.492261e+06 delaware california 0 0.372842 0.015978 0.551065 1.0 KP KP 6799 gap inc 117608 gap (itm) inc\n", + "186 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6799 117605 gap incorporated gap incorporated 1 2.492261e+06 delaware canada 0 0.372842 0.012191 0.551065 1.0 KP KP 6799 gap inc 117605 gap (canada) inc\n", + "412 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 1524 165843 aircastle limited aircastle limited 1 2.492261e+06 d0 ireland 0 0.000150 0.008315 0.551065 1.0 ARKSTL ARKSTL 1524 aircastle ltd 165843 aircastle (ireland) limited\n", + "189 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6753 115383 arthur j gallagher and company arthur j gallagher and company 1 2.492261e+06 illinois delaware 0 0.006115 0.372842 0.551065 1.0 AR0R J KLKHR ANT AR0R J KLKHR ANT 6753 arthur j. gallagher & co. 115383 arthur j. gallagher & co\n", + "193 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6651 110797 flowserve corporation flowserve corporation 1 2.492261e+06 new york mauritius 0 0.009913 0.001075 0.551065 1.0 FLSRF FLSRF 6651 flowserve corp 110797 flowserve (mauritius) corporation\n", + "406 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 578 24844 united parcel service incorporated united parcel service incorporated 1 2.492261e+06 delaware ohio 0 0.372842 0.008136 0.551065 1.0 UNTT PRSL SRFS UNTT PRSL SRFS 578 united parcel service inc 24844 united parcel service, inc\n", + "198 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 5812 171905 nextracker incorporated nextracker incorporated 1 2.492261e+06 delaware united states delaware 0 0.372842 0.002278 0.551065 1.0 NKSTRKR NKSTRKR 5812 nextracker inc. 171905 nextracker inc\n", + "199 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 5843 51850 sculptor acquisition corp i sculptor acquisition corp i 1 2.492261e+06 e9 cayman islands 0 0.001069 0.015387 0.551065 1.0 SKLPTR AKKSXN I SKLPTR AKKSXN I 5843 sculptor acquisition corp i 51850 sculptor acquisition corp i\n", + "174 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 7095 179994 cintas corporation cintas corporation 1 2.492261e+06 washington nevada 0 0.002996 0.014652 0.551065 1.0 SNTS SNTS 7095 cintas corp 179994 cintas corporation\n", + "405 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 285 12641 onespan incorporated onespan incorporated 1 2.492261e+06 delaware usa, state of delaware 0 0.372842 0.000011 0.551065 1.0 ONSPN ONSPN 285 onespan inc. 12641 onespan inc\n", + "207 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6282 97173 mars acquisition corporation mars acquisition corporation 1 2.492261e+06 e9 delaware 0 0.001069 0.372842 0.551065 1.0 MRS AKKSXN MRS AKKSXN 6282 mars acquisition corp. 97173 mars acquisition corp\n", + "212 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 4834 97747 viatris incorporated viatris incorporated 1 2.492261e+06 delaware philippines 0 0.372842 0.001927 0.551065 1.0 FTRS FTRS 4834 viatris inc 97747 viatris, inc\n", + "397 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 1205 35911 turning point brands incorporated turning point brands incorporated 1 2.492261e+06 delaware ontario, canada 0 0.372842 0.000852 0.551065 1.0 TRNNK PNT BRNTS TRNNK PNT BRNTS 1205 turning point brands, inc. 35911 turning point brands (canada) inc\n", + "396 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 1171 35941 clearpoint neuro incorporated clearpoint neuro incorporated 1 2.492261e+06 delaware canada new brunswick 0 0.372842 0.000006 0.551065 1.0 KLRPNT NR KLRPNT NR 1171 clearpoint neuro, inc. 35941 clearpoint neuro (canada) inc\n", + "393 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 1765 51537 genpact limited genpact limited 1 2.492261e+06 d0 united kingdom 0 0.000150 0.031521 0.551065 1.0 JNPKT JNPKT 1765 genpact ltd 51537 genpact (uk) ltd\n", + "223 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6181 106386 perimeter solutions sa perimeter solutions sa 1 2.492261e+06 n4 grand of luxembourg 0 0.000017 0.000011 0.551065 1.0 PRMTR SLXNS S PRMTR SLXNS S 6181 perimeter solutions, sa 106386 perimeter solutions sa\n", + "390 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 949 34324 ceva incorporated ceva incorporated 1 2.492261e+06 delaware cayman islands 0 0.372842 0.015387 0.551065 1.0 SF SF 949 ceva inc 34324 ceva inc\n", + "226 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6825 123476 harte hanks incorporated harte hanks incorporated 1 2.492261e+06 delaware ohio 0 0.372842 0.008136 0.551065 1.0 HRT HNKS HRT HNKS 6825 harte hanks inc 123476 harte hanks, inc\n", + "228 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 234 6600 jones lang lasalle incorporated jones lang lasalle incorporated 1 2.492261e+06 maryland puerto rico 0 0.007786 0.001548 0.551065 1.0 JNS LNK LSL JNS LNK LSL 234 jones lang lasalle inc 6600 jones lang lasalle (puerto rico), inc\n", + "229 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 234 6596 jones lang lasalle incorporated jones lang lasalle incorporated 1 2.492261e+06 maryland philippines 0 0.007786 0.001927 0.551065 1.0 JNS LNK LSL JNS LNK LSL 234 jones lang lasalle inc 6596 jones lang lasalle (philippines), inc\n", + "231 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 2097 54939 optimizerx corporation optimizerx corporation 1 2.492261e+06 nevada michigan 0 0.014652 0.007151 0.551065 1.0 OPTMSRKS OPTMSRKS 2097 optimizerx corp 54939 optimizerx corporation\n", + "201 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6176 166072 phoenix motor incorporated phoenix motor incorporated 1 2.492261e+06 delaware us 0 0.372842 0.000908 0.551065 1.0 FNKS MTR FNKS MTR 6176 phoenix motor inc. 166072 phoenix motor inc\n", + "232 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 2117 57288 transocean limited transocean limited 1 2.492261e+06 v8 switzerland 0 0.000033 0.006421 0.551065 1.0 TRNSSN TRNSSN 2117 transocean ltd. 57288 transocean ltd\n", + "421 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 1348 40725 lazard group limited liability company lazard group limited liability company 1 2.492261e+06 delaware us 0 0.372842 0.000908 0.551065 1.0 LSRT KRP LSRT KRP 1348 lazard group llc 40725 lazard group llc\n", + "169 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6922 189462 analog devices incorporated analog devices incorporated 1 2.492261e+06 massachusetts united states 0 0.004466 0.012146 0.551065 1.0 ANLK TFSS ANLK TFSS 6922 analog devices inc 189462 analog devices, inc\n", + "115 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 2485 167379 ameriguard security services incorporated ameriguard security services incorporated 1 2.492261e+06 nevada california 0 0.014652 0.015978 0.551065 1.0 AMRKRT SKRT SRFSS AMRKRT SKRT SRFSS 2485 ameriguard security services, inc. 167379 ameriguard security services, inc\n", + "116 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 2486 167379 ameriguard security services incorporated ameriguard security services incorporated 1 2.492261e+06 nevada california 0 0.014652 0.015978 0.551065 1.0 AMRKRT SKRT SRFSS AMRKRT SKRT SRFSS 2486 ameriguard security services, inc. 167379 ameriguard security services, inc\n", + "120 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 4683 95837 advantage solutions incorporated advantage solutions incorporated 1 2.492261e+06 delaware canada 0 0.372842 0.012191 0.551065 1.0 ATFNTJ SLXNS ATFNTJ SLXNS 4683 advantage solutions inc. 95837 advantage solutions inc\n", + "445 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 926 165871 commvault systems incorporated commvault systems incorporated 1 2.492261e+06 delaware ontario, canada 0 0.372842 0.000852 0.551065 1.0 KMFLT SSTMS KMFLT SSTMS 926 commvault systems inc 165871 commvault systems (canada) inc\n", + "124 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 4148 90738 firstsun capital bancorp firstsun capital bancorp 1 2.492261e+06 delaware new mexico 0 0.372842 0.000652 0.551065 1.0 FRSTSN KPTL BNKRP FRSTSN KPTL BNKRP 4148 firstsun capital bancorp 90738 firstsun capital bancorp\n", + "126 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 5544 26048 taboola com limited taboola com limited 1 2.492261e+06 l3 israel 0 0.000061 0.003057 0.551065 1.0 TBL KM TBL KM 5544 taboola.com ltd. 26048 taboola.com ltd\n", + "443 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 2 96 henry schein incorporated henry schein incorporated 1 2.492261e+06 delaware pennsylvania 0 0.372842 0.007919 0.551065 1.0 HNR SXN HNR SXN 2 henry schein inc 96 henry schein (lancaster, pa) inc\n", + "132 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6668 117995 tomi environmental solutions incorporated tomi environmental solutions incorporated 1 2.492261e+06 florida nevada 0 0.014691 0.014652 0.551065 1.0 TM ENFRNMNTL SLXNS TM ENFRNMNTL SLXNS 6668 tomi environmental solutions, inc. 117995 tomi environmental solutions, inc\n", + "136 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6148 107455 esab corporation esab corporation 1 2.492261e+06 delaware united states 0 0.372842 0.012146 0.551065 1.0 ESB ESB 6148 esab corp 107455 esab corporation\n", + "137 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6958 104521 apache corporation apache corporation 1 2.492261e+06 delaware new jersey 0 0.372842 0.006143 0.551065 1.0 APX APX 6958 apache corp 104521 apache corporation\n", + "138 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 7011 121758 ncr corporation ncr corporation 1 2.492261e+06 maryland new zealand 0 0.007786 0.002590 0.551065 1.0 NKR NKR 7011 ncr corp 121758 ncr (nz) corporation\n", + "423 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 77 165059 jakks pacific incorporated jakks pacific incorporated 1 2.492261e+06 delaware canada 0 0.372842 0.012191 0.551065 1.0 JKS PSFK JKS PSFK 77 jakks pacific inc 165059 jakks pacific (canada), inc\n", + "139 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 4902 170051 gan limited gan limited 1 2.492261e+06 d0 england and wales 0 0.000150 0.003536 0.551065 1.0 KN KN 4902 gan ltd 170051 gan (uk) limited\n", + "141 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6613 108716 cts corporation cts corporation 1 2.492261e+06 indiana delaware 0 0.004060 0.372842 0.551065 1.0 KTS KTS 6613 cts corp 108716 cts corporation\n", + "437 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 738 29776 garmin limited garmin limited 1 2.492261e+06 v8 thailand 0 0.000033 0.002378 0.551065 1.0 KRMN KRMN 738 garmin ltd 29776 garmin (thailand) ltd\n", + "435 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 277 9849 c h robinson worldwide incorporated c h robinson worldwide incorporated 1 2.492261e+06 delaware united states 0 0.372842 0.012146 0.551065 1.0 K H RBNSN WRLTWT K H RBNSN WRLTWT 277 c. h. robinson worldwide, inc. 9849 c.h. robinson worldwide, inc\n", + "146 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6763 176423 richardson electronics limited richardson electronics limited 1 2.492261e+06 delaware thailand 0 0.372842 0.002378 0.551065 1.0 RXRTSN ELKTRNKS RXRTSN ELKTRNKS 6763 richardson electronics, ltd. 176423 richardson electronics (thailand) limited\n", + "149 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 4875 98755 api group corporation api group corporation 1 2.492261e+06 d8 delaware 0 0.000078 0.372842 0.551065 1.0 AP KRP AP KRP 4875 api group corp 98755 api group corporation\n", + "432 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 2310 167475 thermon group holdings incorporated thermon group holdings incorporated 1 2.492261e+06 delaware delaware, united states 0 0.372842 0.002139 0.551065 1.0 0RMN KRP HLTNKS 0RMN KRP HLTNKS 2310 thermon group holdings, inc. 167475 thermon group holdings, inc\n", + "156 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 6677 118432 aon public limited company aon public limited company 1 2.492261e+06 l2 ireland 0 0.000111 0.008315 0.551065 1.0 AN AN 6677 aon plc 118432 aon plc\n", + "158 6.816691 0.991207 __splink__input_table_0 __splink__input_table_1 5955 80272 minority equality opportunities acquisition in... minority equality opportunities acquisition in... 1 2.492261e+06 delaware delaware, united states 0 0.372842 0.002139 0.551065 1.0 MNRT EKLT OPRTNTS AKKSXN MNRT EKLT OPRTNTS AKKSXN 5955 minority equality opportunities acquisition inc. 80272 minority equality opportunities acquisition inc" + ] + }, + "execution_count": 133, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preds_df.sort_values(by=\"match_probability\").iloc[0:50]" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "id": "59cc74aa-674b-4c89-95d6-181d0f7c162a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrecord_id_lrecord_id_rcompany_name_lcompany_name_rgamma_company_namebf_company_nameloc_of_incorporation_lloc_of_incorporation_rgamma_loc_of_incorporationtf_loc_of_incorporation_ltf_loc_of_incorporation_rbf_loc_of_incorporationbf_tf_adj_loc_of_incorporationcompany_name_mphone_lcompany_name_mphone_r
06.3399090.987805__splink__input_table_0__splink__input_table_18180159390national instruments corporationnational instruments corporation21.774257e+06delawarerepublic of korea00.3728420.0002340.5562301.000000NXNL INSTRMNTSNXNL INSTRMNTS
16.3399090.987805__splink__input_table_0__splink__input_table_17912154757enbridge incorporatedenbridge incorporated21.774257e+06a0alberta00.0000330.0008800.5562301.000000ENBRJENBRJ
26.3399090.987805__splink__input_table_0__splink__input_table_17557140921spectrum pharmaceuticals incorporatedspectrum pharmaceuticals incorporated21.774257e+06delawarecayman islands00.3728420.0153870.5562301.000000SPKTRM FRMSTKLSSPKTRM FRMSTKLS
37.7176390.995272__splink__input_table_0__splink__input_table_18057152329american eagle outfitters incorporatedamerican eagle outfitters incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079AMRKN EKL OTFTRSAMRKN EKL OTFTRS
414.1263620.999944__splink__input_table_0__splink__input_table_1731528974pruco life insurance companypruco life insurance company21.774257e+06arizonaarizona20.0043880.0043882.48746749.368830PRK LF INSRNSPRK LF INSRNS
57.1861560.993180__splink__input_table_0__splink__input_table_17419142779national presto industries incorporatednational presto industries incorporated21.774257e+06wisconsinNone-10.004110NaN1.0000001.000000NXNL PRST INTSTRSNXNL PRST INTSTRS
66.3399090.987805__splink__input_table_0__splink__input_table_17387142016national bankshares incorporatednational bankshares incorporated21.774257e+06virginiacommonwealth virginia00.0062760.0000220.5562301.000000NXNL BNKXRSNXNL BNKXRS
713.6101420.999920__splink__input_table_0__splink__input_table_17387127697national bankshares incorporatednational bankshares incorporated21.774257e+06virginiavirginia20.0062760.0062762.48746734.518756NXNL BNKXRSNXNL BNKXRS
87.7176390.995272__splink__input_table_0__splink__input_table_18258162906thermo fisher scientific incorporatedthermo fisher scientific incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.5810790RM FXR SSNTFK0RM FXR SSNTFK
912.1018550.999773__splink__input_table_0__splink__input_table_1742860197general motors financial company incorporatedgeneral motors financial company incorporated21.774257e+06texastexas20.0178540.0178542.48746712.134323JNRL MTRS FNNXLJNRL MTRS FNNXL
106.3399090.987805__splink__input_table_0__splink__input_table_18258163501thermo fisher scientific incorporatedthermo fisher scientific incorporated21.774257e+06delawaremexico00.3728420.0112050.5562301.0000000RM FXR SSNTFK0RM FXR SSNTFK
116.3399090.987805__splink__input_table_0__splink__input_table_1549852885apollo strategic growth capital iiapollo strategic growth capital ii21.774257e+06e9cayman islands00.0010690.0153870.5562301.000000APL STRTJK KR0 KPTLAPL STRTJK KR0 KPTL
126.3399090.987805__splink__input_table_0__splink__input_table_18258162892thermo fisher scientific incorporatedthermo fisher scientific incorporated21.774257e+06delawarecanada00.3728420.0121910.5562301.0000000RM FXR SSNTFK0RM FXR SSNTFK
136.3399090.987805__splink__input_table_0__splink__input_table_18258162847thermo fisher scientific incorporatedthermo fisher scientific incorporated21.774257e+06delawarerussia00.3728420.0011080.5562301.0000000RM FXR SSNTFK0RM FXR SSNTFK
146.3399090.987805__splink__input_table_0__splink__input_table_149818301intellinetics incorporatedintellinetics incorporated21.774257e+06nevadaohio00.0146520.0081360.5562301.000000INTLNTKSINTLNTKS
156.3399090.987805__splink__input_table_0__splink__input_table_11533165897high sierra technologies incorporatedhigh sierra technologies incorporated21.774257e+06coloradonevada00.0048170.0146520.5562301.000000H SR TXNLJSH SR TXNLJS
1613.9918580.999939__splink__input_table_0__splink__input_table_1212761213lnpr group incorporatedlnpr group incorporated21.774257e+06coloradocolorado20.0048170.0048172.48746744.974148LNPR KRPLNPR KRP
177.1861560.993180__splink__input_table_0__splink__input_table_1931969norwood financial corporationnorwood financial corporation21.774257e+06pennsylvaniaNone-10.007919NaN1.0000001.000000NRWT FNNXLNRWT FNNXL
186.3399090.987805__splink__input_table_0__splink__input_table_11512257nov incorporatednov incorporated21.774257e+06delawaremauritius00.3728420.0010750.5562301.000000NFNF
196.3399090.987805__splink__input_table_0__splink__input_table_128010975juniper networks incorporatedjuniper networks incorporated21.774257e+06delawarecalifornia, usa00.3728420.0002340.5562301.000000JNPR NTWRKSJNPR NTWRKS
203.2523920.905028__splink__input_table_0__splink__input_table_11399157790logiq incorporatedlogiq3 incorporated12.087284e+05delawarecanada00.3728420.0121910.5562301.000000LJKLJK
217.7176390.995272__splink__input_table_0__splink__input_table_11720166283edgio incorporatededgio incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079EJEJ
226.3399090.987805__splink__input_table_0__splink__input_table_12020184709arem pacific corporationarem pacific corporation21.774257e+06delawarearizona00.3728420.0043880.5562301.000000ARM PSFKARM PSFK
237.1861560.993180__splink__input_table_0__splink__input_table_175626596ensign group incorporatedensign group incorporated21.774257e+06Nonenevada-1NaN0.0146521.0000001.000000ENSKN KRPENSKN KRP
247.1861560.993180__splink__input_table_0__splink__input_table_1110424668cco holdings limited liability companycco holdings limited liability company21.774257e+06Nonedelaware-1NaN0.3728421.0000001.000000KK HLTNKSKK HLTNKS
257.7176390.995272__splink__input_table_0__splink__input_table_132111011pc connection incorporatedpc connection incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079KNKXNKNKXN
266.3399090.987805__splink__input_table_0__splink__input_table_147714483polarityte incorporatedpolarityte incorporated21.774257e+06delawarenevada00.3728420.0146520.5562301.000000PLRTTPLRTT
277.7176390.995272__splink__input_table_0__splink__input_table_181025991atlas air worldwide holdings incorporatedatlas air worldwide holdings incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079ATLS AR WRLTWT HLTNKSATLS AR WRLTWT HLTNKS
286.3399090.987805__splink__input_table_0__splink__input_table_11003166010spi energy co limitedspi energy co limited21.774257e+06e9cayman00.0010690.0003450.5562301.000000SP ENRJSP ENRJ
297.7176390.995272__splink__input_table_0__splink__input_table_11012165926bimi international medical incorporatedbimi international medical incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079BM INTRNXNL MTKLBM INTRNXNL MTKL
307.1861560.993180__splink__input_table_0__splink__input_table_1186851876phreesia incorporatedphreesia incorporated21.774257e+06delawareNone-10.372842NaN1.0000001.000000FRXFRX
316.3399090.987805__splink__input_table_0__splink__input_table_1219878290secureworks corporationsecureworks corporation21.774257e+06delawareunited states00.3728420.0121460.5562301.000000SKRWRKSSKRWRKS
327.7176390.995272__splink__input_table_0__splink__input_table_1227358771ryerson holding corporationryerson holding corporation21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079RYRSN HLTNKRYRSN HLTNK
53740.0089140.501545337.1861560.993180__splink__input_table_0__splink__input_table_169167681manitowoc co incorporatedmanitowoc crane companies, llc mcg10.0000050.00000512.5343192219106comfort systems usa incorporatedcomfort systems usa incorporated21.774257e+06Nonearkansas-1NaN0.0012531.000000wisconsinwisconsin30.0041000.0041002.3217850.18078520232023MNTWK K INKRPRTTMNTWK KRN KMPNS LK MKK1.000000KMFRT SSTMS USKMFRT SSTMS US
14520.0089140.5015453414.3518090.999952__splink__input_table_0__splink__input_table_139951003schneider national, incorporated33.schneider logistics, incorporated10.0000050.00000512.534319478180383winnebago industries incorporatedwinnebago industries incorporated21.774257e+06minnesotaminnesota20.0037540.0037542.48746757.719048WNBK INTSTRSWNBK INTSTRS
356.3399090.987805__splink__input_table_0__splink__input_table_11913166068renewable energy acquisition corporationrenewable energy acquisition corporation21.774257e+06nevadaus00.0146520.0009080.5562301.000000wisconsinwisconsin30.0041000.0041002.3217850.18078520232023SXNTR NXNL INKRPRTTSXNTR LJSTKS INKRPRTTRNWBL ENRJ AKKSXNRNWBL ENRJ AKKSXN
41850.0089140.501545367.1861560.993180__splink__input_table_0__splink__input_table_14856819wisconsin electric power companywisconsin energy capital corporation10.0000100.00000512.534319257164606riverview bancorp incorporatedriverview bancorp incorporated21.774257e+06washingtonNone-10.002996NaN1.000000wisconsinwisconsin30.0041000.0041002.3217850.18078520232023WSKNSN ELKTRK PWR KMPNWSKNSN ENRJ KPTL KRPRXN1.000000RFRF BNKRPRFRF BNKRP
39070.0089140.501545377.1861560.993180__splink__input_table_0__splink__input_table_118361390orion energy systems, incorporatedwilson funeral home, incorporated10.0000050.00000512.534319294182945timberland bancorp incorporatedtimberland bancorp incorporated21.774257e+06washingtonNone-10.002996NaN1.000000wisconsinwisconsin30.0041000.0041002.3217850.18078520232023ORN ENRJ SSTMS INKRPRTTWLSN FNRL HM INKRPRTT1.000000TMBRLNT BNKRPTMBRLNT BNKRP
387.7176390.995272__splink__input_table_0__splink__input_table_141518543lkq corporationlkq corporation21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079LKKLKK
397.7176390.995272__splink__input_table_0__splink__input_table_167423252berkshire hills bancorp incorporatedberkshire hills bancorp incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079BRKXR HLS BNKRPBRKXR HLS BNKRP
406.3399090.987805__splink__input_table_0__splink__input_table_11270181001dolby laboratories incorporateddolby laboratories incorporated21.774257e+06delawarecalifornia00.3728420.0159780.5562301.000000TLB LBRTRSTLB LBRTRS
14260.0089140.501545413.2523920.905028__splink__input_table_0__splink__input_table_139951010schneider national, incorporated40.schneider resources, incorporated1321132984tss incorporateddss incorporated10.0000050.00000512.5343192.087284e+05delawarenew york00.3728420.0099130.5562301.000000wisconsinwisconsin30.0041000.0041002.3217850.18078520232023SXNTR NXNL INKRPRTTSXNTR RSRSS INKRPRTTTSTS
...........................................................................427.7176390.995272__splink__input_table_0__splink__input_table_1148246045anywhere real estate incorporatedanywhere real estate incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079ANHR RL ESTTANHR RL ESTT
467213.2322660.999896436.3399090.987805__splink__input_table_0__splink__input_table_165684608wesbanco incorporatedwesbanco, incorporated30.0000050.00000535295.437753149447625kbr incorporatedkbr incorporated21.774257e+06delawareunited states00.3728420.0121460.5562301.000000west virginiawest virginia30.0012070.0012072.32178170.42967220232023WSBNK INKRPRTTWSBNK INKRPRTT
182913.2570620.999898__splink__input_table_0__splink__input_table_14974974berkshire hathaway energy companyberkshire hathaway energy company40.0000100.000010695779.2731160.053272iowaiowa30.0012460.0012462.32178165.10374520232023BRKXR H0W ENRJ KMPNBRKXR H0W ENRJ KMPNKBRKBR
447.7176390.995272__splink__input_table_0__splink__input_table_11972166348reshape lifesciences incorporatedreshape lifesciences incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079RXP LFSSNSSRXP LFSSNSS
4512.3870180.999813__splink__input_table_0__splink__input_table_11457172081imperalis holding corporationimperalis holding corporation21.774257e+06nevadanevada20.0146520.0146522.48746714.786255IMPRLS HLTNKIMPRLS HLTNK
4612.3870180.999813__splink__input_table_0__splink__input_table_12037172091bitnile metaverse incorporatedbitnile metaverse incorporated21.774257e+06nevadanevada20.0146520.0146522.48746714.786255BTNL MTFRSBTNL MTFRS
477.7176390.995272__splink__input_table_0__splink__input_table_1105835808qvc incorporatedqvc incorporated21.774257e+06delawaredelaware20.3728420.3728422.4874670.581079KFKKFK
645813.5508730.999917489.6928770.998793__splink__input_table_0__splink__input_table_13842749shiftpixy, incorporatedshiftpixy labs, incorporated30.0000050.00000535295.437753170547703irhythm technologies incorporatedirhythm technologies incorporated21.774257e+06delawareus delaware10.3728420.0003235.6832681.000000wyomingwyoming30.0009680.0009682.32178212.54735020232023XFTPKS INKRPRTTXFTPKS LBS INKRPRTT
133013.6214740.999921__splink__input_table_0__splink__input_table_14088476securetech innovations, incorporatedsecuretech innovations, incorporated40.0000100.000010695779.2731160.053272wyomingwyoming30.0009680.0009682.32178212.54735020232023SKRTX INFXNS INKRPRTTSKRTX INFXNS INKRPRTTIRH0M TXNLJSIRH0M TXNLJS
618614.2064360.999947497.1861560.993180__splink__input_table_0__splink__input_table_181162004southwestern public service companysouthwestern public service company40.0000100.000010695779.2731160.053272new mexiconew mexico30.0006450.0006452.32178318.82102420232023S0WSTRN PBLK SRFS KMPNS0WSTRN PBLK SRFS KMPN33813985essex property trust incorporatedessex property trust incorporated21.774257e+06marylandNone-10.007786NaN1.0000001.000000ESKS PRPRT TRSTESKS PRPRT TRST
\n", - "

7540 rows × 24 columns

\n", "
" ], "text/plain": [ - " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_l company_name_r gamma_company_name tf_company_name_l tf_company_name_r bf_company_name bf_tf_adj_company_name loc_of_incorporation_l loc_of_incorporation_r gamma_loc_of_incorporation tf_loc_of_incorporation_l tf_loc_of_incorporation_r bf_loc_of_incorporation bf_tf_adj_loc_of_incorporation report_year_l report_year_r company_name_mphone_l company_name_mphone_r\n", - "5374 0.008914 0.501545 __splink__input_table_0 __splink__input_table_1 6916 7681 manitowoc co incorporated manitowoc crane companies, llc mcg 1 0.000005 0.000005 12.534319 1.000000 wisconsin wisconsin 3 0.004100 0.004100 2.32178 50.180785 2023 2023 MNTWK K INKRPRTT MNTWK KRN KMPNS LK MKK\n", - "1452 0.008914 0.501545 __splink__input_table_0 __splink__input_table_1 3995 1003 schneider national, incorporated 33.schneider logistics, incorporated 1 0.000005 0.000005 12.534319 1.000000 wisconsin wisconsin 3 0.004100 0.004100 2.32178 50.180785 2023 2023 SXNTR NXNL INKRPRTT SXNTR LJSTKS INKRPRTT\n", - "4185 0.008914 0.501545 __splink__input_table_0 __splink__input_table_1 485 6819 wisconsin electric power company wisconsin energy capital corporation 1 0.000010 0.000005 12.534319 1.000000 wisconsin wisconsin 3 0.004100 0.004100 2.32178 50.180785 2023 2023 WSKNSN ELKTRK PWR KMPN WSKNSN ENRJ KPTL KRPRXN\n", - "3907 0.008914 0.501545 __splink__input_table_0 __splink__input_table_1 1836 1390 orion energy systems, incorporated wilson funeral home, incorporated 1 0.000005 0.000005 12.534319 1.000000 wisconsin wisconsin 3 0.004100 0.004100 2.32178 50.180785 2023 2023 ORN ENRJ SSTMS INKRPRTT WLSN FNRL HM INKRPRTT\n", - "1426 0.008914 0.501545 __splink__input_table_0 __splink__input_table_1 3995 1010 schneider national, incorporated 40.schneider resources, incorporated 1 0.000005 0.000005 12.534319 1.000000 wisconsin wisconsin 3 0.004100 0.004100 2.32178 50.180785 2023 2023 SXNTR NXNL INKRPRTT SXNTR RSRSS INKRPRTT\n", - "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", - "4672 13.232266 0.999896 __splink__input_table_0 __splink__input_table_1 6568 4608 wesbanco incorporated wesbanco, incorporated 3 0.000005 0.000005 35295.437753 1.000000 west virginia west virginia 3 0.001207 0.001207 2.32178 170.429672 2023 2023 WSBNK INKRPRTT WSBNK INKRPRTT\n", - "1829 13.257062 0.999898 __splink__input_table_0 __splink__input_table_1 497 4974 berkshire hathaway energy company berkshire hathaway energy company 4 0.000010 0.000010 695779.273116 0.053272 iowa iowa 3 0.001246 0.001246 2.32178 165.103745 2023 2023 BRKXR H0W ENRJ KMPN BRKXR H0W ENRJ KMPN\n", - "6458 13.550873 0.999917 __splink__input_table_0 __splink__input_table_1 3842 749 shiftpixy, incorporated shiftpixy labs, incorporated 3 0.000005 0.000005 35295.437753 1.000000 wyoming wyoming 3 0.000968 0.000968 2.32178 212.547350 2023 2023 XFTPKS INKRPRTT XFTPKS LBS INKRPRTT\n", - "1330 13.621474 0.999921 __splink__input_table_0 __splink__input_table_1 4088 476 securetech innovations, incorporated securetech innovations, incorporated 4 0.000010 0.000010 695779.273116 0.053272 wyoming wyoming 3 0.000968 0.000968 2.32178 212.547350 2023 2023 SKRTX INFXNS INKRPRTT SKRTX INFXNS INKRPRTT\n", - "6186 14.206436 0.999947 __splink__input_table_0 __splink__input_table_1 8116 2004 southwestern public service company southwestern public service company 4 0.000010 0.000010 695779.273116 0.053272 new mexico new mexico 3 0.000645 0.000645 2.32178 318.821024 2023 2023 S0WSTRN PBLK SRFS KMPN S0WSTRN PBLK SRFS KMPN\n", - "\n", - "[7540 rows x 24 columns]" - ] - }, - "execution_count": 123, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "preds_df.sort_values(by=\"match_probability\")" - ] - }, - { - "cell_type": "code", - "execution_count": 238, - "id": "255272b6-a5c4-4ab8-bebc-d13e77655938", - "metadata": { - "tags": [] - }, - "outputs": [ - { - "data": { - "text/plain": [ - "Index(['match_weight', 'match_probability', 'source_dataset_l',\n", - " 'source_dataset_r', 'record_id_l', 'record_id_r', 'company_name_l',\n", - " 'company_name_r', 'gamma_company_name', 'tf_company_name_l',\n", - " 'tf_company_name_r', 'bf_company_name', 'bf_tf_adj_company_name',\n", - " 'loc_of_incorporation_l', 'loc_of_incorporation_r',\n", - " 'gamma_loc_of_incorporation', 'tf_loc_of_incorporation_l',\n", - " 'tf_loc_of_incorporation_r', 'bf_loc_of_incorporation',\n", - " 'bf_tf_adj_loc_of_incorporation', 'company_name_mphone_l',\n", - " 'company_name_mphone_r', 'report_year_l', 'report_year_r'],\n", - " dtype='object')" + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_l company_name_r gamma_company_name bf_company_name loc_of_incorporation_l loc_of_incorporation_r gamma_loc_of_incorporation tf_loc_of_incorporation_l tf_loc_of_incorporation_r bf_loc_of_incorporation bf_tf_adj_loc_of_incorporation company_name_mphone_l company_name_mphone_r\n", + "0 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 8180 159390 national instruments corporation national instruments corporation 2 1.774257e+06 delaware republic of korea 0 0.372842 0.000234 0.556230 1.000000 NXNL INSTRMNTS NXNL INSTRMNTS\n", + "1 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 7912 154757 enbridge incorporated enbridge incorporated 2 1.774257e+06 a0 alberta 0 0.000033 0.000880 0.556230 1.000000 ENBRJ ENBRJ\n", + "2 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 7557 140921 spectrum pharmaceuticals incorporated spectrum pharmaceuticals incorporated 2 1.774257e+06 delaware cayman islands 0 0.372842 0.015387 0.556230 1.000000 SPKTRM FRMSTKLS SPKTRM FRMSTKLS\n", + "3 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 8057 152329 american eagle outfitters incorporated american eagle outfitters incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 AMRKN EKL OTFTRS AMRKN EKL OTFTRS\n", + "4 14.126362 0.999944 __splink__input_table_0 __splink__input_table_1 7315 28974 pruco life insurance company pruco life insurance company 2 1.774257e+06 arizona arizona 2 0.004388 0.004388 2.487467 49.368830 PRK LF INSRNS PRK LF INSRNS\n", + "5 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 7419 142779 national presto industries incorporated national presto industries incorporated 2 1.774257e+06 wisconsin None -1 0.004110 NaN 1.000000 1.000000 NXNL PRST INTSTRS NXNL PRST INTSTRS\n", + "6 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 7387 142016 national bankshares incorporated national bankshares incorporated 2 1.774257e+06 virginia commonwealth virginia 0 0.006276 0.000022 0.556230 1.000000 NXNL BNKXRS NXNL BNKXRS\n", + "7 13.610142 0.999920 __splink__input_table_0 __splink__input_table_1 7387 127697 national bankshares incorporated national bankshares incorporated 2 1.774257e+06 virginia virginia 2 0.006276 0.006276 2.487467 34.518756 NXNL BNKXRS NXNL BNKXRS\n", + "8 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 8258 162906 thermo fisher scientific incorporated thermo fisher scientific incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 0RM FXR SSNTFK 0RM FXR SSNTFK\n", + "9 12.101855 0.999773 __splink__input_table_0 __splink__input_table_1 7428 60197 general motors financial company incorporated general motors financial company incorporated 2 1.774257e+06 texas texas 2 0.017854 0.017854 2.487467 12.134323 JNRL MTRS FNNXL JNRL MTRS FNNXL\n", + "10 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 8258 163501 thermo fisher scientific incorporated thermo fisher scientific incorporated 2 1.774257e+06 delaware mexico 0 0.372842 0.011205 0.556230 1.000000 0RM FXR SSNTFK 0RM FXR SSNTFK\n", + "11 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 5498 52885 apollo strategic growth capital ii apollo strategic growth capital ii 2 1.774257e+06 e9 cayman islands 0 0.001069 0.015387 0.556230 1.000000 APL STRTJK KR0 KPTL APL STRTJK KR0 KPTL \n", + "12 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 8258 162892 thermo fisher scientific incorporated thermo fisher scientific incorporated 2 1.774257e+06 delaware canada 0 0.372842 0.012191 0.556230 1.000000 0RM FXR SSNTFK 0RM FXR SSNTFK\n", + "13 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 8258 162847 thermo fisher scientific incorporated thermo fisher scientific incorporated 2 1.774257e+06 delaware russia 0 0.372842 0.001108 0.556230 1.000000 0RM FXR SSNTFK 0RM FXR SSNTFK\n", + "14 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 498 18301 intellinetics incorporated intellinetics incorporated 2 1.774257e+06 nevada ohio 0 0.014652 0.008136 0.556230 1.000000 INTLNTKS INTLNTKS\n", + "15 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 1533 165897 high sierra technologies incorporated high sierra technologies incorporated 2 1.774257e+06 colorado nevada 0 0.004817 0.014652 0.556230 1.000000 H SR TXNLJS H SR TXNLJS\n", + "16 13.991858 0.999939 __splink__input_table_0 __splink__input_table_1 2127 61213 lnpr group incorporated lnpr group incorporated 2 1.774257e+06 colorado colorado 2 0.004817 0.004817 2.487467 44.974148 LNPR KRP LNPR KRP\n", + "17 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 93 1969 norwood financial corporation norwood financial corporation 2 1.774257e+06 pennsylvania None -1 0.007919 NaN 1.000000 1.000000 NRWT FNNXL NRWT FNNXL\n", + "18 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 151 2257 nov incorporated nov incorporated 2 1.774257e+06 delaware mauritius 0 0.372842 0.001075 0.556230 1.000000 NF NF\n", + "19 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 280 10975 juniper networks incorporated juniper networks incorporated 2 1.774257e+06 delaware california, usa 0 0.372842 0.000234 0.556230 1.000000 JNPR NTWRKS JNPR NTWRKS\n", + "20 3.252392 0.905028 __splink__input_table_0 __splink__input_table_1 1399 157790 logiq incorporated logiq3 incorporated 1 2.087284e+05 delaware canada 0 0.372842 0.012191 0.556230 1.000000 LJK LJK\n", + "21 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 1720 166283 edgio incorporated edgio incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 EJ EJ\n", + "22 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 2020 184709 arem pacific corporation arem pacific corporation 2 1.774257e+06 delaware arizona 0 0.372842 0.004388 0.556230 1.000000 ARM PSFK ARM PSFK\n", + "23 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 756 26596 ensign group incorporated ensign group incorporated 2 1.774257e+06 None nevada -1 NaN 0.014652 1.000000 1.000000 ENSKN KRP ENSKN KRP\n", + "24 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 1104 24668 cco holdings limited liability company cco holdings limited liability company 2 1.774257e+06 None delaware -1 NaN 0.372842 1.000000 1.000000 KK HLTNKS KK HLTNKS\n", + "25 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 321 11011 pc connection incorporated pc connection incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 KNKXN KNKXN\n", + "26 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 477 14483 polarityte incorporated polarityte incorporated 2 1.774257e+06 delaware nevada 0 0.372842 0.014652 0.556230 1.000000 PLRTT PLRTT\n", + "27 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 810 25991 atlas air worldwide holdings incorporated atlas air worldwide holdings incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 ATLS AR WRLTWT HLTNKS ATLS AR WRLTWT HLTNKS\n", + "28 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 1003 166010 spi energy co limited spi energy co limited 2 1.774257e+06 e9 cayman 0 0.001069 0.000345 0.556230 1.000000 SP ENRJ SP ENRJ\n", + "29 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 1012 165926 bimi international medical incorporated bimi international medical incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 BM INTRNXNL MTKL BM INTRNXNL MTKL\n", + "30 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 1868 51876 phreesia incorporated phreesia incorporated 2 1.774257e+06 delaware None -1 0.372842 NaN 1.000000 1.000000 FRX FRX\n", + "31 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 2198 78290 secureworks corporation secureworks corporation 2 1.774257e+06 delaware united states 0 0.372842 0.012146 0.556230 1.000000 SKRWRKS SKRWRKS\n", + "32 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 2273 58771 ryerson holding corporation ryerson holding corporation 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 RYRSN HLTNK RYRSN HLTNK\n", + "33 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 221 9106 comfort systems usa incorporated comfort systems usa incorporated 2 1.774257e+06 None arkansas -1 NaN 0.001253 1.000000 1.000000 KMFRT SSTMS US KMFRT SSTMS US\n", + "34 14.351809 0.999952 __splink__input_table_0 __splink__input_table_1 478 180383 winnebago industries incorporated winnebago industries incorporated 2 1.774257e+06 minnesota minnesota 2 0.003754 0.003754 2.487467 57.719048 WNBK INTSTRS WNBK INTSTRS\n", + "35 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 1913 166068 renewable energy acquisition corporation renewable energy acquisition corporation 2 1.774257e+06 nevada us 0 0.014652 0.000908 0.556230 1.000000 RNWBL ENRJ AKKSXN RNWBL ENRJ AKKSXN\n", + "36 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 257 164606 riverview bancorp incorporated riverview bancorp incorporated 2 1.774257e+06 washington None -1 0.002996 NaN 1.000000 1.000000 RFRF BNKRP RFRF BNKRP\n", + "37 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 294 182945 timberland bancorp incorporated timberland bancorp incorporated 2 1.774257e+06 washington None -1 0.002996 NaN 1.000000 1.000000 TMBRLNT BNKRP TMBRLNT BNKRP\n", + "38 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 415 18543 lkq corporation lkq corporation 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 LKK LKK\n", + "39 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 674 23252 berkshire hills bancorp incorporated berkshire hills bancorp incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 BRKXR HLS BNKRP BRKXR HLS BNKRP\n", + "40 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 1270 181001 dolby laboratories incorporated dolby laboratories incorporated 2 1.774257e+06 delaware california 0 0.372842 0.015978 0.556230 1.000000 TLB LBRTRS TLB LBRTRS\n", + "41 3.252392 0.905028 __splink__input_table_0 __splink__input_table_1 1321 132984 tss incorporated dss incorporated 1 2.087284e+05 delaware new york 0 0.372842 0.009913 0.556230 1.000000 TS TS\n", + "42 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 1482 46045 anywhere real estate incorporated anywhere real estate incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 ANHR RL ESTT ANHR RL ESTT\n", + "43 6.339909 0.987805 __splink__input_table_0 __splink__input_table_1 1494 47625 kbr incorporated kbr incorporated 2 1.774257e+06 delaware united states 0 0.372842 0.012146 0.556230 1.000000 KBR KBR\n", + "44 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 1972 166348 reshape lifesciences incorporated reshape lifesciences incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 RXP LFSSNSS RXP LFSSNSS\n", + "45 12.387018 0.999813 __splink__input_table_0 __splink__input_table_1 1457 172081 imperalis holding corporation imperalis holding corporation 2 1.774257e+06 nevada nevada 2 0.014652 0.014652 2.487467 14.786255 IMPRLS HLTNK IMPRLS HLTNK\n", + "46 12.387018 0.999813 __splink__input_table_0 __splink__input_table_1 2037 172091 bitnile metaverse incorporated bitnile metaverse incorporated 2 1.774257e+06 nevada nevada 2 0.014652 0.014652 2.487467 14.786255 BTNL MTFRS BTNL MTFRS\n", + "47 7.717639 0.995272 __splink__input_table_0 __splink__input_table_1 1058 35808 qvc incorporated qvc incorporated 2 1.774257e+06 delaware delaware 2 0.372842 0.372842 2.487467 0.581079 KFK KFK\n", + "48 9.692877 0.998793 __splink__input_table_0 __splink__input_table_1 1705 47703 irhythm technologies incorporated irhythm technologies incorporated 2 1.774257e+06 delaware us delaware 1 0.372842 0.000323 5.683268 1.000000 IRH0M TXNLJS IRH0M TXNLJS\n", + "49 7.186156 0.993180 __splink__input_table_0 __splink__input_table_1 338 13985 essex property trust incorporated essex property trust incorporated 2 1.774257e+06 maryland None -1 0.007786 NaN 1.000000 1.000000 ESKS PRPRT TRST ESKS PRPRT TRST" ] }, - "execution_count": 238, + "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "preds_df.columns" + "preds_df[preds_df.match_probability > .9]" ] }, { "cell_type": "code", - "execution_count": 249, + "execution_count": 79, "id": "8e658c36-7b6f-480f-9d74-37af9510ebe2", "metadata": { "tags": [] @@ -2908,579 +5071,587 @@ " match_probability\n", " company_name_l\n", " company_name_r\n", - " loc_of_incorporation_l\n", - " loc_of_incorporation_r\n", + " loc_list_l\n", + " loc_list_r\n", " company_name_mphone_l\n", " company_name_mphone_r\n", " \n", " \n", " \n", " \n", - " 150\n", - " 0.996128\n", - " santander drive auto receivables trust 2018-1\n", - " santander drive auto receivables trust\n", - " delaware\n", - " delaware\n", - " SNTNTR TRF AT RSFBLS TRST\n", - " SNTNTR TRF AT RSFBLS TRST\n", + " 465\n", + " 0.914612\n", + " conns incorporated\n", + " invenco incorporated\n", + " [delaware]\n", + " [delaware]\n", + " KNS\n", + " INFNK\n", " \n", " \n", - " 151\n", - " 0.996128\n", - " santander drive auto receivables trust 2018-5\n", - " santander drive auto receivables trust\n", - " delaware\n", - " delaware\n", - " SNTNTR TRF AT RSFBLS TRST\n", - " SNTNTR TRF AT RSFBLS TRST\n", + " 466\n", + " 0.914612\n", + " vishay intertechnology incorporated\n", + " vishay precision foil, incorporated\n", + " [delaware]\n", + " [delaware]\n", + " FX INTRTXNLJ\n", + " FX PRSXN FL\n", " \n", " \n", - " 152\n", - " 0.996128\n", - " santander drive auto receivables trust 2018-3\n", - " santander drive auto receivables trust\n", - " delaware\n", - " delaware\n", - " SNTNTR TRF AT RSFBLS TRST\n", - " SNTNTR TRF AT RSFBLS TRST\n", + " 467\n", + " 0.980607\n", + " vishay precision group, incorporated\n", + " vishay precision foil, incorporated\n", + " [delaware]\n", + " [delaware]\n", + " FX PRSXN KRP\n", + " FX PRSXN FL\n", " \n", " \n", - " 153\n", - " 0.996128\n", - " santander drive auto receivables trust 2016-1\n", - " santander drive auto receivables trust\n", - " delaware\n", - " delaware\n", - " SNTNTR TRF AT RSFBLS TRST\n", - " SNTNTR TRF AT RSFBLS TRST\n", + " 470\n", + " 0.975104\n", + " jones lang lasalle incorporated\n", + " jones lang lasalle limited\n", + " [maryland]\n", + " [hong, kong]\n", + " JNS LNK LSL\n", + " JNS LNK LSL\n", " \n", " \n", - " 154\n", - " 0.573277\n", - " constellation pharmaceuticals inc\n", - " constellation connect, llc\n", - " delaware\n", - " delaware\n", - " KNSTLXN FRMSTKLS INK\n", - " KNSTLXN KNKT LK\n", + " 471\n", + " 0.951657\n", + " nrg energy, incorporated\n", + " nrg energy, incorporated\n", + " [delaware]\n", + " [delaware]\n", + " NRK ENRJ\n", + " NRK ENRJ\n", " \n", " \n", - " 162\n", - " 0.959568\n", - " consolidated communications holdings, inc.\n", - " consolidated communications of\n", - " delaware\n", - " illinois\n", - " KNSLTTT KMNKXNS HLTNKS INK\n", - " KNSLTTT KMNKXNS OF\n", + " 472\n", + " 0.914612\n", + " firstenergy corporation\n", + " firstenergy ventures corporation\n", + " [ohio]\n", + " [ohio]\n", + " FRSTNRJ\n", + " FRSTNRJ FNTRS\n", " \n", " \n", - " 163\n", - " 0.959568\n", - " consolidated communications holdings, inc.\n", - " consolidated communications of\n", - " delaware\n", - " missouri\n", - " KNSLTTT KMNKXNS HLTNKS INK\n", - " KNSLTTT KMNKXNS OF\n", + " 478\n", + " 0.914612\n", + " hudson pacific properties, incorporated\n", + " hudson pacific services, incorporated\n", + " [maryland]\n", + " [maryland]\n", + " HTSN PSFK PRPRTS\n", + " HTSN PSFK SRFSS\n", " \n", " \n", - " 164\n", - " 0.959568\n", - " consolidated communications holdings, inc.\n", - " consolidated communications of\n", - " delaware\n", - " maine\n", - " KNSLTTT KMNKXNS HLTNKS INK\n", - " KNSLTTT KMNKXNS OF\n", + " 479\n", + " 0.980607\n", + " hudson pacific properties, incorporated\n", + " hudson pacific properties, limited partnership\n", + " [maryland]\n", + " [maryland]\n", + " HTSN PSFK PRPRTS\n", + " HTSN PSFK PRPRTS\n", " \n", " \n", - " 165\n", - " 0.959568\n", - " consolidated communications holdings, inc.\n", - " consolidated communications of\n", - " delaware\n", - " kansas\n", - " KNSLTTT KMNKXNS HLTNKS INK\n", - " KNSLTTT KMNKXNS OF\n", + " 481\n", + " 0.914612\n", + " digital ally, incorporated\n", + " digital ally international, incorporated\n", + " [nevada]\n", + " [nevada]\n", + " TJTL AL\n", + " TJTL AL INTRNXNL\n", " \n", " \n", - " 166\n", - " 0.959568\n", - " consolidated communications holdings, inc.\n", - " consolidated communications of\n", - " delaware\n", - " minnesota\n", - " KNSLTTT KMNKXNS HLTNKS INK\n", - " KNSLTTT KMNKXNS OF\n", + " 489\n", + " 0.976947\n", + " cco holdings limited liability company\n", + " rhfw holdings, limited liability company\n", + " NaN\n", + " [delaware]\n", + " KK HLTNKS\n", + " RHF HLTNKS\n", " \n", " \n", - " 167\n", - " 0.959568\n", - " consolidated communications holdings, inc.\n", - " consolidated communications of\n", - " delaware\n", - " central\n", - " KNSLTTT KMNKXNS HLTNKS INK\n", - " KNSLTTT KMNKXNS OF\n", + " 493\n", + " 0.975104\n", + " intuitive surgical incorporated\n", + " intuitive surgical limited\n", + " [delaware]\n", + " [united, kingdom]\n", + " INTTF SRJKL\n", + " INTTF SRJKL\n", " \n", " \n", - " 168\n", - " 0.959568\n", - " consolidated communications holdings, inc.\n", - " consolidated communications of\n", - " delaware\n", - " florida\n", - " KNSLTTT KMNKXNS HLTNKS INK\n", - " KNSLTTT KMNKXNS OF\n", + " 494\n", + " 0.975104\n", + " jones lang lasalle incorporated\n", + " jones lang lasalle limited\n", + " [maryland]\n", + " [england]\n", + " JNS LNK LSL\n", + " JNS LNK LSL\n", " \n", " \n", - " 169\n", - " 0.959568\n", - " consolidated communications holdings, inc.\n", - " consolidated communications of\n", - " delaware\n", - " california\n", - " KNSLTTT KMNKXNS HLTNKS INK\n", - " KNSLTTT KMNKXNS OF\n", + " 500\n", + " 0.975104\n", + " becton dickinson and company\n", + " becton, dickinson and company, limited\n", + " [new, jersey]\n", + " [ireland]\n", + " BKTN TKNSN ANT\n", + " BKTN TKNSN ANT\n", " \n", " \n", - " 174\n", - " 0.573277\n", - " duke energy corp\n", - " duke energy one, inc\n", - " delaware\n", - " delaware\n", - " TK ENRJ KRP\n", - " TK ENRJ ON INK\n", + " 501\n", + " 0.975104\n", + " united parcel service incorporated\n", + " united guaranty services, incorporated\n", + " [delaware]\n", + " [north, carolina]\n", + " UNTT PRSL SRFS\n", + " UNTT KRNT SRFSS\n", " \n", " \n", - " 177\n", - " 0.573277\n", - " verus international, inc.\n", - " emcor international, inc\n", - " delaware\n", - " delaware\n", - " FRS INTRNXNL INK\n", - " EMKR INTRNXNL INK\n", + " 509\n", + " 0.914612\n", + " estee lauder companies incorporated\n", + " estee lauder international, incorporated\n", + " [delaware]\n", + " [delaware]\n", + " EST LTR KMPNS\n", + " EST LTR INTRNXNL\n", " \n", " \n", - " 178\n", - " 0.573277\n", - " verus international, inc.\n", - " emcor international, inc\n", - " delaware\n", - " delaware\n", - " FRS INTRNXNL INK\n", - " EMKR INTRNXNL INK\n", + " 510\n", + " 0.914612\n", + " maxcyte, incorporated\n", + " cues, incorporated\n", + " [delaware]\n", + " [delaware]\n", + " MKSST\n", + " KS\n", " \n", " \n", - " 179\n", - " 0.714594\n", - " green plains inc.\n", - " green plains superior llc fka superior\n", - " iowa\n", - " iowa\n", - " KRN PLNS INK\n", - " KRN PLNS SPRR LK FK SPRR\n", + " 515\n", + " 0.980607\n", + " zimmer biomet holdings, incorporated\n", + " zimmer biomet spine, incorporated\n", + " [delaware]\n", + " [delaware]\n", + " SMR BMT HLTNKS\n", + " SMR BMT SPN\n", " \n", " \n", - " 183\n", - " 0.996128\n", - " duke energy corp\n", - " duke energy group, llc\n", - " delaware\n", - " delaware\n", - " TK ENRJ KRP\n", - " TK ENRJ KRP LK\n", + " 518\n", + " 0.914612\n", + " nordicus partners corporation\n", + " nordco enterprises, incorporated\n", + " [delaware]\n", + " [wilmington, delaware]\n", + " NRTKS PRTNRS\n", + " NRTK ENTRPRSS\n", " \n", " \n", - " 195\n", - " 0.884993\n", - " green stream holdings inc.\n", - " western gas wyoming, l.l.c\n", - " wyoming\n", - " wyoming\n", - " KRN STRM HLTNKS INK\n", - " WSTRN KS YMNK LLK\n", + " 519\n", + " 0.975104\n", + " valero energy corp/tx\n", + " valero energy incorporated\n", + " [delaware]\n", + " [canada]\n", + " FLR ENRJ TKS\n", + " FLR ENRJ\n", " \n", " \n", - " 196\n", - " 0.884993\n", - " green stream holdings inc.\n", - " western gas wyoming, l.l.c\n", - " wyoming\n", - " wyoming\n", - " KRN STRM HLTNKS INK\n", - " WSTRN KS YMNK LLK\n", + " 527\n", + " 0.914612\n", + " nrg energy, incorporated\n", + " nrg energy holdings incorporated\n", + " [delaware]\n", + " [delaware]\n", + " NRK ENRJ\n", + " NRK ENRJ HLTNKS\n", " \n", " \n", - " 197\n", - " 0.992184\n", - " fortress biotech, inc.\n", - " fortress biotech, china, inc\n", - " delaware\n", - " None\n", - " FRTRS BTX INK\n", - " FRTRS BTX XN INK\n", + " 528\n", + " 0.914612\n", + " everi holdings incorporated\n", + " edi holdings, incorporated\n", + " [delaware]\n", + " [delaware]\n", + " EFR HLTNKS\n", + " ET HLTNKS\n", " \n", " \n", - " 199\n", - " 0.996128\n", - " duke energy corp\n", - " duke energy china corp\n", - " delaware\n", - " delaware\n", - " TK ENRJ KRP\n", - " TK ENRJ XN KRP\n", + " 535\n", + " 0.914612\n", + " estee lauder companies incorporated\n", + " estee lauder incorporated\n", + " [delaware]\n", + " [delaware]\n", + " EST LTR KMPNS\n", + " EST LTR\n", " \n", " \n", - " 200\n", - " 0.573277\n", - " duke energy corp\n", - " duke energy corporate services, inc\n", - " delaware\n", - " delaware\n", - " TK ENRJ KRP\n", - " TK ENRJ KRPRT SRFSS INK\n", + " 548\n", + " 0.975104\n", + " universal logistics holdings, incorporated\n", + " universal logistics corporation\n", + " [michigan]\n", + " [florida]\n", + " UNFRSL LJSTKS HLTNKS\n", + " UNFRSL LJSTKS\n", " \n", " \n", - " 203\n", - " 0.573277\n", - " apollo global management, inc.\n", - " apollo belenos management llc\n", - " delaware\n", - " delaware\n", - " APL KLBL MNJMNT INK\n", - " APL BLNS MNJMNT LK\n", + " 551\n", + " 0.975104\n", + " alliant energy corporation\n", + " allergan gi corporation\n", + " [wisconsin]\n", + " [delaware]\n", + " ALNT ENRJ\n", + " ALRKN J\n", " \n", " \n", - " 204\n", - " 0.573277\n", - " apollo global management, inc.\n", - " apollo belenos management llc\n", - " delaware\n", - " delaware\n", - " APL KLBL MNJMNT INK\n", - " APL BLNS MNJMNT LK\n", + " 555\n", + " 0.975104\n", + " smartmetric, incorporated\n", + " smartpetro incorporated\n", + " [nevada]\n", + " [philippines]\n", + " SMRTMTRK\n", + " SMRTPTR\n", " \n", " \n", - " 206\n", - " 0.981099\n", - " columbia property trust, inc.\n", - " columbia courtyard, inc\n", - " maryland\n", - " maryland\n", - " KLMB PRPRT TRST INK\n", - " KLMB KRTYRT INK\n", + " 566\n", + " 0.914612\n", + " republic services, incorporated\n", + " republic conduit, incorporated\n", + " [delaware]\n", + " [delaware]\n", + " RPBLK SRFSS\n", + " RPBLK KNTT\n", " \n", " \n", - " 208\n", - " 0.573277\n", - " duke energy corp\n", - " duke energy beckjord, llc\n", - " delaware\n", - " delaware\n", - " TK ENRJ KRP\n", - " TK ENRJ BKJRT LK\n", + " 571\n", + " 0.975104\n", + " freedom holdings, incorporated\n", + " freedom designs, incorporated\n", + " [maryland]\n", + " [california]\n", + " FRTM HLTNKS\n", + " FRTM TSKNS\n", " \n", " \n", - " 209\n", - " 0.573277\n", - " duke energy corp\n", - " duke energy beckjord storage llc\n", - " delaware\n", - " delaware\n", - " TK ENRJ KRP\n", - " TK ENRJ BKJRT STRJ LK\n", + " 573\n", + " 0.938457\n", + " ares real estate income trust incorporated\n", + " ares real estate income trust incorporated\n", + " [maryland]\n", + " [delaware]\n", + " ARS RL ESTT INKM TRST\n", + " ARS RL ESTT INKM TRST\n", " \n", " \n", - " 210\n", - " 0.573277\n", - " duke energy corp\n", - " duke energy acp, llc\n", - " delaware\n", - " delaware\n", - " TK ENRJ KRP\n", - " TK ENRJ AKP LK\n", + " 574\n", + " 0.975104\n", + " bank of new york mellon corporation\n", + " bank of new york mellon sa/nv\n", + " [delaware]\n", + " [belgium]\n", + " BNK OF N YRK MLN\n", + " BNK OF N YRK MLN SNF\n", " \n", " \n", - " 213\n", - " 0.981099\n", - " spirit realty capital, inc.\n", - " spirit reit, inc\n", - " maryland\n", - " maryland\n", - " SPRT RLT KPTL INK\n", - " SPRT RT INK\n", + " 576\n", + " 0.914612\n", + " southern company\n", + " southern wood piedmont company\n", + " [delaware]\n", + " [delaware]\n", + " S0RN\n", + " S0RN WT PTMNT\n", " \n", " \n", - " 215\n", - " 0.573277\n", - " apollo global management, inc.\n", - " apollo na management ii, llc\n", - " delaware\n", - " delaware\n", - " APL KLBL MNJMNT INK\n", - " APL N MNJMNT LK\n", + " 582\n", + " 0.914612\n", + " ameresco, incorporated\n", + " ameripath, incorporated\n", + " [delaware]\n", + " [delaware]\n", + " AMRSK\n", + " AMRP0\n", " \n", " \n", - " 216\n", - " 0.573277\n", - " apollo global management, inc.\n", - " apollo na management ii, llc\n", - " delaware\n", - " delaware\n", - " APL KLBL MNJMNT INK\n", - " APL N MNJMNT LK\n", + " 584\n", + " 0.914612\n", + " trevena incorporated\n", + " anr, incorporated\n", + " [delaware]\n", + " [delaware]\n", + " TRFN\n", + " ANR\n", " \n", " \n", - " 225\n", - " 0.992184\n", - " fortress biotech, inc.\n", - " fortress biotech, china, inc\n", - " delaware\n", - " None\n", - " FRTRS BTX INK\n", - " FRTRS BTX XN INK\n", + " 590\n", + " 0.975104\n", + " bank of new york mellon corporation\n", + " bank of new york mellon\n", + " [delaware]\n", + " [new, york]\n", + " BNK OF N YRK MLN\n", + " BNK OF N YRK MLN\n", " \n", " \n", - " 226\n", - " 0.573277\n", - " green brick partners, inc.\n", - " green brick mortgage, llc\n", - " delaware\n", - " delaware\n", - " KRN BRK PRTNRS INK\n", - " KRN BRK MRTKJ LK\n", + " 591\n", + " 0.938457\n", + " xerox holdings corporation\n", + " xerox holdings corporation\n", + " [connecticut]\n", + " [new, york]\n", + " SRKS HLTNKS\n", + " SRKS HLTNKS\n", " \n", " \n", - " 227\n", - " 0.573277\n", - " duke energy corp\n", - " duke energy beckjord storage llc\n", - " delaware\n", - " delaware\n", - " TK ENRJ KRP\n", - " TK ENRJ BKJRT STRJ LK\n", + " 594\n", + " 0.975104\n", + " jones lang lasalle incorporated\n", + " jones lang lasalle ip, incorporated\n", + " [maryland]\n", + " [delaware]\n", + " JNS LNK LSL\n", + " JNS LNK LSL IP\n", " \n", " \n", - " 228\n", - " 0.959568\n", - " green plains inc.\n", - " green plains madison llc\n", - " iowa\n", - " delaware\n", - " KRN PLNS INK\n", - " KRN PLNS MTSN LK\n", + " 595\n", + " 0.914612\n", + " iron mountain incorporated\n", + " iron mountain global holdings, incorporated\n", + " [delaware]\n", + " [delaware]\n", + " IRN MNTN\n", + " IRN MNTN KLBL HLTNKS\n", " \n", " \n", - " 242\n", - " 0.959568\n", - " great lakes dredge & dock corp\n", - " great lakes dredge & dock do brasil ltda\n", - " delaware\n", - " brazil\n", - " KRT LKS TRJ TK KRP\n", - " KRT LKS TRJ TK T BRSL LTT\n", + " 597\n", + " 0.980607\n", + " extreme networks incorporated\n", + " extreme networks ihc, incorporated\n", + " [delaware]\n", + " [delaware]\n", + " EKSTRM NTWRKS\n", + " EKSTRM NTWRKS IK\n", " \n", " \n", - " 243\n", - " 0.573277\n", - " great lakes dredge & dock corp\n", - " great lakes dredge & dock environmental, inc\n", - " delaware\n", - " delaware\n", - " KRT LKS TRJ TK KRP\n", - " KRT LKS TRJ TK ENFRNMNTL INK\n", + " 599\n", + " 0.976947\n", + " q2 holdings, incorporated\n", + " vr holdings, incorporated\n", + " NaN\n", + " [colorado]\n", + " K HLTNKS\n", + " FR HLTNKS\n", " \n", " \n", - " 244\n", - " 0.996128\n", - " great lakes dredge & dock corp\n", - " great lakes dredge & dock company, llc\n", - " delaware\n", - " delaware\n", - " KRT LKS TRJ TK KRP\n", - " KRT LKS TRJ TK KMPN LK\n", + " 600\n", + " 0.980607\n", + " extreme networks incorporated\n", + " extreme networks, incorporated\n", + " [delaware]\n", + " [delaware]\n", + " EKSTRM NTWRKS\n", + " EKSTRM NTWRKS\n", " \n", " \n", - " 251\n", - " 0.573277\n", - " blackstone group inc\n", - " blackstone pb ii l.l.c\n", - " delaware\n", - " delaware\n", - " BLKSTN KRP INK\n", - " BLKSTN PB LLK\n", + " 604\n", + " 0.914612\n", + " cutera incorporated\n", + " vrec, incorporated\n", + " [delaware]\n", + " [delaware]\n", + " KTR\n", + " FRK\n", " \n", " \n", - " 252\n", - " 0.573277\n", - " blackstone group inc\n", - " blackstone pb i l.l.c\n", - " delaware\n", - " delaware\n", - " BLKSTN KRP INK\n", - " BLKSTN PB I LLK\n", + " 605\n", + " 0.975104\n", + " assured guaranty limited\n", + " assured guaranty services limited\n", + " [d0]\n", + " [england]\n", + " ASRT KRNT\n", + " ASRT KRNT SRFSS\n", " \n", " \n", - " 254\n", - " 0.573277\n", - " duke energy corp\n", - " duke energy acp, llc\n", - " delaware\n", - " delaware\n", - " TK ENRJ KRP\n", - " TK ENRJ AKP LK\n", + " 606\n", + " 0.976947\n", + " virtra, incorporated\n", + " viator, incorporated\n", + " [nevada]\n", + " NaN\n", + " FRTR\n", + " FTR\n", " \n", " \n", - " 255\n", - " 0.573277\n", - " duke energy corp\n", - " duke energy shoreham, llc\n", - " delaware\n", - " delaware\n", - " TK ENRJ KRP\n", - " TK ENRJ XRHM LK\n", + " 618\n", + " 0.975104\n", + " sculptor capital management, incorporated\n", + " sculptor capital management hong kong limited\n", + " [delaware]\n", + " [hong, kong]\n", + " SKLPTR KPTL MNJMNT\n", + " SKLPTR KPTL MNJMNT HNK KNK\n", " \n", " \n", - " 256\n", - " 0.573277\n", - " duke energy corp\n", - " duke energy sam, llc\n", - " delaware\n", - " delaware\n", - " TK ENRJ KRP\n", - " TK ENRJ SM LK\n", + " 625\n", + " 0.975104\n", + " enstar group limited\n", + " enstar limited\n", + " [d0]\n", + " [bermuda]\n", + " ENSTR KRP\n", + " ENSTR\n", " \n", " \n", - " 257\n", - " 0.573277\n", - " blackstone group inc\n", - " blackstone obs l.l.c\n", - " delaware\n", - " delaware\n", - " BLKSTN KRP INK\n", - " BLKSTN OBS LLK\n", + " 626\n", + " 0.975104\n", + " sellas life sciences group, incorporated\n", + " sellas life sciences group limited\n", + " [delaware]\n", + " [bermuda]\n", + " SLS LF SSNSS KRP\n", + " SLS LF SSNSS KRP\n", " \n", " \n", - " 264\n", - " 0.992184\n", - " freightcar america, inc.\n", - " freightcar america leasing, llc\n", - " delaware\n", - " None\n", - " FRTKR AMRK INK\n", - " FRTKR AMRK LSNK LK\n", + " 627\n", + " 0.975104\n", + " intuitive surgical incorporated\n", + " intuitive surgical canada incorporated\n", + " [delaware]\n", + " [canada]\n", + " INTTF SRJKL\n", + " INTTF SRJKL KNT\n", " \n", " \n", - " 265\n", - " 0.992184\n", - " freightcar america, inc.\n", - " freightcar america leasing, llc\n", - " delaware\n", - " None\n", - " FRTKR AMRK INK\n", - " FRTKR AMRK LSNK LK\n", + " 630\n", + " 0.951657\n", + " forestar group incorporated\n", + " forestar group incorporated\n", + " [delaware]\n", + " [delaware]\n", + " FRSTR KRP\n", + " FRSTR KRP\n", " \n", " \n", - " 266\n", - " 0.959568\n", - " qurate retail, inc.\n", - " qurate retail group, inc\n", - " englewood\n", - " de\n", - " KRT RTL INK\n", - " KRT RTL KRP INK\n", + " 637\n", + " 0.914612\n", + " dcp midstream, limited partnership\n", + " dcp midstream operating, limited partnership\n", + " [delaware]\n", + " [delaware]\n", + " TKP MTSTRM\n", + " TKP MTSTRM OPRTNK\n", " \n", " \n", - " 267\n", - " 0.884993\n", - " green stream holdings inc.\n", - " western gas wyoming, l.l.c\n", - " wyoming\n", - " wyoming\n", - " KRN STRM HLTNKS INK\n", - " WSTRN KS YMNK LLK\n", + " 639\n", + " 0.951657\n", + " equitable holdings, incorporated\n", + " equitable holdings, incorporated\n", + " [delaware]\n", + " [delaware]\n", + " EKTBL HLTNKS\n", + " EKTBL HLTNKS\n", " \n", " \n", - " 268\n", - " 0.884993\n", - " green stream holdings inc.\n", - " western gas wyoming, l.l.c\n", - " wyoming\n", - " wyoming\n", - " KRN STRM HLTNKS INK\n", - " WSTRN KS YMNK LLK\n", + " 643\n", + " 0.914612\n", + " energy transfer limited partnership\n", + " energy transfer partners, limited liability co...\n", + " [delaware]\n", + " [delaware]\n", + " ENRJ TRNSFR\n", + " ENRJ TRNSFR PRTNRS\n", " \n", " \n", "\n", "" ], "text/plain": [ - " match_probability company_name_l company_name_r loc_of_incorporation_l loc_of_incorporation_r company_name_mphone_l company_name_mphone_r\n", - "150 0.996128 santander drive auto receivables trust 2018-1 santander drive auto receivables trust delaware delaware SNTNTR TRF AT RSFBLS TRST SNTNTR TRF AT RSFBLS TRST\n", - "151 0.996128 santander drive auto receivables trust 2018-5 santander drive auto receivables trust delaware delaware SNTNTR TRF AT RSFBLS TRST SNTNTR TRF AT RSFBLS TRST\n", - "152 0.996128 santander drive auto receivables trust 2018-3 santander drive auto receivables trust delaware delaware SNTNTR TRF AT RSFBLS TRST SNTNTR TRF AT RSFBLS TRST\n", - "153 0.996128 santander drive auto receivables trust 2016-1 santander drive auto receivables trust delaware delaware SNTNTR TRF AT RSFBLS TRST SNTNTR TRF AT RSFBLS TRST\n", - "154 0.573277 constellation pharmaceuticals inc constellation connect, llc delaware delaware KNSTLXN FRMSTKLS INK KNSTLXN KNKT LK\n", - "162 0.959568 consolidated communications holdings, inc. consolidated communications of delaware illinois KNSLTTT KMNKXNS HLTNKS INK KNSLTTT KMNKXNS OF\n", - "163 0.959568 consolidated communications holdings, inc. consolidated communications of delaware missouri KNSLTTT KMNKXNS HLTNKS INK KNSLTTT KMNKXNS OF\n", - "164 0.959568 consolidated communications holdings, inc. consolidated communications of delaware maine KNSLTTT KMNKXNS HLTNKS INK KNSLTTT KMNKXNS OF\n", - "165 0.959568 consolidated communications holdings, inc. consolidated communications of delaware kansas KNSLTTT KMNKXNS HLTNKS INK KNSLTTT KMNKXNS OF\n", - "166 0.959568 consolidated communications holdings, inc. consolidated communications of delaware minnesota KNSLTTT KMNKXNS HLTNKS INK KNSLTTT KMNKXNS OF\n", - "167 0.959568 consolidated communications holdings, inc. consolidated communications of delaware central KNSLTTT KMNKXNS HLTNKS INK KNSLTTT KMNKXNS OF\n", - "168 0.959568 consolidated communications holdings, inc. consolidated communications of delaware florida KNSLTTT KMNKXNS HLTNKS INK KNSLTTT KMNKXNS OF\n", - "169 0.959568 consolidated communications holdings, inc. consolidated communications of delaware california KNSLTTT KMNKXNS HLTNKS INK KNSLTTT KMNKXNS OF\n", - "174 0.573277 duke energy corp duke energy one, inc delaware delaware TK ENRJ KRP TK ENRJ ON INK\n", - "177 0.573277 verus international, inc. emcor international, inc delaware delaware FRS INTRNXNL INK EMKR INTRNXNL INK\n", - "178 0.573277 verus international, inc. emcor international, inc delaware delaware FRS INTRNXNL INK EMKR INTRNXNL INK\n", - "179 0.714594 green plains inc. green plains superior llc fka superior iowa iowa KRN PLNS INK KRN PLNS SPRR LK FK SPRR\n", - "183 0.996128 duke energy corp duke energy group, llc delaware delaware TK ENRJ KRP TK ENRJ KRP LK\n", - "195 0.884993 green stream holdings inc. western gas wyoming, l.l.c wyoming wyoming KRN STRM HLTNKS INK WSTRN KS YMNK LLK\n", - "196 0.884993 green stream holdings inc. western gas wyoming, l.l.c wyoming wyoming KRN STRM HLTNKS INK WSTRN KS YMNK LLK\n", - "197 0.992184 fortress biotech, inc. fortress biotech, china, inc delaware None FRTRS BTX INK FRTRS BTX XN INK\n", - "199 0.996128 duke energy corp duke energy china corp delaware delaware TK ENRJ KRP TK ENRJ XN KRP\n", - "200 0.573277 duke energy corp duke energy corporate services, inc delaware delaware TK ENRJ KRP TK ENRJ KRPRT SRFSS INK\n", - "203 0.573277 apollo global management, inc. apollo belenos management llc delaware delaware APL KLBL MNJMNT INK APL BLNS MNJMNT LK\n", - "204 0.573277 apollo global management, inc. apollo belenos management llc delaware delaware APL KLBL MNJMNT INK APL BLNS MNJMNT LK\n", - "206 0.981099 columbia property trust, inc. columbia courtyard, inc maryland maryland KLMB PRPRT TRST INK KLMB KRTYRT INK\n", - "208 0.573277 duke energy corp duke energy beckjord, llc delaware delaware TK ENRJ KRP TK ENRJ BKJRT LK\n", - "209 0.573277 duke energy corp duke energy beckjord storage llc delaware delaware TK ENRJ KRP TK ENRJ BKJRT STRJ LK\n", - "210 0.573277 duke energy corp duke energy acp, llc delaware delaware TK ENRJ KRP TK ENRJ AKP LK\n", - "213 0.981099 spirit realty capital, inc. spirit reit, inc maryland maryland SPRT RLT KPTL INK SPRT RT INK\n", - "215 0.573277 apollo global management, inc. apollo na management ii, llc delaware delaware APL KLBL MNJMNT INK APL N MNJMNT LK\n", - "216 0.573277 apollo global management, inc. apollo na management ii, llc delaware delaware APL KLBL MNJMNT INK APL N MNJMNT LK\n", - "225 0.992184 fortress biotech, inc. fortress biotech, china, inc delaware None FRTRS BTX INK FRTRS BTX XN INK\n", - "226 0.573277 green brick partners, inc. green brick mortgage, llc delaware delaware KRN BRK PRTNRS INK KRN BRK MRTKJ LK\n", - "227 0.573277 duke energy corp duke energy beckjord storage llc delaware delaware TK ENRJ KRP TK ENRJ BKJRT STRJ LK\n", - "228 0.959568 green plains inc. green plains madison llc iowa delaware KRN PLNS INK KRN PLNS MTSN LK\n", - "242 0.959568 great lakes dredge & dock corp great lakes dredge & dock do brasil ltda delaware brazil KRT LKS TRJ TK KRP KRT LKS TRJ TK T BRSL LTT\n", - "243 0.573277 great lakes dredge & dock corp great lakes dredge & dock environmental, inc delaware delaware KRT LKS TRJ TK KRP KRT LKS TRJ TK ENFRNMNTL INK\n", - "244 0.996128 great lakes dredge & dock corp great lakes dredge & dock company, llc delaware delaware KRT LKS TRJ TK KRP KRT LKS TRJ TK KMPN LK\n", - "251 0.573277 blackstone group inc blackstone pb ii l.l.c delaware delaware BLKSTN KRP INK BLKSTN PB LLK\n", - "252 0.573277 blackstone group inc blackstone pb i l.l.c delaware delaware BLKSTN KRP INK BLKSTN PB I LLK\n", - "254 0.573277 duke energy corp duke energy acp, llc delaware delaware TK ENRJ KRP TK ENRJ AKP LK\n", - "255 0.573277 duke energy corp duke energy shoreham, llc delaware delaware TK ENRJ KRP TK ENRJ XRHM LK\n", - "256 0.573277 duke energy corp duke energy sam, llc delaware delaware TK ENRJ KRP TK ENRJ SM LK\n", - "257 0.573277 blackstone group inc blackstone obs l.l.c delaware delaware BLKSTN KRP INK BLKSTN OBS LLK\n", - "264 0.992184 freightcar america, inc. freightcar america leasing, llc delaware None FRTKR AMRK INK FRTKR AMRK LSNK LK\n", - "265 0.992184 freightcar america, inc. freightcar america leasing, llc delaware None FRTKR AMRK INK FRTKR AMRK LSNK LK\n", - "266 0.959568 qurate retail, inc. qurate retail group, inc englewood de KRT RTL INK KRT RTL KRP INK\n", - "267 0.884993 green stream holdings inc. western gas wyoming, l.l.c wyoming wyoming KRN STRM HLTNKS INK WSTRN KS YMNK LLK\n", - "268 0.884993 green stream holdings inc. western gas wyoming, l.l.c wyoming wyoming KRN STRM HLTNKS INK WSTRN KS YMNK LLK" + " match_probability company_name_l company_name_r loc_list_l loc_list_r company_name_mphone_l company_name_mphone_r\n", + "465 0.914612 conns incorporated invenco incorporated [delaware] [delaware] KNS INFNK\n", + "466 0.914612 vishay intertechnology incorporated vishay precision foil, incorporated [delaware] [delaware] FX INTRTXNLJ FX PRSXN FL\n", + "467 0.980607 vishay precision group, incorporated vishay precision foil, incorporated [delaware] [delaware] FX PRSXN KRP FX PRSXN FL\n", + "470 0.975104 jones lang lasalle incorporated jones lang lasalle limited [maryland] [hong, kong] JNS LNK LSL JNS LNK LSL\n", + "471 0.951657 nrg energy, incorporated nrg energy, incorporated [delaware] [delaware] NRK ENRJ NRK ENRJ\n", + "472 0.914612 firstenergy corporation firstenergy ventures corporation [ohio] [ohio] FRSTNRJ FRSTNRJ FNTRS\n", + "478 0.914612 hudson pacific properties, incorporated hudson pacific services, incorporated [maryland] [maryland] HTSN PSFK PRPRTS HTSN PSFK SRFSS\n", + "479 0.980607 hudson pacific properties, incorporated hudson pacific properties, limited partnership [maryland] [maryland] HTSN PSFK PRPRTS HTSN PSFK PRPRTS\n", + "481 0.914612 digital ally, incorporated digital ally international, incorporated [nevada] [nevada] TJTL AL TJTL AL INTRNXNL\n", + "489 0.976947 cco holdings limited liability company rhfw holdings, limited liability company NaN [delaware] KK HLTNKS RHF HLTNKS\n", + "493 0.975104 intuitive surgical incorporated intuitive surgical limited [delaware] [united, kingdom] INTTF SRJKL INTTF SRJKL\n", + "494 0.975104 jones lang lasalle incorporated jones lang lasalle limited [maryland] [england] JNS LNK LSL JNS LNK LSL\n", + "500 0.975104 becton dickinson and company becton, dickinson and company, limited [new, jersey] [ireland] BKTN TKNSN ANT BKTN TKNSN ANT\n", + "501 0.975104 united parcel service incorporated united guaranty services, incorporated [delaware] [north, carolina] UNTT PRSL SRFS UNTT KRNT SRFSS\n", + "509 0.914612 estee lauder companies incorporated estee lauder international, incorporated [delaware] [delaware] EST LTR KMPNS EST LTR INTRNXNL\n", + "510 0.914612 maxcyte, incorporated cues, incorporated [delaware] [delaware] MKSST KS\n", + "515 0.980607 zimmer biomet holdings, incorporated zimmer biomet spine, incorporated [delaware] [delaware] SMR BMT HLTNKS SMR BMT SPN\n", + "518 0.914612 nordicus partners corporation nordco enterprises, incorporated [delaware] [wilmington, delaware] NRTKS PRTNRS NRTK ENTRPRSS\n", + "519 0.975104 valero energy corp/tx valero energy incorporated [delaware] [canada] FLR ENRJ TKS FLR ENRJ\n", + "527 0.914612 nrg energy, incorporated nrg energy holdings incorporated [delaware] [delaware] NRK ENRJ NRK ENRJ HLTNKS\n", + "528 0.914612 everi holdings incorporated edi holdings, incorporated [delaware] [delaware] EFR HLTNKS ET HLTNKS\n", + "535 0.914612 estee lauder companies incorporated estee lauder incorporated [delaware] [delaware] EST LTR KMPNS EST LTR\n", + "548 0.975104 universal logistics holdings, incorporated universal logistics corporation [michigan] [florida] UNFRSL LJSTKS HLTNKS UNFRSL LJSTKS\n", + "551 0.975104 alliant energy corporation allergan gi corporation [wisconsin] [delaware] ALNT ENRJ ALRKN J\n", + "555 0.975104 smartmetric, incorporated smartpetro incorporated [nevada] [philippines] SMRTMTRK SMRTPTR\n", + "566 0.914612 republic services, incorporated republic conduit, incorporated [delaware] [delaware] RPBLK SRFSS RPBLK KNTT\n", + "571 0.975104 freedom holdings, incorporated freedom designs, incorporated [maryland] [california] FRTM HLTNKS FRTM TSKNS\n", + "573 0.938457 ares real estate income trust incorporated ares real estate income trust incorporated [maryland] [delaware] ARS RL ESTT INKM TRST ARS RL ESTT INKM TRST\n", + "574 0.975104 bank of new york mellon corporation bank of new york mellon sa/nv [delaware] [belgium] BNK OF N YRK MLN BNK OF N YRK MLN SNF\n", + "576 0.914612 southern company southern wood piedmont company [delaware] [delaware] S0RN S0RN WT PTMNT\n", + "582 0.914612 ameresco, incorporated ameripath, incorporated [delaware] [delaware] AMRSK AMRP0\n", + "584 0.914612 trevena incorporated anr, incorporated [delaware] [delaware] TRFN ANR\n", + "590 0.975104 bank of new york mellon corporation bank of new york mellon [delaware] [new, york] BNK OF N YRK MLN BNK OF N YRK MLN\n", + "591 0.938457 xerox holdings corporation xerox holdings corporation [connecticut] [new, york] SRKS HLTNKS SRKS HLTNKS\n", + "594 0.975104 jones lang lasalle incorporated jones lang lasalle ip, incorporated [maryland] [delaware] JNS LNK LSL JNS LNK LSL IP\n", + "595 0.914612 iron mountain incorporated iron mountain global holdings, incorporated [delaware] [delaware] IRN MNTN IRN MNTN KLBL HLTNKS\n", + "597 0.980607 extreme networks incorporated extreme networks ihc, incorporated [delaware] [delaware] EKSTRM NTWRKS EKSTRM NTWRKS IK\n", + "599 0.976947 q2 holdings, incorporated vr holdings, incorporated NaN [colorado] K HLTNKS FR HLTNKS\n", + "600 0.980607 extreme networks incorporated extreme networks, incorporated [delaware] [delaware] EKSTRM NTWRKS EKSTRM NTWRKS\n", + "604 0.914612 cutera incorporated vrec, incorporated [delaware] [delaware] KTR FRK\n", + "605 0.975104 assured guaranty limited assured guaranty services limited [d0] [england] ASRT KRNT ASRT KRNT SRFSS\n", + "606 0.976947 virtra, incorporated viator, incorporated [nevada] NaN FRTR FTR\n", + "618 0.975104 sculptor capital management, incorporated sculptor capital management hong kong limited [delaware] [hong, kong] SKLPTR KPTL MNJMNT SKLPTR KPTL MNJMNT HNK KNK\n", + "625 0.975104 enstar group limited enstar limited [d0] [bermuda] ENSTR KRP ENSTR\n", + "626 0.975104 sellas life sciences group, incorporated sellas life sciences group limited [delaware] [bermuda] SLS LF SSNSS KRP SLS LF SSNSS KRP\n", + "627 0.975104 intuitive surgical incorporated intuitive surgical canada incorporated [delaware] [canada] INTTF SRJKL INTTF SRJKL KNT\n", + "630 0.951657 forestar group incorporated forestar group incorporated [delaware] [delaware] FRSTR KRP FRSTR KRP\n", + "637 0.914612 dcp midstream, limited partnership dcp midstream operating, limited partnership [delaware] [delaware] TKP MTSTRM TKP MTSTRM OPRTNK\n", + "639 0.951657 equitable holdings, incorporated equitable holdings, incorporated [delaware] [delaware] EKTBL HLTNKS EKTBL HLTNKS\n", + "643 0.914612 energy transfer limited partnership energy transfer partners, limited liability co... [delaware] [delaware] ENRJ TRNSFR ENRJ TRNSFR PRTNRS" ] }, - "execution_count": 249, + "execution_count": 79, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "preds_df[preds_df.match_probability >= .5][[\"match_probability\", \"company_name_l\", \"company_name_r\", \"loc_of_incorporation_l\", \"loc_of_incorporation_r\", \"company_name_mphone_l\", \"company_name_mphone_r\"]].iloc[150:200]" + "preds_df[preds_df.match_probability >= .9][[\"match_probability\", \"company_name_l\", \"company_name_r\", \"loc_list_l\", \"loc_list_r\", \"company_name_mphone_l\", \"company_name_mphone_r\"]].iloc[150:200]" ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "fb2122d8-ff0a-4117-a91c-17a0523dcfcb", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb index 111ae84..0d74c13 100644 --- a/notebooks/18-kl-splink-sec-eia.ipynb +++ b/notebooks/18-kl-splink-sec-eia.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 262, + "execution_count": 2, "id": "1107fe42-197c-4fea-9c48-06d08699af0b", "metadata": {}, "outputs": [], @@ -23,13 +23,16 @@ "from pathlib import Path\n", "\n", "import pandas as pd\n", + "from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix\n", "from splink import block_on, DuckDBAPI, Linker, SettingsCreator\n", "from splink.blocking_analysis import count_comparisons_from_blocking_rule, cumulative_comparisons_to_be_scored_from_blocking_rules_chart, n_largest_blocks\n", "import splink.comparison_library as cl\n", + "import splink.comparison_level_library as cll\n", "from splink.exploratory import completeness_chart, profile_columns\n", "from upath import UPath\n", "\n", - "from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import prepare_sec10k_basic_info_df, prepare_ex21_df" + "from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive, convert_ex21_id_to_filename\n", + "from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import add_sec_company_id_to_subsidiaries, prepare_sec10k_basic_info_df, prepare_eia_df, prepare_ex21_df" ] }, { @@ -53,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 3, "id": "4ab5594d-7d1f-425d-80e1-92c30be73011", "metadata": { "tags": [] @@ -65,7 +68,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 4, "id": "2edc29d4-6c85-4b31-aae6-0de38c846e44", "metadata": { "tags": [] @@ -77,7 +80,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 5, "id": "eaa37762-9f94-4927-9341-0ab09be3c8ab", "metadata": { "tags": [] @@ -89,7 +92,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 6, "id": "3fb7895f-10c5-4450-96f9-77b36471b53e", "metadata": { "tags": [] @@ -101,7 +104,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 7, "id": "06c76b82-1aad-47b2-aecc-6225a286cc40", "metadata": { "tags": [] @@ -118,7 +121,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 8, "id": "d95acde9-1640-4c26-a5d1-c50b6666ccf4", "metadata": { "tags": [] @@ -130,7 +133,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 9, "id": "3b7484de-bbc7-47ba-b408-a1af1183018c", "metadata": { "tags": [] @@ -149,7 +152,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 10, "id": "d3d39fc0-130f-4bbd-9cc9-bbaf58808109", "metadata": { "tags": [] @@ -162,7 +165,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 11, "id": "04b6b682-91f4-49e2-9f74-2861548d1dd4", "metadata": {}, "outputs": [], @@ -185,7 +188,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 12, "id": "d4e950a6-ee6c-414c-b5b9-52a4175bf0b7", "metadata": {}, "outputs": [], @@ -195,7 +198,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 13, "id": "14eb7f24-7f7b-43aa-a0df-85e888e43821", "metadata": {}, "outputs": [], @@ -208,140 +211,7 @@ }, { "cell_type": "code", - "execution_count": 29, - "id": "23da5ca1-bd04-44d4-b252-7b114d6d553f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
value
filenamefiler_countblockblock_countkey
edgar/data/100240/0000950144-94-000787.txt0company_data0company_conformed_nameturner broadcasting system inc
central_index_key0000100240
standard_industrial_classification4833
irs_number580950695
state_of_incorporationga
..................
edgar/data/936528/0000936528-23-000207.txt0former_company0date_of_name_change20230928
1former_conformed_namewafd inc
date_of_name_change20230927
2former_conformed_namewashington federal inc
date_of_name_change19950206
\n", - "

7980908 rows × 1 columns

\n", - "
" - ], - "text/plain": [ - " value\n", - "filename filer_count block block_count key \n", - "edgar/data/100240/0000950144-94-000787.txt 0 company_data 0 company_conformed_name turner broadcasting system inc\n", - " central_index_key 0000100240\n", - " standard_industrial_classification 4833\n", - " irs_number 580950695\n", - " state_of_incorporation ga\n", - "... ...\n", - "edgar/data/936528/0000936528-23-000207.txt 0 former_company 0 date_of_name_change 20230928\n", - " 1 former_conformed_name wafd inc\n", - " date_of_name_change 20230927\n", - " 2 former_conformed_name washington federal inc\n", - " date_of_name_change 19950206\n", - "\n", - "[7980908 rows x 1 columns]" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "raw_sec_df" - ] - }, - { - "cell_type": "code", - "execution_count": 87, + "execution_count": null, "id": "1be3364e-9887-42b2-b303-0a24e8681acf", "metadata": { "tags": [] @@ -352,6 +222,16 @@ "raw_sec_df.columns.name = None" ] }, + { + "cell_type": "code", + "execution_count": 15, + "id": "f6f76c8b-ffbf-4e2b-870b-57f1260ba522", + "metadata": {}, + "outputs": [], + "source": [ + "sec_clean_df = prepare_sec10k_basic_info_df(raw_sec_df)" + ] + }, { "cell_type": "markdown", "id": "3bac9280-1183-4aba-b78f-84bcf37ef1e2", @@ -362,7 +242,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 17, "id": "611da616-45ef-40ae-bc06-8bfbc871274d", "metadata": {}, "outputs": [], @@ -372,7 +252,7 @@ }, { "cell_type": "code", - "execution_count": 91, + "execution_count": 18, "id": "1d6272f2-b6f3-4497-9251-cbeedf794a0b", "metadata": {}, "outputs": [], @@ -390,25 +270,16 @@ { "cell_type": "markdown", "id": "b636d438-ed71-426c-8c2a-9e550fe99958", - "metadata": {}, - "source": [ - "# Preprocessing" - ] - }, - { - "cell_type": "code", - "execution_count": 153, - "id": "f6f76c8b-ffbf-4e2b-870b-57f1260ba522", - "metadata": {}, - "outputs": [], + "metadata": { + "tags": [] + }, "source": [ - "# cleaning on both sides\n", - "sec_clean_df = prepare_sec10k_basic_info_df(raw_sec_df)" + "# Preprocess Ex. 21" ] }, { "cell_type": "code", - "execution_count": 157, + "execution_count": 19, "id": "84e26751-663b-45a5-bb4d-fbfbbdca447e", "metadata": {}, "outputs": [ @@ -416,7 +287,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "/Users/katielamb/CatalystCoop/mozilla-sec-eia/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py:189: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", + "/Users/katielamb/CatalystCoop/mozilla-sec-eia/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py:168: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", " df = df.fillna(np.nan)\n" ] } @@ -427,493 +298,638 @@ }, { "cell_type": "code", - "execution_count": 224, - "id": "24defbd5-ccfe-4844-ab87-3adb1b4df2d9", - "metadata": {}, - "outputs": [], - "source": [ - "eia_clean_df = prepare_eia_df(eia_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 228, - "id": "a284b2c9-8edf-4b3f-ab08-5b2cff65ed19", - "metadata": {}, - "outputs": [], - "source": [ - "SHARED_COLS = [\n", - " \"record_id\",\n", - " \"report_date\",\n", - " \"report_year\",\n", - " \"company_name\",\n", - " \"street_address\",\n", - " \"street_address_2\",\n", - " \"city\",\n", - " \"state\", # could use state of incorporation from SEC\n", - " \"zip_code\",\n", - " \"phone_number\",\n", - " \"company_name_mphone\"\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e90de0d3-3220-4869-80a3-fc7dd381d393", + "execution_count": 34, + "id": "027191c4-82fa-491b-8c73-54551c7fa4e6", "metadata": {}, "outputs": [], "source": [ - "# strip legal terms and then make a list column from company name\n", - "# use this for blocking and comnparison levels\n", - "eia_match_df[\"company_name_mphone_list\"] = eia_match_df[\"company_name_mphone\"].str.split()" + "sec_match_df = sec_clean_df.drop_duplicates(subset=[\"central_index_key\", \"company_name\", \"loc_of_incorporation\", \"report_year\"])\n", + "merged_df = sec_match_df.merge(ex21_clean_df, how=\"inner\", on=\"company_name\", suffixes=(\"_sec\", \"_ex21\"))\n", + "merged_df.loc[:, \"loc_tokens_sec\"] = merged_df[\"loc_of_incorporation_sec\"].fillna(\"\").str.lower().str.split()\n", + "merged_df.loc[:, \"loc_tokens_ex21\"] = merged_df[\"loc_of_incorporation_ex21\"].fillna(\"\").str.lower().str.split()\n", + "merged_df[\"loc_overlap\"] = merged_df.apply(\n", + " lambda row: len(set(row[\"loc_tokens_sec\"]) & set(row[\"loc_tokens_ex21\"])), axis=1\n", + ")\n", + "merged_df[\"report_year_diff\"] = merged_df.apply(\n", + " lambda row: abs(int(row[\"report_year_sec\"]) - int(row[\"report_year_ex21\"])), axis=1\n", + ")\n", + "# Sort by CIK, company_name, loc_overlap, and report_year_diff\n", + "# so that we can then choose the first record in each CIK, company_name group\n", + "merged_df = merged_df.sort_values(by=[\"central_index_key\", \"company_name\", \"loc_overlap\", \"report_year_diff\"],\n", + " ascending=[True, True, False, True]\n", + " )\n", + "# Select the row with the highest loc overlap and nearest report years for each CIK and company name\n", + "cik_and_company_pairs = merged_df.groupby([\"central_index_key\", \"company_name\"], as_index=False).first()\n", + "# We now have the closest matching CIK and company name pairs\n", + "# We want to get the best matching CIK for each company name and loc of incorporation\n", + "# Select the row with the highest loc overlap and nearest report years for each company name and loc pair\n", + "cik_and_company_pairs = cik_and_company_pairs.sort_values(by=[\"company_name\", \"loc_of_incorporation_ex21\", \"loc_overlap\", \"report_year_diff\"],\n", + " ascending=[True, True, False, True]\n", + " )\n", + "closest_match = cik_and_company_pairs.groupby([\"company_name\", \"loc_of_incorporation_ex21\"], as_index=False).first()\n", + "closest_match = closest_match.drop_duplicates(subset=[\"central_index_key\", \"company_name\", \"loc_of_incorporation_ex21\"])" ] }, { "cell_type": "code", - "execution_count": null, - "id": "460c5bd5-f2e2-45c3-86c3-ac203bd053d0", + "execution_count": 35, + "id": "bd9e9f44-7ff8-4615-a5c3-ee8f32439e26", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "False 5808\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# create list column for address information as well" + "# a company name and location of incorporation should match to only one CIK\n", + "closest_match.duplicated(subset=[\"company_name\", \"loc_of_incorporation_ex21\"]).value_counts()" ] }, { "cell_type": "code", - "execution_count": 158, - "id": "c3bdc160-1939-4f34-914f-ecb0b5fdb5ac", + "execution_count": 36, + "id": "64572f77-0a64-48a9-83fd-1c0179202010", "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
report_datereport_yearcompany_namestreet_addressstreet_address_2citystatezip_codephone_number
02000-03-302000meta group incorporated208 harbor drNaNstamfordct06912-00612039736700
12001-04-022001meta group incorporated208 harbor drNaNstamfordct06912-00612039736700
22002-04-012002meta group incorporated208 harbor drNaNstamfordct06912-00612039736700
\n", - "
" - ], "text/plain": [ - " report_date report_year company_name street_address street_address_2 city state zip_code phone_number\n", - "0 2000-03-30 2000 meta group incorporated 208 harbor dr NaN stamford ct 06912-0061 2039736700\n", - "1 2001-04-02 2001 meta group incorporated 208 harbor dr NaN stamford ct 06912-0061 2039736700\n", - "2 2002-04-01 2002 meta group incorporated 208 harbor dr NaN stamford ct 06912-0061 2039736700" + "central_index_key\n", + "False 5532\n", + "True 276\n", + "Name: count, dtype: int64" ] }, - "execution_count": 158, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "sec_clean_df[SHARED_COLS].head(3)" + "# it's okay if there's duplication here\n", + "# multiple subsidiaries can point to the same CIK\n", + "# and company names can change and they still keep the same CIK\n", + "closest_match.central_index_key.duplicated().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "id": "a669e0b7-c7fb-4c12-9121-0282e616286a", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_with_cik = ex21_clean_df.merge(\n", + " closest_match[[\"company_name\", \"central_index_key\", \"loc_of_incorporation_ex21\"]].rename(columns={\"loc_of_incorporation_ex21\": \"loc_of_incorporation\"}),\n", + " how=\"left\",\n", + " on=[\"company_name\", \"loc_of_incorporation\"],\n", + ").rename(columns={\"central_index_key\": \"subsidiary_cik\"})" ] }, { "cell_type": "code", - "execution_count": 159, - "id": "9d73fdac-8d97-4030-9772-79ac058b0d33", + "execution_count": 38, + "id": "245697ec-9451-47e7-953b-eba65062ee93", "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
report_datereport_yearcompany_namestreet_addressstreet_address_2citystatezip_codephone_number
332023-01-012023desert willow energy storage100 bayview circleNaNnewport beachcaNaNNaN
352023-01-012023portage solar plantn8917NaNportagewi53901NaN
372023-01-012023nsf energy one limited liability company1241 university aveNaNrochesterny14607NaN
\n", - "
" - ], "text/plain": [ - " report_date report_year company_name street_address street_address_2 city state zip_code phone_number\n", - "33 2023-01-01 2023 desert willow energy storage 100 bayview circle NaN newport beach ca NaN NaN\n", - "35 2023-01-01 2023 portage solar plant n8917 NaN portage wi 53901 NaN\n", - "37 2023-01-01 2023 nsf energy one limited liability company 1241 university ave NaN rochester ny 14607 NaN" + "subsidiary_cik\n", + "True 2900030\n", + "False 21674\n", + "Name: count, dtype: int64" ] }, - "execution_count": 159, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "eia_clean_df[~eia_match_df.street_address.isnull()][SHARED_COLS].head(3)" + "ex21_with_cik.subsidiary_cik.isnull().value_counts()" ] }, { "cell_type": "code", - "execution_count": 160, - "id": "db2b1e13-824e-4c86-8065-fc99e9a1186c", + "execution_count": 39, + "id": "1382a2e4-e88e-47bb-93ed-dafc576ec2f4", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_with_cik = ex21_with_cik.merge(closest_match[[\"company_name\", \"central_index_key\"]],\n", + " how=\"left\",\n", + " on=\"company_name\"\n", + " ).rename(columns={\"central_index_key\": \"company_name_merge_cik\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "id": "5f70e3ff-2494-4eda-bfa2-6989bcf442bb", + "metadata": {}, + "outputs": [], + "source": [ + "# if a subsidiary doesn't have a CIK and has a null location\n", + "# but its company name was assigned a CIK (with a different location)\n", + "# then assign that CIK to the subsidiary\n", + "ex21_with_cik[\"subsidiary_cik\"] = ex21_with_cik[\"subsidiary_cik\"].where(\n", + " ~(ex21_with_cik.subsidiary_cik.isnull()) | ~(ex21_with_cik.loc_of_incorporation.isnull()), \n", + " ex21_with_cik[\"company_name_merge_cik\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "id": "63d4cc13-a4bf-4473-99bb-6d8fcf9a1174", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "subsidiary_cik\n", + "True 2897527\n", + "False 24221\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# there should be fewer null CIKs now\n", + "ex21_with_cik.subsidiary_cik.isnull().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "id": "e25cf09f-8bbd-4dcd-b308-71bc5a357bf5", + "metadata": {}, + "outputs": [], + "source": [ + "archive = GCSArchive()\n", + "md = archive.get_metadata()" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "id": "d17ed466-74d6-44e5-aaca-8dc6793712d4", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_with_cik.loc[:, \"filename\"] = convert_ex21_id_to_filename(ex21_with_cik)" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "id": "6303051b-74bf-4043-885e-aaaf6593852d", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_with_cik = ex21_with_cik.merge(md[\"cik\"],\n", + " how=\"left\",\n", + " left_on=\"filename\",\n", + " right_index=True).rename(columns={\"cik\": \"parent_cik\"})" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "da72f2d4-54a8-487a-82ec-92d9e8df091f", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_with_cik = add_sec_company_id_to_subsidiaries(ex21_with_cik)" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "eff49691-d17c-4a55-817d-8eeaf83900e4", + "metadata": {}, + "outputs": [], + "source": [ + "# remove the Ex. 21 subsidiaries who were matched to a filing company\n", + "unmatched_ex21_df = ex21_with_cik[ex21_with_cik.subsidiary_cik.isnull()]" + ] + }, + { + "cell_type": "markdown", + "id": "381e4a8f-d8da-4802-b86e-1ac4fc3094db", + "metadata": {}, + "source": [ + "# Preprocess SEC and EIA\n", + "\n", + "Does it actually make sense to add in the Ex. 21 subsidiaries when we only have company name?\n", + "Does it make more sense to do a direct match on company name after\n", + "the SEC basic info to EIA match is done? And if there's a conflicting SEC match (one basic info and one Ex. 21) then review it manually?" + ] + }, + { + "cell_type": "code", + "execution_count": 67, + "id": "8453d55d-a3ac-422d-9cef-e7f13d582efe", + "metadata": {}, + "outputs": [], + "source": [ + "# find a way to use state of incorporation even though it's not on the EIA side?\n", + "sec_full_clean_df = pd.concat([sec_clean_df, \n", + " unmatched_ex21_df[[\"sec_company_id\", \"report_year\", \"company_name\", \"company_name_no_legal\", \"company_name_mphone\", \"state_of_incorporation\"]]\n", + " ])" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "id": "2bc79d7d-b756-47d5-a61d-a3a761160250", + "metadata": {}, + "outputs": [], + "source": [ + "sec_full_clean_df = sec_full_clean_df.reset_index(drop=True).reset_index(names=\"record_id\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "166d3c96-93d6-4a22-afbf-8d94dc9ecfb9", + "metadata": {}, + "outputs": [], + "source": [ + "# for now, just use sec_clean_df without Ex. 21 subsidiaries\n", + "sec_clean_df = sec_clean_df.reset_index(drop=True).reset_index(names=\"record_id\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "24defbd5-ccfe-4844-ab87-3adb1b4df2d9", + "metadata": {}, + "outputs": [], + "source": [ + "eia_clean_df = prepare_eia_df(eia_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "e754b2ef-5a0d-4582-8694-047528dfd339", "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
record_ididcompany_name_rawloc_of_incorporationown_perreport_yearcompany_namecompany_name_mphone
0014060-0000916131-94-000015brenton bank and trust companyiowaNaN1994brenton bank and trust companyBRNTN BNK ANT TRST KMPN
1114060-0000916131-94-000015adeliowaNaN1994adelATL
2214060-0000916131-94-000015brenton savings bank, fsb united statesames, iowaNaN1994brenton savings bank, fsb united statesBRNTN SFNKS BNK FSB UNTT STTS
\n", - "
" - ], "text/plain": [ - " record_id id company_name_raw loc_of_incorporation own_per report_year company_name company_name_mphone\n", - "0 0 14060-0000916131-94-000015 brenton bank and trust company iowa NaN 1994 brenton bank and trust company BRNTN BNK ANT TRST KMPN\n", - "1 1 14060-0000916131-94-000015 adel iowa NaN 1994 adel ATL\n", - "2 2 14060-0000916131-94-000015 brenton savings bank, fsb united states ames, iowa NaN 1994 brenton savings bank, fsb united states BRNTN SFNKS BNK FSB UNTT STTS" + "True" ] }, - "execution_count": 160, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ex21_clean_df.head(3)" + "sec_clean_df.record_id.is_unique" ] }, { "cell_type": "code", - "execution_count": 229, - "id": "4ea7c80a-5b5b-4a07-bca0-b6ed1e78dce9", + "execution_count": 28, + "id": "38ad3504-2cde-455f-8896-6a435677541c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "['record_id',\n", - " 'report_date',\n", - " 'report_year',\n", - " 'company_name',\n", - " 'street_address',\n", - " 'street_address_2',\n", - " 'city',\n", - " 'state',\n", - " 'zip_code',\n", - " 'phone_number',\n", - " 'company_name_mphone']" + "True" ] }, - "execution_count": 229, + "execution_count": 28, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "SHARED_COLS" + "eia_clean_df.record_id.is_unique" ] }, { "cell_type": "code", - "execution_count": 231, - "id": "c734137b-fd76-4f6a-a828-23cd12d4ac27", + "execution_count": 30, + "id": "e90de0d3-3220-4869-80a3-fc7dd381d393", "metadata": {}, "outputs": [], "source": [ - "eia_match_df = eia_clean_df[SHARED_COLS]" + "# TODO: move this into preprocessing\n", + "# strip legal terms and then make a list column from company name\n", + "# use this for blocking and comnparison levels\n", + "eia_clean_df.loc[:, \"company_name_mphone_list\"] = eia_clean_df[\"company_name_mphone\"].str.split()" ] }, { "cell_type": "code", - "execution_count": 232, - "id": "2b8b6313-abf0-4233-8bad-43b8b9cc1e0b", + "execution_count": 31, + "id": "b71a24f2-51b5-444f-a645-054cc3e25cf8", "metadata": {}, "outputs": [], "source": [ - "sec_match_df = sec_clean_df[SHARED_COLS]" + "sec_clean_df.loc[:, \"company_name_mphone_list\"] = sec_clean_df[\"company_name_mphone\"].str.split()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "eb9c00dc-50a5-49cc-9589-0bf4df917ab3", + "metadata": {}, + "outputs": [], + "source": [ + "eia_clean_df.loc[:, \"zip_code\"] = eia_clean_df[\"zip_code\"].str[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "edead864-7004-4081-ab78-313c14ff81a3", + "metadata": {}, + "outputs": [], + "source": [ + "sec_clean_df.loc[:, \"zip_code\"] = sec_clean_df[\"zip_code\"].str[:5]" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "a5af13b2-9d43-42e6-9477-1fb7d52412cf", + "metadata": {}, + "outputs": [], + "source": [ + "# I think we don't need this column\n", + "eia_clean_df.loc[:, \"street_address_list\"] = eia_clean_df[\"street_address\"].str.split()\n", + "sec_clean_df.loc[:, \"street_address_list\"] = sec_clean_df[\"street_address\"].str.split()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "a284b2c9-8edf-4b3f-ab08-5b2cff65ed19", + "metadata": {}, + "outputs": [], + "source": [ + "SHARED_COLS = [\n", + " \"record_id\",\n", + " \"report_date\",\n", + " \"report_year\",\n", + " \"company_name\",\n", + " \"company_name_no_legal\",\n", + " \"street_address\",\n", + " \"street_address_list\",\n", + " \"street_address_2\",\n", + " \"city\",\n", + " \"state\", # could use state of incorporation from SEC\n", + " \"zip_code\",\n", + " \"phone_number\",\n", + " \"company_name_mphone\",\n", + " \"company_name_mphone_list\"\n", + "]" ] }, { "cell_type": "markdown", - "id": "9a04c196-e926-4502-82ee-c27352352591", - "metadata": { - "jp-MarkdownHeadingCollapsed": true, - "tags": [] - }, + "id": "21b697b0-7d9e-452c-9b8b-ee40fd6bb7bd", + "metadata": {}, "source": [ - "# Link in Ex. 21 records" + "create list column for address information as well?" ] }, { "cell_type": "code", - "execution_count": 165, - "id": "c1500344-ff7f-450e-90dd-1105d8e7c637", + "execution_count": 55, + "id": "c734137b-fd76-4f6a-a828-23cd12d4ac27", "metadata": {}, "outputs": [], "source": [ - "# run the Ex.21 to SEC model\n", - "filepath = Path(\"../sec_ex21_model_settings/2023_model.json\")\n", - "with open(filepath, 'r') as file:\n", - " sec_ex21_settings = json.load(file)" + "eia_match_df = eia_clean_df[SHARED_COLS]" ] }, { "cell_type": "code", - "execution_count": 192, - "id": "172ea84f-a0b7-4e9c-b746-322a47663171", + "execution_count": 38, + "id": "2b8b6313-abf0-4233-8bad-43b8b9cc1e0b", "metadata": {}, "outputs": [], "source": [ - "sec_test_df = sec_match_df[sec_match_df.report_year.isin([2016, 2017])][[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]" + "sec_match_df = sec_clean_df[SHARED_COLS]" ] }, { "cell_type": "code", - "execution_count": 193, - "id": "3f8ba4ee-b1e7-4e05-982e-43d8e446eea9", + "execution_count": 43, + "id": "a4a15b86-71cf-4d8d-9c09-f82a70f10273", "metadata": {}, "outputs": [], "source": [ - "ex21_test_df = ex21_match_df[ex21_match_df.report_year.isin([2016, 2017])][[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]" + "match_cols = [\"company_name\", \"state\", \"city\", \"street_address\", \"zip_code\"]" ] }, { "cell_type": "code", - "execution_count": 194, - "id": "2c715d7a-3d6d-4970-8ae3-5a6e1a12e937", + "execution_count": 49, + "id": "842fa02e-5202-445c-b728-72bce42e740d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "14125" + "True 138441\n", + "False 39407\n", + "Name: count, dtype: int64" ] }, - "execution_count": 194, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "len(sec_test_df)" + "# duplicates exist because of differing report years\n", + "eia_match_df.duplicated(subset=match_cols).value_counts()" ] }, { "cell_type": "code", - "execution_count": 195, - "id": "ec13db12-3664-4e00-aa83-7c372039b230", + "execution_count": 52, + "id": "b53e6244-f0ca-4256-bc09-9c3264675389", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "233101" + "True 168445\n", + "False 64515\n", + "Name: count, dtype: int64" ] }, - "execution_count": 195, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "len(ex21_test_df)" + "sec_match_df.duplicated(subset=match_cols).value_counts()" ] }, { "cell_type": "code", - "execution_count": 196, - "id": "d2fcc1da-4435-4b17-8be7-cb34a6917522", + "execution_count": 56, + "id": "baa742ae-1b49-4d0a-84c8-5f864398c8ed", + "metadata": {}, + "outputs": [], + "source": [ + "sec_match_df = sec_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=match_cols, keep=\"first\")" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "id": "63e47f5f-e142-48fa-9ffa-e14d27ee1476", + "metadata": {}, + "outputs": [], + "source": [ + "eia_match_df = eia_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=match_cols, keep=\"first\")" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "id": "5cf7ca17-b42b-40c6-b6f7-9077acdb1220", "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, - "execution_count": 233, + "execution_count": 129, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "# this goes way down when we start matching in the Ex. 21 subsidiaries\n", "completeness_chart(sec_match_df, db_api=db_api)" ] }, { "cell_type": "code", - "execution_count": 234, + "execution_count": 130, "id": "02063bcd-8301-4a70-aab1-0bbf6119cf8b", "metadata": {}, "outputs": [ @@ -1639,23 +1656,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, - "execution_count": 234, + "execution_count": 130, "metadata": {}, "output_type": "execute_result" } @@ -1719,18 +1736,7 @@ }, { "cell_type": "code", - "execution_count": 209, - "id": "c4542c1f-d826-43c1-9af5-ce6473b79d90", - "metadata": {}, - "outputs": [], - "source": [ - "# could sub in zip code for street address?\n", - "match_cols = [\"company_name\", \"state\", \"city\", \"street_address\", \"zip_code\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 210, + "execution_count": 131, "id": "95bd4db7-c447-4a0e-ab37-59d4beac0b11", "metadata": {}, "outputs": [ @@ -1739,23 +1745,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, - "execution_count": 210, + "execution_count": 131, "metadata": {}, "output_type": "execute_result" } @@ -1819,7 +1825,7 @@ }, { "cell_type": "code", - "execution_count": 211, + "execution_count": 132, "id": "d9cbc91b-b061-461b-b1f2-83036c70d7c7", "metadata": {}, "outputs": [ @@ -1828,23 +1834,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, - "execution_count": 211, + "execution_count": 132, "metadata": {}, "output_type": "execute_result" } @@ -1916,34 +1922,38 @@ }, { "cell_type": "code", - "execution_count": 300, + "execution_count": 66, "id": "6402e556-b87c-47ca-bc30-ced2b42e6626", "metadata": {}, "outputs": [], "source": [ - "br0 = \"l.report_year = r.report_year and substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3)\"\n", - "br1 = \"l.report_year = r.report_year and l.street_address = r.street_address\"\n", - "# br2 = \"l.report_year = r.report_year and substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and l.city = r.city\"\n", - "br4 = \"l.report_year = r.report_year and l.phone_number = r.phone_number\"" + "# probably shouldn't be blocking on report year, because we don't care that much \n", + "# about report year lining up\n", + "# try overlap between tokens in address or company name\n", + "br0 = \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4)\"\n", + "br1 = \"l.street_address = r.street_address\"\n", + "br2 = \"substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and l.city = r.city\"\n", + "# br3 = \"substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and l.zip_code = r.zip_code\"\n", + "br3 = \"substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and array_length(list_intersect(l.street_address_list, r.street_address_list)) >= 2\"" ] }, { "cell_type": "code", - "execution_count": 257, + "execution_count": 67, "id": "1e6d3d24-f8ce-4118-9d31-d4f237d28237", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'number_of_comparisons_generated_pre_filter_conditions': 618634,\n", - " 'number_of_comparisons_to_be_scored_post_filter_conditions': 618634,\n", + "{'number_of_comparisons_generated_pre_filter_conditions': 988101,\n", + " 'number_of_comparisons_to_be_scored_post_filter_conditions': 988101,\n", " 'filter_conditions_identified': '',\n", - " 'equi_join_conditions_identified': 'l.report_year = r.report_year AND SUBSTRING(l.company_name_mphone, 1, 4) = SUBSTRING(r.company_name_mphone, 1, 4)',\n", + " 'equi_join_conditions_identified': 'SUBSTRING(l.company_name_mphone, 1, 4) = SUBSTRING(r.company_name_mphone, 1, 4)',\n", " 'link_type_join_condition': 'where l.\"source_dataset\" || \\'-__-\\' || l.\"record_id\" < r.\"source_dataset\" || \\'-__-\\' || r.\"record_id\" and l.\"source_dataset\" != r.\"source_dataset\"'}" ] }, - "execution_count": 257, + "execution_count": 67, "metadata": {}, "output_type": "execute_result" } @@ -1962,7 +1972,7 @@ }, { "cell_type": "code", - "execution_count": 259, + "execution_count": 68, "id": "c37c117a-0fa2-4b8a-9fae-da3569895ca3", "metadata": {}, "outputs": [ @@ -1988,8 +1998,6 @@ " \n", " \n", " key_0\n", - " key_1\n", - " key_2\n", " count_l\n", " count_r\n", " block_count\n", @@ -1998,43 +2006,37 @@ " \n", " \n", " 0\n", - " 2023\n", - " boston\n", - " 02110\n", - " 113\n", - " 134\n", - " 15142\n", + " AMRK\n", + " 888\n", + " 85\n", + " 75480\n", " \n", " \n", " 1\n", - " 2022\n", - " boston\n", - " 02110\n", - " 116\n", - " 110\n", - " 12760\n", + " INTR\n", + " 468\n", + " 157\n", + " 73476\n", " \n", " \n", " 2\n", - " 2021\n", - " boston\n", - " 02110\n", - " 113\n", - " 88\n", - " 9944\n", + " FRST\n", + " 836\n", + " 82\n", + " 68552\n", " \n", " \n", "\n", "
" ], "text/plain": [ - " key_0 key_1 key_2 count_l count_r block_count\n", - "0 2023 boston 02110 113 134 15142\n", - "1 2022 boston 02110 116 110 12760\n", - "2 2021 boston 02110 113 88 9944" + " key_0 count_l count_r block_count\n", + "0 AMRK 888 85 75480\n", + "1 INTR 468 157 73476\n", + "2 FRST 836 82 68552" ] }, - "execution_count": 259, + "execution_count": 68, "metadata": {}, "output_type": "execute_result" } @@ -2042,7 +2044,7 @@ "source": [ "result = n_largest_blocks(\n", " table_or_tables=[sec_match_df, eia_match_df],\n", - " blocking_rule=br3,\n", + " blocking_rule=br0,\n", " link_type=\"link_only\",\n", " db_api=db_api,\n", " n_largest=3\n", @@ -2053,32 +2055,46 @@ }, { "cell_type": "code", - "execution_count": 302, + "execution_count": 69, "id": "4e1a9844-5d98-4cac-a083-eef134f083ce", "metadata": {}, "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "bf1ed000055946dcbdc2d64e635de891", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, { "data": { "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, - "execution_count": 302, + "execution_count": 69, "metadata": {}, "output_type": "execute_result" } ], "source": [ "blocking_rules_for_analysis = [\n", - " br0, br1\n", + " br0, br1, br2, br3\n", "]\n", "\n", "\n", @@ -2161,7 +2177,30 @@ }, { "cell_type": "code", - "execution_count": 382, + "execution_count": 334, + "id": "cb8b02b2-50a1-4525-9516-eecdf9a145db", + "metadata": {}, + "outputs": [], + "source": [ + "# NOT USED\n", + "company_name_comparison = cl.CustomComparison(\n", + " comparison_levels = [\n", + " cll.NullLevel(\"company_name\"),\n", + " cll.ExactMatchLevel(\"company_name\"),\n", + " # cll.ExactMatchLevel(\"company_name_no_legal\"),\n", + " # cll.LevenshteinLevel(\"company_name\", distance_threshold=1),\n", + " cll.JaroWinklerLevel(\"company_name_no_legal\", distance_threshold=.95),\n", + " # cll.ArraySubsetLevel(\"company_name_mphone_list\"),\n", + " cll.ArrayIntersectLevel(\"company_name_mphone_list\", min_intersection=3)\n", + " ],\n", + " output_column_name=\"company_name\",\n", + " comparison_description=None\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 386, "id": "f72f546c-c338-4c9f-aab1-ba03c3169e18", "metadata": {}, "outputs": [ @@ -2169,32 +2208,33 @@ "name": "stdout", "output_type": "stream", "text": [ - "Comparison 'JaccardAtThresholds' of \"company_name\".\n", + "Comparison 'NameComparison' of \"company_name_no_legal\".\n", "Similarity is assessed using the following ComparisonLevels:\n", - " - 'company_name is NULL' with SQL rule: \"company_name_l\" IS NULL OR \"company_name_r\" IS NULL\n", - " - 'Exact match on company_name' with SQL rule: \"company_name_l\" = \"company_name_r\"\n", - " - 'Jaccard distance of 'company_name >= 0.9'' with SQL rule: jaccard(\"company_name_l\", \"company_name_r\") >= 0.9\n", - " - 'Jaccard distance of 'company_name >= 0.7'' with SQL rule: jaccard(\"company_name_l\", \"company_name_r\") >= 0.7\n", + " - 'company_name_no_legal is NULL' with SQL rule: \"company_name_no_legal_l\" IS NULL OR \"company_name_no_legal_r\" IS NULL\n", + " - 'Exact match on company_name_no_legal' with SQL rule: \"company_name_no_legal_l\" = \"company_name_no_legal_r\"\n", + " - 'Jaro-Winkler distance of company_name_no_legal >= 0.95' with SQL rule: jaro_winkler_similarity(\"company_name_no_legal_l\", \"company_name_no_legal_r\") >= 0.95\n", " - 'All other comparisons' with SQL rule: ELSE\n", "\n" ] } ], "source": [ - "# company_name_comparison = cl.NameComparison(\n", - "# \"company_name\",\n", - " # dmeta_col_name=\"company_name_mphone_list\" # this was breaking it for some reason\n", - "# )\n", + "company_name_comparison = cl.NameComparison(\n", + " \"company_name_no_legal\",\n", + " jaro_winkler_thresholds=[.95],\n", + ")\n", + "\"\"\"\n", "company_name_comparison = cl.JaccardAtThresholds(\n", " \"company_name\",\n", " # dmeta_col_name=\"company_name_mphone_list\" # this was breaking it for some reason\n", ")\n", + "\"\"\"\n", "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)" ] }, { "cell_type": "code", - "execution_count": 373, + "execution_count": 449, "id": "4298a288-c306-4d75-9d72-e5b8f87774ce", "metadata": {}, "outputs": [ @@ -2207,7 +2247,6 @@ " - 'street_address is NULL' with SQL rule: \"street_address_l\" IS NULL OR \"street_address_r\" IS NULL\n", " - 'Exact match on street_address' with SQL rule: \"street_address_l\" = \"street_address_r\"\n", " - 'Levenshtein distance of street_address <= 1' with SQL rule: levenshtein(\"street_address_l\", \"street_address_r\") <= 1\n", - " - 'Levenshtein distance of street_address <= 2' with SQL rule: levenshtein(\"street_address_l\", \"street_address_r\") <= 2\n", " - 'All other comparisons' with SQL rule: ELSE\n", "\n" ] @@ -2216,24 +2255,45 @@ "source": [ "address_comparison = cl.LevenshteinAtThresholds(\n", " \"street_address\",\n", - " # size_threshold_or_thresholds=[1,2,3]\n", - ")\n", + " distance_threshold_or_thresholds=[1]\n", + ").configure(term_frequency_adjustments=True)\n", "print(address_comparison.get_comparison(\"duckdb\").human_readable_description)" ] }, { "cell_type": "code", - "execution_count": 267, + "execution_count": 422, + "id": "d2e043ed-7f64-4547-992d-7f947a63db6d", + "metadata": {}, + "outputs": [], + "source": [ + "# NOT USED\n", + "address_comparison = cl.CustomComparison(\n", + " comparison_levels = [\n", + " cll.NullLevel(\"street_address\"),\n", + " cll.ExactMatchLevel(\"street_address\"),\n", + " cll.LevenshteinLevel(\"street_address\", distance_threshold=1),\n", + " cll.ArraySubsetLevel(\"street_address_list\"),\n", + " ],\n", + " output_column_name=\"street_address\",\n", + " comparison_description=None\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 388, "id": "63ed7cd2-d803-4d17-b730-c9fc17df0607", "metadata": {}, "outputs": [], "source": [ + "# Use state and city instead of zip code\n", "zip_code_comparison = cl.ExactMatch(\"zip_code\").configure(term_frequency_adjustments=True)" ] }, { "cell_type": "code", - "execution_count": 268, + "execution_count": 450, "id": "974a3982-38a1-45cb-9875-b8d4584c808d", "metadata": {}, "outputs": [], @@ -2243,7 +2303,7 @@ }, { "cell_type": "code", - "execution_count": 269, + "execution_count": 451, "id": "7592619b-340a-4496-8195-9ce932cae699", "metadata": {}, "outputs": [ @@ -2265,14 +2325,13 @@ "city_comparison = cl.NameComparison(\n", " \"city\",\n", " jaro_winkler_thresholds=[0.9]\n", - " # dmeta_col_name=\"company_name_mphone\" # this was breaking it for some reason\n", ")\n", "print(city_comparison.get_comparison(\"duckdb\").human_readable_description)" ] }, { "cell_type": "code", - "execution_count": 383, + "execution_count": 452, "id": "a946c820-9b93-41f1-9fc2-6da47d0cf407", "metadata": {}, "outputs": [], @@ -2283,12 +2342,12 @@ " comparisons=[\n", " company_name_comparison,\n", " address_comparison,\n", - " zip_code_comparison,\n", + " # zip_code_comparison,\n", " state_comparison,\n", " city_comparison\n", " ],\n", " blocking_rules_to_generate_predictions=[\n", - " br0, br1\n", + " br0, br1, br2, br3\n", " ],\n", " retain_intermediate_calculation_columns=True,\n", ")\n", @@ -2298,48 +2357,34 @@ }, { "cell_type": "code", - "execution_count": 384, + "execution_count": 453, "id": "36cae876-783d-4bff-89df-9d30cc5e60d6", "metadata": {}, "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "358d0a088e2441deaef798c55ad97068", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stderr", "output_type": "stream", "text": [ - "Probability two random records match is estimated to be 2.18e-05.\n", - "This means that amongst all possible pairwise record comparisons, one in 45,828.17 are expected to match. With 40,620,617,120 total possible comparisons, we expect a total of around 886,367.78 matching pairs\n" + "Probability two random records match is estimated to be 1.78e-06.\n", + "This means that amongst all possible pairwise record comparisons, one in 562,858.42 are expected to match. With 2,542,342,605 total possible comparisons, we expect a total of around 4,516.84 matching pairs\n" ] } ], "source": [ "deterministic_rules = [\n", - " block_on(\"company_name\", \"company_name\"),\n", - " block_on(\"phone_number\"),\n", - " block_on(\"street_address\"),\n", - " \"jaccard(r.company_name, l.company_name) >= .9 and l.city = r.city\",\n", - " \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4) and l.city = r.city\",\n", + " block_on(\"company_name_mphone\", \"company_name_mphone\"),\n", + " # block_on(\"street_address\"),\n", + " \"jaro_winkler_similarity(r.company_name, l.company_name) >= .95 and l.city = r.city\",\n", + " # \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4) and l.city = r.city and jaccard(r.street_address, l.street_address) >= .9\",\n", + " \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4) and l.city = r.city and l.street_address = r.street_address\",\n", "]\n", "\n", - "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.9)" + "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.95)" ] }, { "cell_type": "code", - "execution_count": 385, + "execution_count": null, "id": "a5190bda-070d-4d8e-a6db-be7c65b04db3", "metadata": {}, "outputs": [ @@ -2347,26 +2392,17 @@ "name": "stderr", "output_type": "stream", "text": [ - "----- Estimating u probabilities using random sampling -----\n", - "\n", - "Estimated u probabilities using random sampling\n", - "\n", - "Your model is not yet fully trained. Missing estimates for:\n", - " - company_name (no m values are trained).\n", - " - street_address (no m values are trained).\n", - " - zip_code (no m values are trained).\n", - " - state (no m values are trained).\n", - " - city (no m values are trained).\n" + "----- Estimating u probabilities using random sampling -----\n" ] } ], "source": [ - "linker.training.estimate_u_using_random_sampling(max_pairs=1e7)" + "linker.training.estimate_u_using_random_sampling(max_pairs=1e8)" ] }, { "cell_type": "code", - "execution_count": 386, + "execution_count": 427, "id": "f8dd1336-8a5b-49d2-a054-df0c0a0e953f", "metadata": {}, "outputs": [ @@ -2381,31 +2417,30 @@ "(l.\"company_name\" = r.\"company_name\") AND (l.\"company_name\" = r.\"company_name\")\n", "\n", "Parameter estimates will be made for the following comparison(s):\n", + " - company_name_no_legal\n", " - street_address\n", - " - zip_code\n", " - state\n", " - city\n", "\n", "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", - " - company_name\n", "\n", - "Iteration 1: Largest change in params was 0.804 in the m_probability of street_address, level `All other comparisons`\n", - "Iteration 2: Largest change in params was 0.0737 in the m_probability of state, level `Exact match on state`\n", - "Iteration 3: Largest change in params was -0.039 in the m_probability of state, level `All other comparisons`\n", - "Iteration 4: Largest change in params was 0.021 in the m_probability of city, level `All other comparisons`\n", - "Iteration 5: Largest change in params was 0.00805 in the m_probability of city, level `All other comparisons`\n", - "Iteration 6: Largest change in params was -0.00338 in the m_probability of state, level `All other comparisons`\n", - "Iteration 7: Largest change in params was 0.00164 in the m_probability of state, level `Exact match on state`\n", - "Iteration 8: Largest change in params was 0.000825 in the m_probability of state, level `Exact match on state`\n", - "Iteration 9: Largest change in params was -0.000425 in the m_probability of state, level `All other comparisons`\n", - "Iteration 10: Largest change in params was -0.000223 in the m_probability of state, level `All other comparisons`\n", - "Iteration 11: Largest change in params was 0.000118 in the m_probability of state, level `Exact match on state`\n", - "Iteration 12: Largest change in params was 6.29e-05 in the m_probability of state, level `Exact match on state`\n", + "WARNING:\n", + "Level Jaro-Winkler distance of company_name_no_legal >= 0.95 on comparison company_name_no_legal not observed in dataset, unable to train m value\n", + "\n", + "WARNING:\n", + "Level All other comparisons on comparison company_name_no_legal not observed in dataset, unable to train m value\n", "\n", - "EM converged after 12 iterations\n", + "Iteration 1: Largest change in params was -0.347 in the m_probability of city, level `All other comparisons`\n", + "Iteration 2: Largest change in params was 0.307 in the m_probability of city, level `All other comparisons`\n", + "Iteration 3: Largest change in params was 0.0403 in the m_probability of city, level `All other comparisons`\n", + "Iteration 4: Largest change in params was 4.46e-05 in the m_probability of city, level `All other comparisons`\n", + "\n", + "EM converged after 4 iterations\n", + "m probability not trained for company_name_no_legal - Jaro-Winkler distance of company_name_no_legal >= 0.95 (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n", + "m probability not trained for company_name_no_legal - All other comparisons (comparison vector value: 0). This usually means the comparison level was never observed in the training data.\n", "\n", "Your model is not yet fully trained. Missing estimates for:\n", - " - company_name (no m values are trained).\n" + " - company_name_no_legal (some m values are not trained).\n" ] } ], @@ -2418,7 +2453,7 @@ }, { "cell_type": "code", - "execution_count": 387, + "execution_count": 428, "id": "9581aa18-3352-429a-86c4-6078bcf13a55", "metadata": {}, "outputs": [ @@ -2433,32 +2468,28 @@ "(l.\"street_address\" = r.\"street_address\") AND (l.\"street_address\" = r.\"street_address\")\n", "\n", "Parameter estimates will be made for the following comparison(s):\n", - " - company_name\n", - " - zip_code\n", + " - company_name_no_legal\n", " - state\n", " - city\n", "\n", "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", " - street_address\n", "\n", - "Iteration 1: Largest change in params was -0.929 in the m_probability of company_name, level `Exact match on company_name`\n", - "Iteration 2: Largest change in params was 0.0355 in probability_two_random_records_match\n", - "Iteration 3: Largest change in params was 0.00843 in the m_probability of state, level `All other comparisons`\n", - "Iteration 4: Largest change in params was -0.00612 in the m_probability of state, level `Exact match on state`\n", - "Iteration 5: Largest change in params was -0.00431 in the m_probability of state, level `Exact match on state`\n", - "Iteration 6: Largest change in params was -0.00301 in the m_probability of state, level `Exact match on state`\n", - "Iteration 7: Largest change in params was 0.0021 in the m_probability of state, level `All other comparisons`\n", - "Iteration 8: Largest change in params was -0.00146 in the m_probability of state, level `Exact match on state`\n", - "Iteration 9: Largest change in params was 0.00101 in the m_probability of state, level `All other comparisons`\n", - "Iteration 10: Largest change in params was -0.000704 in the m_probability of state, level `Exact match on state`\n", - "Iteration 11: Largest change in params was 0.000489 in the m_probability of state, level `All other comparisons`\n", - "Iteration 12: Largest change in params was -0.00034 in the m_probability of state, level `Exact match on state`\n", - "Iteration 13: Largest change in params was -0.000236 in the m_probability of state, level `Exact match on state`\n", - "Iteration 14: Largest change in params was 0.000164 in the m_probability of state, level `All other comparisons`\n", - "Iteration 15: Largest change in params was -0.000114 in the m_probability of state, level `Exact match on state`\n", - "Iteration 16: Largest change in params was -7.88e-05 in the m_probability of state, level `Exact match on state`\n", + "Iteration 1: Largest change in params was -0.395 in the m_probability of city, level `All other comparisons`\n", + "Iteration 2: Largest change in params was 0.889 in the m_probability of company_name_no_legal, level `All other comparisons`\n", + "Iteration 3: Largest change in params was 0.285 in probability_two_random_records_match\n", + "Iteration 4: Largest change in params was 0.0152 in probability_two_random_records_match\n", + "Iteration 5: Largest change in params was 0.048 in the m_probability of city, level `All other comparisons`\n", + "Iteration 6: Largest change in params was 0.0559 in the m_probability of city, level `All other comparisons`\n", + "Iteration 7: Largest change in params was 0.0205 in probability_two_random_records_match\n", + "Iteration 8: Largest change in params was 0.00696 in probability_two_random_records_match\n", + "Iteration 9: Largest change in params was 0.0024 in probability_two_random_records_match\n", + "Iteration 10: Largest change in params was 0.000849 in probability_two_random_records_match\n", + "Iteration 11: Largest change in params was 0.000305 in probability_two_random_records_match\n", + "Iteration 12: Largest change in params was 0.00011 in probability_two_random_records_match\n", + "Iteration 13: Largest change in params was 3.98e-05 in probability_two_random_records_match\n", "\n", - "EM converged after 16 iterations\n", + "EM converged after 13 iterations\n", "\n", "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" ] @@ -2473,7 +2504,7 @@ }, { "cell_type": "code", - "execution_count": 388, + "execution_count": 429, "id": "8ad317ed-1db9-4932-9815-6e9e0efa9580", "metadata": {}, "outputs": [ @@ -2482,23 +2513,112 @@ "text/html": [ "\n", "\n", + "
\n", + "" + ], + "text/plain": [ + "alt.VConcatChart(...)" + ] + }, + "execution_count": 429, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.visualisations.match_weights_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 430, + "id": "5e21bf55-64ac-4f4b-8f1c-d7507b5e7af6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", - "
\n", + "
\n", "" + ], + "text/plain": [ + "alt.HConcatChart(...)" + ] + }, + "execution_count": 430, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "linker.visualisations.m_u_parameters_chart()" + ] + }, + { + "cell_type": "code", + "execution_count": 420, + "id": "fedb78e1-ee73-4d1e-8a96-3b27f6561a91", + "metadata": {}, + "outputs": [], + "source": [ + "settings = linker.misc.save_model_to_json(\n", + " \"model_unsupervised_city_state_0.json\", overwrite=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "31f9d73d-cfa4-41fa-906f-c8501a29283b", + "metadata": {}, + "source": [ + "## Make Predictions" + ] + }, + { + "cell_type": "code", + "execution_count": 431, + "id": "94e96441-89b6-4516-aa6a-4d1593ce03be", + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "3ce1c0af73694400974ca6253619dd5b", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Blocking time: 9.73 seconds\n", + "Predict time: 0.52 seconds\n" + ] + } + ], + "source": [ + "# it's helpful to keep threshold at .5 just to see what makes it into blocking\n", + "# df_predictions = linker.inference.predict(threshold_match_probability=0.5)\n", + "df_predictions = linker.inference.predict()" + ] + }, + { + "cell_type": "code", + "execution_count": 432, + "id": "722effbc-24e8-45ca-aa64-8cdcd75846e0", + "metadata": {}, + "outputs": [], + "source": [ + "preds_df = df_predictions.as_pandas_dataframe()" + ] + }, + { + "cell_type": "code", + "execution_count": 433, + "id": "21b19a11-15c6-46ab-bd73-7c4752f7e25e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrecord_id_lrecord_id_rcompany_name_no_legal_lcompany_name_no_legal_rgamma_company_name_no_legaltf_company_name_no_legal_ltf_company_name_no_legal_rbf_company_name_no_legalbf_tf_adj_company_name_no_legalstreet_address_lstreet_address_rstreet_address_list_lstreet_address_list_rgamma_street_addressbf_street_addressstate_lstate_rgamma_statetf_state_ltf_state_rbf_statebf_tf_adj_statecity_lcity_rgamma_citytf_city_ltf_city_rbf_citybf_tf_adj_citycompany_name_mphone_lcompany_name_mphone_rmatch_key
32260-24.0478235.766122e-08__splink__input_table_0__splink__input_table_182087113663sutro biopharmastirling energy systems solar one00.0000190.0000290.9859811.0310 utah ave., suite 150suite 150[310, utah, ave.,, suite, 150][suite, 150]0.00.265921caaz00.1491420.0129500.3106981.0south san franciscophoenix00.0014380.0035110.3984031.0STR BFRMSTRLNK ENRJ SSTMS SLR ON3
27875-24.0478235.766122e-08__splink__input_table_0__splink__input_table_1126035113797corner growth acquisition 2grubb and ellis management services00.0000100.0000190.9859811.0251 lytton avenue, suite 200suite 200[251, lytton, avenue,, suite, 200][suite, 200]0.00.265921capa00.1491420.0301970.3106981.0palo altopittsburgh00.0018500.0036560.3984031.0KRNR KR0 AKKSXNKRB ANT ELS MNJMNT SRFSS3
27993-24.0478235.766122e-08__splink__input_table_0__splink__input_table_112509697905altus powerallegheny ridge wind farm00.0000100.0000380.9859811.02200 atlantic street, 6th floor6th floor[2200, atlantic, street,, 6th, floor][6th, floor]0.00.265921ctca00.0203250.1491420.3106981.0stamfordsan francisco00.0037890.0133740.3984031.0ALTS PWRALKHN RJ WNT FRM3
28003-24.0478235.766122e-08__splink__input_table_0__splink__input_table_111540291508clearway energyclipper windpower00.0000380.0000290.9859811.0300 carnegie center, suite 300suite 300[300, carnegie, center,, suite, 300][suite, 300]0.00.265921njca00.0311590.1491420.3106981.0princetoncarpinteria00.0021180.0001890.3984031.0KLRW ENRJKLPR WNTPWR3
28024-24.0478235.766122e-08__splink__input_table_0__splink__input_table_112500977758benchmark 2020 b21 mortgage trustbountiful city city of00.0000100.0000480.9859811.0200 west street198 south 200 west street[200, west, street][198, south, 200, west, street]0.00.265921nyut00.1130100.0104750.3106981.0new yorkbountiful city00.0869440.0000220.3984031.0BNXMRK B MRTKJ TRSTBNTFL ST ST OF3
...............................................................................................................
1038434NaNNaN__splink__input_table_0__splink__input_table_113778470294farmer brothersfarmers electric ia00.0000290.0000380.9859811.020333 s normandie ave1959 yoder ave,sw[20333, s, normandie, ave][1959, yoder, ave,sw]NaNNaNcaia00.1491420.0165270.3106981.0torrancekalona00.0024850.0000110.3984031.0FRMR BR0RSFRMRS ELKTRK I0
1038441NaNNaN__splink__input_table_0__splink__input_table_1139631137540international game technologyintergen north america00.0000480.0000290.9859811.06355 south buffalo drive4th floor[6355, south, buffalo, drive][4th, floor]NaNNaNnvma00.0192880.0414010.3106981.0las vegasburlington00.0104770.0014150.3984031.0INTRNXNL KM TXNLJINTRJN NR0 AMRK0
1038443NaNNaN__splink__input_table_0__splink__input_table_19085313424monster artsminnesota solar csg 400.0000100.0000290.9859811.0806 east avenida pico200 wellington street west, su[806, east, avenida, pico][200, wellington, street, west,, su]NaNNaNcaNone-10.149142NaN1.0000001.0san clementetoronto00.0003460.0021290.3984031.0MNSTR ARTSMNST SLR KSK0
1038454NaNNaN__splink__input_table_0__splink__input_table_11081361959nxt idnextgrid mastic00.0000380.0000290.9859811.04 research drive, #402879 sanchez street[4, research, drive,, #402][879, sanchez, street]NaNNaNctca00.0203250.1491420.3106981.0sheltonsan francisco00.0003900.0133740.3984031.0NKST ITNKSTKRT MSTK0
1038456NaNNaN__splink__input_table_0__splink__input_table_191657105602coronado biosciencesgarnet energy00.0000190.0000380.9859811.024 new england executive parksuite 102[24, new, england, executive, park][suite, 102]NaNNaNmaca00.0414010.1491420.3106981.0burlingtonwestlake village00.0014150.0006910.3984031.0KRNT BSSNSSKRNT ENRJ0
\n", + "

1038457 rows × 36 columns

\n", + "
" + ], + "text/plain": [ + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r street_address_list_l street_address_list_r gamma_street_address bf_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key\n", + "32260 -24.047823 5.766122e-08 __splink__input_table_0 __splink__input_table_1 82087 113663 sutro biopharma stirling energy systems solar one 0 0.000019 0.000029 0.985981 1.0 310 utah ave., suite 150 suite 150 [310, utah, ave.,, suite, 150] [suite, 150] 0.0 0.265921 ca az 0 0.149142 0.012950 0.310698 1.0 south san francisco phoenix 0 0.001438 0.003511 0.398403 1.0 STR BFRM STRLNK ENRJ SSTMS SLR ON 3\n", + "27875 -24.047823 5.766122e-08 __splink__input_table_0 __splink__input_table_1 126035 113797 corner growth acquisition 2 grubb and ellis management services 0 0.000010 0.000019 0.985981 1.0 251 lytton avenue, suite 200 suite 200 [251, lytton, avenue,, suite, 200] [suite, 200] 0.0 0.265921 ca pa 0 0.149142 0.030197 0.310698 1.0 palo alto pittsburgh 0 0.001850 0.003656 0.398403 1.0 KRNR KR0 AKKSXN KRB ANT ELS MNJMNT SRFSS 3\n", + "27993 -24.047823 5.766122e-08 __splink__input_table_0 __splink__input_table_1 125096 97905 altus power allegheny ridge wind farm 0 0.000010 0.000038 0.985981 1.0 2200 atlantic street, 6th floor 6th floor [2200, atlantic, street,, 6th, floor] [6th, floor] 0.0 0.265921 ct ca 0 0.020325 0.149142 0.310698 1.0 stamford san francisco 0 0.003789 0.013374 0.398403 1.0 ALTS PWR ALKHN RJ WNT FRM 3\n", + "28003 -24.047823 5.766122e-08 __splink__input_table_0 __splink__input_table_1 115402 91508 clearway energy clipper windpower 0 0.000038 0.000029 0.985981 1.0 300 carnegie center, suite 300 suite 300 [300, carnegie, center,, suite, 300] [suite, 300] 0.0 0.265921 nj ca 0 0.031159 0.149142 0.310698 1.0 princeton carpinteria 0 0.002118 0.000189 0.398403 1.0 KLRW ENRJ KLPR WNTPWR 3\n", + "28024 -24.047823 5.766122e-08 __splink__input_table_0 __splink__input_table_1 125009 77758 benchmark 2020 b21 mortgage trust bountiful city city of 0 0.000010 0.000048 0.985981 1.0 200 west street 198 south 200 west street [200, west, street] [198, south, 200, west, street] 0.0 0.265921 ny ut 0 0.113010 0.010475 0.310698 1.0 new york bountiful city 0 0.086944 0.000022 0.398403 1.0 BNXMRK B MRTKJ TRST BNTFL ST ST OF 3\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "1038434 NaN NaN __splink__input_table_0 __splink__input_table_1 137784 70294 farmer brothers farmers electric ia 0 0.000029 0.000038 0.985981 1.0 20333 s normandie ave 1959 yoder ave,sw [20333, s, normandie, ave] [1959, yoder, ave,sw] NaN NaN ca ia 0 0.149142 0.016527 0.310698 1.0 torrance kalona 0 0.002485 0.000011 0.398403 1.0 FRMR BR0RS FRMRS ELKTRK I 0\n", + "1038441 NaN NaN __splink__input_table_0 __splink__input_table_1 139631 137540 international game technology intergen north america 0 0.000048 0.000029 0.985981 1.0 6355 south buffalo drive 4th floor [6355, south, buffalo, drive] [4th, floor] NaN NaN nv ma 0 0.019288 0.041401 0.310698 1.0 las vegas burlington 0 0.010477 0.001415 0.398403 1.0 INTRNXNL KM TXNLJ INTRJN NR0 AMRK 0\n", + "1038443 NaN NaN __splink__input_table_0 __splink__input_table_1 90853 13424 monster arts minnesota solar csg 4 0 0.000010 0.000029 0.985981 1.0 806 east avenida pico 200 wellington street west, su [806, east, avenida, pico] [200, wellington, street, west,, su] NaN NaN ca None -1 0.149142 NaN 1.000000 1.0 san clemente toronto 0 0.000346 0.002129 0.398403 1.0 MNSTR ARTS MNST SLR KSK 0\n", + "1038454 NaN NaN __splink__input_table_0 __splink__input_table_1 108136 1959 nxt id nextgrid mastic 0 0.000038 0.000029 0.985981 1.0 4 research drive, #402 879 sanchez street [4, research, drive,, #402] [879, sanchez, street] NaN NaN ct ca 0 0.020325 0.149142 0.310698 1.0 shelton san francisco 0 0.000390 0.013374 0.398403 1.0 NKST IT NKSTKRT MSTK 0\n", + "1038456 NaN NaN __splink__input_table_0 __splink__input_table_1 91657 105602 coronado biosciences garnet energy 0 0.000019 0.000038 0.985981 1.0 24 new england executive park suite 102 [24, new, england, executive, park] [suite, 102] NaN NaN ma ca 0 0.041401 0.149142 0.310698 1.0 burlington westlake village 0 0.001415 0.000691 0.398403 1.0 KRNT BSSNSS KRNT ENRJ 0\n", + "\n", + "[1038457 rows x 36 columns]" + ] + }, + "execution_count": 433, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preds_df.sort_values(by=\"match_probability\")" + ] + }, + { + "cell_type": "code", + "execution_count": 434, + "id": "c0b292c8-26ed-407a-866e-75851577d567", + "metadata": {}, + "outputs": [], + "source": [ + "# join on utility_id_eia and CIK\n", + "preds_validation_df = preds_df.merge(sec_clean_df[[\"record_id\", \"central_index_key\", \"company_name_raw\"]],\n", + " how=\"left\",\n", + " left_on=\"record_id_l\",\n", + " right_on=\"record_id\")" + ] + }, + { + "cell_type": "code", + "execution_count": 435, + "id": "e8744d54-3cc6-434c-bbbe-a10d2d3cd9c0", + "metadata": {}, + "outputs": [], + "source": [ + "preds_validation_df = preds_validation_df.merge(eia_clean_df[[\"record_id\", \"utility_id_eia\"]],\n", + " how=\"left\",\n", + " left_on=\"record_id_r\",\n", + " right_on=\"record_id\")" + ] + }, + { + "cell_type": "code", + "execution_count": 436, + "id": "5103190c-3775-427f-a8f2-cc8a8f79892b", + "metadata": {}, + "outputs": [], + "source": [ + "preds_validation_df = preds_validation_df.sort_values(\n", + " by=[\"central_index_key\", \"utility_id_eia\", \"match_probability\"], ascending=False\n", + ").drop_duplicates(subset=[\"central_index_key\", \"utility_id_eia\"], keep=\"first\")" + ] + }, + { + "cell_type": "code", + "execution_count": 437, + "id": "8fa04f18-beff-4bdc-bbbb-705950eab5b8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrecord_id_lrecord_id_rcompany_name_no_legal_lcompany_name_no_legal_rgamma_company_name_no_legaltf_company_name_no_legal_ltf_company_name_no_legal_rbf_company_name_no_legalbf_tf_adj_company_name_no_legalstreet_address_lstreet_address_rstreet_address_list_lstreet_address_list_rgamma_street_addressbf_street_addressstate_lstate_rgamma_statetf_state_ltf_state_rbf_statebf_tf_adj_statecity_lcity_rgamma_citytf_city_ltf_city_rbf_citybf_tf_adj_citycompany_name_mphone_lcompany_name_mphone_rmatch_keyrecord_id_xcentral_index_keycompany_name_rawrecord_id_yutility_id_eia
8898455.6798070.980865__splink__input_table_0__splink__input_table_15195622658constellation energyconstellation newenergy10.0000290.0000776085.7549191.0000001310 point streetNone[1310, point, street]NaN-1.01.000000mdmd10.0232980.02329814.8563412.034020baltimorebaltimore20.0036780.00367894.807391.654881KNSTLXN ENRJKNSTLXN NWNRJ0519560001868275constellation energy corp2265858491
88410913.0956330.999886__splink__input_table_0__splink__input_table_112026796849evergyevergy20.0000190.000019872345.6896550.0595641200 main street1200 main street[1200, main, street][1200, main, street]2.05.407499momo10.0117440.01174414.8563414.035057kansas citykansas city20.0019730.00197394.807393.085372EFRJEFRJ01202670001711269evergy, inc.9684964428
89394112.4865670.999826__splink__input_table_0__splink__input_table_112022296211consol energyconsol energy20.0000580.000058872345.6896550.019855275 technology drive275 technology drive[275, technology, drive][275, technology, drive]2.05.407499papa10.0301970.03019714.8563411.569346canonsburgcanonsburg20.0003900.00039094.8073915.603165KNSL ENRJKNSL ENRJ01202220001710366consol energy inc.962114299
9435949.1612740.998256__splink__input_table_0__splink__input_table_111927183669vistra energyvistra energy20.0000190.000019872345.6896550.0595646555 sierra drive6555 sierra drive[6555, sierra, drive][6555, sierra, drive]2.05.407499txtx10.0808660.08086614.8563410.586015irvingirving20.0043800.00438094.807391.389595FSTR ENRJFSTR ENRJ01192710001692819vistra energy corp.8366962723
8604147.5763110.994788__splink__input_table_0__splink__input_table_111927471441vistravistra20.0000580.000058872345.6896550.0198556555 sierra drive6555 sierra drive[6555, sierra, drive][6555, sierra, drive]2.05.407499txtx10.0808660.08086614.8563410.586015irvingirving20.0043800.00438094.807391.389595FSTRFSTR01192740001692819vistra corp.714415504
..............................................................................................................................
102676512.0871330.999770__splink__input_table_0__splink__input_table_115310679761archer daniels midlandarcher daniels midland20.0000580.000058872345.6896550.0198554666 faries pkwy4666 faries pkwy[4666, faries, pkwy][4666, faries, pkwy]2.05.407499ilil10.0331910.03319114.8563411.427770decaturdecatur20.0004680.00046894.8073913.002638ARXR TNLS MTLNTARXR TNLS MTLNT01531060000007084archer daniels midland co79761772
6568339.8099770.998887__splink__input_table_0__splink__input_table_115054679913appalachian powerappalachian power20.0000770.000077872345.6896550.0148911 riverside plaza1 riverside plaza[1, riverside, plaza][1, riverside, plaza]2.05.407499ohoh10.0187700.01877014.8563412.524754columbuscolumbus20.0030090.00300994.807392.022633APLXN PWRAPLXN PWR01505460000006879appalachian power co79913733
64074710.8880460.999473__splink__input_table_0__splink__input_table_114474380319american crystal sugar /mn/american crystal sugar10.0000100.0000296085.7549191.000000101 n 3rd stNone[101, n, 3rd, st]NaN-1.01.000000mnmn10.0259960.02599614.8563411.822919moorheadmoorhead20.0000890.00008994.8073968.263848AMRKN KRSTL SKR MNAMRKN KRSTL SKR01447430000004828american crystal sugar co /mn/80319491
9985789.9905540.999018__splink__input_table_0__splink__input_table_1257580977alabama poweralabama power20.0000670.000067872345.6896550.017018600 n 18th stNone[600, n, 18th, st]NaN-1.01.000000alal10.0052800.00528014.8563418.975778birminghambirmingham20.0019950.00199594.807393.050898ALBM PWRALBM PWR025750000003153alabama power co80977195
9129149.4344940.998557__splink__input_table_0__splink__input_table_113297679317air products and chemicals /de/air products and chemicals10.0000190.0000486085.7549191.0000007201 hamilton blvd7201 hamilton blvd[7201, hamilton, blvd][7201, hamilton, blvd]2.05.407499papa10.0301970.03019714.8563411.569346allentownallentown20.0011370.00113794.807395.354027AR PRTKTS ANT XMKLS TAR PRTKTS ANT XMKLS01329760000002969air products & chemicals inc /de/79317991
\n", + "

197 rows × 41 columns

\n", + "
" + ], + "text/plain": [ + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r street_address_list_l street_address_list_r gamma_street_address bf_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key record_id_x central_index_key company_name_raw record_id_y utility_id_eia\n", + "889845 5.679807 0.980865 __splink__input_table_0 __splink__input_table_1 51956 22658 constellation energy constellation newenergy 1 0.000029 0.000077 6085.754919 1.000000 1310 point street None [1310, point, street] NaN -1.0 1.000000 md md 1 0.023298 0.023298 14.856341 2.034020 baltimore baltimore 2 0.003678 0.003678 94.80739 1.654881 KNSTLXN ENRJ KNSTLXN NWNRJ 0 51956 0001868275 constellation energy corp 22658 58491\n", + "884109 13.095633 0.999886 __splink__input_table_0 __splink__input_table_1 120267 96849 evergy evergy 2 0.000019 0.000019 872345.689655 0.059564 1200 main street 1200 main street [1200, main, street] [1200, main, street] 2.0 5.407499 mo mo 1 0.011744 0.011744 14.856341 4.035057 kansas city kansas city 2 0.001973 0.001973 94.80739 3.085372 EFRJ EFRJ 0 120267 0001711269 evergy, inc. 96849 64428\n", + "893941 12.486567 0.999826 __splink__input_table_0 __splink__input_table_1 120222 96211 consol energy consol energy 2 0.000058 0.000058 872345.689655 0.019855 275 technology drive 275 technology drive [275, technology, drive] [275, technology, drive] 2.0 5.407499 pa pa 1 0.030197 0.030197 14.856341 1.569346 canonsburg canonsburg 2 0.000390 0.000390 94.80739 15.603165 KNSL ENRJ KNSL ENRJ 0 120222 0001710366 consol energy inc. 96211 4299\n", + "943594 9.161274 0.998256 __splink__input_table_0 __splink__input_table_1 119271 83669 vistra energy vistra energy 2 0.000019 0.000019 872345.689655 0.059564 6555 sierra drive 6555 sierra drive [6555, sierra, drive] [6555, sierra, drive] 2.0 5.407499 tx tx 1 0.080866 0.080866 14.856341 0.586015 irving irving 2 0.004380 0.004380 94.80739 1.389595 FSTR ENRJ FSTR ENRJ 0 119271 0001692819 vistra energy corp. 83669 62723\n", + "860414 7.576311 0.994788 __splink__input_table_0 __splink__input_table_1 119274 71441 vistra vistra 2 0.000058 0.000058 872345.689655 0.019855 6555 sierra drive 6555 sierra drive [6555, sierra, drive] [6555, sierra, drive] 2.0 5.407499 tx tx 1 0.080866 0.080866 14.856341 0.586015 irving irving 2 0.004380 0.004380 94.80739 1.389595 FSTR FSTR 0 119274 0001692819 vistra corp. 71441 5504\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "1026765 12.087133 0.999770 __splink__input_table_0 __splink__input_table_1 153106 79761 archer daniels midland archer daniels midland 2 0.000058 0.000058 872345.689655 0.019855 4666 faries pkwy 4666 faries pkwy [4666, faries, pkwy] [4666, faries, pkwy] 2.0 5.407499 il il 1 0.033191 0.033191 14.856341 1.427770 decatur decatur 2 0.000468 0.000468 94.80739 13.002638 ARXR TNLS MTLNT ARXR TNLS MTLNT 0 153106 0000007084 archer daniels midland co 79761 772\n", + "656833 9.809977 0.998887 __splink__input_table_0 __splink__input_table_1 150546 79913 appalachian power appalachian power 2 0.000077 0.000077 872345.689655 0.014891 1 riverside plaza 1 riverside plaza [1, riverside, plaza] [1, riverside, plaza] 2.0 5.407499 oh oh 1 0.018770 0.018770 14.856341 2.524754 columbus columbus 2 0.003009 0.003009 94.80739 2.022633 APLXN PWR APLXN PWR 0 150546 0000006879 appalachian power co 79913 733\n", + "640747 10.888046 0.999473 __splink__input_table_0 __splink__input_table_1 144743 80319 american crystal sugar /mn/ american crystal sugar 1 0.000010 0.000029 6085.754919 1.000000 101 n 3rd st None [101, n, 3rd, st] NaN -1.0 1.000000 mn mn 1 0.025996 0.025996 14.856341 1.822919 moorhead moorhead 2 0.000089 0.000089 94.80739 68.263848 AMRKN KRSTL SKR MN AMRKN KRSTL SKR 0 144743 0000004828 american crystal sugar co /mn/ 80319 491\n", + "998578 9.990554 0.999018 __splink__input_table_0 __splink__input_table_1 2575 80977 alabama power alabama power 2 0.000067 0.000067 872345.689655 0.017018 600 n 18th st None [600, n, 18th, st] NaN -1.0 1.000000 al al 1 0.005280 0.005280 14.856341 8.975778 birmingham birmingham 2 0.001995 0.001995 94.80739 3.050898 ALBM PWR ALBM PWR 0 2575 0000003153 alabama power co 80977 195\n", + "912914 9.434494 0.998557 __splink__input_table_0 __splink__input_table_1 132976 79317 air products and chemicals /de/ air products and chemicals 1 0.000019 0.000048 6085.754919 1.000000 7201 hamilton blvd 7201 hamilton blvd [7201, hamilton, blvd] [7201, hamilton, blvd] 2.0 5.407499 pa pa 1 0.030197 0.030197 14.856341 1.569346 allentown allentown 2 0.001137 0.001137 94.80739 5.354027 AR PRTKTS ANT XMKLS T AR PRTKTS ANT XMKLS 0 132976 0000002969 air products & chemicals inc /de/ 79317 991\n", + "\n", + "[197 rows x 41 columns]" + ] + }, + "execution_count": 437, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "preds_validation_df[preds_validation_df.match_probability > .9]" + ] + }, + { + "cell_type": "code", + "execution_count": 89, + "id": "11190456-12a9-49df-b863-7a6f674e39eb", + "metadata": {}, + "outputs": [], + "source": [ + "validation_df = pd.read_csv(\"sec_eia_validation_set.csv\", dtype={\"central_index_key\": str})" + ] + }, + { + "cell_type": "code", + "execution_count": 90, + "id": "5a57bae5-6188-4a6c-be9a-14a159f82d81", + "metadata": {}, + "outputs": [], + "source": [ + "validation_df[\"central_index_key\"] = validation_df[\"central_index_key\"].str.zfill(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 438, + "id": "461725d0-28c8-43dd-bcd5-086b7d3f7d4b", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df = validation_df.merge(\n", + " preds_validation_df[[\"record_id_l\", \"record_id_r\", \"central_index_key\", \"utility_id_eia\", \"match_probability\", \"gamma_company_name_no_legal\"]].drop_duplicates(keep=\"first\"),\n", + " how=\"left\",\n", + " on=[\"central_index_key\", \"utility_id_eia\"],\n", + " indicator=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 439, + "id": "4d45f339-7a5b-466a-81f5-c71e425a77df", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df[\"predicted_match\"] = merged_df[\"_merge\"].map({\"both\": 1, \"left_only\": 0})" + ] + }, + { + "cell_type": "code", + "execution_count": 440, + "id": "182732ea-39c5-4fb4-9429-5f7aed781ef5", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df[\"predicted_match\"] = merged_df[\"predicted_match\"].where(\n", + " (merged_df.match_probability > .95),\n", + " 0\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 441, + "id": "bc058dbf-6f71-4978-90ce-5405edbbf9e5", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
central_index_keyutility_id_eiasec_company_nameeia_company_namematchrecord_id_lrecord_id_rmatch_probabilitygamma_company_name_no_legal_mergepredicted_match
00000003153195alabama power coNaN12575809770.9990182both1.0
1000186894158702fluence energy, inc.Fluence0126809216150.0000020both0.0
200000410917140georgia power coNaN150428682420.0298532both0.0
300000221984062columbus southern power co /oh/Columbus Southern Power Co1129635963000.9976281both1.0
400013261605416duke energy corpNaN137661715550.9263522both0.0
5000003037154905duke energy carolinas, llcDuke Energy Carolinas LLC11332611185430.9879162both1.0
6000086944657140berkshire realty co inc /deBerkshire Wind Power Cooperative Corp0198821894150.0000300both0.0
7000009212218195southern cosouthern co services inc0504171118240.0000630both0.0
8000009212217650southern coSouthern Power Co050417496130.0043150both0.0
9000007548814328pacific gas & electric coNaN12898554800.6249912both0.0
1000010312966526firstenergy corpFirstEnergy014192697160.9997072both1.0
11000103129654776firstenergy corpFirstEnergy Nuclear Generation Corp0141921021630.0000660both0.0
1200010312966458firstenergy corpFirst Energy Services0141921620330.0000660both0.0
13000103129632208firstenergy corpFirst Energy Corp1141921218550.0106971both0.0
14000010012224211tucson electric power coNaN1715415070.9997982both1.0
15000009627118454tampa electric coNaN1231716479820.9892282both1.0
1600007159575248dominion energy, incNaN115937718780.9982822both1.0
17000101387159883nrg energy, incNRG Energy Gas & Wind Holdings Inc07168174540.0025750both0.0
18000101387113377nrg energy incNRG Energy Inc17173950290.9888012both1.0
19000078881613994oglethorpe power corpNaN1172902564780.9997682both1.0
2000000186753266central maine power coNaN11267711766630.8977002both0.0
\n", + "
" + ], + "text/plain": [ + " central_index_key utility_id_eia sec_company_name eia_company_name match record_id_l record_id_r match_probability gamma_company_name_no_legal _merge predicted_match\n", + "0 0000003153 195 alabama power co NaN 1 2575 80977 0.999018 2 both 1.0\n", + "1 0001868941 58702 fluence energy, inc. Fluence 0 126809 21615 0.000002 0 both 0.0\n", + "2 0000041091 7140 georgia power co NaN 1 50428 68242 0.029853 2 both 0.0\n", + "3 0000022198 4062 columbus southern power co /oh/ Columbus Southern Power Co 1 129635 96300 0.997628 1 both 1.0\n", + "4 0001326160 5416 duke energy corp NaN 1 37661 71555 0.926352 2 both 0.0\n", + "5 0000030371 54905 duke energy carolinas, llc Duke Energy Carolinas LLC 1 133261 118543 0.987916 2 both 1.0\n", + "6 0000869446 57140 berkshire realty co inc /de Berkshire Wind Power Cooperative Corp 0 198821 89415 0.000030 0 both 0.0\n", + "7 0000092122 18195 southern co southern co services inc 0 50417 111824 0.000063 0 both 0.0\n", + "8 0000092122 17650 southern co Southern Power Co 0 50417 49613 0.004315 0 both 0.0\n", + "9 0000075488 14328 pacific gas & electric co NaN 1 2898 55480 0.624991 2 both 0.0\n", + "10 0001031296 6526 firstenergy corp FirstEnergy 0 14192 69716 0.999707 2 both 1.0\n", + "11 0001031296 54776 firstenergy corp FirstEnergy Nuclear Generation Corp 0 14192 102163 0.000066 0 both 0.0\n", + "12 0001031296 6458 firstenergy corp First Energy Services 0 14192 162033 0.000066 0 both 0.0\n", + "13 0001031296 32208 firstenergy corp First Energy Corp 1 14192 121855 0.010697 1 both 0.0\n", + "14 0000100122 24211 tucson electric power co NaN 1 715 41507 0.999798 2 both 1.0\n", + "15 0000096271 18454 tampa electric co NaN 1 231716 47982 0.989228 2 both 1.0\n", + "16 0000715957 5248 dominion energy, inc NaN 1 15937 71878 0.998282 2 both 1.0\n", + "17 0001013871 59883 nrg energy, inc NRG Energy Gas & Wind Holdings Inc 0 7168 17454 0.002575 0 both 0.0\n", + "18 0001013871 13377 nrg energy inc NRG Energy Inc 1 7173 95029 0.988801 2 both 1.0\n", + "19 0000788816 13994 oglethorpe power corp NaN 1 172902 56478 0.999768 2 both 1.0\n", + "20 0000018675 3266 central maine power co NaN 1 126771 176663 0.897700 2 both 0.0" + ] + }, + "execution_count": 441, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df.head(50)" + ] + }, + { + "cell_type": "code", + "execution_count": 442, + "id": "2fc6ed05-5b35-4856-a668-f16e253e7fea", + "metadata": {}, + "outputs": [], + "source": [ + "precision = precision_score(merged_df['match'], merged_df['predicted_match'])\n", + "recall = recall_score(merged_df['match'], merged_df['predicted_match'])\n", + "accuracy = accuracy_score(merged_df['match'], merged_df['predicted_match'])\n", + "# roc_auc = roc_auc_score(merged_df['match'], merged_df['match_probability'])\n", + "\n", + "# Confusion matrix\n", + "conf_matrix = confusion_matrix(merged_df['match'], merged_df['predicted_match'])" + ] + }, + { + "cell_type": "code", + "execution_count": 443, + "id": "99a7bc64-9f32-4f53-9a89-63a31ff7debe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(np.float64(0.8888888888888888),\n", + " np.float64(0.6153846153846154),\n", + " 0.7142857142857143)" + ] + }, + "execution_count": 443, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "precision, recall, accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": 444, + "id": "08932be5-b90c-440d-9efb-156cb4d63c93", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Predicted NegativePredicted Positive
Negative71
Positive58
\n", + "
" + ], + "text/plain": [ + " Predicted Negative Predicted Positive\n", + "Negative 7 1\n", + "Positive 5 8" + ] + }, + "execution_count": 444, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.DataFrame(\n", + " conf_matrix,\n", + " index=[\"Negative\", \"Positive\"],\n", + " columns=[\"Predicted Negative\", \"Predicted Positive\"]\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 445, + "id": "025c80e9-5055-4eaa-a873-38b910cd7f94", + "metadata": {}, + "outputs": [], + "source": [ + "incorrect_df = merged_df[merged_df.match != merged_df.predicted_match]" + ] + }, + { + "cell_type": "code", + "execution_count": 446, + "id": "e4f44fda-1b55-4f9c-a96d-5eec36dac768", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
central_index_keyutility_id_eiasec_company_nameeia_company_namematchrecord_id_lrecord_id_rmatch_probabilitygamma_company_name_no_legal_mergepredicted_match
200000410917140georgia power coNaN150428682420.0298532both0.0
400013261605416duke energy corpNaN137661715550.9263522both0.0
9000007548814328pacific gas & electric coNaN12898554800.6249912both0.0
1000010312966526firstenergy corpFirstEnergy014192697160.9997072both1.0
13000103129632208firstenergy corpFirst Energy Corp1141921218550.0106971both0.0
2000000186753266central maine power coNaN11267711766630.8977002both0.0
\n", + "
" ], "text/plain": [ - "alt.VConcatChart(...)" + " central_index_key utility_id_eia sec_company_name eia_company_name match record_id_l record_id_r match_probability gamma_company_name_no_legal _merge predicted_match\n", + "2 0000041091 7140 georgia power co NaN 1 50428 68242 0.029853 2 both 0.0\n", + "4 0001326160 5416 duke energy corp NaN 1 37661 71555 0.926352 2 both 0.0\n", + "9 0000075488 14328 pacific gas & electric co NaN 1 2898 55480 0.624991 2 both 0.0\n", + "10 0001031296 6526 firstenergy corp FirstEnergy 0 14192 69716 0.999707 2 both 1.0\n", + "13 0001031296 32208 firstenergy corp First Energy Corp 1 14192 121855 0.010697 1 both 0.0\n", + "20 0000018675 3266 central maine power co NaN 1 126771 176663 0.897700 2 both 0.0" ] }, - "execution_count": 388, + "execution_count": 446, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "linker.visualisations.match_weights_chart()" + "incorrect_df" ] }, { "cell_type": "code", - "execution_count": 389, - "id": "5e21bf55-64ac-4f4b-8f1c-d7507b5e7af6", + "execution_count": 447, + "id": "c425a676-aa6e-4d8f-b814-931da392c2ff", + "metadata": {}, + "outputs": [], + "source": [ + "recs_to_view = []\n", + "for idx, rec in incorrect_df.iterrows():\n", + " full_rec = preds_validation_df[\n", + " (preds_validation_df.record_id_l == rec.record_id_l) & \n", + " (preds_validation_df.record_id_r == rec.record_id_r)\n", + " ].squeeze()\n", + " if full_rec.empty:\n", + " continue\n", + " recs_to_view.append(full_rec.to_dict())" + ] + }, + { + "cell_type": "code", + "execution_count": 448, + "id": "ff55f2cb-7ce1-4697-99e7-bf22918f7ed1", "metadata": {}, "outputs": [ { @@ -2571,23 +4618,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ - "alt.HConcatChart(...)" + "alt.LayerChart(...)" ] }, - "execution_count": 389, + "execution_count": 448, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# company_name doesn't look good here\n", - "linker.visualisations.m_u_parameters_chart()" - ] - }, - { - "cell_type": "code", - "execution_count": 285, - "id": "fedb78e1-ee73-4d1e-8a96-3b27f6561a91", - "metadata": {}, - "outputs": [], - "source": [ - "settings = linker.misc.save_model_to_json(\n", - " \"model_test.json\", overwrite=True\n", - ")" + "linker.visualisations.waterfall_chart(recs_to_view, filter_nulls=True)" ] }, { "cell_type": "markdown", - "id": "31f9d73d-cfa4-41fa-906f-c8501a29283b", - "metadata": {}, - "source": [ - "## Make Predictions" - ] - }, - { - "cell_type": "code", - "execution_count": 390, - "id": "94e96441-89b6-4516-aa6a-4d1593ce03be", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Blocking time: 0.28 seconds\n" - ] - }, - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "1680da9f410c424d8e5648fc98c88022", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Predict time: 3.06 seconds\n" - ] - } - ], - "source": [ - "df_predictions = linker.inference.predict(threshold_match_probability=0.5)" - ] - }, - { - "cell_type": "code", - "execution_count": 391, - "id": "722effbc-24e8-45ca-aa64-8cdcd75846e0", + "id": "a2ba43b6-a664-462a-823f-e3f08585bb51", "metadata": {}, - "outputs": [], "source": [ - "preds_df = df_predictions.as_pandas_dataframe()" + "# Save good predictions" ] }, { "cell_type": "code", - "execution_count": 392, - "id": "21b19a11-15c6-46ab-bd73-7c4752f7e25e", + "execution_count": 192, + "id": "92172e2f-39ba-49e3-8312-98597256ca4f", "metadata": {}, "outputs": [ { @@ -2755,11 +4740,17 @@ " company_name_l\n", " company_name_r\n", " gamma_company_name\n", + " tf_company_name_l\n", + " tf_company_name_r\n", " bf_company_name\n", + " bf_tf_adj_company_name\n", " street_address_l\n", " street_address_r\n", " gamma_street_address\n", + " tf_street_address_l\n", + " tf_street_address_r\n", " bf_street_address\n", + " bf_tf_adj_street_address\n", " zip_code_l\n", " zip_code_r\n", " gamma_zip_code\n", @@ -2767,13 +4758,6 @@ " tf_zip_code_r\n", " bf_zip_code\n", " bf_tf_adj_zip_code\n", - " state_l\n", - " state_r\n", - " gamma_state\n", - " tf_state_l\n", - " tf_state_r\n", - " bf_state\n", - " bf_tf_adj_state\n", " city_l\n", " city_r\n", " gamma_city\n", @@ -2783,226 +4767,221 @@ " bf_tf_adj_city\n", " company_name_mphone_l\n", " company_name_mphone_r\n", - " report_year_l\n", - " report_year_r\n", + " street_address_list_l\n", + " street_address_list_r\n", " match_key\n", " \n", " \n", " \n", " \n", - " 11211\n", - " 0.054332\n", - " 0.509414\n", + " 199607\n", + " 4.265490\n", + " 0.950575\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 85762\n", - " 68295\n", - " citi trends incorporated\n", - " georgia pacific corporation\n", - " 1\n", - " 1.462842\n", - " 104 coleman boulevard\n", - " None\n", - " -1\n", + " 20077\n", + " 117512\n", + " prt group incorporated\n", + " pratt and whitney power systems\n", + " 0\n", + " 0.000019\n", + " 0.000010\n", + " 0.991220\n", " 1.000000\n", - " 31408\n", - " 31326\n", + " 80 lamberton rd\n", + " mail stop 191-13\n", " 0\n", - " 0.000045\n", - " 0.000103\n", - " 0.402918\n", + " 0.000036\n", + " 0.000012\n", + " 0.865948\n", " 1.000000\n", - " ga\n", - " ga\n", + " 06095\n", + " 06095\n", " 1\n", - " 0.023374\n", - " 0.023374\n", - " 22.598054\n", - " 1.815434\n", - " savannah\n", - " savannah\n", + " 0.000191\n", + " 0.000191\n", + " 1148.002189\n", + " 3.403266\n", + " windsor\n", + " windsor\n", " 2\n", - " 0.000454\n", - " 0.000454\n", - " 215.559681\n", - " 9.129471\n", - " ST TRNTS INKRPRTT\n", - " JRJ PSFK KRPRXN\n", - " 2021\n", - " 2008\n", + " 0.000279\n", + " 0.000279\n", + " 126.999683\n", + " 24.882561\n", + " PRT KRP\n", + " PRT ANT HTN PWR SSTMS\n", + " [80, lamberton, rd]\n", + " [mail, stop, 191-13]\n", " 0\n", " \n", " \n", - " 11666\n", - " 0.098035\n", - " 0.516982\n", + " 12041\n", + " 4.277468\n", + " 0.950964\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 94615\n", - " 75114\n", - " chicopee bancorp, incorporated\n", - " chicopee city of\n", - " 0\n", - " 0.845800\n", - " 70 center street\n", - " 725 front street\n", - " 0\n", - " 0.844089\n", - " 01013\n", - " 01021\n", + " 219453\n", + " 113555\n", + " cogentrix energy incorporated\n", + " green country energy limited liability company\n", " 0\n", - " 0.000036\n", - " 0.000061\n", - " 0.402918\n", + " 0.000019\n", + " 0.000038\n", + " 0.991220\n", " 1.000000\n", - " ma\n", - " ma\n", - " 1\n", - " 0.042950\n", - " 0.042950\n", - " 22.598054\n", - " 0.987961\n", - " chicopee\n", - " chicopee\n", + " 9405 arrowpoint blvd\n", + " 9405 arrowpoint blvd\n", " 2\n", - " 0.000117\n", - " 0.000117\n", - " 215.559681\n", - " 35.431042\n", - " XKP BNKRP INKRPRTT\n", - " XKP ST OF\n", - " 2012\n", - " 2012\n", - " 0\n", + " 0.000534\n", + " 0.000534\n", + " 14580.390627\n", + " 0.015600\n", + " 28273\n", + " 28273\n", + " 1\n", + " 0.001256\n", + " 0.001256\n", + " 1148.002189\n", + " 0.516567\n", + " charlotte\n", + " chalotte\n", + " 1\n", + " 0.014155\n", + " 0.000022\n", + " 79.923487\n", + " 1.000000\n", + " KJNTRKS ENRJ\n", + " KRN KNTR ENRJ\n", + " [9405, arrowpoint, blvd]\n", + " [9405, arrowpoint, blvd]\n", + " 1\n", " \n", " \n", - " 11665\n", - " 0.098035\n", - " 0.516982\n", + " 12805\n", + " 4.277468\n", + " 0.950964\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 94614\n", - " 75115\n", - " chicopee bancorp, incorporated\n", - " chicopee city of\n", - " 0\n", - " 0.845800\n", - " 70 center street\n", - " 725 front street\n", + " 219453\n", + " 115755\n", + " cogentrix energy incorporated\n", + " jackson county power limited liability company\n", " 0\n", - " 0.844089\n", - " 01013\n", - " 01021\n", - " 0\n", - " 0.000036\n", - " 0.000061\n", - " 0.402918\n", + " 0.000019\n", + " 0.000029\n", + " 0.991220\n", " 1.000000\n", - " ma\n", - " ma\n", - " 1\n", - " 0.042950\n", - " 0.042950\n", - " 22.598054\n", - " 0.987961\n", - " chicopee\n", - " chicopee\n", + " 9405 arrowpoint blvd\n", + " 9405 arrowpoint blvd\n", " 2\n", - " 0.000117\n", - " 0.000117\n", - " 215.559681\n", - " 35.431042\n", - " XKP BNKRP INKRPRTT\n", - " XKP ST OF\n", - " 2011\n", - " 2011\n", - " 0\n", + " 0.000534\n", + " 0.000534\n", + " 14580.390627\n", + " 0.015600\n", + " 28273\n", + " 28273\n", + " 1\n", + " 0.001256\n", + " 0.001256\n", + " 1148.002189\n", + " 0.516567\n", + " charlotte\n", + " chaarlotte\n", + " 1\n", + " 0.014155\n", + " 0.000011\n", + " 79.923487\n", + " 1.000000\n", + " KJNTRKS ENRJ\n", + " JKSN KNT PWR\n", + " [9405, arrowpoint, blvd]\n", + " [9405, arrowpoint, blvd]\n", + " 1\n", " \n", " \n", - " 11668\n", - " 0.098035\n", - " 0.516982\n", + " 8137\n", + " 4.278093\n", + " 0.950984\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 94618\n", - " 75118\n", - " chicopee bancorp, incorporated\n", - " chicopee city of\n", + " 64813\n", + " 3879\n", + " rand logistics incorporated\n", + " norridgewock river road solar limited liabilit...\n", " 0\n", - " 0.845800\n", - " 70 center street\n", - " 725 front street\n", - " 0\n", - " 0.844089\n", - " 01013\n", - " 01021\n", - " 0\n", - " 0.000036\n", - " 0.000061\n", - " 0.402918\n", + " 0.000029\n", + " 0.000019\n", + " 0.991220\n", " 1.000000\n", - " ma\n", - " ma\n", + " 333 washington street\n", + " 333 washington street\n", + " 2\n", + " 0.001056\n", + " 0.001056\n", + " 14580.390627\n", + " 0.007888\n", + " 07302\n", + " 07302\n", " 1\n", - " 0.042950\n", - " 0.042950\n", - " 22.598054\n", - " 0.987961\n", - " chicopee\n", - " chicopee\n", + " 0.002332\n", + " 0.002332\n", + " 1148.002189\n", + " 0.278152\n", + " jersey city\n", + " jersey city\n", " 2\n", - " 0.000117\n", - " 0.000117\n", - " 215.559681\n", - " 35.431042\n", - " XKP BNKRP INKRPRTT\n", - " XKP ST OF\n", - " 2008\n", - " 2008\n", - " 0\n", + " 0.002998\n", + " 0.002998\n", + " 126.999683\n", + " 2.312506\n", + " RNT LJSTKS\n", + " NRJWK RFR RT SLR\n", + " [333, washington, street]\n", + " [333, washington, street]\n", + " 1\n", " \n", " \n", - " 11669\n", - " 0.098035\n", - " 0.516982\n", + " 8136\n", + " 4.278093\n", + " 0.950984\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 94620\n", - " 75116\n", - " chicopee bancorp, incorporated\n", - " chicopee city of\n", + " 64813\n", + " 5193\n", + " rand logistics incorporated\n", + " anderson solar farm limited liability company\n", " 0\n", - " 0.845800\n", - " 70 center street\n", - " p o box 405\n", - " 0\n", - " 0.844089\n", - " 01013\n", - " 01021\n", - " 0\n", - " 0.000036\n", - " 0.000061\n", - " 0.402918\n", + " 0.000029\n", + " 0.000029\n", + " 0.991220\n", " 1.000000\n", - " ma\n", - " ma\n", + " 333 washington street\n", + " 333 washington street\n", + " 2\n", + " 0.001056\n", + " 0.001056\n", + " 14580.390627\n", + " 0.007888\n", + " 07302\n", + " 07302\n", " 1\n", - " 0.042950\n", - " 0.042950\n", - " 22.598054\n", - " 0.987961\n", - " chicopee\n", - " chicopee\n", + " 0.002332\n", + " 0.002332\n", + " 1148.002189\n", + " 0.278152\n", + " jersey city\n", + " jersey city\n", " 2\n", - " 0.000117\n", - " 0.000117\n", - " 215.559681\n", - " 35.431042\n", - " XKP BNKRP INKRPRTT\n", - " XKP ST OF\n", - " 2010\n", - " 2010\n", - " 0\n", + " 0.002998\n", + " 0.002998\n", + " 126.999683\n", + " 2.312506\n", + " RNT LJSTKS\n", + " ANTRSN SLR FRM\n", + " [333, washington, street]\n", + " [333, washington, street]\n", + " 1\n", " \n", " \n", " ...\n", @@ -3045,258 +5024,252 @@ " ...\n", " ...\n", " ...\n", - " ...\n", " \n", " \n", - " 10043\n", - " 45.026591\n", + " 199278\n", + " 27.514584\n", " 1.000000\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 177698\n", - " 67483\n", - " green mountain power corporation\n", - " green mountain power corporation\n", - " 3\n", - " 9751.372250\n", - " 163 acorn lane\n", - " 163 acorn lane\n", - " 3\n", - " 24859.333063\n", - " 05446\n", - " 05446\n", - " 1\n", - " 0.000143\n", - " 0.000143\n", - " 1447.988342\n", - " 2.894003\n", - " vt\n", - " vt\n", + " 27759\n", + " 142183\n", + " diamond brands incorporated\n", + " diamond brands incorporated\n", + " 2\n", + " 0.000029\n", + " 0.000029\n", + " 7612.680596\n", + " 0.037986\n", + " 1800 cloquet avenue\n", + " 1800 cloquet avenue\n", + " 2\n", + " 0.000036\n", + " 0.000036\n", + " 14580.390627\n", + " 0.233998\n", + " 55720\n", + " 55720\n", " 1\n", - " 0.002680\n", - " 0.002680\n", - " 22.598054\n", - " 15.835981\n", - " colchester\n", - " colchester\n", + " 0.000078\n", + " 0.000078\n", + " 1148.002189\n", + " 8.265075\n", + " cloquet\n", + " cloquet\n", " 2\n", - " 0.000198\n", - " 0.000198\n", - " 215.559681\n", - " 20.959208\n", - " KRN MNTN PWR KRPRXN\n", - " KRN MNTN PWR KRPRXN\n", - " 2001\n", - " 2001\n", + " 0.000078\n", + " 0.000078\n", + " 126.999683\n", + " 88.866289\n", + " TMNT BRNTS\n", + " TMNT BRNTS\n", + " [1800, cloquet, avenue]\n", + " [1800, cloquet, avenue]\n", " 0\n", " \n", " \n", - " 10051\n", - " 45.026591\n", + " 485070\n", + " 27.655362\n", " 1.000000\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 177702\n", - " 67479\n", - " green mountain power corporation\n", - " green mountain power corporation\n", - " 3\n", - " 9751.372250\n", - " 163 acorn lane\n", - " 163 acorn lane\n", - " 3\n", - " 24859.333063\n", - " 05446\n", - " 05446\n", - " 1\n", - " 0.000143\n", - " 0.000143\n", - " 1447.988342\n", - " 2.894003\n", - " vt\n", - " vt\n", + " 50420\n", + " 95697\n", + " gulf power company\n", + " gulf power company\n", + " 2\n", + " 0.000038\n", + " 0.000038\n", + " 7612.680596\n", + " 0.028490\n", + " one energy place\n", + " one energy place\n", + " 2\n", + " 0.000024\n", + " 0.000024\n", + " 14580.390627\n", + " 0.350997\n", + " 32520\n", + " 32520\n", " 1\n", - " 0.002680\n", - " 0.002680\n", - " 22.598054\n", - " 15.835981\n", - " colchester\n", - " colchester\n", + " 0.000056\n", + " 0.000056\n", + " 1148.002189\n", + " 11.571104\n", + " pensacola\n", + " pensacola\n", " 2\n", - " 0.000198\n", - " 0.000198\n", - " 215.559681\n", - " 20.959208\n", - " KRN MNTN PWR KRPRXN\n", - " KRN MNTN PWR KRPRXN\n", - " 2005\n", - " 2005\n", + " 0.000111\n", + " 0.000111\n", + " 126.999683\n", + " 62.206402\n", + " KLF PWR\n", + " KLF PWR\n", + " [one, energy, place]\n", + " [one, energy, place]\n", " 0\n", " \n", " \n", - " 10050\n", - " 45.026591\n", + " 331565\n", + " 27.977290\n", " 1.000000\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 177701\n", - " 67480\n", - " green mountain power corporation\n", - " green mountain power corporation\n", - " 3\n", - " 9751.372250\n", - " 163 acorn lane\n", - " 163 acorn lane\n", - " 3\n", - " 24859.333063\n", - " 05446\n", - " 05446\n", - " 1\n", - " 0.000143\n", - " 0.000143\n", - " 1447.988342\n", - " 2.894003\n", - " vt\n", - " vt\n", + " 170775\n", + " 78563\n", + " berry petroleum company\n", + " berry petroleum company\n", + " 2\n", + " 0.000096\n", + " 0.000096\n", + " 7612.680596\n", + " 0.011396\n", + " 28700 hovey hills rd\n", + " 28700 hovey hills rd\n", + " 2\n", + " 0.000024\n", + " 0.000024\n", + " 14580.390627\n", + " 0.350997\n", + " 93268\n", + " 93268\n", " 1\n", - " 0.002680\n", - " 0.002680\n", - " 22.598054\n", - " 15.835981\n", - " colchester\n", - " colchester\n", + " 0.000045\n", + " 0.000045\n", + " 1148.002189\n", + " 14.463881\n", + " taft\n", + " taft\n", " 2\n", - " 0.000198\n", - " 0.000198\n", - " 215.559681\n", - " 20.959208\n", - " KRN MNTN PWR KRPRXN\n", - " KRN MNTN PWR KRPRXN\n", - " 2004\n", - " 2004\n", + " 0.000045\n", + " 0.000045\n", + " 126.999683\n", + " 155.516006\n", + " BR PTRLM\n", + " BR PTRLM\n", + " [28700, hovey, hills, rd]\n", + " [28700, hovey, hills, rd]\n", " 0\n", " \n", " \n", - " 10049\n", - " 45.026591\n", + " 869341\n", + " 28.977290\n", " 1.000000\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 177699\n", - " 67482\n", - " green mountain power corporation\n", - " green mountain power corporation\n", - " 3\n", - " 9751.372250\n", - " 163 acorn lane\n", - " 163 acorn lane\n", - " 3\n", - " 24859.333063\n", - " 05446\n", - " 05446\n", - " 1\n", - " 0.000143\n", - " 0.000143\n", - " 1447.988342\n", - " 2.894003\n", - " vt\n", - " vt\n", + " 39609\n", + " 141382\n", + " eme homer city generation limited partnership\n", + " eme homer city generation limited partnership\n", + " 2\n", + " 0.000038\n", + " 0.000038\n", + " 7612.680596\n", + " 0.028490\n", + " 1750 power plant road\n", + " 1750 power plant road\n", + " 2\n", + " 0.000024\n", + " 0.000024\n", + " 14580.390627\n", + " 0.350997\n", + " 15748\n", + " 15748\n", " 1\n", - " 0.002680\n", - " 0.002680\n", - " 22.598054\n", - " 15.835981\n", - " colchester\n", - " colchester\n", + " 0.000045\n", + " 0.000045\n", + " 1148.002189\n", + " 14.463881\n", + " homer city\n", + " homer city\n", " 2\n", - " 0.000198\n", - " 0.000198\n", - " 215.559681\n", - " 20.959208\n", - " KRN MNTN PWR KRPRXN\n", - " KRN MNTN PWR KRPRXN\n", - " 2002\n", - " 2002\n", + " 0.000056\n", + " 0.000056\n", + " 126.999683\n", + " 124.412805\n", + " EM HMR ST JNRXN\n", + " EM HMR ST JNRXN\n", + " [1750, power, plant, road]\n", + " [1750, power, plant, road]\n", " 0\n", " \n", " \n", - " 10035\n", - " 45.026591\n", + " 73212\n", + " 29.544331\n", " 1.000000\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 177700\n", - " 67481\n", - " green mountain power corporation\n", - " green mountain power corporation\n", - " 3\n", - " 9751.372250\n", - " 163 acorn lane\n", - " 163 acorn lane\n", - " 3\n", - " 24859.333063\n", - " 05446\n", - " 05446\n", - " 1\n", - " 0.000143\n", - " 0.000143\n", - " 1447.988342\n", - " 2.894003\n", - " vt\n", - " vt\n", + " 224681\n", + " 50859\n", + " selkirk cogen partners limited partnership\n", + " selkirk cogen partners limited partnership\n", + " 2\n", + " 0.000058\n", + " 0.000058\n", + " 7612.680596\n", + " 0.018993\n", + " 24 power park drive\n", + " 24 power park drive\n", + " 2\n", + " 0.000024\n", + " 0.000024\n", + " 14580.390627\n", + " 0.350997\n", + " 12158\n", + " 12158\n", " 1\n", - " 0.002680\n", - " 0.002680\n", - " 22.598054\n", - " 15.835981\n", - " colchester\n", - " colchester\n", + " 0.000034\n", + " 0.000034\n", + " 1148.002189\n", + " 19.285174\n", + " selkirk\n", + " selkirk\n", " 2\n", - " 0.000198\n", - " 0.000198\n", - " 215.559681\n", - " 20.959208\n", - " KRN MNTN PWR KRPRXN\n", - " KRN MNTN PWR KRPRXN\n", - " 2003\n", - " 2003\n", + " 0.000033\n", + " 0.000033\n", + " 126.999683\n", + " 207.354675\n", + " SLKRK KJN PRTNRS\n", + " SLKRK KJN PRTNRS\n", + " [24, power, park, drive]\n", + " [24, power, park, drive]\n", " 0\n", " \n", " \n", "\n", - "

12713 rows × 40 columns

\n", + "

3014 rows × 39 columns

\n", "" ], "text/plain": [ - " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_l company_name_r gamma_company_name bf_company_name street_address_l street_address_r gamma_street_address bf_street_address zip_code_l zip_code_r gamma_zip_code tf_zip_code_l tf_zip_code_r bf_zip_code bf_tf_adj_zip_code state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r report_year_l report_year_r match_key\n", - "11211 0.054332 0.509414 __splink__input_table_0 __splink__input_table_1 85762 68295 citi trends incorporated georgia pacific corporation 1 1.462842 104 coleman boulevard None -1 1.000000 31408 31326 0 0.000045 0.000103 0.402918 1.000000 ga ga 1 0.023374 0.023374 22.598054 1.815434 savannah savannah 2 0.000454 0.000454 215.559681 9.129471 ST TRNTS INKRPRTT JRJ PSFK KRPRXN 2021 2008 0\n", - "11666 0.098035 0.516982 __splink__input_table_0 __splink__input_table_1 94615 75114 chicopee bancorp, incorporated chicopee city of 0 0.845800 70 center street 725 front street 0 0.844089 01013 01021 0 0.000036 0.000061 0.402918 1.000000 ma ma 1 0.042950 0.042950 22.598054 0.987961 chicopee chicopee 2 0.000117 0.000117 215.559681 35.431042 XKP BNKRP INKRPRTT XKP ST OF 2012 2012 0\n", - "11665 0.098035 0.516982 __splink__input_table_0 __splink__input_table_1 94614 75115 chicopee bancorp, incorporated chicopee city of 0 0.845800 70 center street 725 front street 0 0.844089 01013 01021 0 0.000036 0.000061 0.402918 1.000000 ma ma 1 0.042950 0.042950 22.598054 0.987961 chicopee chicopee 2 0.000117 0.000117 215.559681 35.431042 XKP BNKRP INKRPRTT XKP ST OF 2011 2011 0\n", - "11668 0.098035 0.516982 __splink__input_table_0 __splink__input_table_1 94618 75118 chicopee bancorp, incorporated chicopee city of 0 0.845800 70 center street 725 front street 0 0.844089 01013 01021 0 0.000036 0.000061 0.402918 1.000000 ma ma 1 0.042950 0.042950 22.598054 0.987961 chicopee chicopee 2 0.000117 0.000117 215.559681 35.431042 XKP BNKRP INKRPRTT XKP ST OF 2008 2008 0\n", - "11669 0.098035 0.516982 __splink__input_table_0 __splink__input_table_1 94620 75116 chicopee bancorp, incorporated chicopee city of 0 0.845800 70 center street p o box 405 0 0.844089 01013 01021 0 0.000036 0.000061 0.402918 1.000000 ma ma 1 0.042950 0.042950 22.598054 0.987961 chicopee chicopee 2 0.000117 0.000117 215.559681 35.431042 XKP BNKRP INKRPRTT XKP ST OF 2010 2010 0\n", - "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", - "10043 45.026591 1.000000 __splink__input_table_0 __splink__input_table_1 177698 67483 green mountain power corporation green mountain power corporation 3 9751.372250 163 acorn lane 163 acorn lane 3 24859.333063 05446 05446 1 0.000143 0.000143 1447.988342 2.894003 vt vt 1 0.002680 0.002680 22.598054 15.835981 colchester colchester 2 0.000198 0.000198 215.559681 20.959208 KRN MNTN PWR KRPRXN KRN MNTN PWR KRPRXN 2001 2001 0\n", - "10051 45.026591 1.000000 __splink__input_table_0 __splink__input_table_1 177702 67479 green mountain power corporation green mountain power corporation 3 9751.372250 163 acorn lane 163 acorn lane 3 24859.333063 05446 05446 1 0.000143 0.000143 1447.988342 2.894003 vt vt 1 0.002680 0.002680 22.598054 15.835981 colchester colchester 2 0.000198 0.000198 215.559681 20.959208 KRN MNTN PWR KRPRXN KRN MNTN PWR KRPRXN 2005 2005 0\n", - "10050 45.026591 1.000000 __splink__input_table_0 __splink__input_table_1 177701 67480 green mountain power corporation green mountain power corporation 3 9751.372250 163 acorn lane 163 acorn lane 3 24859.333063 05446 05446 1 0.000143 0.000143 1447.988342 2.894003 vt vt 1 0.002680 0.002680 22.598054 15.835981 colchester colchester 2 0.000198 0.000198 215.559681 20.959208 KRN MNTN PWR KRPRXN KRN MNTN PWR KRPRXN 2004 2004 0\n", - "10049 45.026591 1.000000 __splink__input_table_0 __splink__input_table_1 177699 67482 green mountain power corporation green mountain power corporation 3 9751.372250 163 acorn lane 163 acorn lane 3 24859.333063 05446 05446 1 0.000143 0.000143 1447.988342 2.894003 vt vt 1 0.002680 0.002680 22.598054 15.835981 colchester colchester 2 0.000198 0.000198 215.559681 20.959208 KRN MNTN PWR KRPRXN KRN MNTN PWR KRPRXN 2002 2002 0\n", - "10035 45.026591 1.000000 __splink__input_table_0 __splink__input_table_1 177700 67481 green mountain power corporation green mountain power corporation 3 9751.372250 163 acorn lane 163 acorn lane 3 24859.333063 05446 05446 1 0.000143 0.000143 1447.988342 2.894003 vt vt 1 0.002680 0.002680 22.598054 15.835981 colchester colchester 2 0.000198 0.000198 215.559681 20.959208 KRN MNTN PWR KRPRXN KRN MNTN PWR KRPRXN 2003 2003 0\n", - "\n", - "[12713 rows x 40 columns]" + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_l company_name_r gamma_company_name tf_company_name_l tf_company_name_r bf_company_name bf_tf_adj_company_name street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address zip_code_l zip_code_r gamma_zip_code tf_zip_code_l tf_zip_code_r bf_zip_code bf_tf_adj_zip_code city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r street_address_list_l street_address_list_r match_key\n", + "199607 4.265490 0.950575 __splink__input_table_0 __splink__input_table_1 20077 117512 prt group incorporated pratt and whitney power systems 0 0.000019 0.000010 0.991220 1.000000 80 lamberton rd mail stop 191-13 0 0.000036 0.000012 0.865948 1.000000 06095 06095 1 0.000191 0.000191 1148.002189 3.403266 windsor windsor 2 0.000279 0.000279 126.999683 24.882561 PRT KRP PRT ANT HTN PWR SSTMS [80, lamberton, rd] [mail, stop, 191-13] 0\n", + "12041 4.277468 0.950964 __splink__input_table_0 __splink__input_table_1 219453 113555 cogentrix energy incorporated green country energy limited liability company 0 0.000019 0.000038 0.991220 1.000000 9405 arrowpoint blvd 9405 arrowpoint blvd 2 0.000534 0.000534 14580.390627 0.015600 28273 28273 1 0.001256 0.001256 1148.002189 0.516567 charlotte chalotte 1 0.014155 0.000022 79.923487 1.000000 KJNTRKS ENRJ KRN KNTR ENRJ [9405, arrowpoint, blvd] [9405, arrowpoint, blvd] 1\n", + "12805 4.277468 0.950964 __splink__input_table_0 __splink__input_table_1 219453 115755 cogentrix energy incorporated jackson county power limited liability company 0 0.000019 0.000029 0.991220 1.000000 9405 arrowpoint blvd 9405 arrowpoint blvd 2 0.000534 0.000534 14580.390627 0.015600 28273 28273 1 0.001256 0.001256 1148.002189 0.516567 charlotte chaarlotte 1 0.014155 0.000011 79.923487 1.000000 KJNTRKS ENRJ JKSN KNT PWR [9405, arrowpoint, blvd] [9405, arrowpoint, blvd] 1\n", + "8137 4.278093 0.950984 __splink__input_table_0 __splink__input_table_1 64813 3879 rand logistics incorporated norridgewock river road solar limited liabilit... 0 0.000029 0.000019 0.991220 1.000000 333 washington street 333 washington street 2 0.001056 0.001056 14580.390627 0.007888 07302 07302 1 0.002332 0.002332 1148.002189 0.278152 jersey city jersey city 2 0.002998 0.002998 126.999683 2.312506 RNT LJSTKS NRJWK RFR RT SLR [333, washington, street] [333, washington, street] 1\n", + "8136 4.278093 0.950984 __splink__input_table_0 __splink__input_table_1 64813 5193 rand logistics incorporated anderson solar farm limited liability company 0 0.000029 0.000029 0.991220 1.000000 333 washington street 333 washington street 2 0.001056 0.001056 14580.390627 0.007888 07302 07302 1 0.002332 0.002332 1148.002189 0.278152 jersey city jersey city 2 0.002998 0.002998 126.999683 2.312506 RNT LJSTKS ANTRSN SLR FRM [333, washington, street] [333, washington, street] 1\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "199278 27.514584 1.000000 __splink__input_table_0 __splink__input_table_1 27759 142183 diamond brands incorporated diamond brands incorporated 2 0.000029 0.000029 7612.680596 0.037986 1800 cloquet avenue 1800 cloquet avenue 2 0.000036 0.000036 14580.390627 0.233998 55720 55720 1 0.000078 0.000078 1148.002189 8.265075 cloquet cloquet 2 0.000078 0.000078 126.999683 88.866289 TMNT BRNTS TMNT BRNTS [1800, cloquet, avenue] [1800, cloquet, avenue] 0\n", + "485070 27.655362 1.000000 __splink__input_table_0 __splink__input_table_1 50420 95697 gulf power company gulf power company 2 0.000038 0.000038 7612.680596 0.028490 one energy place one energy place 2 0.000024 0.000024 14580.390627 0.350997 32520 32520 1 0.000056 0.000056 1148.002189 11.571104 pensacola pensacola 2 0.000111 0.000111 126.999683 62.206402 KLF PWR KLF PWR [one, energy, place] [one, energy, place] 0\n", + "331565 27.977290 1.000000 __splink__input_table_0 __splink__input_table_1 170775 78563 berry petroleum company berry petroleum company 2 0.000096 0.000096 7612.680596 0.011396 28700 hovey hills rd 28700 hovey hills rd 2 0.000024 0.000024 14580.390627 0.350997 93268 93268 1 0.000045 0.000045 1148.002189 14.463881 taft taft 2 0.000045 0.000045 126.999683 155.516006 BR PTRLM BR PTRLM [28700, hovey, hills, rd] [28700, hovey, hills, rd] 0\n", + "869341 28.977290 1.000000 __splink__input_table_0 __splink__input_table_1 39609 141382 eme homer city generation limited partnership eme homer city generation limited partnership 2 0.000038 0.000038 7612.680596 0.028490 1750 power plant road 1750 power plant road 2 0.000024 0.000024 14580.390627 0.350997 15748 15748 1 0.000045 0.000045 1148.002189 14.463881 homer city homer city 2 0.000056 0.000056 126.999683 124.412805 EM HMR ST JNRXN EM HMR ST JNRXN [1750, power, plant, road] [1750, power, plant, road] 0\n", + "73212 29.544331 1.000000 __splink__input_table_0 __splink__input_table_1 224681 50859 selkirk cogen partners limited partnership selkirk cogen partners limited partnership 2 0.000058 0.000058 7612.680596 0.018993 24 power park drive 24 power park drive 2 0.000024 0.000024 14580.390627 0.350997 12158 12158 1 0.000034 0.000034 1148.002189 19.285174 selkirk selkirk 2 0.000033 0.000033 126.999683 207.354675 SLKRK KJN PRTNRS SLKRK KJN PRTNRS [24, power, park, drive] [24, power, park, drive] 0\n", + "\n", + "[3014 rows x 39 columns]" ] }, - "execution_count": 392, + "execution_count": 192, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "preds_df.sort_values(by=\"match_probability\")" + "preds_df[preds_df.match_probability >= .95].sort_values(by=\"match_probability\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "4f63fb3d-5fac-476d-9271-347412121902", + "id": "288ffe20-c69e-4c96-8835-765c06303bf2", "metadata": {}, "outputs": [], "source": [] diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 6a33b78..2974628 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -26,13 +26,14 @@ mlflow_train_test_io_managers, ) -from . import basic_10k, ex_21, extract +from . import basic_10k, ex_21, extract, sec_output_table from .utils.cloud import cloud_interface_resource basic_10k_assets = load_assets_from_modules([basic_10k]) ex21_assets = load_assets_from_package_module(ex_21) ex21_data_assets = load_assets_from_modules([ex_21.data]) shared_assets = load_assets_from_modules([extract]) +sec_output_assets = load_assets_from_modules([sec_output_table]) basic_10k_production_job = model_jobs.create_production_model_job( "basic_10k_extraction", @@ -53,6 +54,9 @@ ], ) +sec_output_table_production_job = model_jobs.create_production_model_job( + "sec_output_table_creation", sec_output_table.production_assets +) exhibit21_extractor = define_dagstermill_asset( name="train_exhibit21_extractor", @@ -97,13 +101,15 @@ + ex21_assets + shared_assets + [exhibit21_extractor, exhibit21_layout_classifier] - + ex21_data_assets, + + ex21_data_assets + + sec_output_assets, jobs=[ basic_10k_production_job, basic_10k_validation_job, ex21_production_job, ex21_training_job, ex21_layout_classifier_training_job, + sec_output_table_production_job, ], resources={ "cloud_interface": cloud_interface_resource, diff --git a/src/mozilla_sec_eia/models/sec10k/sec_output_table.py b/src/mozilla_sec_eia/models/sec10k/sec_output_table.py new file mode 100644 index 0000000..6f0f900 --- /dev/null +++ b/src/mozilla_sec_eia/models/sec10k/sec_output_table.py @@ -0,0 +1,327 @@ +"""Module for creating an SEC 10K output table with filing companies and subsidiary companies.""" + +import re +from importlib import resources +from pathlib import Path + +import numpy as np +import pandas as pd +from dagster import AssetIn, AssetOut, multi_asset + +from mozilla_sec_eia.models.sec10k.utils.cloud import ( + GCSArchive, + convert_ex21_id_to_filename, +) +from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import ( + company_name_cleaner, +) + +# TODO: should this be a shared asset? Can you use the existing sec_10k_filing_metadata with all year quarters? +archive = GCSArchive() +md = archive.get_metadata() + +INVALID_NAMES = [ + "llc", + "limited liability company", + "limited", + "ltd", + "iiii", + "inc", + "incorporated", + "partnership", + "i", + "name", + "company", + "&", + "", +] + + +def _remove_weird_sec_cols(sec_df) -> pd.DataFrame: + weird_cols = ["]fiscal_year_end", "]irs_number", "]state_of_incorporation"] + for weird_col in weird_cols: + if weird_col not in sec_df: + continue + normal_col = weird_col[1:] + sec_df.loc[:, normal_col] = sec_df[normal_col].where( + sec_df[weird_col].isnull(), sec_df[weird_col] + ) + sec_df = sec_df.drop(columns=[weird_col]) + return sec_df + + +def _add_report_year_to_sec(sec_df) -> pd.DataFrame: + """Merge metadata on to get a report year for extracted SEC data. + + Expects filename to be the index of the SEC dataframe. + """ + sec_df = sec_df.merge( + md[["date_filed"]], how="left", left_index=True, right_index=True + ) + sec_df.loc[:, "report_year"] = ( + sec_df["report_date"].astype("datetime64[ns]").dt.year + ) + return sec_df + + +def _flatten_sec_companies_across_time(sec_df) -> pd.DataFrame: + """Keep only the most recent record for each unique SEC CIK. + + Note that this drops old records for companies that have changed + names or addresses across time. + TODO: create an asset that tracks name and address chnages across + time. + """ + sec_df = _add_report_year_to_sec(sec_df) + sec_df = ( + sec_df.sort_values(by="report_year", ascending=False) + .groupby("central_index_key") + .first() + ) + return sec_df + + +def get_sec_state_code_dict() -> dict[str, str]: + """Create a dictionary mapping state codes to their names. + + Table found at https://www.sec.gov/submit-filings/filer-support-resources/edgar-state-country-codes + Published by SEC and reports valid state codes + for filers of Form D. Used to standardize the state codes + in the SEC 10K filings. The expanded names of the state codes + are comments in the XML file, so we have to read the XML in as + text and parse it. + """ + # TODO: make a check to see if SEC has published a new version of this table + xml_filepath = ( + resources.files("mozilla_sec_eia.package_data") / "formDStateCodes.xsd.xml" + ) + with Path.open(xml_filepath) as file: + xml_text = file.read() + + pattern = r'.*?' + state_code_dict = { + code.lower(): name.lower() + for code, name in re.findall(pattern, xml_text, re.DOTALL) + } + return state_code_dict + + +def clean_loc_of_incorporation(df) -> pd.DataFrame: + """Clean location of incorporation column in SEC basic 10K or Ex. 21 dataframe. + + Arguments: + df: Ex. 21 or SEC 10K basic info dataframe with loc_of_incorporation + column. + """ + state_code_to_name = get_sec_state_code_dict() + df.loc[:, "loc_of_incorporation"] = df["state_of_incorporation"].replace( + state_code_to_name + ) + df["loc_of_incorporation"] = ( + df["loc_of_incorporation"] + .fillna(pd.NA) + .apply(lambda x: x.str.strip().str.lower()) + .replace("", pd.NA) + ) + return df + + +def clean_company_name(df) -> pd.DataFrame: + """Clean company name column in SEC basic 10K or Ex. 21 dataframe. + + Arguments: + df: Ex. 21 or SEC 10K basic info dataframe with company_name + column. + """ + df["company_name"] = ( + df["company_name"] + .fillna(pd.NA) + .apply(lambda x: x.str.strip().str.lower()) + .replace("", pd.NA) + ) + df.loc[:, "company_name_clean"] = company_name_cleaner.apply_name_cleaning( + df[["company_name"]] + ).str.strip() + df = df[ + (~df["company_name"].isin(INVALID_NAMES)) + & (~df["company_name_clean"].isin(INVALID_NAMES)) + ] + df = df.fillna(np.nan) + + return df + + +def add_parent_company_cik(ex21_df: pd.DataFrame) -> pd.DataFrame: + """Add the CIK of the parent company to Ex. 21 subsidiaries.""" + ex21_df = ex21_df.merge( + md["cik"], how="left", left_on="filename", right_index=True + ).rename(columns={"cik": "parent_company_cik"}) + + +def match_ex21_subsidiaries_to_filer_company( + basic10k_df: pd.DataFrame, ex21_df: pd.DataFrame +) -> pd.DataFrame: + """Match Ex. 21 subsidiaries to filer companies. + + We want to assign CIKs to Ex. 21 subsidiaries if they in turn + file a 10k. To do this, we merge the Ex. 21 subsidiaries to 10k + filers on comapny name. If there are multiple matches with the same + company name we take the company with the most overlap in location of + incorporation and nearest report years. Then we merge the CIK back onto + the Ex. 21 df. + + Returns: + A dataframe of the Ex. 21 subsidiaries with a column for the + subsidiaries CIK (null if the subsidiary doesn't file). + """ + basic10k_df = basic10k_df.drop_duplicates( + subset=[ + "central_index_key", + "company_name", + "loc_of_incorporation", + "report_year", + ] + ) + merged_df = basic10k_df.merge( + ex21_df, how="inner", on="company_name", suffixes=("_sec", "_ex21") + ) + # split up the location of incorporation on whitespace, creating a column + # with lists of word tokens + merged_df.loc[:, "loc_tokens_sec"] = ( + merged_df["loc_of_incorporation_sec"].fillna("").str.lower().str.split() + ) + merged_df.loc[:, "loc_tokens_ex21"] = ( + merged_df["loc_of_incorporation_ex21"].fillna("").str.lower().str.split() + ) + # get the number of words overlapping between location of incorporation tokens + merged_df["loc_overlap"] = merged_df.apply( + lambda row: len(set(row["loc_tokens_sec"]) & set(row["loc_tokens_ex21"])), + axis=1, + ) + # get the difference in report years + merged_df["report_year_diff"] = merged_df.apply( + lambda row: abs(int(row["report_year_sec"]) - int(row["report_year_ex21"])), + axis=1, + ) + merged_df = merged_df.sort_values( + by=[ + "company_name", + "loc_of_incorporation_ex21", + "loc_overlap", + "report_year_diff", + ], + ascending=[True, True, False, True], + ) + # Select the row with the highest loc overlap and nearest report years + # for each company name and location pair + closest_match_df = merged_df.groupby( + ["company_name", "loc_of_incorporation_ex21"], as_index=False + ).first() + ex21_with_cik_df = ex21_df.merge( + closest_match_df[ + ["company_name", "central_index_key", "loc_of_incorporation_ex21"] + ].rename(columns={"loc_of_incorporation_ex21": "loc_of_incorporation"}), + how="left", + on=["company_name", "loc_of_incorporation"], + ).rename(columns={"central_index_key": "subsidiary_cik"}) + # if a subsidiary doesn't have a CIK and has a null location + # but its company name was assigned a CIK (with a different location) + # then assign that CIK to the subsidiary + ex21_with_cik_df = ex21_with_cik_df.merge( + closest_match_df[["company_name", "central_index_key"]], + how="left", + on="company_name", + ).rename(columns={"central_index_key": "company_name_merge_cik"}) + ex21_with_cik_df["subsidiary_cik"] = ex21_with_cik_df["subsidiary_cik"].where( + ~(ex21_with_cik_df.subsidiary_cik.isnull()) + | ~(ex21_with_cik_df.loc_of_incorporation.isnull()), + ex21_with_cik_df["company_name_merge_cik"], + ) + ex21_with_cik_df = ex21_with_cik_df.rename( + columns={"subsidiary_cik": "central_index_key"} + ) + return ex21_with_cik_df + + +@multi_asset( + ins={ + "ex21_df": AssetIn("ex21_company_ownership_info"), + }, + outs={ + "clean_ex21_subsidiary_table": AssetOut( + io_manager_key="pandas_parquet_io_manager", + ) + }, +) +def clean_ex21_table(ex21_df: pd.DataFrame) -> pd.DataFrame: + """Clean Ex. 21 table of subsidiaries before combing with basic 10k table.""" + ex21_df.loc[:, "filename"] = convert_ex21_id_to_filename(ex21_df) + ex21_df = clean_loc_of_incorporation(ex21_df) + ex21_df = clean_company_name(ex21_df) + ex21_df = add_parent_company_cik(ex21_df) + # flatten out the Ex. 21 table + ex21_df = ex21_df.drop_duplicates( + subset=["parent_company_cik", "company_name", "loc_of_incorporation"] + ) + return ex21_df + + +@multi_asset( + ins={ + "basic_10k_df": AssetIn("basic_10k_company_info"), + "clean_ex21_df": AssetIn("clean_ex21_subsidiary_table"), + # specify an io_manager_key? + }, + outs={ + "out_sec_10k__parents_and_subsidiaries": AssetOut( + io_manager_key="pandas_parquet_io_manager", + # specify a dagster_type? + ), + }, +) +def sec_output_table( + basic_10k_df: pd.DataFrame, clean_ex21_df: pd.DataFrame +) -> pd.DataFrame: + """Asset for creating an SEC 10K output table. + + Flatten the table across time to only keep the most recent record + for each CIK. Add in Ex. 21 subsidiaries and link them to already present + filing companies. Create an sec_company_id for subsidiaries that aren't linked + to a CIK. + """ + basic_10k_df = basic_10k_df.reset_index().pivot_table( + values="value", index="filename", columns="key", aggfunc="first" + ) + basic_10k_df.columns.name = None + basic_10k_df = _remove_weird_sec_cols(basic_10k_df) + + # add a location of incorporation to better match it to Ex. 21 subsidiaries + basic_10k_df = clean_loc_of_incorporation(basic_10k_df) + basic_10k_df = clean_company_name(basic_10k_df) + ex21_df_with_cik = match_ex21_subsidiaries_to_filer_company( + basic10k_df=basic_10k_df, ex21_df=clean_ex21_df + ) + basic_10k_df = basic_10k_df.merge( + ex21_df_with_cik[["central_index_key", "parent_company_cik", "own_per"]], + how="left", + on="central_index_key", + ) + basic_10k_df.loc[:, "files_10k"] = True + basic_10k_df.loc[:, "sec_company_id"] = basic_10k_df["central_index_key"] + ex21_non_filing_subs_df = ex21_df_with_cik[ + ex21_df_with_cik["central_index_key"].isnull() + ] + ex21_non_filing_subs_df.loc[:, "files_10k"] = False + # create a sec_company_id for the subsidiaries that don't have a CIK + ex21_non_filing_subs_df.loc[:, "sec_company_id"] = ( + ex21_non_filing_subs_df["company_name"].str + + ex21_non_filing_subs_df["loc_of_incorporation"].str + ) + out_df = pd.concat([basic_10k_df, ex21_non_filing_subs_df]) + # this drops records for earlier company names and addresses + # that have since changed, so we lose some information + out_df = _flatten_sec_companies_across_time(out_df) + return out_df + + +production_assets = [sec_output_table] diff --git a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py index 1a5ec96..f2e284a 100644 --- a/src/mozilla_sec_eia/models/sec10k/utils/cloud.py +++ b/src/mozilla_sec_eia/models/sec10k/utils/cloud.py @@ -29,6 +29,20 @@ def _compute_md5(file_path: Path) -> str: return base64.b64encode(hash_md5.digest()).decode() +def convert_ex21_id_to_filename(df: pd.DataFrame, id_col_name: str = "id"): + """Convert the ID column to GCS archive filenames. + + The extracted Ex. 21 tables have an ID that doesn't match + the filenames in the GCS archive. Create a new column "filename" + that converts this ID column into the GCS archive filename + for that filing. + """ + df.loc[:, "filename"] = ( + "edgar/data/" + df[id_col_name].str.replace("-", "/", n=1) + ".txt" + ) + return df + + class Exhibit21(BaseModel): """This is a class to wrap Exhibit 21's, which are included in many SEC 10ks.""" diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/create_eia_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/create_eia_input.py new file mode 100644 index 0000000..d0266b9 --- /dev/null +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/create_eia_input.py @@ -0,0 +1,76 @@ +"""Create an EIA input utilities table that's ready for record linkage with the SEC 10K companies.""" + +import pandas as pd + + +# TODO: make Dagster inputs instead of reading from AWS? +def get_eia861_utilities_table(): + """Get the utilities contained in EIA Form 861. + + TODO: In PUDL we should eventually implement an actual thorough + harvesting of utilities from all EIA Form 861 tables, but this is + good enough for now. + """ + raw_eia861_df = pd.read_parquet( + "s3://pudl.catalyst.coop/stable/core_eia861__assn_utility.parquet" + ) + harvested_df = pd.concat( + [ + pd.read_parquet( + "s3://pudl.catalyst.coop/stable/core_eia861__yearly_utility_data_misc.parquet" + )[["report_date", "utility_id_eia", "utility_name_eia"]], + pd.read_parquet( + "s3://pudl.catalyst.coop/stable/core_eia861__yearly_operational_data_misc.parquet" + )[["report_date", "utility_id_eia", "utility_name_eia"]], + pd.read_parquet( + "s3://pudl.catalyst.coop/stable/core_eia861__yearly_demand_side_management_misc.parquet" + )[["report_date", "utility_id_eia", "utility_name_eia"]], + pd.read_parquet( + "s3://pudl.catalyst.coop/stable/core_eia861__yearly_energy_efficiency.parquet" + )[["report_date", "utility_id_eia", "utility_name_eia"]], + ] + ) + eia861_df = raw_eia861_df.merge( + harvested_df, on=["report_date", "utility_id_eia"], how="left" + ).drop_duplicates(subset=["report_date", "utility_id_eia"]) + mergers_df = pd.read_parquet( + "s3://pudl.catalyst.coop/stable/core_eia861__yearly_mergers.parquet" + ) + mergers_df = mergers_df[mergers_df["new_parent"].notna()] + eia861_df = eia861_df.merge( + mergers_df[ + ["report_date", "new_parent", "merge_address", "merge_city", "merge_state"] + ], + how="left", + left_on=["report_date", "utility_name_eia"], + right_on=["report_date", "new_parent"], + ) + eia861_df = eia861_df.rename( + columns={"merge_address": "street_address", "merge_city": "city"} + ) + eia861_df = ( + eia861_df.groupby(["report_date", "utility_id_eia"]).first().reset_index() + ) + + eia861_df["state"] = eia861_df["state"].where( + eia861_df["merge_state"].isnull(), eia861_df["merge_state"] + ) + eia861_df = eia861_df.drop(columns=["new_parent", "merge_state"]) + return eia861_df + + +# TODO: add Dagster asset inputs for PUDL inputs instead of reading from AWS? +def get_eia_utilities_table(): + """Create a table of EIA Form 860 and 861 utilities.""" + raw_eia_df = pd.read_parquet( + "s3://pudl.catalyst.coop/stable/out_eia__yearly_utilities.parquet" + ) + eia861_df = get_eia861_utilities_table() + eia_df = pd.concat([raw_eia_df, eia861_df]) + eia_df = eia_df.drop_duplicates( + subset=["utility_id_eia", "report_date"], keep="first" + ) + eia_df["report_date"] = eia_df["report_date"].astype("datetime64[ns]") + # there are nulls from non harvested 861 utilities + eia_df = eia_df.dropna(subset="utility_name_eia") + return eia_df diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py index ebb7843..12c4704 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py @@ -1,5 +1,9 @@ """Preprocessing for EIA and SEC input data before record linkage.""" +import re +from importlib import resources +from pathlib import Path + import jellyfish import numpy as np import pandas as pd @@ -60,82 +64,15 @@ "", ] -state_code_dict = { - # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#States. - "AK": "Alaska", - "AL": "Alabama", - "AR": "Arkansas", - "AZ": "Arizona", - "CA": "California", - "CO": "Colorado", - "CT": "Connecticut", - "DE": "Delaware", - "FL": "Florida", - "GA": "Georgia", - "HI": "Hawaii", - "IA": "Iowa", - "ID": "Idaho", - "IL": "Illinois", - "IN": "Indiana", - "KS": "Kansas", - "KY": "Kentucky", - "LA": "Louisiana", - "MA": "Massachusetts", - "MD": "Maryland", - "ME": "Maine", - "MI": "Michigan", - "MN": "Minnesota", - "MO": "Missouri", - "MS": "Mississippi", - "MT": "Montana", - "NC": "North Carolina", - "ND": "North Dakota", - "NE": "Nebraska", - "NH": "New Hampshire", - "NJ": "New Jersey", - "NM": "New Mexico", - "NV": "Nevada", - "NY": "New York", - "OH": "Ohio", - "OK": "Oklahoma", - "OR": "Oregon", - "PA": "Pennsylvania", - "RI": "Rhode Island", - "SC": "South Carolina", - "SD": "South Dakota", - "TN": "Tennessee", - "TX": "Texas", - "UT": "Utah", - "VA": "Virginia", - "VT": "Vermont", - "WA": "Washington", - "WI": "Wisconsin", - "WV": "West Virginia", - "WY": "Wyoming", - # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#Federal_district. - "DC": "District of Columbia", - # https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States#Inhabited_territories. - "AS": "American Samoa", - "GU": "Guam GU", - "MP": "Northern Mariana Islands", - "PR": "Puerto Rico PR", - "VI": "U.S. Virgin Islands", -} -state_code_to_name = {k.lower(): v.lower() for k, v in state_code_dict.items()} company_name_cleaner = name_cleaner.CompanyNameCleaner( cleaning_rules_list=[ "remove_word_the_from_the_end", "remove_word_the_from_the_beginning", - "replace_amperstand_between_space_by_AND", + "replace_ampersand_by_AND", "replace_hyphen_by_space", - "replace_hyphen_between_spaces_by_single_space", "replace_underscore_by_space", - "replace_underscore_between_spaces_by_single_space", - # "remove_all_punctuation", - # "remove_numbers", - # "remove_math_symbols", - "remove_words_in_parentheses", + "remove_text_punctuation", "remove_parentheses", "remove_brackets", "remove_curly_brackets", @@ -143,7 +80,38 @@ ] ) +legal_term_remover = name_cleaner.CompanyNameCleaner( + cleaning_rules_list=[], handle_legal_terms=2 +) + + +# TODO: remove +def get_sec_state_code_dict(): + """Create a dictionary mapping state codes to their names. + + Table found at https://www.sec.gov/submit-filings/filer-support-resources/edgar-state-country-codes + Published by SEC and reports valid state codes + for filers of Form D. Used to standardize the state codes + in the SEC 10K filings. The expanded names of the state codes + are comments in the XML file, so we have to read the XML in as + text and parse it. + """ + # TODO: make a check to see if SEC has published a new version of this table + xml_filepath = ( + resources.files("mozilla_sec_eia.package_data") / "formDStateCodes.xsd.xml" + ) + with Path.open(xml_filepath) as file: + xml_text = file.read() + + pattern = r'.*?' + state_code_dict = { + code.lower(): name.lower() + for code, name in re.findall(pattern, xml_text, re.DOTALL) + } + return state_code_dict + +# TODO: moved to output table module, take out def _add_report_year_to_sec(sec_df): """Merge metadata on to get a report year for extracted SEC data. @@ -151,9 +119,13 @@ def _add_report_year_to_sec(sec_df): """ archive = GCSArchive() md = archive.get_metadata() - return sec_df.merge( + sec_df = sec_df.merge( md[["date_filed"]], how="left", left_index=True, right_index=True ) + sec_df.loc[:, "report_year"] = ( + sec_df["report_date"].astype("datetime64[ns]").dt.year + ) + return sec_df # TODO: this is in PUDL, pull out into helper function @@ -163,6 +135,7 @@ def _get_metaphone(row, col_name): return jellyfish.metaphone(row[col_name]) +# TODO: deduplicate this with what's already been done def _clean_company_name(df): df.loc[:, "company_name_clean"] = company_name_cleaner.apply_name_cleaning( df[["company_name"]] @@ -171,9 +144,13 @@ def _clean_company_name(df): df = df.rename(columns={"company_name": "company_name_raw"}).rename( columns={"company_name_clean": "company_name"} ) + df.loc[:, "company_name_no_legal"] = legal_term_remover.apply_name_cleaning( + df[["company_name"]] + ) return df +# TODO: deduplicate this with what's already been done def clean_sec_df(df): """Shared cleaning for SEC 10K and Ex. 21 dataframes. @@ -185,29 +162,32 @@ def clean_sec_df(df): df[["company_name", "loc_of_incorporation"]] .fillna(pd.NA) .apply(lambda x: x.str.strip().str.lower()) + .replace("", pd.NA) ) - df.loc[:, "company_name"] = df["company_name"].replace("", pd.NA) - df.loc[:, "loc_of_incorporation"] = df["loc_of_incorporation"].replace("", pd.NA) df = _clean_company_name(df) + df.loc[:, "company_name_mphone"] = df.apply( + _get_metaphone, axis=1, args=("company_name_no_legal",) + ) df = df[ (~df["company_name"].isin(INVALID_NAMES)) - & ~(df["company_name_raw"].isin(INVALID_NAMES)) + & (~df["company_name_raw"].isin(INVALID_NAMES)) ] df = df.fillna(np.nan) - df = df.drop_duplicates( - subset=["company_name", "loc_of_incorporation", "report_year"] - ) + return df +# TODO: moved to output table module, take out def _remove_weird_sec_cols(sec_df): - for weird_col in ["]fiscal_year_end", "]irs_number", "]state_of_incorporation"]: + weird_cols = ["]fiscal_year_end", "]irs_number", "]state_of_incorporation"] + for weird_col in weird_cols: if weird_col not in sec_df: continue normal_col = weird_col[1:] sec_df.loc[:, normal_col] = sec_df[normal_col].where( sec_df[weird_col].isnull(), sec_df[weird_col] ) + sec_df = sec_df.drop(columns=[weird_col]) return sec_df @@ -215,26 +195,35 @@ def _remove_weird_sec_cols(sec_df): # later unite them into one cleaning function def prepare_sec10k_basic_info_df(sec_df): """Preprocess SEC 10k basic information dataframe for record linkage.""" - sec_df = _add_report_year_to_sec(sec_df) + # sec_df = _add_report_year_to_sec(sec_df) sec_df = sec_df.rename(columns=SEC_COL_MAP).reset_index() - sec_df.loc[:, "report_year"] = ( - sec_df["report_date"].astype("datetime64[ns]").dt.year - ) - sec_df.loc[:, "loc_of_incorporation"] = sec_df["state_of_incorporation"].replace( - state_code_to_name - ) + # state_code_to_name = get_sec_state_code_dict() + # sec_df.loc[:, "loc_of_incorporation"] = sec_df["state_of_incorporation"].replace( + # state_code_to_name + # ) # TODO: maybe shouldn't expand the state names and comparison should # just be an exact match or nothing? # sec_df.loc[:, "state"] = sec_df["state"].replace(state_code_to_name) # TODO: needs a record_id_sec column? # sec_df = sec_df.rename(columns={"record_id_sec": "record_id"}) - sec_df = _remove_weird_sec_cols(sec_df) + # sec_df = _remove_weird_sec_cols(sec_df) sec_df = clean_sec_df(sec_df) sec_df[STR_COLS] = sec_df[STR_COLS].apply(lambda x: x.str.strip().str.lower()) - sec_df.loc[:, "company_name_mphone"] = sec_df.apply( - _get_metaphone, axis=1, args=("company_name",) + # TODO: cluster/mark these duplicates so they can be assigned + # IDs post matching + sec_df = sec_df.drop_duplicates( + subset=[ + "central_index_key", + "report_year", + "company_name", + "standard_industrial_classification", + "city", + "state", + "street_address", + "zip_code", + ] ) - sec_df = sec_df.reset_index(names="record_id") + sec_df.loc[:, "sec_company_id"] = sec_df["central_index_key"] return sec_df @@ -242,14 +231,20 @@ def prepare_ex21_df(ex21_df): """Preprocess Ex. 21 extracted dataframe for record linkage.""" ex21_df = ex21_df.rename(columns=EX21_COL_MAP) # TODO: move this to general preprocessing function? + state_code_to_name = get_sec_state_code_dict() ex21_df.loc[:, "loc_of_incorporation"] = ex21_df["loc_of_incorporation"].replace( state_code_to_name ) + name_to_state_code = {v: k for k, v in state_code_to_name.items()} + # need this? + ex21_df.loc[:, "state_of_incorporation"] = ex21_df["loc_of_incorporation"].replace( + name_to_state_code + ) ex21_df = clean_sec_df(ex21_df) - ex21_df.loc[:, "company_name_mphone"] = ex21_df.apply( - _get_metaphone, axis=1, args=("company_name",) + ex21_df = ex21_df.drop_duplicates( + subset=["company_name", "loc_of_incorporation", "report_year"] ) - ex21_df = ex21_df.reset_index(names="record_id") + # ex21_df = ex21_df.reset_index(drop=True).reset_index(names="record_id") return ex21_df @@ -263,26 +258,35 @@ def prepare_eia_df(eia_df): eia_df[STR_COLS] = eia_df[STR_COLS].apply(lambda x: x.str.strip().str.lower()) eia_df = _clean_company_name(eia_df) eia_df.loc[:, "company_name_mphone"] = eia_df.apply( - _get_metaphone, axis=1, args=("company_name",) + _get_metaphone, axis=1, args=("company_name_no_legal",) ) - eia_df = eia_df.reset_index(names="record_id") + eia_df = eia_df.reset_index(drop=True).reset_index(names="record_id") return eia_df -""" -def preprocessing(eia_df, sec_df): - # TODO: reorganize to be more similar to ferc to eia match structure - eia_df = eia_df.rename(columns=EIA_COL_MAP) +def add_sec_company_id_to_subsidiaries(ex21_df: pd.DataFrame): + """Add sec_company_id onto SEC Ex. 21 subsidiaries. - # TODO: fill out this prepare for matching function - # eia_df = prepare_for_matching(eia_df) - # sec_df = prepare_for_matching(sec_df) - sec_df.loc[:, "loc_of_incorporation"] = sec_df["state_of_incorporation"].replace( - state_code_to_name + At this point, the passed in Ex. 21 dataframe should have been + matched to SEC 10K filers with record linkage and assigned a CIK + where applicable (if the subsidiary files with the SEC). Take the + subsidiaries that don't have a CIK and create an sec_company_id + for those companies. + + Arguments: + ex21_df: A dataframe of subsidiaries from SEC Ex. 21 filings with + columns subsidiary_cik, company_name (of the subsidiary), + and loc_of_incorporation. + """ + ex21_df = ex21_df.sort_values(by="parent_cik") + ex21_df = ex21_df.drop_duplicates(subset=["company_name", "loc_of_incorporation"]) + ex21_df.loc[:, "sec_company_id"] = ( + ex21_df["parent_cik"] + + "_" + + (ex21_df.groupby("parent_cik").cumcount() + 1).astype(str) ) - sec_df.loc[:, "loc_of_incorporation"] = sec_df["loc_of_incorporation"].where( - ~sec_df["loc_of_incorporation"].isnull(), sec_df["city"] + # override sec_company_id with CIK where a subsidiary has an assigned CIK + ex21_df.loc[:, "sec_company_id"] = ex21_df["sec_company_id"].where( + ex21_df["subsidiary_cik"].isnull(), ex21_df["subsidiary_cik"] ) - sec_df = sec_df.rename(columns={"record_id_sec": "record_id"}) - eia_df = eia_df.rename(columns={"record_id_eia": "record_id"}) -""" + return ex21_df diff --git a/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml b/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml new file mode 100644 index 0000000..d5b3c3d --- /dev/null +++ b/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml @@ -0,0 +1,328 @@ + + + + + + + + + + + + Set of valid State and Country Codes according to EDGAR. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 88f17f2ed1f8c264d5b35f568d64701d074f800c Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Fri, 29 Nov 2024 15:18:57 -0500 Subject: [PATCH 133/161] fix errors with asset creation --- .../models/sec10k/sec_output_table.py | 81 ++++++++++++------- 1 file changed, 53 insertions(+), 28 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec10k/sec_output_table.py b/src/mozilla_sec_eia/models/sec10k/sec_output_table.py index 6f0f900..a00cc99 100644 --- a/src/mozilla_sec_eia/models/sec10k/sec_output_table.py +++ b/src/mozilla_sec_eia/models/sec10k/sec_output_table.py @@ -1,5 +1,7 @@ """Module for creating an SEC 10K output table with filing companies and subsidiary companies.""" +import logging + import re from importlib import resources from pathlib import Path @@ -16,9 +18,16 @@ company_name_cleaner, ) +from .extract import ( + sec10k_filing_metadata, + year_quarter_partitions, +) + +logger = logging.getLogger(f"catalystcoop.{__name__}") + # TODO: should this be a shared asset? Can you use the existing sec_10k_filing_metadata with all year quarters? -archive = GCSArchive() -md = archive.get_metadata() +# archive = GCSArchive() +# md = archive.get_metadata() INVALID_NAMES = [ "llc", @@ -50,21 +59,20 @@ def _remove_weird_sec_cols(sec_df) -> pd.DataFrame: return sec_df -def _add_report_year_to_sec(sec_df) -> pd.DataFrame: +def _add_report_year_to_sec(sec_df: pd.DataFrame, md: pd.DataFrame) -> pd.DataFrame: """Merge metadata on to get a report year for extracted SEC data. Expects filename to be the index of the SEC dataframe. """ - sec_df = sec_df.merge( - md[["date_filed"]], how="left", left_index=True, right_index=True - ) + sec_df = sec_df.merge(md[["filename", "date_filed"]], how="left", on=["filename"]) + sec_df = sec_df.rename(columns={"date_filed": "report_date"}) sec_df.loc[:, "report_year"] = ( sec_df["report_date"].astype("datetime64[ns]").dt.year ) return sec_df -def _flatten_sec_companies_across_time(sec_df) -> pd.DataFrame: +def _flatten_sec_companies_across_time(sec_df: pd.DataFrame) -> pd.DataFrame: """Keep only the most recent record for each unique SEC CIK. Note that this drops old records for companies that have changed @@ -72,7 +80,6 @@ def _flatten_sec_companies_across_time(sec_df) -> pd.DataFrame: TODO: create an asset that tracks name and address chnages across time. """ - sec_df = _add_report_year_to_sec(sec_df) sec_df = ( sec_df.sort_values(by="report_year", ascending=False) .groupby("central_index_key") @@ -113,14 +120,16 @@ def clean_loc_of_incorporation(df) -> pd.DataFrame: df: Ex. 21 or SEC 10K basic info dataframe with loc_of_incorporation column. """ - state_code_to_name = get_sec_state_code_dict() - df.loc[:, "loc_of_incorporation"] = df["state_of_incorporation"].replace( - state_code_to_name - ) + if "state_of_incorporation" in df: + state_code_to_name = get_sec_state_code_dict() + df.loc[:, "loc_of_incorporation"] = df["state_of_incorporation"].replace( + state_code_to_name + ) df["loc_of_incorporation"] = ( df["loc_of_incorporation"] .fillna(pd.NA) - .apply(lambda x: x.str.strip().str.lower()) + .str.strip() + .str.lower() .replace("", pd.NA) ) return df @@ -134,10 +143,7 @@ def clean_company_name(df) -> pd.DataFrame: column. """ df["company_name"] = ( - df["company_name"] - .fillna(pd.NA) - .apply(lambda x: x.str.strip().str.lower()) - .replace("", pd.NA) + df["company_name"].fillna(pd.NA).str.strip().str.lower().replace("", pd.NA) ) df.loc[:, "company_name_clean"] = company_name_cleaner.apply_name_cleaning( df[["company_name"]] @@ -151,11 +157,12 @@ def clean_company_name(df) -> pd.DataFrame: return df -def add_parent_company_cik(ex21_df: pd.DataFrame) -> pd.DataFrame: +def add_parent_company_cik(ex21_df: pd.DataFrame, md: pd.DataFrame) -> pd.DataFrame: """Add the CIK of the parent company to Ex. 21 subsidiaries.""" - ex21_df = ex21_df.merge( - md["cik"], how="left", left_on="filename", right_index=True - ).rename(columns={"cik": "parent_company_cik"}) + ex21_df = ex21_df.merge(md[["filename", "cik"]], how="left", on="filename").rename( + columns={"cik": "parent_company_cik"} + ) + return ex21_df def match_ex21_subsidiaries_to_filer_company( @@ -185,6 +192,9 @@ def match_ex21_subsidiaries_to_filer_company( merged_df = basic10k_df.merge( ex21_df, how="inner", on="company_name", suffixes=("_sec", "_ex21") ) + logger.info(f"basic 10k cols: {basic10k_df.columns}") + logger.info(f"ex21 cols: {ex21_df.columns}") + logger.info(f"merged cols: {merged_df.columns}") # split up the location of incorporation on whitespace, creating a column # with lists of word tokens merged_df.loc[:, "loc_tokens_sec"] = ( @@ -252,13 +262,20 @@ def match_ex21_subsidiaries_to_filer_company( io_manager_key="pandas_parquet_io_manager", ) }, + partitions_def=year_quarter_partitions, ) -def clean_ex21_table(ex21_df: pd.DataFrame) -> pd.DataFrame: +def clean_ex21_table( + ex21_df: pd.DataFrame, sec10k_filing_metadata: pd.DataFrame +) -> pd.DataFrame: """Clean Ex. 21 table of subsidiaries before combing with basic 10k table.""" ex21_df.loc[:, "filename"] = convert_ex21_id_to_filename(ex21_df) + ex21_df = _add_report_year_to_sec(ex21_df, sec10k_filing_metadata) + ex21_df = ex21_df.rename( + columns={"subsidiary": "company_name", "loc": "loc_of_incorporation"} + ) ex21_df = clean_loc_of_incorporation(ex21_df) ex21_df = clean_company_name(ex21_df) - ex21_df = add_parent_company_cik(ex21_df) + ex21_df = add_parent_company_cik(ex21_df, sec10k_filing_metadata) # flatten out the Ex. 21 table ex21_df = ex21_df.drop_duplicates( subset=["parent_company_cik", "company_name", "loc_of_incorporation"] @@ -278,9 +295,12 @@ def clean_ex21_table(ex21_df: pd.DataFrame) -> pd.DataFrame: # specify a dagster_type? ), }, + partitions_def=year_quarter_partitions, ) def sec_output_table( - basic_10k_df: pd.DataFrame, clean_ex21_df: pd.DataFrame + basic_10k_df: pd.DataFrame, + clean_ex21_df: pd.DataFrame, + sec10k_filing_metadata: pd.DataFrame, ) -> pd.DataFrame: """Asset for creating an SEC 10K output table. @@ -293,10 +313,14 @@ def sec_output_table( values="value", index="filename", columns="key", aggfunc="first" ) basic_10k_df.columns.name = None + basic_10k_df = basic_10k_df.reset_index() basic_10k_df = _remove_weird_sec_cols(basic_10k_df) - + basic_10k_df = _add_report_year_to_sec(basic_10k_df, sec10k_filing_metadata) # add a location of incorporation to better match it to Ex. 21 subsidiaries basic_10k_df = clean_loc_of_incorporation(basic_10k_df) + basic_10k_df = basic_10k_df.rename( + columns={"company_conformed_name": "company_name"} + ) basic_10k_df = clean_company_name(basic_10k_df) ex21_df_with_cik = match_ex21_subsidiaries_to_filer_company( basic10k_df=basic_10k_df, ex21_df=clean_ex21_df @@ -314,8 +338,9 @@ def sec_output_table( ex21_non_filing_subs_df.loc[:, "files_10k"] = False # create a sec_company_id for the subsidiaries that don't have a CIK ex21_non_filing_subs_df.loc[:, "sec_company_id"] = ( - ex21_non_filing_subs_df["company_name"].str - + ex21_non_filing_subs_df["loc_of_incorporation"].str + ex21_non_filing_subs_df["company_name"] + + "_" + + ex21_non_filing_subs_df["loc_of_incorporation"] ) out_df = pd.concat([basic_10k_df, ex21_non_filing_subs_df]) # this drops records for earlier company names and addresses @@ -324,4 +349,4 @@ def sec_output_table( return out_df -production_assets = [sec_output_table] +production_assets = [sec_output_table, sec10k_filing_metadata] From c9b62baa83f22f4dfcf7b3048e1009be5685d3f1 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Sat, 30 Nov 2024 18:08:12 -0500 Subject: [PATCH 134/161] clean up sec output table creation --- .../20-kl-validate-sec-output-table.ipynb | 1064 +++++++++++++++++ .../models/sec10k/sec_output_table.py | 105 +- 2 files changed, 1124 insertions(+), 45 deletions(-) create mode 100644 notebooks/20-kl-validate-sec-output-table.ipynb diff --git a/notebooks/20-kl-validate-sec-output-table.ipynb b/notebooks/20-kl-validate-sec-output-table.ipynb new file mode 100644 index 0000000..2b28fb9 --- /dev/null +++ b/notebooks/20-kl-validate-sec-output-table.ipynb @@ -0,0 +1,1064 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "id": "d383d1dd-6cdc-45ea-a371-105046c009e2", + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 3" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "3c58ad67-151d-4054-a972-a1e7ee12949f", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from upath import UPath" + ] + }, + { + "cell_type": "markdown", + "id": "8d178634-b494-4769-93e3-c0213e4a0326", + "metadata": {}, + "source": [ + "### Read in SEC output table" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "25e8183d-3248-440c-aa4e-e7ee7db4c487", + "metadata": {}, + "outputs": [], + "source": [ + "# review outputs from Dagster\n", + "sec_out_df = pd.read_parquet(UPath(\"gs://sec10k-outputs/v2/out_sec_10k__parents_and_subsidiaries/2023q1.parquet\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "3881bfbd-cdc3-4f9c-92af-9e74d7758e51", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sec_company_idfilenamebusiness_phonecentral_index_keycitycompany_namedate_of_name_changefilm_numberfiscal_year_endform_type...street_1street_2zipreport_datereport_yearlocation_of_inccompany_name_cleanparent_company_cikown_perfiles_10k
00000001800edgar/data/1800/0001628280-23-004026.txt22466761000000001800abbott parkabbott laboratoriesNone23642562123110-k...100 abbott park roadNone60064-35002023-02-172023illinoisabbott laboratoriesNoneNoneTrue
10000001800_3a nutrition (vietnam) company limi...edgar/data/1800/0001628280-23-004026.txtNoneNoneNone3a nutrition (vietnam) company limitedNoneNoneNoneNone...NoneNoneNone2023-02-172023viet nam3a nutrition vietnam company limited0000001800NoneFalse
20000001800_abbott (jiaxing) nutrition co., ltd...edgar/data/1800/0001628280-23-004026.txtNoneNoneNoneabbott (jiaxing) nutrition co., ltdNoneNoneNoneNone...NoneNoneNone2023-02-172023chinaabbott jiaxing nutrition co limited0000001800NoneFalse
30000001800_abbott (shanghai) diagnostics sales...edgar/data/1800/0001628280-23-004026.txtNoneNoneNoneabbott (shanghai) diagnostics sales co., ltdNoneNoneNoneNone...NoneNoneNone2023-02-172023chinaabbott shanghai diagnostics sales co limited0000001800NoneFalse
40000001800_abbott (uk) finance limited_united ...edgar/data/1800/0001628280-23-004026.txtNoneNoneNoneabbott (uk) finance limitedNoneNoneNoneNone...NoneNoneNone2023-02-172023united kingdomabbott uk finance limited0000001800NoneFalse
..................................................................
1713580001951118edgar/data/1951118/0001853620-23-000117.txt(248) 991-67000001951118farmington hillsmercedes-benz auto receivables trust 2022-1None23764946123110-k...35555 w. twelve mile rd.suite 100483312023-03-272023delawaremercedes benz auto receivables trust 2022 1NoneNoneTrue
1713590001951752edgar/data/1951752/0001951752-23-000016.txt31359434950001951752dearbornford credit auto owner trust 2022-dNone23751556123110-k...c/o ford motor co , whq ste 801-c1one american road481262023-03-222023Noneford credit auto owner trust 2022 dNoneNoneTrue
1713600001954336edgar/data/1477336/0001954336-23-000024.txt313-656-55000001954336wilmingtonally auto receivables trust 2022-3None23759320123110-k...1209 orange streetNone198012023-03-242023delawareally auto receivables trust 2022 3NoneNoneTrue
1713610001954436edgar/data/1954436/0000929638-23-001050.txt(214) 572-82760001954436irvingexeter automobile receivables trust 2022-6None23784761123110-k...2101 w. john carpenter freewayNone750632023-03-312023delawareexeter automobile receivables trust 2022 6NoneNoneTrue
1713620001955010edgar/data/1955010/0001140361-23-012122.txt212-326-15000001955010new yorkoha senior private lending fund (u) llcNone23740150123110-k...one vanderbilt, 16th floorNone100172023-03-172023delawareoha senior private lending fund u limited liab...NoneNoneTrue
\n", + "

171363 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " sec_company_id \\\n", + "0 0000001800 \n", + "1 0000001800_3a nutrition (vietnam) company limi... \n", + "2 0000001800_abbott (jiaxing) nutrition co., ltd... \n", + "3 0000001800_abbott (shanghai) diagnostics sales... \n", + "4 0000001800_abbott (uk) finance limited_united ... \n", + "... ... \n", + "171358 0001951118 \n", + "171359 0001951752 \n", + "171360 0001954336 \n", + "171361 0001954436 \n", + "171362 0001955010 \n", + "\n", + " filename business_phone \\\n", + "0 edgar/data/1800/0001628280-23-004026.txt 2246676100 \n", + "1 edgar/data/1800/0001628280-23-004026.txt None \n", + "2 edgar/data/1800/0001628280-23-004026.txt None \n", + "3 edgar/data/1800/0001628280-23-004026.txt None \n", + "4 edgar/data/1800/0001628280-23-004026.txt None \n", + "... ... ... \n", + "171358 edgar/data/1951118/0001853620-23-000117.txt (248) 991-6700 \n", + "171359 edgar/data/1951752/0001951752-23-000016.txt 3135943495 \n", + "171360 edgar/data/1477336/0001954336-23-000024.txt 313-656-5500 \n", + "171361 edgar/data/1954436/0000929638-23-001050.txt (214) 572-8276 \n", + "171362 edgar/data/1955010/0001140361-23-012122.txt 212-326-1500 \n", + "\n", + " central_index_key city \\\n", + "0 0000001800 abbott park \n", + "1 None None \n", + "2 None None \n", + "3 None None \n", + "4 None None \n", + "... ... ... \n", + "171358 0001951118 farmington hills \n", + "171359 0001951752 dearborn \n", + "171360 0001954336 wilmington \n", + "171361 0001954436 irving \n", + "171362 0001955010 new york \n", + "\n", + " company_name date_of_name_change \\\n", + "0 abbott laboratories None \n", + "1 3a nutrition (vietnam) company limited None \n", + "2 abbott (jiaxing) nutrition co., ltd None \n", + "3 abbott (shanghai) diagnostics sales co., ltd None \n", + "4 abbott (uk) finance limited None \n", + "... ... ... \n", + "171358 mercedes-benz auto receivables trust 2022-1 None \n", + "171359 ford credit auto owner trust 2022-d None \n", + "171360 ally auto receivables trust 2022-3 None \n", + "171361 exeter automobile receivables trust 2022-6 None \n", + "171362 oha senior private lending fund (u) llc None \n", + "\n", + " film_number fiscal_year_end form_type ... \\\n", + "0 23642562 1231 10-k ... \n", + "1 None None None ... \n", + "2 None None None ... \n", + "3 None None None ... \n", + "4 None None None ... \n", + "... ... ... ... ... \n", + "171358 23764946 1231 10-k ... \n", + "171359 23751556 1231 10-k ... \n", + "171360 23759320 1231 10-k ... \n", + "171361 23784761 1231 10-k ... \n", + "171362 23740150 1231 10-k ... \n", + "\n", + " street_1 street_2 zip \\\n", + "0 100 abbott park road None 60064-3500 \n", + "1 None None None \n", + "2 None None None \n", + "3 None None None \n", + "4 None None None \n", + "... ... ... ... \n", + "171358 35555 w. twelve mile rd. suite 100 48331 \n", + "171359 c/o ford motor co , whq ste 801-c1 one american road 48126 \n", + "171360 1209 orange street None 19801 \n", + "171361 2101 w. john carpenter freeway None 75063 \n", + "171362 one vanderbilt, 16th floor None 10017 \n", + "\n", + " report_date report_year location_of_inc \\\n", + "0 2023-02-17 2023 illinois \n", + "1 2023-02-17 2023 viet nam \n", + "2 2023-02-17 2023 china \n", + "3 2023-02-17 2023 china \n", + "4 2023-02-17 2023 united kingdom \n", + "... ... ... ... \n", + "171358 2023-03-27 2023 delaware \n", + "171359 2023-03-22 2023 None \n", + "171360 2023-03-24 2023 delaware \n", + "171361 2023-03-31 2023 delaware \n", + "171362 2023-03-17 2023 delaware \n", + "\n", + " company_name_clean parent_company_cik \\\n", + "0 abbott laboratories None \n", + "1 3a nutrition vietnam company limited 0000001800 \n", + "2 abbott jiaxing nutrition co limited 0000001800 \n", + "3 abbott shanghai diagnostics sales co limited 0000001800 \n", + "4 abbott uk finance limited 0000001800 \n", + "... ... ... \n", + "171358 mercedes benz auto receivables trust 2022 1 None \n", + "171359 ford credit auto owner trust 2022 d None \n", + "171360 ally auto receivables trust 2022 3 None \n", + "171361 exeter automobile receivables trust 2022 6 None \n", + "171362 oha senior private lending fund u limited liab... None \n", + "\n", + " own_per files_10k \n", + "0 None True \n", + "1 None False \n", + "2 None False \n", + "3 None False \n", + "4 None False \n", + "... ... ... \n", + "171358 None True \n", + "171359 None True \n", + "171360 None True \n", + "171361 None True \n", + "171362 None True \n", + "\n", + "[171363 rows x 27 columns]" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df" + ] + }, + { + "cell_type": "markdown", + "id": "3447dcdb-4506-4de0-9201-9711ff9259ee", + "metadata": {}, + "source": [ + "### There are a combination of SEC 10K filers and subsidiary companies:" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "id": "0d654dfc-2fb2-41d3-9ff8-6fe70732a04a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "files_10k\n", + "False 165824\n", + "True 5539\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df.files_10k.value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "6797b5b7-be91-430a-a30c-cc26c62aa7b1", + "metadata": {}, + "source": [ + "### `sec_company_id` and `central_index_key` should be unique:" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "67e0e789-feb0-4866-ba82-8346c62c1bef", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df.sec_company_id.is_unique" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "053d65c9-dbdd-4622-a4ee-badc7db2a88d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df.central_index_key.dropna().is_unique" + ] + }, + { + "cell_type": "markdown", + "id": "b7e05e03-fa05-4655-a085-c66afcfba442", + "metadata": {}, + "source": [ + "### Location of incorporation should be clean and standardized for filers and subsidiaries." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "id": "cb33b703-be24-4ddc-a9f2-148850c3f4af", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "location_of_inc\n", + "delaware 3076\n", + "nevada 300\n", + "maryland 299\n", + "cayman islands 135\n", + "north carolina 92\n", + "new york 74\n", + "florida 74\n", + "pennsylvania 71\n", + "california 57\n", + "texas 56\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df[sec_out_df.files_10k][\"location_of_inc\"].value_counts().head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "id": "cb6fc7b5-b9c0-46ae-991c-cae41f86e8f3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "location_of_inc\n", + "bahamas 1\n", + "germany 1\n", + "hong kong 1\n", + "china 1\n", + "virgin islands, u.s. 1\n", + "quebec, canada 1\n", + "new brunswick, canada 1\n", + "new hampshire 1\n", + "netherlands antilles 1\n", + "malaysia 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df[sec_out_df.files_10k][\"location_of_inc\"].value_counts().tail(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "id": "5373ced2-75e9-4229-b927-3ad4b8d33e39", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "location_of_inc\n", + "delaware 67546\n", + "united kingdom 4979\n", + "cayman islands 3000\n", + "texas 2881\n", + "netherlands 2615\n", + "california 2566\n", + "germany 2381\n", + "china 2305\n", + "florida 2130\n", + "australia 1938\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df[~sec_out_df.files_10k][\"location_of_inc\"].value_counts().head(10)" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "id": "3ceb1aa2-c622-4a97-9293-281325637f09", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "location_of_inc\n", + "ontario, can 1\n", + "british col, can 1\n", + "hong kong china china 1\n", + "zhongshan, china 1\n", + "jacksonville, florida 1\n", + "toronto, ontario, canada 1\n", + "limassol, cyprus 1\n", + "doncaster, syorkshire, uk 1\n", + "manchester, england 1\n", + "cote 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df[~sec_out_df.files_10k][\"location_of_inc\"].value_counts().tail(10)" + ] + }, + { + "cell_type": "markdown", + "id": "95d51bdb-c378-45bc-9848-4a2a8895b470", + "metadata": {}, + "source": [ + "### All non SEC 10K filers should have a `parent_company_cik`" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "id": "89cd6bdb-a06c-40ae-8b49-c610e769f9c8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "parent_company_cik\n", + "False 165824\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df[~sec_out_df.files_10k][\"parent_company_cik\"].isnull().value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "8f4bd494-951f-417f-ba56-fa0202d741a5", + "metadata": {}, + "source": [ + "### When run on all year quarters, all `parent_company_cik` should appear in `central_index_key` column" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "id": "d024bc29-d0b1-45cd-a0a2-c9b66e73e0d6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2954" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n_parent_company_cik = len(set(sec_out_df.parent_company_cik))\n", + "n_parent_company_cik" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "id": "d70660f2-559e-4ec1-8167-1bfdce45c287", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "2832" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n_overlap = len(set(sec_out_df.parent_company_cik).intersection(set(sec_out_df.central_index_key)))\n", + "n_overlap" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "id": "0eb86d64-5ca0-423a-864c-dbfb00b5b9fa", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "122" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "n_parent_company_cik - n_overlap" + ] + }, + { + "cell_type": "markdown", + "id": "60366af2-259a-4a87-a93f-2180d8777c67", + "metadata": {}, + "source": [ + "### There should be filer companies that have a `parent_company_cik` because they were matched to a subsidiary" + ] + }, + { + "cell_type": "code", + "execution_count": 49, + "id": "b5c53dab-3be5-48f1-90f6-583acfb452ab", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "parent_company_cik\n", + "True 5474\n", + "False 65\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 49, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df[sec_out_df.files_10k].parent_company_cik.isnull().value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "5358a4e1-38a7-489d-bf1a-f53de58447ba", + "metadata": {}, + "source": [ + "### There should be no non-filer companies that have a CIK" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "id": "4a19df26-79c3-4aa1-bcbf-916b822346ca", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "central_index_key\n", + "True 165824\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 50, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df[~sec_out_df.files_10k].central_index_key.isnull().value_counts()" + ] + }, + { + "cell_type": "markdown", + "id": "bde4f03f-e5b0-4895-ade6-ae44b260e78e", + "metadata": {}, + "source": [ + "### There should be no duplicated `company_name`, `location_of_inc`, `parent_company_cik` records" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "id": "ca87709a-daa7-4396-83a4-0f5bb8ec2cd4", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sec_company_idfilenamebusiness_phonecentral_index_keycitycompany_namedate_of_name_changefilm_numberfiscal_year_endform_type...street_1street_2zipreport_datereport_yearlocation_of_inccompany_name_cleanparent_company_cikown_perfiles_10k
\n", + "

0 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [sec_company_id, filename, business_phone, central_index_key, city, company_name, date_of_name_change, film_number, fiscal_year_end, form_type, former_conformed_name, irs_number, sec_act, sec_file_number, standard_industrial_classification, state, state_of_incorporation, street_1, street_2, zip, report_date, report_year, location_of_inc, company_name_clean, parent_company_cik, own_per, files_10k]\n", + "Index: []\n", + "\n", + "[0 rows x 27 columns]" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df[sec_out_df.duplicated(subset=[\"company_name\", \"location_of_inc\", \"parent_company_cik\"])]" + ] + }, + { + "cell_type": "markdown", + "id": "bca9e395-bd96-4183-b299-46cd589d97d5", + "metadata": {}, + "source": [ + "### There can be companies with the same name, location, and CIK, but different parent companies." + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "cc1880f3-a9d3-4f8a-a42b-2f9ff428ca45", + "metadata": {}, + "outputs": [], + "source": [ + "sec_out_df = sec_out_df.fillna({\"central_index_key\": pd.NA})" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "id": "f87257df-00f7-48a8-882a-fb1ea8c27e18", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
company_namelocation_of_inccentral_index_keyparent_company_cik
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [company_name, location_of_inc, central_index_key, parent_company_cik]\n", + "Index: []" + ] + }, + "execution_count": 59, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "sec_out_df[~sec_out_df.central_index_key.isnull() \n", + " & (sec_out_df.duplicated(\n", + " subset=[\"company_name\", \"location_of_inc\", \"central_index_key\"], keep=False\n", + " ))][[\"company_name\", \"location_of_inc\", \"central_index_key\", \"parent_company_cik\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e2169181-dcd8-4b43-b03e-9526f597147d", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "mozilla_sec_eia", + "language": "python", + "name": "mozilla_sec_eia" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.0" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/src/mozilla_sec_eia/models/sec10k/sec_output_table.py b/src/mozilla_sec_eia/models/sec10k/sec_output_table.py index a00cc99..1ccaea9 100644 --- a/src/mozilla_sec_eia/models/sec10k/sec_output_table.py +++ b/src/mozilla_sec_eia/models/sec10k/sec_output_table.py @@ -1,7 +1,6 @@ """Module for creating an SEC 10K output table with filing companies and subsidiary companies.""" import logging - import re from importlib import resources from pathlib import Path @@ -11,7 +10,6 @@ from dagster import AssetIn, AssetOut, multi_asset from mozilla_sec_eia.models.sec10k.utils.cloud import ( - GCSArchive, convert_ex21_id_to_filename, ) from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import ( @@ -25,10 +23,6 @@ logger = logging.getLogger(f"catalystcoop.{__name__}") -# TODO: should this be a shared asset? Can you use the existing sec_10k_filing_metadata with all year quarters? -# archive = GCSArchive() -# md = archive.get_metadata() - INVALID_NAMES = [ "llc", "limited liability company", @@ -73,18 +67,20 @@ def _add_report_year_to_sec(sec_df: pd.DataFrame, md: pd.DataFrame) -> pd.DataFr def _flatten_sec_companies_across_time(sec_df: pd.DataFrame) -> pd.DataFrame: - """Keep only the most recent record for each unique SEC CIK. + """Keep only the most recent record for each unique SEC company. Note that this drops old records for companies that have changed - names or addresses across time. + names or addresses across time. Also, we group by sec_company_id not + CIK, so filer companies and subsidiary companies are unique in the + output dataframe. TODO: create an asset that tracks name and address chnages across time. """ sec_df = ( sec_df.sort_values(by="report_year", ascending=False) - .groupby("central_index_key") + .groupby("sec_company_id") .first() - ) + ).reset_index() return sec_df @@ -113,20 +109,19 @@ def get_sec_state_code_dict() -> dict[str, str]: return state_code_dict -def clean_loc_of_incorporation(df) -> pd.DataFrame: +def clean_location_of_inc(df) -> pd.DataFrame: """Clean location of incorporation column in SEC basic 10K or Ex. 21 dataframe. Arguments: - df: Ex. 21 or SEC 10K basic info dataframe with loc_of_incorporation + df: Ex. 21 or SEC 10K basic info dataframe with location_of_inc column. """ if "state_of_incorporation" in df: - state_code_to_name = get_sec_state_code_dict() - df.loc[:, "loc_of_incorporation"] = df["state_of_incorporation"].replace( - state_code_to_name - ) - df["loc_of_incorporation"] = ( - df["loc_of_incorporation"] + df.loc[:, "location_of_inc"] = df["state_of_incorporation"] + state_code_to_name = get_sec_state_code_dict() + df.loc[:, "location_of_inc"] = ( + df["location_of_inc"] + .replace(state_code_to_name) .fillna(pd.NA) .str.strip() .str.lower() @@ -162,6 +157,9 @@ def add_parent_company_cik(ex21_df: pd.DataFrame, md: pd.DataFrame) -> pd.DataFr ex21_df = ex21_df.merge(md[["filename", "cik"]], how="left", on="filename").rename( columns={"cik": "parent_company_cik"} ) + ex21_df.loc[:, "parent_company_cik"] = ( + ex21_df["parent_company_cik"].astype(str).str.zfill(10) + ) return ex21_df @@ -185,23 +183,20 @@ def match_ex21_subsidiaries_to_filer_company( subset=[ "central_index_key", "company_name", - "loc_of_incorporation", + "location_of_inc", "report_year", ] ) merged_df = basic10k_df.merge( ex21_df, how="inner", on="company_name", suffixes=("_sec", "_ex21") ) - logger.info(f"basic 10k cols: {basic10k_df.columns}") - logger.info(f"ex21 cols: {ex21_df.columns}") - logger.info(f"merged cols: {merged_df.columns}") # split up the location of incorporation on whitespace, creating a column # with lists of word tokens merged_df.loc[:, "loc_tokens_sec"] = ( - merged_df["loc_of_incorporation_sec"].fillna("").str.lower().str.split() + merged_df["location_of_inc_sec"].fillna("").str.lower().str.split() ) merged_df.loc[:, "loc_tokens_ex21"] = ( - merged_df["loc_of_incorporation_ex21"].fillna("").str.lower().str.split() + merged_df["location_of_inc_ex21"].fillna("").str.lower().str.split() ) # get the number of words overlapping between location of incorporation tokens merged_df["loc_overlap"] = merged_df.apply( @@ -216,23 +211,28 @@ def match_ex21_subsidiaries_to_filer_company( merged_df = merged_df.sort_values( by=[ "company_name", - "loc_of_incorporation_ex21", + "location_of_inc_ex21", "loc_overlap", "report_year_diff", ], ascending=[True, True, False, True], ) # Select the row with the highest loc overlap and nearest report years - # for each company name and location pair + # for each company name, location, and parent company record closest_match_df = merged_df.groupby( - ["company_name", "loc_of_incorporation_ex21"], as_index=False + ["company_name", "location_of_inc_ex21", "parent_company_cik"], as_index=False ).first() ex21_with_cik_df = ex21_df.merge( closest_match_df[ - ["company_name", "central_index_key", "loc_of_incorporation_ex21"] - ].rename(columns={"loc_of_incorporation_ex21": "loc_of_incorporation"}), + [ + "company_name", + "parent_company_cik", + "location_of_inc_ex21", + "central_index_key", + ] + ].rename(columns={"location_of_inc_ex21": "location_of_inc"}), how="left", - on=["company_name", "loc_of_incorporation"], + on=["company_name", "location_of_inc", "parent_company_cik"], ).rename(columns={"central_index_key": "subsidiary_cik"}) # if a subsidiary doesn't have a CIK and has a null location # but its company name was assigned a CIK (with a different location) @@ -244,15 +244,33 @@ def match_ex21_subsidiaries_to_filer_company( ).rename(columns={"central_index_key": "company_name_merge_cik"}) ex21_with_cik_df["subsidiary_cik"] = ex21_with_cik_df["subsidiary_cik"].where( ~(ex21_with_cik_df.subsidiary_cik.isnull()) - | ~(ex21_with_cik_df.loc_of_incorporation.isnull()), + | ~(ex21_with_cik_df.location_of_inc.isnull()), ex21_with_cik_df["company_name_merge_cik"], ) + ex21_with_cik_df = ex21_with_cik_df.drop(columns="company_name_merge_cik") ex21_with_cik_df = ex21_with_cik_df.rename( columns={"subsidiary_cik": "central_index_key"} ) return ex21_with_cik_df +def create_sec_company_id_for_ex21_subs(ex21_df: pd.DataFrame) -> pd.DataFrame: + """Create an sec_company_id for Ex. 21 subsidiaries. + + This is a unique identifier string for Ex. 21 subsidiaries. + This ID is necessary for tracking subsidiaries who aren't ultimately + matched to a 10K filer company. + """ + ex21_df.loc[:, "sec_company_id"] = ( + ex21_df["parent_company_cik"] + + "_" + + ex21_df["company_name"] + + "_" + + ex21_df["location_of_inc"] + ) + return ex21_df + + @multi_asset( ins={ "ex21_df": AssetIn("ex21_company_ownership_info"), @@ -267,19 +285,21 @@ def match_ex21_subsidiaries_to_filer_company( def clean_ex21_table( ex21_df: pd.DataFrame, sec10k_filing_metadata: pd.DataFrame ) -> pd.DataFrame: - """Clean Ex. 21 table of subsidiaries before combing with basic 10k table.""" + """Clean Ex. 21 table of subsidiaries before combining with basic 10k table.""" ex21_df.loc[:, "filename"] = convert_ex21_id_to_filename(ex21_df) + ex21_df = ex21_df.drop(columns=["id"]) ex21_df = _add_report_year_to_sec(ex21_df, sec10k_filing_metadata) ex21_df = ex21_df.rename( - columns={"subsidiary": "company_name", "loc": "loc_of_incorporation"} + columns={"subsidiary": "company_name", "loc": "location_of_inc"} ) - ex21_df = clean_loc_of_incorporation(ex21_df) + ex21_df = clean_location_of_inc(ex21_df) ex21_df = clean_company_name(ex21_df) ex21_df = add_parent_company_cik(ex21_df, sec10k_filing_metadata) - # flatten out the Ex. 21 table - ex21_df = ex21_df.drop_duplicates( - subset=["parent_company_cik", "company_name", "loc_of_incorporation"] - ) + # add an sec_company_id, ultimately this ID become the subsidiary's CIK + # if the subsidiary is matched to an SEC filer + ex21_df = create_sec_company_id_for_ex21_subs(ex21_df=ex21_df) + ex21_df = _flatten_sec_companies_across_time(ex21_df) + return ex21_df @@ -317,7 +337,7 @@ def sec_output_table( basic_10k_df = _remove_weird_sec_cols(basic_10k_df) basic_10k_df = _add_report_year_to_sec(basic_10k_df, sec10k_filing_metadata) # add a location of incorporation to better match it to Ex. 21 subsidiaries - basic_10k_df = clean_loc_of_incorporation(basic_10k_df) + basic_10k_df = clean_location_of_inc(basic_10k_df) basic_10k_df = basic_10k_df.rename( columns={"company_conformed_name": "company_name"} ) @@ -332,16 +352,11 @@ def sec_output_table( ) basic_10k_df.loc[:, "files_10k"] = True basic_10k_df.loc[:, "sec_company_id"] = basic_10k_df["central_index_key"] + # get the subsidiary companies that weren't matched to a 10K filing company ex21_non_filing_subs_df = ex21_df_with_cik[ ex21_df_with_cik["central_index_key"].isnull() ] ex21_non_filing_subs_df.loc[:, "files_10k"] = False - # create a sec_company_id for the subsidiaries that don't have a CIK - ex21_non_filing_subs_df.loc[:, "sec_company_id"] = ( - ex21_non_filing_subs_df["company_name"] - + "_" - + ex21_non_filing_subs_df["loc_of_incorporation"] - ) out_df = pd.concat([basic_10k_df, ex21_non_filing_subs_df]) # this drops records for earlier company names and addresses # that have since changed, so we lose some information From 01b2d23084063dd7572bf009adba8aa822d50c7f Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Sat, 30 Nov 2024 18:08:39 -0500 Subject: [PATCH 135/161] splink notebook change --- notebooks/18-kl-splink-sec-eia.ipynb | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb index 0d74c13..19ab082 100644 --- a/notebooks/18-kl-splink-sec-eia.ipynb +++ b/notebooks/18-kl-splink-sec-eia.ipynb @@ -527,6 +527,32 @@ "unmatched_ex21_df = ex21_with_cik[ex21_with_cik.subsidiary_cik.isnull()]" ] }, + { + "cell_type": "code", + "execution_count": 53, + "id": "56f41505-421e-4bf7-bfc4-93500e0c5e71", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 a_1\n", + "1 b_2\n", + "2 c_3\n", + "dtype: object" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1 = pd.DataFrame({\"text1\": [\"a\", \"b\", \"c\"]})\n", + "df2 = pd.DataFrame({\"text2\": [\"1\", \"2\", \"3\"]})\n", + "df1[\"text1\"] + \"_\" + df2[\"text2\"]" + ] + }, { "cell_type": "markdown", "id": "381e4a8f-d8da-4802-b86e-1ac4fc3094db", From 30d22c96bdff9d893887295479e8c880fe955e1d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 30 Nov 2024 23:16:10 +0000 Subject: [PATCH 136/161] [pre-commit.ci] auto fixes from pre-commit.com hooks For more information, see https://pre-commit.ci --- .../package_data/formDStateCodes.xsd.xml | 656 +++++++++--------- 1 file changed, 328 insertions(+), 328 deletions(-) diff --git a/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml b/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml index d5b3c3d..2ec0c2b 100644 --- a/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml +++ b/src/mozilla_sec_eia/package_data/formDStateCodes.xsd.xml @@ -1,328 +1,328 @@ - - - - - - - - - - - - Set of valid State and Country Codes according to EDGAR. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + Set of valid State and Country Codes according to EDGAR. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + From 3fca3c9792e40fa7e916d625f417f3e121bf3504 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Sat, 30 Nov 2024 18:18:21 -0500 Subject: [PATCH 137/161] fix pre commit --- src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py index 0fec63c..ef43757 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py @@ -1 +1,3 @@ +"""Implement record linkage model between SEC companies and EIA utilities.""" + from . import preprocessing From d53ab255c57df3bb46c38429d4e589189ace04f0 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Sun, 1 Dec 2024 09:29:35 -0500 Subject: [PATCH 138/161] update python dependency in test environment --- test_environment.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_environment.yml b/test_environment.yml index 5fa9b2d..c3d51d5 100644 --- a/test_environment.yml +++ b/test_environment.yml @@ -6,7 +6,7 @@ channels: dependencies: # Packages required for setting up the environment - pip>=21,<24 - - python>=3.10,<3.12 + - python>=3.10,<=3.12 - setuptools>=66,<69 # Packages specified in setup.py that need or benefit from binary conda packages From 2eb1555a74f976205391247d03dd3d10614760eb Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Sun, 1 Dec 2024 10:21:48 -0500 Subject: [PATCH 139/161] update github tox env --- .github/workflows/tox-pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tox-pytest.yml b/.github/workflows/tox-pytest.yml index 730fd08..4fff900 100644 --- a/.github/workflows/tox-pytest.yml +++ b/.github/workflows/tox-pytest.yml @@ -11,7 +11,7 @@ jobs: id-token: write strategy: matrix: - python-version: ["3.10", "3.11"] + python-version: ["3.10", "3.11", "3.12"] fail-fast: false defaults: run: From 44eb70beb63ced07af28eeba77e2676541099334 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Mon, 2 Dec 2024 13:31:27 -0500 Subject: [PATCH 140/161] restructure input table assets --- .../library/record_linkage_utils.py | 109 ++++++++++++++ src/mozilla_sec_eia/models/sec10k/__init__.py | 11 +- .../models/sec_eia_record_linkage/__init__.py | 50 ++++++- .../sec_eia_record_linkage/preprocessing.py | 137 +----------------- .../sec_eia_splink_config.py | 57 ++++++++ ...te_eia_input.py => transform_eia_input.py} | 43 +++++- .../transform_sec_input.py} | 112 +++++++------- 7 files changed, 319 insertions(+), 200 deletions(-) create mode 100644 src/mozilla_sec_eia/library/record_linkage_utils.py create mode 100644 src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py rename src/mozilla_sec_eia/models/sec_eia_record_linkage/{create_eia_input.py => transform_eia_input.py} (70%) rename src/mozilla_sec_eia/models/{sec10k/sec_output_table.py => sec_eia_record_linkage/transform_sec_input.py} (85%) diff --git a/src/mozilla_sec_eia/library/record_linkage_utils.py b/src/mozilla_sec_eia/library/record_linkage_utils.py new file mode 100644 index 0000000..9a33392 --- /dev/null +++ b/src/mozilla_sec_eia/library/record_linkage_utils.py @@ -0,0 +1,109 @@ +"""Utility functions for cleaning strings during modeling preprocessing steps.""" + +import jellyfish +import pandas as pd + +from pudl.analysis.record_linkage import name_cleaner + +INVALID_NAMES = [ + "llc", + "limited liability company", + "limited", + "ltd", + "iiii", + "inc", + "incorporated", + "partnership", + "i", + "name", + "company", + "&", + "", +] + +company_name_cleaner = name_cleaner.CompanyNameCleaner( + cleaning_rules_list=[ + "remove_word_the_from_the_end", + "remove_word_the_from_the_beginning", + "replace_ampersand_by_AND", + "replace_hyphen_by_space", + "replace_underscore_by_space", + "remove_text_punctuation", + "remove_parentheses", + "remove_brackets", + "remove_curly_brackets", + "enforce_single_space_between_words", + ] +) + +legal_term_remover = name_cleaner.CompanyNameCleaner( + cleaning_rules_list=[], handle_legal_terms=2 +) + + +def clean_company_name( + df: pd.DataFrame, col_name: str = "company_name" +) -> pd.DataFrame: + """Conduct cleaning on a company name column and add column without legal terms. + + Uses the PUDL name cleaner object to do basic cleaning on `col_name` column + such as stripping punctuation, correcting case, normalizing legal + terms etc. The clean column becomes the `col_name` column and the original + `col_name` column is renamed to `{col_name}_raw`. Also adds a column called + `{col_name}_no_legal` which has legal terms stripped from the clean strings. + + Arguments: + df: The dataframe that is to be cleaned. Must contain `col_name` column. + col_name: The name of the column with the company name strings. + + Returns: + pd.DataFrame: The original dataframe with `col_name` now containing + cleaned strings and an additional column with the raw strings + and a column with the legal terms stripped from the company name. + """ + df[col_name] = df[col_name].fillna(pd.NA).str.strip().str.lower().replace("", pd.NA) + df.loc[:, f"{col_name}_clean"] = company_name_cleaner.apply_name_cleaning( + df[[col_name]] + ).str.strip() + df = df[df[f"{col_name}_clean"] != ""] + df = df.rename(columns={col_name: f"{col_name}_raw"}).rename( + columns={f"{col_name}_clean": col_name} + ) + df.loc[:, f"{col_name}_no_legal"] = legal_term_remover.apply_name_cleaning( + df[[col_name]] + ) + return df + + +def drop_invalid_names( + df: pd.DataFrame, col_name: str = "company_name" +) -> pd.DataFrame: + """Drop rows that have invalid company names, like just 'llc', or 'partnership'.""" + return df[(~df[col_name].isin(INVALID_NAMES))] + + +# TODO: this is in PUDL, deduplicate +def get_metaphone_col(col: pd.Series) -> pd.Series: + """Get the metaphones of the strings in a column.""" + return col.apply(jellyfish.metaphone) + + +def transform_company_name(df: pd.DataFrame) -> pd.DataFrame: + """Apply cleaning, get metaphone col, drop invalid rows.""" + df = clean_company_name(df) + df.loc[:, "company_name_mphone"] = get_metaphone_col(df["company_name_no_legal"]) + df = drop_invalid_names(df, "company_name_clean") + return df + + +def fill_street_address_nulls( + df: pd.DataFrame, + address_col: str = "street_address", + secondary_address_col: str = "street_address_2", +) -> pd.DataFrame: + """Fill null street address with value from secondary address column.""" + df[address_col] = pd.where( + (~df[address_col].isnull()) | (df[secondary_address_col].isnull()), + df[secondary_address_col], + ) + return df diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 1bf9be9..4fd2f14 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -28,14 +28,14 @@ mlflow_train_test_io_managers, ) -from . import basic_10k, ex_21, extract, sec_output_table +from . import basic_10k, ex_21, extract from .utils.cloud import cloud_interface_resource basic_10k_assets = load_assets_from_modules([basic_10k]) ex21_assets = load_assets_from_package_module(ex_21) ex21_data_assets = load_assets_from_modules([ex_21.data]) shared_assets = load_assets_from_modules([extract]) -sec_output_assets = load_assets_from_modules([sec_output_table]) + basic_10k_production_job = model_jobs.create_production_model_job( "basic_10k_extraction", @@ -57,9 +57,6 @@ description="Run exhibit 21 extraction pipeline on archived filings.", ) -sec_output_table_production_job = model_jobs.create_production_model_job( - "sec_output_table_creation", sec_output_table.production_assets -) finetune_layoutlm = define_dagstermill_asset( name="layoutlm", @@ -139,8 +136,7 @@ finetune_layoutlm, train_exhibit21_layout_classifier, ] - + ex21_data_assets - + sec_output_assets, + + ex21_data_assets, jobs=[ basic_10k_production_job, basic_10k_validation_job, @@ -148,7 +144,6 @@ finetune_layoutlm_job, exhibit21_extraction_validation_job, exhibit21_layout_classifier_training_job, - sec_output_table_production_job, ], resources={ "cloud_interface": cloud_interface_resource, diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py index ef43757..c87c0cb 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py @@ -1,3 +1,51 @@ """Implement record linkage model between SEC companies and EIA utilities.""" -from . import preprocessing +from dagster import Definitions, load_assets_from_modules +from dagstermill import ( + ConfigurableLocalOutputNotebookIOManager, +) +from upath import UPath + +from mozilla_sec_eia.library import model_jobs +from mozilla_sec_eia.library.generic_io_managers import ( + PandasParquetIOManager, + PickleUPathIOManager, +) +from mozilla_sec_eia.library.mlflow import ( + MlflowPyfuncModelIOManager, + mlflow_interface_resource, + mlflow_train_test_io_managers, +) +from mozilla_sec_eia.models.sec10k.utils.cloud import cloud_interface_resource + +from . import transform_eia_input, transform_sec_input + +eia_assets = load_assets_from_modules([transform_eia_input]) +sec_assets = load_assets_from_modules([transform_sec_input]) + +eia_input_table_production_job = model_jobs.create_production_model_job( + "eia_input_table_creation", transform_eia_input.production_assets +) +sec_input_table_production_job = model_jobs.create_production_model_job( + "sec_input_table_creation", transform_sec_input.production_assets +) + +defs = Definitions( + sec_assets, + jobs=[eia_input_table_production_job, sec_input_table_production_job], + resources={ + "cloud_interface": cloud_interface_resource, + "mlflow_interface": mlflow_interface_resource, + "pandas_parquet_io_manager": PandasParquetIOManager( + base_path=UPath("gs://sec10k-outputs/v2") + ), + "pickle_gcs_io_manager": PickleUPathIOManager( + base_path=UPath("gs://sec10k-outputs/dagster_storage") + ), + "pyfunc_model_io_manager": MlflowPyfuncModelIOManager( + mlflow_interface=mlflow_interface_resource + ), + "output_notebook_io_manager": ConfigurableLocalOutputNotebookIOManager(), + } + | mlflow_train_test_io_managers, +) diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py index 12c4704..3caa182 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py @@ -1,14 +1,9 @@ """Preprocessing for EIA and SEC input data before record linkage.""" -import re -from importlib import resources -from pathlib import Path - import jellyfish import numpy as np import pandas as pd -from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive from pudl.analysis.record_linkage import name_cleaner EIA_COL_MAP = { @@ -16,8 +11,6 @@ "address_2": "street_address_2", } -EX21_COL_MAP = {"subsidiary": "company_name", "loc": "loc_of_incorporation"} - SEC_COL_MAP = { "company_conformed_name": "company_name", "street_1": "street_address", @@ -85,49 +78,6 @@ ) -# TODO: remove -def get_sec_state_code_dict(): - """Create a dictionary mapping state codes to their names. - - Table found at https://www.sec.gov/submit-filings/filer-support-resources/edgar-state-country-codes - Published by SEC and reports valid state codes - for filers of Form D. Used to standardize the state codes - in the SEC 10K filings. The expanded names of the state codes - are comments in the XML file, so we have to read the XML in as - text and parse it. - """ - # TODO: make a check to see if SEC has published a new version of this table - xml_filepath = ( - resources.files("mozilla_sec_eia.package_data") / "formDStateCodes.xsd.xml" - ) - with Path.open(xml_filepath) as file: - xml_text = file.read() - - pattern = r'.*?' - state_code_dict = { - code.lower(): name.lower() - for code, name in re.findall(pattern, xml_text, re.DOTALL) - } - return state_code_dict - - -# TODO: moved to output table module, take out -def _add_report_year_to_sec(sec_df): - """Merge metadata on to get a report year for extracted SEC data. - - Expects filename to be the index of the SEC dataframe. - """ - archive = GCSArchive() - md = archive.get_metadata() - sec_df = sec_df.merge( - md[["date_filed"]], how="left", left_index=True, right_index=True - ) - sec_df.loc[:, "report_year"] = ( - sec_df["report_date"].astype("datetime64[ns]").dt.year - ) - return sec_df - - # TODO: this is in PUDL, pull out into helper function def _get_metaphone(row, col_name): if pd.isnull(row[col_name]): @@ -135,7 +85,7 @@ def _get_metaphone(row, col_name): return jellyfish.metaphone(row[col_name]) -# TODO: deduplicate this with what's already been done +# TODO: delete def _clean_company_name(df): df.loc[:, "company_name_clean"] = company_name_cleaner.apply_name_cleaning( df[["company_name"]] @@ -150,7 +100,7 @@ def _clean_company_name(df): return df -# TODO: deduplicate this with what's already been done +# TODO: delete def clean_sec_df(df): """Shared cleaning for SEC 10K and Ex. 21 dataframes. @@ -158,12 +108,6 @@ def clean_sec_df(df): df: Ex. 21 or SEC 10K basic info dataframe with columns company_name, loc_of_incorporation, and report_year. """ - df[["company_name", "loc_of_incorporation"]] = ( - df[["company_name", "loc_of_incorporation"]] - .fillna(pd.NA) - .apply(lambda x: x.str.strip().str.lower()) - .replace("", pd.NA) - ) df = _clean_company_name(df) df.loc[:, "company_name_mphone"] = df.apply( _get_metaphone, axis=1, args=("company_name_no_legal",) @@ -177,40 +121,13 @@ def clean_sec_df(df): return df -# TODO: moved to output table module, take out -def _remove_weird_sec_cols(sec_df): - weird_cols = ["]fiscal_year_end", "]irs_number", "]state_of_incorporation"] - for weird_col in weird_cols: - if weird_col not in sec_df: - continue - normal_col = weird_col[1:] - sec_df.loc[:, normal_col] = sec_df[normal_col].where( - sec_df[weird_col].isnull(), sec_df[weird_col] - ) - sec_df = sec_df.drop(columns=[weird_col]) - return sec_df - - -# TODO: for now split these into separate cleaning functions -# later unite them into one cleaning function +# TODO: delete def prepare_sec10k_basic_info_df(sec_df): """Preprocess SEC 10k basic information dataframe for record linkage.""" - # sec_df = _add_report_year_to_sec(sec_df) sec_df = sec_df.rename(columns=SEC_COL_MAP).reset_index() - # state_code_to_name = get_sec_state_code_dict() - # sec_df.loc[:, "loc_of_incorporation"] = sec_df["state_of_incorporation"].replace( - # state_code_to_name - # ) - # TODO: maybe shouldn't expand the state names and comparison should - # just be an exact match or nothing? - # sec_df.loc[:, "state"] = sec_df["state"].replace(state_code_to_name) - # TODO: needs a record_id_sec column? - # sec_df = sec_df.rename(columns={"record_id_sec": "record_id"}) - # sec_df = _remove_weird_sec_cols(sec_df) sec_df = clean_sec_df(sec_df) sec_df[STR_COLS] = sec_df[STR_COLS].apply(lambda x: x.str.strip().str.lower()) - # TODO: cluster/mark these duplicates so they can be assigned - # IDs post matching + # TODO: does this actually drop anything? sec_df = sec_df.drop_duplicates( subset=[ "central_index_key", @@ -223,31 +140,17 @@ def prepare_sec10k_basic_info_df(sec_df): "zip_code", ] ) - sec_df.loc[:, "sec_company_id"] = sec_df["central_index_key"] return sec_df +# TODO: delete def prepare_ex21_df(ex21_df): """Preprocess Ex. 21 extracted dataframe for record linkage.""" - ex21_df = ex21_df.rename(columns=EX21_COL_MAP) - # TODO: move this to general preprocessing function? - state_code_to_name = get_sec_state_code_dict() - ex21_df.loc[:, "loc_of_incorporation"] = ex21_df["loc_of_incorporation"].replace( - state_code_to_name - ) - name_to_state_code = {v: k for k, v in state_code_to_name.items()} - # need this? - ex21_df.loc[:, "state_of_incorporation"] = ex21_df["loc_of_incorporation"].replace( - name_to_state_code - ) ex21_df = clean_sec_df(ex21_df) - ex21_df = ex21_df.drop_duplicates( - subset=["company_name", "loc_of_incorporation", "report_year"] - ) - # ex21_df = ex21_df.reset_index(drop=True).reset_index(names="record_id") return ex21_df +# TODO: delete def prepare_eia_df(eia_df): """Preprocess EIA utility dataframe for record linkage.""" eia_df = eia_df.rename(columns=EIA_COL_MAP) @@ -262,31 +165,3 @@ def prepare_eia_df(eia_df): ) eia_df = eia_df.reset_index(drop=True).reset_index(names="record_id") return eia_df - - -def add_sec_company_id_to_subsidiaries(ex21_df: pd.DataFrame): - """Add sec_company_id onto SEC Ex. 21 subsidiaries. - - At this point, the passed in Ex. 21 dataframe should have been - matched to SEC 10K filers with record linkage and assigned a CIK - where applicable (if the subsidiary files with the SEC). Take the - subsidiaries that don't have a CIK and create an sec_company_id - for those companies. - - Arguments: - ex21_df: A dataframe of subsidiaries from SEC Ex. 21 filings with - columns subsidiary_cik, company_name (of the subsidiary), - and loc_of_incorporation. - """ - ex21_df = ex21_df.sort_values(by="parent_cik") - ex21_df = ex21_df.drop_duplicates(subset=["company_name", "loc_of_incorporation"]) - ex21_df.loc[:, "sec_company_id"] = ( - ex21_df["parent_cik"] - + "_" - + (ex21_df.groupby("parent_cik").cumcount() + 1).astype(str) - ) - # override sec_company_id with CIK where a subsidiary has an assigned CIK - ex21_df.loc[:, "sec_company_id"] = ex21_df["sec_company_id"].where( - ex21_df["subsidiary_cik"].isnull(), ex21_df["subsidiary_cik"] - ) - return ex21_df diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py new file mode 100644 index 0000000..3a5edae --- /dev/null +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py @@ -0,0 +1,57 @@ +"""Configuration file for the splink SEC to EIA record linkage model.""" + +import splink.comparison_library as cl +from splink import block_on + +STR_COLS = [ + "company_name", + "street_address", + "street_address_2", + "city", + "state", + "zip_code", +] + +SHARED_COLS = [ + "record_id", + "report_date", + "report_year", + "company_name", + "company_name_no_legal", + "company_name_mphone", + "street_address", + "street_address_2", + "city", + "state", # could use state of incorporation from SEC + "zip_code", + "phone_number", +] + +MATCH_COLS = ["company_name", "state", "city", "street_address"] + +BLOCKING_RULES = [ + "substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4)", + "l.street_address = r.street_address", + "substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and l.city = r.city", + "substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and array_length(list_intersect(l.street_address_list, r.street_address_list)) >= 2", +] + +company_name_comparison = cl.NameComparison( + "company_name_no_legal", + jaro_winkler_thresholds=[0.95], +) + +address_comparison = cl.LevenshteinAtThresholds( + "street_address", distance_threshold_or_thresholds=[1] +).configure(term_frequency_adjustments=True) +print(address_comparison.get_comparison("duckdb").human_readable_description) + +state_comparison = cl.ExactMatch("state").configure(term_frequency_adjustments=True) +city_comparison = cl.NameComparison("city", jaro_winkler_thresholds=[0.9]) + +# blocking rules for estimating probability two random records match +deterministic_blocking_rules = [ + block_on("company_name_mphone", "company_name_mphone"), + "jaro_winkler_similarity(r.company_name, l.company_name) >= .95 and l.city = r.city", + "substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4) and l.city = r.city and l.street_address = r.street_address", +] diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/create_eia_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py similarity index 70% rename from src/mozilla_sec_eia/models/sec_eia_record_linkage/create_eia_input.py rename to src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py index d0266b9..4da5c1b 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/create_eia_input.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py @@ -1,10 +1,23 @@ """Create an EIA input utilities table that's ready for record linkage with the SEC 10K companies.""" +import numpy as np import pandas as pd +from dagster import AssetOut, multi_asset + +from mozilla_sec_eia.library.record_linkage_utils import ( + fill_street_address_nulls, + transform_company_name, +) +from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import STR_COLS + +EIA_COL_MAP = { + "utility_name_eia": "company_name", # TODO: should be linking to owner or operator name? + "address_2": "street_address_2", +} # TODO: make Dagster inputs instead of reading from AWS? -def get_eia861_utilities_table(): +def harvest_eia861_utilities(): """Get the utilities contained in EIA Form 861. TODO: In PUDL we should eventually implement an actual thorough @@ -59,18 +72,36 @@ def get_eia861_utilities_table(): return eia861_df +@multi_asset( + outs={ + "core_eia__parents_and_subsidiaries": AssetOut( + io_manager_key="pandas_parquet_io_manager" + ) + # TODO: allow year partitions? + } +) # TODO: add Dagster asset inputs for PUDL inputs instead of reading from AWS? -def get_eia_utilities_table(): +def eia_rl_input_table(): """Create a table of EIA Form 860 and 861 utilities.""" raw_eia_df = pd.read_parquet( "s3://pudl.catalyst.coop/stable/out_eia__yearly_utilities.parquet" ) - eia861_df = get_eia861_utilities_table() + eia861_df = harvest_eia861_utilities() eia_df = pd.concat([raw_eia_df, eia861_df]) eia_df = eia_df.drop_duplicates( subset=["utility_id_eia", "report_date"], keep="first" - ) + ).dropna(subset="utility_name_eia") + eia_df = eia_df.rename(columns=EIA_COL_MAP) eia_df["report_date"] = eia_df["report_date"].astype("datetime64[ns]") - # there are nulls from non harvested 861 utilities - eia_df = eia_df.dropna(subset="utility_name_eia") + eia_df.loc[:, "report_year"] = eia_df["report_date"].dt.year + eia_df = transform_company_name(eia_df) + eia_df.loc[:, "zip_code"] = eia_df["zip_code"].str[:5] + eia_df = fill_street_address_nulls(eia_df) + eia_df[STR_COLS] = eia_df[STR_COLS].apply(lambda x: x.str.strip().str.lower()) + eia_df = eia_df.fillna(np.nan) + eia_df = eia_df.reset_index(drop=True).reset_index(names="record_id") + return eia_df + + +production_assets = [eia_rl_input_table] diff --git a/src/mozilla_sec_eia/models/sec10k/sec_output_table.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py similarity index 85% rename from src/mozilla_sec_eia/models/sec10k/sec_output_table.py rename to src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py index 1ccaea9..7c4aab0 100644 --- a/src/mozilla_sec_eia/models/sec10k/sec_output_table.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py @@ -9,20 +9,32 @@ import pandas as pd from dagster import AssetIn, AssetOut, multi_asset +from mozilla_sec_eia.library.record_linkage_utils import ( + fill_street_address_nulls, + transform_company_name, +) from mozilla_sec_eia.models.sec10k.utils.cloud import ( convert_ex21_id_to_filename, ) -from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import ( - company_name_cleaner, -) +from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import STR_COLS -from .extract import ( +from ..sec10k.extract import ( sec10k_filing_metadata, year_quarter_partitions, ) logger = logging.getLogger(f"catalystcoop.{__name__}") + +EX21_COL_MAP = {"subsidiary": "company_name", "loc": "loc_of_incorporation"} +SEC_COL_MAP = { + "company_conformed_name": "company_name", + "street_1": "street_address", + "street_2": "street_address_2", + "zip": "zip_code", + "business_phone": "phone_number", +} + INVALID_NAMES = [ "llc", "limited liability company", @@ -130,29 +142,7 @@ def clean_location_of_inc(df) -> pd.DataFrame: return df -def clean_company_name(df) -> pd.DataFrame: - """Clean company name column in SEC basic 10K or Ex. 21 dataframe. - - Arguments: - df: Ex. 21 or SEC 10K basic info dataframe with company_name - column. - """ - df["company_name"] = ( - df["company_name"].fillna(pd.NA).str.strip().str.lower().replace("", pd.NA) - ) - df.loc[:, "company_name_clean"] = company_name_cleaner.apply_name_cleaning( - df[["company_name"]] - ).str.strip() - df = df[ - (~df["company_name"].isin(INVALID_NAMES)) - & (~df["company_name_clean"].isin(INVALID_NAMES)) - ] - df = df.fillna(np.nan) - - return df - - -def add_parent_company_cik(ex21_df: pd.DataFrame, md: pd.DataFrame) -> pd.DataFrame: +def _add_parent_company_cik(ex21_df: pd.DataFrame, md: pd.DataFrame) -> pd.DataFrame: """Add the CIK of the parent company to Ex. 21 subsidiaries.""" ex21_df = ex21_df.merge(md[["filename", "cik"]], how="left", on="filename").rename( columns={"cik": "parent_company_cik"} @@ -276,48 +266,74 @@ def create_sec_company_id_for_ex21_subs(ex21_df: pd.DataFrame) -> pd.DataFrame: "ex21_df": AssetIn("ex21_company_ownership_info"), }, outs={ - "clean_ex21_subsidiary_table": AssetOut( + "transformed_ex21_subsidiary_table": AssetOut( io_manager_key="pandas_parquet_io_manager", ) }, partitions_def=year_quarter_partitions, ) -def clean_ex21_table( +def transform_ex21_table( ex21_df: pd.DataFrame, sec10k_filing_metadata: pd.DataFrame ) -> pd.DataFrame: - """Clean Ex. 21 table of subsidiaries before combining with basic 10k table.""" + """Transform Ex. 21 table of subsidiaries before combining with basic 10k table.""" ex21_df.loc[:, "filename"] = convert_ex21_id_to_filename(ex21_df) ex21_df = ex21_df.drop(columns=["id"]) ex21_df = _add_report_year_to_sec(ex21_df, sec10k_filing_metadata) - ex21_df = ex21_df.rename( - columns={"subsidiary": "company_name", "loc": "location_of_inc"} - ) + ex21_df = ex21_df.rename(columns=EX21_COL_MAP) ex21_df = clean_location_of_inc(ex21_df) - ex21_df = clean_company_name(ex21_df) - ex21_df = add_parent_company_cik(ex21_df, sec10k_filing_metadata) + # TODO: what to do with the clean company name? + ex21_df = transform_company_name(ex21_df) + ex21_df = _add_parent_company_cik(ex21_df, sec10k_filing_metadata) # add an sec_company_id, ultimately this ID become the subsidiary's CIK # if the subsidiary is matched to an SEC filer ex21_df = create_sec_company_id_for_ex21_subs(ex21_df=ex21_df) ex21_df = _flatten_sec_companies_across_time(ex21_df) + ex21_df = ex21_df.fillna(np.nan) return ex21_df +def transform_basic10k_table( + basic_10k_df: pd.DataFrame, sec10k_filing_metadata: pd.DataFrame +) -> pd.DataFrame: + """Transformations on SEC basic 10K filer table to prepare for record linkage.""" + basic_10k_df = basic_10k_df.reset_index().pivot_table( + values="value", index="filename", columns="key", aggfunc="first" + ) + basic_10k_df.columns.name = None + # TODO: chain these function calls together + basic_10k_df = basic_10k_df.reset_index() + basic_10k_df = _remove_weird_sec_cols(basic_10k_df) + basic_10k_df = _add_report_year_to_sec(basic_10k_df, sec10k_filing_metadata) + basic_10k_df = basic_10k_df.rename(columns=SEC_COL_MAP) + # add a location of incorporation to better match it to Ex. 21 subsidiaries + basic_10k_df = clean_location_of_inc(basic_10k_df) + basic_10k_df = transform_company_name(basic_10k_df) + basic_10k_df.loc[:, "zip_code"] = basic_10k_df["zip_code"].str[:5] + basic_10k_df = fill_street_address_nulls(basic_10k_df) + basic_10k_df.loc[:, "files_10k"] = True + basic_10k_df.loc[:, "sec_company_id"] = basic_10k_df["central_index_key"] + basic_10k_df[STR_COLS] = basic_10k_df[STR_COLS].apply( + lambda x: x.str.strip().str.lower() + ) + return basic_10k_df + + @multi_asset( ins={ "basic_10k_df": AssetIn("basic_10k_company_info"), - "clean_ex21_df": AssetIn("clean_ex21_subsidiary_table"), + "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"), # specify an io_manager_key? }, outs={ - "out_sec_10k__parents_and_subsidiaries": AssetOut( + "core_sec_10k__parents_and_subsidiaries": AssetOut( io_manager_key="pandas_parquet_io_manager", # specify a dagster_type? ), }, partitions_def=year_quarter_partitions, ) -def sec_output_table( +def sec_rl_input_table( basic_10k_df: pd.DataFrame, clean_ex21_df: pd.DataFrame, sec10k_filing_metadata: pd.DataFrame, @@ -329,19 +345,7 @@ def sec_output_table( filing companies. Create an sec_company_id for subsidiaries that aren't linked to a CIK. """ - basic_10k_df = basic_10k_df.reset_index().pivot_table( - values="value", index="filename", columns="key", aggfunc="first" - ) - basic_10k_df.columns.name = None - basic_10k_df = basic_10k_df.reset_index() - basic_10k_df = _remove_weird_sec_cols(basic_10k_df) - basic_10k_df = _add_report_year_to_sec(basic_10k_df, sec10k_filing_metadata) - # add a location of incorporation to better match it to Ex. 21 subsidiaries - basic_10k_df = clean_location_of_inc(basic_10k_df) - basic_10k_df = basic_10k_df.rename( - columns={"company_conformed_name": "company_name"} - ) - basic_10k_df = clean_company_name(basic_10k_df) + basic_10k_df = transform_basic10k_table(basic_10k_df, sec10k_filing_metadata) ex21_df_with_cik = match_ex21_subsidiaries_to_filer_company( basic10k_df=basic_10k_df, ex21_df=clean_ex21_df ) @@ -350,18 +354,18 @@ def sec_output_table( how="left", on="central_index_key", ) - basic_10k_df.loc[:, "files_10k"] = True - basic_10k_df.loc[:, "sec_company_id"] = basic_10k_df["central_index_key"] # get the subsidiary companies that weren't matched to a 10K filing company ex21_non_filing_subs_df = ex21_df_with_cik[ ex21_df_with_cik["central_index_key"].isnull() ] ex21_non_filing_subs_df.loc[:, "files_10k"] = False out_df = pd.concat([basic_10k_df, ex21_non_filing_subs_df]) + out_df = out_df.fillna(np.nan) # this drops records for earlier company names and addresses # that have since changed, so we lose some information out_df = _flatten_sec_companies_across_time(out_df) + return out_df -production_assets = [sec_output_table, sec10k_filing_metadata] +production_assets = [sec_rl_input_table, sec10k_filing_metadata] From 7563df8e57bcbf34296c4bcde674d63ae145426f Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Mon, 2 Dec 2024 13:37:20 -0500 Subject: [PATCH 141/161] include pseudo code of SEC output table module --- .../models/sec_eia_record_linkage/sec_output_table.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_output_table.py diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_output_table.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_output_table.py new file mode 100644 index 0000000..7f974ad --- /dev/null +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_output_table.py @@ -0,0 +1,10 @@ +"""Module for creating the SEC company output table which connects to EIA company data.""" + + +# the input to this method is "core_sec_10k__parents_and_subsidiaries" +def sec_output_table(): + """Connect SEC to EIA and format an output table.""" + # run record linkage to connect SEC to EIA? + # add a utility_id_eia column onto the core table + # drop the following columns: company_name_no_legal, company_name_mphone, any other intermediate columns + pass From 3c88ff2f9d0a2578e5bc337aef5075e8575e4952 Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Tue, 3 Dec 2024 11:28:37 -0500 Subject: [PATCH 142/161] Try using conda env to run tox --- .github/workflows/tox-pytest.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tox-pytest.yml b/.github/workflows/tox-pytest.yml index 4fff900..f03718e 100644 --- a/.github/workflows/tox-pytest.yml +++ b/.github/workflows/tox-pytest.yml @@ -11,7 +11,7 @@ jobs: id-token: write strategy: matrix: - python-version: ["3.10", "3.11", "3.12"] + python-version: ["3.11", "3.12"] fail-fast: false defaults: run: @@ -70,7 +70,7 @@ jobs: - name: Run PyTest with Tox run: | - tox + conda run -n mozilla-sec-eia tox - name: Upload test coverage report to CodeCov uses: codecov/codecov-action@v5 From fb3d772ba21b50e78fa98abe3d922887bc9d1509 Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Tue, 3 Dec 2024 11:37:11 -0500 Subject: [PATCH 143/161] Add PUDL dependency and restrict to Py3.12 --- .github/workflows/tox-pytest.yml | 2 +- pyproject.toml | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/tox-pytest.yml b/.github/workflows/tox-pytest.yml index f03718e..cf278df 100644 --- a/.github/workflows/tox-pytest.yml +++ b/.github/workflows/tox-pytest.yml @@ -11,7 +11,7 @@ jobs: id-token: write strategy: matrix: - python-version: ["3.11", "3.12"] + python-version: ["3.12"] fail-fast: false defaults: run: diff --git a/pyproject.toml b/pyproject.toml index a87becb..72536e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,7 @@ dynamic = ["version"] license = {file = "LICENSE.txt"} dependencies = [ "accelerate>=0.21.0,<2.0", # Hugging Face dependency for PyTorch models + "catalystcoop.pudl @ git+https://github.com/catalyst-cooperative/pudl.git", "cloud-sql-python-connector[pg8000]", "dagster>=1.7.15", # 1.7.13 & 1.7.14 were both breaking things "dagster-mlflow", From 390770fb58760d1c37ac8ffdd614498401524f77 Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Tue, 3 Dec 2024 11:40:30 -0500 Subject: [PATCH 144/161] Guess you don't need to specify tox env with setup-micromamba --- .github/workflows/tox-pytest.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tox-pytest.yml b/.github/workflows/tox-pytest.yml index cf278df..ac346f4 100644 --- a/.github/workflows/tox-pytest.yml +++ b/.github/workflows/tox-pytest.yml @@ -70,7 +70,7 @@ jobs: - name: Run PyTest with Tox run: | - conda run -n mozilla-sec-eia tox + tox - name: Upload test coverage report to CodeCov uses: codecov/codecov-action@v5 From 8e8beeeee11071344ef766dd9153a985e5387965 Mon Sep 17 00:00:00 2001 From: Dazhong Xia Date: Tue, 3 Dec 2024 11:45:33 -0500 Subject: [PATCH 145/161] Install GDAL version via conda since we rely on PUDL now --- test_environment.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/test_environment.yml b/test_environment.yml index c3d51d5..f54968d 100644 --- a/test_environment.yml +++ b/test_environment.yml @@ -29,6 +29,10 @@ dependencies: - pytorch>=2.2,<3 - torchvision + # GDAL is a transitive dependency whose binaries must match those installed by the + # pudl-dev conda environment, so we also install it with conda here. + - gdal==3.9.3 # pinned to ensure it matches pudl-dev environment exactly. + # Use pip to install the package defined by this repo for development: - pip: - --editable ./[dev,docs,tests,types] From 50e7b7edcd9f8612fde43fc7576d8e72e6e82dc5 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Tue, 3 Dec 2024 15:02:04 -0800 Subject: [PATCH 146/161] notebook has cells for SEC and EIA hook up --- notebooks/18-kl-splink-sec-eia.ipynb | 1267 ++++++++------------------ 1 file changed, 378 insertions(+), 889 deletions(-) diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb index 19ab082..2fdeb79 100644 --- a/notebooks/18-kl-splink-sec-eia.ipynb +++ b/notebooks/18-kl-splink-sec-eia.ipynb @@ -40,10 +40,7 @@ "id": "9b8224d4-7596-45b7-bfb5-028f29a96f3d", "metadata": {}, "source": [ - "# Inputs\n", - "\n", - "Questions:\n", - "* What's the best way to dagsterize this to get EIA data from PUDL?" + "# Inputs" ] }, { @@ -55,140 +52,157 @@ ] }, { - "cell_type": "code", - "execution_count": 3, - "id": "4ab5594d-7d1f-425d-80e1-92c30be73011", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "raw_eia_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/out_eia__yearly_utilities.parquet\")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "2edc29d4-6c85-4b31-aae6-0de38c846e44", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "mergers_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_mergers.parquet\")" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "eaa37762-9f94-4927-9341-0ab09be3c8ab", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "raw_eia861_df = pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__assn_utility.parquet\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "3fb7895f-10c5-4450-96f9-77b36471b53e", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "eia_df = raw_eia_df.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "06c76b82-1aad-47b2-aecc-6225a286cc40", - "metadata": { - "tags": [] - }, - "outputs": [], + "cell_type": "markdown", + "id": "13d543e7-334c-4606-849b-c8d60ad668d2", + "metadata": {}, "source": [ - "harvested_df = pd.concat([\n", - " pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_utility_data_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n", - " pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_operational_data_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n", - " pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_demand_side_management_misc.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n", - " pd.read_parquet(\"s3://pudl.catalyst.coop/stable/core_eia861__yearly_energy_efficiency.parquet\")[[\"report_date\", \"utility_id_eia\", \"utility_name_eia\"]],\n", - "])" + "TODO: materialize asset and read in from Dagster GCS storage" ] }, { "cell_type": "code", - "execution_count": 8, - "id": "d95acde9-1640-4c26-a5d1-c50b6666ccf4", - "metadata": { - "tags": [] - }, + "execution_count": 13, + "id": "7f3e5fdd-2c16-4dc0-8ad1-cf4516fbee33", + "metadata": {}, "outputs": [], "source": [ - "eia861_df = raw_eia861_df.merge(harvested_df, on=[\"report_date\", \"utility_id_eia\"], how=\"left\").drop_duplicates(subset=[\"report_date\", \"utility_id_eia\"])" + "from mozilla_sec_eia.models.sec_eia_record_linkage.create_eia_input import get_eia_utilities_table" ] }, { "cell_type": "code", - "execution_count": 9, - "id": "3b7484de-bbc7-47ba-b408-a1af1183018c", - "metadata": { - "tags": [] - }, + "execution_count": 14, + "id": "70ebf6dc-ed00-4f78-bbaf-2805860a1b63", + "metadata": {}, "outputs": [], "source": [ - "mergers_df = mergers_df[mergers_df[\"new_parent\"].notna()]\n", - "eia861_df = eia861_df.merge(mergers_df[[\"report_date\", \"new_parent\", \"merge_address\", \"merge_city\", \"merge_state\"]], \n", - " how=\"left\", \n", - " left_on=[\"report_date\", \"utility_name_eia\"],\n", - " right_on=[\"report_date\", \"new_parent\"]\n", - " )\n", - "eia861_df = eia861_df.rename(columns={\"merge_address\": \"street_address\", \"merge_city\": \"city\"})\n", - "eia861_df = eia861_df.groupby([\"report_date\", \"utility_id_eia\"]).first().reset_index()" + "eia_df = get_eia_utilities_table()" ] }, { "cell_type": "code", - "execution_count": 10, - "id": "d3d39fc0-130f-4bbd-9cc9-bbaf58808109", - "metadata": { - "tags": [] - }, - "outputs": [], + "execution_count": 28, + "id": "9547a0ca-39f7-46c3-9a02-dcb08b75181a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
utility_id_eiautility_id_pudlutility_name_eiareport_datestreet_addresscitystatezip_codeplants_reported_ownerplants_reported_operatorplants_reported_asset_managerplants_reported_other_relationshipentity_typeattention_lineaddress_2zip_code_4contact_firstnamecontact_lastnamecontact_titlephone_numberphone_extensioncontact_firstname_2contact_lastname_2contact_title_2phone_number_2phone_extension_2data_maturity
06655016573.0Telyon AMZ Windsor LLC2024-01-01NoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNonemonthly_update
\n", + "
" + ], + "text/plain": [ + " utility_id_eia utility_id_pudl utility_name_eia report_date street_address city state zip_code plants_reported_owner plants_reported_operator plants_reported_asset_manager plants_reported_other_relationship entity_type attention_line address_2 zip_code_4 contact_firstname contact_lastname contact_title phone_number phone_extension contact_firstname_2 contact_lastname_2 contact_title_2 phone_number_2 phone_extension_2 data_maturity\n", + "0 66550 16573.0 Telyon AMZ Windsor LLC 2024-01-01 None None None None None None None None None None None None None None None None None None None None None None monthly_update" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "eia861_df[\"state\"] = eia861_df[\"state\"].where(eia861_df[\"merge_state\"].isnull(), eia861_df[\"merge_state\"])\n", - "eia861_df = eia861_df.drop(columns=[\"new_parent\", \"merge_state\"])" + "eia_df.head(1)" ] }, { - "cell_type": "code", - "execution_count": 11, - "id": "04b6b682-91f4-49e2-9f74-2861548d1dd4", + "cell_type": "markdown", + "id": "ae7706d6-3f32-4c99-aeb6-dac6c6529eec", "metadata": {}, - "outputs": [], "source": [ - "eia_df = pd.concat([eia_df, eia861_df])\n", - "eia_df = eia_df.drop_duplicates(subset=[\"utility_id_eia\", \"report_date\"], keep=\"first\")\n", - "# not sure at what point this stops being a datetime\n", - "eia_df[\"report_date\"] = eia_df[\"report_date\"].astype(\"datetime64[ns]\")\n", - "# there are nulls from non harvested 861 utilities\n", - "eia_df = eia_df.dropna(subset=\"utility_name_eia\")" + "### SEC 10K Basic Info" ] }, { "cell_type": "markdown", - "id": "ae7706d6-3f32-4c99-aeb6-dac6c6529eec", + "id": "012db270-d944-464c-9d30-c5995ab491a4", "metadata": {}, "source": [ - "### SEC 10K Basic Info" + "TODO: read in asset from Dagster GCS storage" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 24, "id": "d4e950a6-ee6c-414c-b5b9-52a4175bf0b7", "metadata": {}, "outputs": [], @@ -198,7 +212,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 25, "id": "14eb7f24-7f7b-43aa-a0df-85e888e43821", "metadata": {}, "outputs": [], @@ -211,7 +225,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 26, "id": "1be3364e-9887-42b2-b303-0a24e8681acf", "metadata": { "tags": [] @@ -222,6 +236,127 @@ "raw_sec_df.columns.name = None" ] }, + { + "cell_type": "code", + "execution_count": 29, + "id": "5fcb05e5-6a57-439f-802f-527242f8f223", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
]fiscal_year_end]irs_number]state_of_incorporationbusiness_phonecentral_index_keycitycompany_conformed_namedate_of_name_changefilm_numberfiscal_year_endform_typeformer_conformed_nameirs_numberorganization_namesec_actsec_file_numberstandard_industrial_classificationstatestate_of_incorporationstreet_1street_2zip
filename
edgar/data/1000015/0000912057-00-014793.txtNaNNaNNaN20397367000001000015stamfordmeta group incNaN585471123110-kNaN060971675NaNNaN000-27280services-engineering, accounting, research, ma...ctde208 harbor drNaN06912-0061
\n", + "
" + ], + "text/plain": [ + " ]fiscal_year_end ]irs_number ]state_of_incorporation business_phone central_index_key city company_conformed_name date_of_name_change film_number fiscal_year_end form_type former_conformed_name irs_number organization_name sec_act sec_file_number standard_industrial_classification state state_of_incorporation street_1 street_2 zip\n", + "filename \n", + "edgar/data/1000015/0000912057-00-014793.txt NaN NaN NaN 2039736700 0001000015 stamford meta group inc NaN 585471 1231 10-k NaN 060971675 NaN NaN 000-27280 services-engineering, accounting, research, ma... ct de 208 harbor dr NaN 06912-0061" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "raw_sec_df.head(1)" + ] + }, { "cell_type": "code", "execution_count": 15, @@ -232,6 +367,16 @@ "sec_clean_df = prepare_sec10k_basic_info_df(raw_sec_df)" ] }, + { + "cell_type": "code", + "execution_count": 21, + "id": "329e5d07-4eb4-4ba2-968e-aabf9be4937b", + "metadata": {}, + "outputs": [], + "source": [ + "sec_asset_df = pd.read_parquet(UPath(\"gs://sec10k-outputs/v2/out_sec_10k__parents_and_subsidiaries/2023q1.parquet\"))" + ] + }, { "cell_type": "markdown", "id": "3bac9280-1183-4aba-b78f-84bcf37ef1e2", @@ -240,6 +385,14 @@ "### Ex. 21" ] }, + { + "cell_type": "markdown", + "id": "ae57370a-36bb-40cf-b9f1-8ffdf373fa22", + "metadata": {}, + "source": [ + "TODO: get rid of this section" + ] + }, { "cell_type": "code", "execution_count": 17, @@ -277,6 +430,14 @@ "# Preprocess Ex. 21" ] }, + { + "cell_type": "markdown", + "id": "917c79d4-9250-46a7-855a-14e526bbce6c", + "metadata": {}, + "source": [ + "TODO: get rid of this section" + ] + }, { "cell_type": "code", "execution_count": 19, @@ -527,32 +688,6 @@ "unmatched_ex21_df = ex21_with_cik[ex21_with_cik.subsidiary_cik.isnull()]" ] }, - { - "cell_type": "code", - "execution_count": 53, - "id": "56f41505-421e-4bf7-bfc4-93500e0c5e71", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "0 a_1\n", - "1 b_2\n", - "2 c_3\n", - "dtype: object" - ] - }, - "execution_count": 53, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df1 = pd.DataFrame({\"text1\": [\"a\", \"b\", \"c\"]})\n", - "df2 = pd.DataFrame({\"text2\": [\"1\", \"2\", \"3\"]})\n", - "df1[\"text1\"] + \"_\" + df2[\"text2\"]" - ] - }, { "cell_type": "markdown", "id": "381e4a8f-d8da-4802-b86e-1ac4fc3094db", @@ -565,6 +700,22 @@ "the SEC basic info to EIA match is done? And if there's a conflicting SEC match (one basic info and one Ex. 21) then review it manually?" ] }, + { + "cell_type": "markdown", + "id": "dd3b1335-6ffc-4c8d-b45e-5bee9f3f48da", + "metadata": {}, + "source": [ + "TODO: get rid of these cells" + ] + }, + { + "cell_type": "markdown", + "id": "aaf6c9f9-6fe6-4259-bbc4-d8a18e55984c", + "metadata": {}, + "source": [ + "TODO: filter for only \"files_10k\" filers" + ] + }, { "cell_type": "code", "execution_count": 67, @@ -706,6 +857,14 @@ "sec_clean_df.loc[:, \"street_address_list\"] = sec_clean_df[\"street_address\"].str.split()" ] }, + { + "cell_type": "markdown", + "id": "9f7bebc3-8e79-48e9-9178-68c112bb8ee9", + "metadata": {}, + "source": [ + "TODO: import from config file" + ] + }, { "cell_type": "code", "execution_count": 36, @@ -731,14 +890,6 @@ "]" ] }, - { - "cell_type": "markdown", - "id": "21b697b0-7d9e-452c-9b8b-ee40fd6bb7bd", - "metadata": {}, - "source": [ - "create list column for address information as well?" - ] - }, { "cell_type": "code", "execution_count": 55, @@ -759,6 +910,14 @@ "sec_match_df = sec_clean_df[SHARED_COLS]" ] }, + { + "cell_type": "markdown", + "id": "13bda908-2007-4bca-86ad-1bcf74b1b1ef", + "metadata": {}, + "source": [ + "TODO: import from config" + ] + }, { "cell_type": "code", "execution_count": 43, @@ -789,770 +948,79 @@ } ], "source": [ - "# duplicates exist because of differing report years\n", - "eia_match_df.duplicated(subset=match_cols).value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "b53e6244-f0ca-4256-bc09-9c3264675389", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True 168445\n", - "False 64515\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sec_match_df.duplicated(subset=match_cols).value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "id": "baa742ae-1b49-4d0a-84c8-5f864398c8ed", - "metadata": {}, - "outputs": [], - "source": [ - "sec_match_df = sec_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=match_cols, keep=\"first\")" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "63e47f5f-e142-48fa-9ffa-e14d27ee1476", - "metadata": {}, - "outputs": [], - "source": [ - "eia_match_df = eia_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=match_cols, keep=\"first\")" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "5cf7ca17-b42b-40c6-b6f7-9077acdb1220", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "standard_industrial_classification\n", - "asset-backed securities [6189] 20311\n", - "pharmaceutical preparations [2834] 8530\n", - "state commercial banks [6022] 7886\n", - "real estate investment trusts [6798] 7706\n", - "services-prepackaged software [7372] 6007\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# could try to use keywords like gas, electricity, utility etc.\n", - "sec_clean_df[\"standard_industrial_classification\"].value_counts().head(5)" - ] - }, - { - "cell_type": "code", - "execution_count": 165, - "id": "c1500344-ff7f-450e-90dd-1105d8e7c637", - "metadata": {}, - "outputs": [], - "source": [ - "# run the Ex.21 to SEC model\n", - "filepath = Path(\"../sec_ex21_model_settings/2023_model.json\")\n", - "with open(filepath, 'r') as file:\n", - " sec_ex21_settings = json.load(file)" - ] - }, - { - "cell_type": "code", - "execution_count": 192, - "id": "172ea84f-a0b7-4e9c-b746-322a47663171", - "metadata": {}, - "outputs": [], - "source": [ - "sec_test_df = sec_match_df[sec_match_df.report_year.isin([2016, 2017])][[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": 193, - "id": "3f8ba4ee-b1e7-4e05-982e-43d8e446eea9", - "metadata": {}, - "outputs": [], - "source": [ - "ex21_test_df = ex21_match_df[ex21_match_df.report_year.isin([2016, 2017])][[\"record_id\", \"report_year\", \"company_name\", \"loc_of_incorporation\", \"company_name_mphone\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": 194, - "id": "2c715d7a-3d6d-4970-8ae3-5a6e1a12e937", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "14125" - ] - }, - "execution_count": 194, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(sec_test_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 195, - "id": "ec13db12-3664-4e00-aa83-7c372039b230", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "233101" - ] - }, - "execution_count": 195, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "len(ex21_test_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 196, - "id": "d2fcc1da-4435-4b17-8be7-cb34a6917522", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
record_idreport_yearcompany_nameloc_of_incorporationcompany_name_mphone
23232016nicholas financial incorporatedfloridaNXLS FNNXL INKRPRTT
24242017nicholas financial incorporatedfloridaNXLS FNNXL INKRPRTT
68682016sandisk corporationdelawareSNTSK KRPRXN
\n", - "
" - ], - "text/plain": [ - " record_id report_year company_name loc_of_incorporation company_name_mphone\n", - "23 23 2016 nicholas financial incorporated florida NXLS FNNXL INKRPRTT\n", - "24 24 2017 nicholas financial incorporated florida NXLS FNNXL INKRPRTT\n", - "68 68 2016 sandisk corporation delaware SNTSK KRPRXN" - ] - }, - "execution_count": 196, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sec_test_df.head(3)" - ] - }, - { - "cell_type": "code", - "execution_count": 197, - "id": "e24e2c8f-1124-4e87-b77d-55fca14a7d3c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
record_idreport_yearcompany_nameloc_of_incorporationcompany_name_mphone
283274602016capstone turbine singapore pte., limitedsingaporeKPSTN TRBN SNKPR PT LMTT
283274712016capstone turbine international, incorporateddelawareKPSTN TRBN INTRNXNL INKRPRTT
283274822016capstone turbine financial services, limited l...delawareKPSTN TRBN FNNXL SRFSS LMTT LBLT KMPN
\n", - "
" - ], - "text/plain": [ - " record_id report_year company_name loc_of_incorporation company_name_mphone\n", - "2832746 0 2016 capstone turbine singapore pte., limited singapore KPSTN TRBN SNKPR PT LMTT\n", - "2832747 1 2016 capstone turbine international, incorporated delaware KPSTN TRBN INTRNXNL INKRPRTT\n", - "2832748 2 2016 capstone turbine financial services, limited l... delaware KPSTN TRBN FNNXL SRFSS LMTT LBLT KMPN" - ] - }, - "execution_count": 197, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "ex21_test_df.head(3)" - ] - }, - { - "cell_type": "code", - "execution_count": 198, - "id": "c531657f-5a0a-4ff5-b680-c6a1806feb75", - "metadata": {}, - "outputs": [], - "source": [ - "# can we just load this linker and make predictions? what happens with blocking?\n", - "sec_ex21_linker = Linker([sec_test_df, ex21_test_df], sec_ex21_settings, db_api=DuckDBAPI())" + "# duplicates exist because of differing report years\n", + "eia_match_df.duplicated(subset=match_cols).value_counts()" ] }, { "cell_type": "code", - "execution_count": 199, - "id": "14b239db-a816-428c-a132-dca0ed0998c4", + "execution_count": 52, + "id": "b53e6244-f0ca-4256-bc09-9c3264675389", "metadata": {}, "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Blocking time: 0.44 seconds\n" - ] - }, { "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "661a74c00c7e41f59787cad30a26ec78", - "version_major": 2, - "version_minor": 0 - }, "text/plain": [ - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + "True 168445\n", + "False 64515\n", + "Name: count, dtype: int64" ] }, + "execution_count": 52, "metadata": {}, - "output_type": "display_data" - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Predict time: 115.79 seconds\n" - ] + "output_type": "execute_result" } ], "source": [ - "sec_ex21_preds = sec_ex21_linker.inference.predict(threshold_match_probability=0.6)" + "sec_match_df.duplicated(subset=match_cols).value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "id": "baa742ae-1b49-4d0a-84c8-5f864398c8ed", + "metadata": {}, + "outputs": [], + "source": [ + "sec_match_df = sec_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=match_cols, keep=\"first\")" ] }, { "cell_type": "code", - "execution_count": 200, - "id": "08167db9-9d9c-4b09-a839-847f85842324", + "execution_count": 57, + "id": "63e47f5f-e142-48fa-9ffa-e14d27ee1476", "metadata": {}, "outputs": [], "source": [ - "sec_ex21_preds_df = sec_ex21_preds.as_pandas_dataframe()" + "eia_match_df = eia_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=match_cols, keep=\"first\")" ] }, { "cell_type": "code", - "execution_count": 201, - "id": "3f349a0a-269a-4f34-95e8-54a8c96c57f8", + "execution_count": 39, + "id": "5cf7ca17-b42b-40c6-b6f7-9077acdb1220", "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrecord_id_lrecord_id_rcompany_name_lcompany_name_rgamma_company_nametf_company_name_ltf_company_name_rbf_company_namebf_tf_adj_company_nameloc_of_incorporation_lloc_of_incorporation_rgamma_loc_of_incorporationtf_loc_of_incorporation_ltf_loc_of_incorporation_rbf_loc_of_incorporationbf_tf_adj_loc_of_incorporationcompany_name_mphone_lcompany_name_mphone_rreport_year_lreport_year_r
011.7269540.999705__splink__input_table_0__splink__input_table_1955515939pendrell corporationpentzer corporation30.0000080.00000435295.4377531.0washingtonwashington30.0034270.0034272.32178060.034545PNTRL KRPRXNPNTSR KRPRXN20172017
10.9817200.663845__splink__input_table_0__splink__input_table_1800411485spok holdings, incorporatedautohaus holdings, incorporated20.0000080.0000042126.9805721.0delawaredelaware30.3545130.3545132.3217800.580388SPK HLTNKS INKRPRTTATHS HLTNKS INKRPRTT20172017
24.6040020.960504__splink__input_table_0__splink__input_table_1720682731ashford hospitality trust incorporatedashford hospitality trust, incorporated30.0000080.00000435295.4377531.0marylandNone-10.010087NaN1.0000001.000000AXFRT HSPTLT TRST INKRPRTTAXFRT HSPTLT TRST INKRPRTT20172017
33.9010620.937263__splink__input_table_0__splink__input_table_1586521115tx holdings, incorporatedtex holdings, incorporated30.0000080.00000435295.4377531.0georgiadelaware00.0055960.3545130.6143191.000000TKS HLTNKS INKRPRTTTKS HLTNKS INKRPRTT20172017
44.6040020.960504__splink__input_table_0__splink__input_table_1829461757pharma bio serv, incorporatedpharma bio serv us, incorporated30.0000080.00000435295.4377531.0Nonedelaware-1NaN0.3545131.0000001.000000FRM B SRF INKRPRTTFRM B SRF US INKRPRTT20172017
...........................................................................
93430.9817200.663845__splink__input_table_0__splink__input_table_12486881135transenterix incorporatedtrane brands, incorporated20.0000080.0000042126.9805721.0delawaredelaware30.3545130.3545132.3217800.580388TRNSNTRKS INKRPRTTTRN BRNTS INKRPRTT20172017
93443.9010620.937263__splink__input_table_0__splink__input_table_12602833506cree incorporatedj.crew incorporated30.0000080.00000435295.4377531.0north carolinadelaware00.0049260.3545130.6143191.000000KR INKRPRTTJKR INKRPRTT20172017
93450.9817200.663845__splink__input_table_0__splink__input_table_12322583973applied minerals, incorporatedapplied materials spv2, incorporated20.0000080.0000082126.9805721.0delawaredelaware30.3545130.3545132.3217800.580388APLT MNRLS INKRPRTTAPLT MTRLS SPF INKRPRTT20172016
93463.9010620.937263__splink__input_table_0__splink__input_table_12322583970applied minerals, incorporatedapplied materials japan, incorporated30.0000080.00000835295.4377531.0delawarejapan00.3545130.0057950.6143191.000000APLT MNRLS INKRPRTTAPLT MTRLS JPN INKRPRTT20172016
93472.7249340.868616__splink__input_table_0__splink__input_table_1267563285guess incorporatedaquesys, incorporated20.0000080.0000082126.9805721.0delawareus delaware20.3545130.0004624.5112761.000000KS INKRPRTTAKSS INKRPRTT20172016
\n", - "

9348 rows × 24 columns

\n", - "
" - ], "text/plain": [ - " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_l company_name_r gamma_company_name tf_company_name_l tf_company_name_r bf_company_name bf_tf_adj_company_name loc_of_incorporation_l loc_of_incorporation_r gamma_loc_of_incorporation tf_loc_of_incorporation_l tf_loc_of_incorporation_r bf_loc_of_incorporation bf_tf_adj_loc_of_incorporation company_name_mphone_l company_name_mphone_r report_year_l report_year_r\n", - "0 11.726954 0.999705 __splink__input_table_0 __splink__input_table_1 95551 5939 pendrell corporation pentzer corporation 3 0.000008 0.000004 35295.437753 1.0 washington washington 3 0.003427 0.003427 2.321780 60.034545 PNTRL KRPRXN PNTSR KRPRXN 2017 2017\n", - "1 0.981720 0.663845 __splink__input_table_0 __splink__input_table_1 80041 1485 spok holdings, incorporated autohaus holdings, incorporated 2 0.000008 0.000004 2126.980572 1.0 delaware delaware 3 0.354513 0.354513 2.321780 0.580388 SPK HLTNKS INKRPRTT ATHS HLTNKS INKRPRTT 2017 2017\n", - "2 4.604002 0.960504 __splink__input_table_0 __splink__input_table_1 72068 2731 ashford hospitality trust incorporated ashford hospitality trust, incorporated 3 0.000008 0.000004 35295.437753 1.0 maryland None -1 0.010087 NaN 1.000000 1.000000 AXFRT HSPTLT TRST INKRPRTT AXFRT HSPTLT TRST INKRPRTT 2017 2017\n", - "3 3.901062 0.937263 __splink__input_table_0 __splink__input_table_1 58652 1115 tx holdings, incorporated tex holdings, incorporated 3 0.000008 0.000004 35295.437753 1.0 georgia delaware 0 0.005596 0.354513 0.614319 1.000000 TKS HLTNKS INKRPRTT TKS HLTNKS INKRPRTT 2017 2017\n", - "4 4.604002 0.960504 __splink__input_table_0 __splink__input_table_1 82946 1757 pharma bio serv, incorporated pharma bio serv us, incorporated 3 0.000008 0.000004 35295.437753 1.0 None delaware -1 NaN 0.354513 1.000000 1.000000 FRM B SRF INKRPRTT FRM B SRF US INKRPRTT 2017 2017\n", - "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", - "9343 0.981720 0.663845 __splink__input_table_0 __splink__input_table_1 248688 1135 transenterix incorporated trane brands, incorporated 2 0.000008 0.000004 2126.980572 1.0 delaware delaware 3 0.354513 0.354513 2.321780 0.580388 TRNSNTRKS INKRPRTT TRN BRNTS INKRPRTT 2017 2017\n", - "9344 3.901062 0.937263 __splink__input_table_0 __splink__input_table_1 260283 3506 cree incorporated j.crew incorporated 3 0.000008 0.000004 35295.437753 1.0 north carolina delaware 0 0.004926 0.354513 0.614319 1.000000 KR INKRPRTT JKR INKRPRTT 2017 2017\n", - "9345 0.981720 0.663845 __splink__input_table_0 __splink__input_table_1 232258 3973 applied minerals, incorporated applied materials spv2, incorporated 2 0.000008 0.000008 2126.980572 1.0 delaware delaware 3 0.354513 0.354513 2.321780 0.580388 APLT MNRLS INKRPRTT APLT MTRLS SPF INKRPRTT 2017 2016\n", - "9346 3.901062 0.937263 __splink__input_table_0 __splink__input_table_1 232258 3970 applied minerals, incorporated applied materials japan, incorporated 3 0.000008 0.000008 35295.437753 1.0 delaware japan 0 0.354513 0.005795 0.614319 1.000000 APLT MNRLS INKRPRTT APLT MTRLS JPN INKRPRTT 2017 2016\n", - "9347 2.724934 0.868616 __splink__input_table_0 __splink__input_table_1 267563 285 guess incorporated aquesys, incorporated 2 0.000008 0.000008 2126.980572 1.0 delaware us delaware 2 0.354513 0.000462 4.511276 1.000000 KS INKRPRTT AKSS INKRPRTT 2017 2016\n", - "\n", - "[9348 rows x 24 columns]" + "standard_industrial_classification\n", + "asset-backed securities [6189] 20311\n", + "pharmaceutical preparations [2834] 8530\n", + "state commercial banks [6022] 7886\n", + "real estate investment trusts [6798] 7706\n", + "services-prepackaged software [7372] 6007\n", + "Name: count, dtype: int64" ] }, - "execution_count": 201, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# TODO: this needs to be improved, maybe just do a fuzzy match on string name?\n", - "sec_ex21_preds_df" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "defdf953-4af7-4d43-b7cf-5ae95360d70f", - "metadata": {}, - "outputs": [], - "source": [ - "# add the Ex. 21 subsidiaries that don't get a matching CIK to the SEC side\n", - "# run on all the data\n", - "# save the mapping of subsidiaries that are greater than a certain threshold (unclear why the blocking isn't working)\n", - "# get the subsidiaries that are less than a certain threshold\n", - "# transform them to have columns that match with the SEC df\n", - "# add them to the SEC side" + "# could try to use keywords like gas, electricity, utility etc.\n", + "sec_clean_df[\"standard_industrial_classification\"].value_counts().head(5)" ] }, { @@ -1947,20 +1415,11 @@ ] }, { - "cell_type": "code", - "execution_count": 66, - "id": "6402e556-b87c-47ca-bc30-ced2b42e6626", + "cell_type": "markdown", + "id": "5d0b403f-8a1a-4ee2-89db-f274f6a55bbd", "metadata": {}, - "outputs": [], "source": [ - "# probably shouldn't be blocking on report year, because we don't care that much \n", - "# about report year lining up\n", - "# try overlap between tokens in address or company name\n", - "br0 = \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4)\"\n", - "br1 = \"l.street_address = r.street_address\"\n", - "br2 = \"substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and l.city = r.city\"\n", - "# br3 = \"substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and l.zip_code = r.zip_code\"\n", - "br3 = \"substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and array_length(list_intersect(l.street_address_list, r.street_address_list)) >= 2\"" + "TODO: import BLOCKING RULES from config" ] }, { @@ -1987,7 +1446,7 @@ "source": [ "counts = count_comparisons_from_blocking_rule(\n", " table_or_tables=[sec_match_df, eia_match_df],\n", - " blocking_rule=br0,\n", + " blocking_rule=BLOCKING_RULES[0],\n", " link_type=\"link_only\",\n", " unique_id_column_name='record_id',\n", " db_api=db_api,\n", @@ -2070,7 +1529,7 @@ "source": [ "result = n_largest_blocks(\n", " table_or_tables=[sec_match_df, eia_match_df],\n", - " blocking_rule=br0,\n", + " blocking_rule=BLOCKING_RULES[0],\n", " link_type=\"link_only\",\n", " db_api=db_api,\n", " n_largest=3\n", @@ -2179,14 +1638,9 @@ } ], "source": [ - "blocking_rules_for_analysis = [\n", - " br0, br1, br2, br3\n", - "]\n", - "\n", - "\n", "cumulative_comparisons_to_be_scored_from_blocking_rules_chart(\n", " table_or_tables=[sec_match_df, eia_match_df],\n", - " blocking_rules=blocking_rules_for_analysis,\n", + " blocking_rules=BLOCKING_RULES,\n", " db_api=db_api,\n", " unique_id_column_name='record_id',\n", " link_type=\"link_only\",\n", @@ -2201,6 +1655,14 @@ "## Create Model" ] }, + { + "cell_type": "markdown", + "id": "d35162e9-f671-4e99-a261-e1bd4d16717e", + "metadata": {}, + "source": [ + "TODO: import comparisons from config" + ] + }, { "cell_type": "code", "execution_count": 334, @@ -2373,7 +1835,7 @@ " city_comparison\n", " ],\n", " blocking_rules_to_generate_predictions=[\n", - " br0, br1, br2, br3\n", + " BLOCKING_RULES\n", " ],\n", " retain_intermediate_calculation_columns=True,\n", ")\n", @@ -2381,6 +1843,14 @@ "linker = Linker([sec_match_df, eia_match_df], settings, db_api=DuckDBAPI())" ] }, + { + "cell_type": "markdown", + "id": "04fda31f-fcea-446e-813a-08617d7a43bf", + "metadata": {}, + "source": [ + "TODO: import deterministic rules" + ] + }, { "cell_type": "code", "execution_count": 453, @@ -5292,10 +4762,29 @@ "preds_df[preds_df.match_probability >= .95].sort_values(by=\"match_probability\")" ] }, + { + "cell_type": "markdown", + "id": "ad4d3859-81d1-4fa8-98cc-ff7c9fd038f6", + "metadata": {}, + "source": [ + "# Match to Ex. 21 subsidiaries" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "id": "d1c56b09-80c7-4bfe-b1ec-c0220cadafbf", + "metadata": {}, + "outputs": [], + "source": [ + "# match EIA records that don't have a prediction to EIA subsidiaries\n", + "# can reuse code from SEC module?" + ] + }, { "cell_type": "code", "execution_count": null, - "id": "288ffe20-c69e-4c96-8835-765c06303bf2", + "id": "a5599b7a-ea9a-40fd-9ce1-cb79a8d4dc35", "metadata": {}, "outputs": [], "source": [] From 7dc78e19b9cda4eda7625bd2433bd06cd39aa05e Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 4 Dec 2024 15:35:58 -0500 Subject: [PATCH 147/161] Fix dagster setup for record linkage inputs --- .pre-commit-config.yaml | 1 - environment.yml | 1 - .../models/sec_eia_record_linkage/__init__.py | 19 +++++++++++++++++-- .../transform_sec_input.py | 14 +++++--------- workspace.yaml | 1 + 5 files changed, 23 insertions(+), 13 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 2aaf16a..7516290 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -35,7 +35,6 @@ repos: rev: 24.10.0 hooks: - id: black - language_version: python3.11 - repo: https://github.com/pre-commit/mirrors-prettier rev: v4.0.0-alpha.8 diff --git a/environment.yml b/environment.yml index a902ea3..33b1e04 100644 --- a/environment.yml +++ b/environment.yml @@ -30,5 +30,4 @@ dependencies: # Use pip to install the package defined by this repo for development: - pip: # - git+https://github.com/catalyst-cooperative/pudl.git@main - - -e /Users/katielamb/CatalystCoop/pudl - --editable ./[dev,docs,tests,types] diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py index c87c0cb..932b5f8 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py @@ -1,6 +1,6 @@ """Implement record linkage model between SEC companies and EIA utilities.""" -from dagster import Definitions, load_assets_from_modules +from dagster import AssetKey, AssetSpec, Definitions, load_assets_from_modules from dagstermill import ( ConfigurableLocalOutputNotebookIOManager, ) @@ -18,6 +18,7 @@ ) from mozilla_sec_eia.models.sec10k.utils.cloud import cloud_interface_resource +from ..sec10k.extract import year_quarter_partitions from . import transform_eia_input, transform_sec_input eia_assets = load_assets_from_modules([transform_eia_input]) @@ -30,8 +31,22 @@ "sec_input_table_creation", transform_sec_input.production_assets ) +basic_10k_company_info = AssetSpec( + key=AssetKey("basic_10k_company_info") +).with_io_manager_key("pandas_parquet_io_manager") + +ex21_company_ownership_info = AssetSpec( + key=AssetKey("ex21_company_ownership_info"), partitions_def=year_quarter_partitions +).with_io_manager_key("pandas_parquet_io_manager") + +sec10k_filing_metadata = AssetSpec( + key=AssetKey("sec10k_filing_metadata"), partitions_def=year_quarter_partitions +).with_io_manager_key("io_manager") + defs = Definitions( - sec_assets, + sec_assets + + eia_assets + + [basic_10k_company_info, ex21_company_ownership_info, sec10k_filing_metadata], jobs=[eia_input_table_production_job, sec_input_table_production_job], resources={ "cloud_interface": cloud_interface_resource, diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py index 7c4aab0..82c891e 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd -from dagster import AssetIn, AssetOut, multi_asset +from dagster import AllPartitionMapping, AssetIn, AssetOut, multi_asset from mozilla_sec_eia.library.record_linkage_utils import ( fill_street_address_nulls, @@ -18,11 +18,6 @@ ) from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import STR_COLS -from ..sec10k.extract import ( - sec10k_filing_metadata, - year_quarter_partitions, -) - logger = logging.getLogger(f"catalystcoop.{__name__}") @@ -264,13 +259,14 @@ def create_sec_company_id_for_ex21_subs(ex21_df: pd.DataFrame) -> pd.DataFrame: @multi_asset( ins={ "ex21_df": AssetIn("ex21_company_ownership_info"), + "sec10k_filing_metadata": AssetIn("sec10k_filing_metadata"), }, outs={ "transformed_ex21_subsidiary_table": AssetOut( io_manager_key="pandas_parquet_io_manager", ) }, - partitions_def=year_quarter_partitions, + partitions_def=AllPartitionMapping(), ) def transform_ex21_table( ex21_df: pd.DataFrame, sec10k_filing_metadata: pd.DataFrame @@ -323,6 +319,7 @@ def transform_basic10k_table( ins={ "basic_10k_df": AssetIn("basic_10k_company_info"), "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"), + "sec10k_filing_metadata": AssetIn("sec10k_filing_metadata"), # specify an io_manager_key? }, outs={ @@ -331,7 +328,6 @@ def transform_basic10k_table( # specify a dagster_type? ), }, - partitions_def=year_quarter_partitions, ) def sec_rl_input_table( basic_10k_df: pd.DataFrame, @@ -368,4 +364,4 @@ def sec_rl_input_table( return out_df -production_assets = [sec_rl_input_table, sec10k_filing_metadata] +production_assets = [sec_rl_input_table, transform_ex21_table] diff --git a/workspace.yaml b/workspace.yaml index 144aada..a208373 100644 --- a/workspace.yaml +++ b/workspace.yaml @@ -1,2 +1,3 @@ load_from: - python_module: mozilla_sec_eia.models.sec10k + - python_module: mozilla_sec_eia.models.sec_eia_record_linkage From daa8f0aafa624c9bf6ad55443738c265f3f11ba3 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Mon, 9 Dec 2024 14:08:34 -0800 Subject: [PATCH 148/161] fix util functions --- src/mozilla_sec_eia/library/record_linkage_utils.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/mozilla_sec_eia/library/record_linkage_utils.py b/src/mozilla_sec_eia/library/record_linkage_utils.py index 9a33392..f899bf3 100644 --- a/src/mozilla_sec_eia/library/record_linkage_utils.py +++ b/src/mozilla_sec_eia/library/record_linkage_utils.py @@ -88,11 +88,13 @@ def get_metaphone_col(col: pd.Series) -> pd.Series: return col.apply(jellyfish.metaphone) -def transform_company_name(df: pd.DataFrame) -> pd.DataFrame: +def transform_company_name( + df: pd.DataFrame, col_name: str = "company_name" +) -> pd.DataFrame: """Apply cleaning, get metaphone col, drop invalid rows.""" - df = clean_company_name(df) - df.loc[:, "company_name_mphone"] = get_metaphone_col(df["company_name_no_legal"]) - df = drop_invalid_names(df, "company_name_clean") + df = clean_company_name(df, col_name=col_name) + df.loc[:, f"{col_name}_mphone"] = get_metaphone_col(df[f"{col_name}_no_legal"]) + df = drop_invalid_names(df, col_name) return df @@ -102,7 +104,7 @@ def fill_street_address_nulls( secondary_address_col: str = "street_address_2", ) -> pd.DataFrame: """Fill null street address with value from secondary address column.""" - df[address_col] = pd.where( + df[address_col] = df[address_col].where( (~df[address_col].isnull()) | (df[secondary_address_col].isnull()), df[secondary_address_col], ) From dbefe3426af2c867459bfb02c9ef393570a90616 Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 10 Dec 2024 13:46:41 -0500 Subject: [PATCH 149/161] Handle missing partitions in extracted data --- .../models/sec_eia_record_linkage/__init__.py | 22 +++++++++-- .../transform_sec_input.py | 37 ++++++++----------- 2 files changed, 35 insertions(+), 24 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py index 932b5f8..bbcfa2f 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py @@ -1,6 +1,12 @@ """Implement record linkage model between SEC companies and EIA utilities.""" -from dagster import AssetKey, AssetSpec, Definitions, load_assets_from_modules +from dagster import ( + AssetKey, + AssetSpec, + Definitions, + StaticPartitionsDefinition, + load_assets_from_modules, +) from dagstermill import ( ConfigurableLocalOutputNotebookIOManager, ) @@ -35,12 +41,22 @@ key=AssetKey("basic_10k_company_info") ).with_io_manager_key("pandas_parquet_io_manager") +# Create year_quarter partitions +completed_partitions = StaticPartitionsDefinition( + [ + year_quarter + for year_quarter in year_quarter_partitions.get_partition_keys() + if year_quarter + not in ["2018q1", "2018q2", "2019q1", "2020q1", "2021q1", "2022q1"] + ] +) + ex21_company_ownership_info = AssetSpec( - key=AssetKey("ex21_company_ownership_info"), partitions_def=year_quarter_partitions + key=AssetKey("ex21_company_ownership_info"), partitions_def=completed_partitions ).with_io_manager_key("pandas_parquet_io_manager") sec10k_filing_metadata = AssetSpec( - key=AssetKey("sec10k_filing_metadata"), partitions_def=year_quarter_partitions + key=AssetKey("sec10k_filing_metadata"), partitions_def=completed_partitions ).with_io_manager_key("io_manager") defs = Definitions( diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py index 82c891e..2e8fa96 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd -from dagster import AllPartitionMapping, AssetIn, AssetOut, multi_asset +from dagster import AssetIn, asset from mozilla_sec_eia.library.record_linkage_utils import ( fill_street_address_nulls, @@ -256,22 +256,20 @@ def create_sec_company_id_for_ex21_subs(ex21_df: pd.DataFrame) -> pd.DataFrame: return ex21_df -@multi_asset( +@asset( ins={ - "ex21_df": AssetIn("ex21_company_ownership_info"), - "sec10k_filing_metadata": AssetIn("sec10k_filing_metadata"), - }, - outs={ - "transformed_ex21_subsidiary_table": AssetOut( - io_manager_key="pandas_parquet_io_manager", - ) + "ex21_dfs": AssetIn("ex21_company_ownership_info"), + "sec10k_filing_metadata_dfs": AssetIn("sec10k_filing_metadata"), }, - partitions_def=AllPartitionMapping(), ) -def transform_ex21_table( - ex21_df: pd.DataFrame, sec10k_filing_metadata: pd.DataFrame +def transformed_ex21_subsidiary_table( + ex21_dfs: dict[str, pd.DataFrame], + sec10k_filing_metadata_dfs: dict[str, pd.DataFrame], ) -> pd.DataFrame: """Transform Ex. 21 table of subsidiaries before combining with basic 10k table.""" + ex21_df = pd.concat(ex21_dfs.values()) + sec10k_filing_metadata = pd.concat(sec10k_filing_metadata_dfs.values()) + ex21_df.loc[:, "filename"] = convert_ex21_id_to_filename(ex21_df) ex21_df = ex21_df.drop(columns=["id"]) ex21_df = _add_report_year_to_sec(ex21_df, sec10k_filing_metadata) @@ -315,21 +313,15 @@ def transform_basic10k_table( return basic_10k_df -@multi_asset( +@asset( ins={ "basic_10k_df": AssetIn("basic_10k_company_info"), "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"), "sec10k_filing_metadata": AssetIn("sec10k_filing_metadata"), # specify an io_manager_key? }, - outs={ - "core_sec_10k__parents_and_subsidiaries": AssetOut( - io_manager_key="pandas_parquet_io_manager", - # specify a dagster_type? - ), - }, ) -def sec_rl_input_table( +def core_sec_10k__parents_and_subsidiaries( basic_10k_df: pd.DataFrame, clean_ex21_df: pd.DataFrame, sec10k_filing_metadata: pd.DataFrame, @@ -364,4 +356,7 @@ def sec_rl_input_table( return out_df -production_assets = [sec_rl_input_table, transform_ex21_table] +production_assets = [ + core_sec_10k__parents_and_subsidiaries, + transformed_ex21_subsidiary_table, +] From 97f5d68af4faff270192fcc067d59ee6bdec4eb1 Mon Sep 17 00:00:00 2001 From: zschira Date: Wed, 11 Dec 2024 15:57:52 -0500 Subject: [PATCH 150/161] Fix basic_10k partitions --- .../models/sec_eia_record_linkage/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py index bbcfa2f..3350449 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/__init__.py @@ -37,10 +37,6 @@ "sec_input_table_creation", transform_sec_input.production_assets ) -basic_10k_company_info = AssetSpec( - key=AssetKey("basic_10k_company_info") -).with_io_manager_key("pandas_parquet_io_manager") - # Create year_quarter partitions completed_partitions = StaticPartitionsDefinition( [ @@ -51,6 +47,10 @@ ] ) +basic_10k_company_info = AssetSpec( + key=AssetKey("basic_10k_company_info"), partitions_def=completed_partitions +).with_io_manager_key("pandas_parquet_io_manager") + ex21_company_ownership_info = AssetSpec( key=AssetKey("ex21_company_ownership_info"), partitions_def=completed_partitions ).with_io_manager_key("pandas_parquet_io_manager") From b26f1f8c907d41c0a643a3cfc41f876cb1d23bd9 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Wed, 11 Dec 2024 15:15:07 -0800 Subject: [PATCH 151/161] debug materialization of rl input assets --- .../library/record_linkage_utils.py | 36 +++++++++++++++---- .../transform_sec_input.py | 12 ++++--- 2 files changed, 37 insertions(+), 11 deletions(-) diff --git a/src/mozilla_sec_eia/library/record_linkage_utils.py b/src/mozilla_sec_eia/library/record_linkage_utils.py index f899bf3..924b6a4 100644 --- a/src/mozilla_sec_eia/library/record_linkage_utils.py +++ b/src/mozilla_sec_eia/library/record_linkage_utils.py @@ -1,5 +1,7 @@ """Utility functions for cleaning strings during modeling preprocessing steps.""" +from enum import StrEnum + import jellyfish import pandas as pd @@ -75,11 +77,18 @@ def clean_company_name( return df -def drop_invalid_names( - df: pd.DataFrame, col_name: str = "company_name" +def handle_invalid_names( + df: pd.DataFrame, col_name: str = "company_name", drop_invalid: bool = True ) -> pd.DataFrame: - """Drop rows that have invalid company names, like just 'llc', or 'partnership'.""" - return df[(~df[col_name].isin(INVALID_NAMES))] + """Drop rows that have invalid company names, like just 'llc', or 'partnership'. + + Either drop invalid company name values or fill with the empty string. Invalid + values are contained in `INVALID_NAMES`. + """ + if drop_invalid: + return df[(~df[col_name].isin(INVALID_NAMES))] + df[col_name] = df[col_name].where(~df[col_name].isin(INVALID_NAMES), "") + return df # TODO: this is in PUDL, deduplicate @@ -88,13 +97,28 @@ def get_metaphone_col(col: pd.Series) -> pd.Series: return col.apply(jellyfish.metaphone) +class HandleNulls(StrEnum): + """Enum for handling null values in company name transform.""" + + DROP = "drop" + FILL_EMPTY_STR = "fill_empty_str" + + def transform_company_name( - df: pd.DataFrame, col_name: str = "company_name" + df: pd.DataFrame, + col_name: str = "company_name", + handle_nulls: HandleNulls = HandleNulls.DROP, ) -> pd.DataFrame: """Apply cleaning, get metaphone col, drop invalid rows.""" df = clean_company_name(df, col_name=col_name) + if handle_nulls == HandleNulls.DROP: + df = handle_invalid_names(df, col_name, drop_invalid=True) + df = df[~df[col_name].isnull()] + elif handle_nulls == HandleNulls.FILL_EMPTY_STR: + df = handle_invalid_names(df, col_name, drop_invalid=False) + df = df.fillna({col_name: ""}) df.loc[:, f"{col_name}_mphone"] = get_metaphone_col(df[f"{col_name}_no_legal"]) - df = drop_invalid_names(df, col_name) + return df diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py index 2e8fa96..ff88151 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py @@ -21,7 +21,7 @@ logger = logging.getLogger(f"catalystcoop.{__name__}") -EX21_COL_MAP = {"subsidiary": "company_name", "loc": "loc_of_incorporation"} +EX21_COL_MAP = {"subsidiary": "company_name", "loc": "location_of_inc"} SEC_COL_MAP = { "company_conformed_name": "company_name", "street_1": "street_address", @@ -315,16 +315,16 @@ def transform_basic10k_table( @asset( ins={ - "basic_10k_df": AssetIn("basic_10k_company_info"), + "basic_10k_dfs": AssetIn("basic_10k_company_info"), "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"), - "sec10k_filing_metadata": AssetIn("sec10k_filing_metadata"), + "sec10k_filing_metadata_dfs": AssetIn("sec10k_filing_metadata"), # specify an io_manager_key? }, ) def core_sec_10k__parents_and_subsidiaries( - basic_10k_df: pd.DataFrame, + basic_10k_dfs: dict[str, pd.DataFrame], clean_ex21_df: pd.DataFrame, - sec10k_filing_metadata: pd.DataFrame, + sec10k_filing_metadata_dfs: dict[str, pd.DataFrame], ) -> pd.DataFrame: """Asset for creating an SEC 10K output table. @@ -333,6 +333,8 @@ def core_sec_10k__parents_and_subsidiaries( filing companies. Create an sec_company_id for subsidiaries that aren't linked to a CIK. """ + basic_10k_df = pd.concat(basic_10k_dfs.values()) + sec10k_filing_metadata = pd.concat(sec10k_filing_metadata_dfs.values()) basic_10k_df = transform_basic10k_table(basic_10k_df, sec10k_filing_metadata) ex21_df_with_cik = match_ex21_subsidiaries_to_filer_company( basic10k_df=basic_10k_df, ex21_df=clean_ex21_df From acaf3d1866a6db34731c9c855a4baa34476af955 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Mon, 16 Dec 2024 11:59:26 -0800 Subject: [PATCH 152/161] clean up notebook to work with dagster assets --- notebooks/18-kl-splink-sec-eia.ipynb | 4000 +++++++---------- .../library/record_linkage_utils.py | 35 + .../sec_eia_record_linkage/preprocessing.py | 167 - .../sec_eia_splink_config.py | 5 +- .../transform_eia_input.py | 34 +- .../transform_sec_input.py | 105 +- .../street_suffix_abbreviations.json | 203 + 7 files changed, 1875 insertions(+), 2674 deletions(-) delete mode 100644 src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py create mode 100644 src/mozilla_sec_eia/package_data/street_suffix_abbreviations.json diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb index 2fdeb79..8de5812 100644 --- a/notebooks/18-kl-splink-sec-eia.ipynb +++ b/notebooks/18-kl-splink-sec-eia.ipynb @@ -31,8 +31,16 @@ "from splink.exploratory import completeness_chart, profile_columns\n", "from upath import UPath\n", "\n", - "from mozilla_sec_eia.models.sec10k.utils.cloud import GCSArchive, convert_ex21_id_to_filename\n", - "from mozilla_sec_eia.models.sec_eia_record_linkage.preprocessing import add_sec_company_id_to_subsidiaries, prepare_sec10k_basic_info_df, prepare_eia_df, prepare_ex21_df" + "from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import (\n", + " BLOCKING_RULES,\n", + " MATCH_COLS,\n", + " SHARED_COLS,\n", + " address_comparison,\n", + " city_comparison,\n", + " company_name_comparison,\n", + " deterministic_blocking_rules,\n", + " state_comparison\n", + ")" ] }, { @@ -51,37 +59,19 @@ "### EIA" ] }, - { - "cell_type": "markdown", - "id": "13d543e7-334c-4606-849b-c8d60ad668d2", - "metadata": {}, - "source": [ - "TODO: materialize asset and read in from Dagster GCS storage" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "7f3e5fdd-2c16-4dc0-8ad1-cf4516fbee33", - "metadata": {}, - "outputs": [], - "source": [ - "from mozilla_sec_eia.models.sec_eia_record_linkage.create_eia_input import get_eia_utilities_table" - ] - }, { "cell_type": "code", - "execution_count": 14, - "id": "70ebf6dc-ed00-4f78-bbaf-2805860a1b63", + "execution_count": 3, + "id": "8b1add80-34d7-44a8-a7b4-181a770bb2cb", "metadata": {}, "outputs": [], "source": [ - "eia_df = get_eia_utilities_table()" + "eia_df = pd.read_parquet(\"gs://sec10k-outputs/v2/core_eia__parents_and_subsidiaries.parquet\")" ] }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 4, "id": "9547a0ca-39f7-46c3-9a02-dcb08b75181a", "metadata": {}, "outputs": [ @@ -106,11 +96,13 @@ " \n", " \n", " \n", + " record_id\n", + " company_name\n", + " street_address\n", " utility_id_eia\n", " utility_id_pudl\n", - " utility_name_eia\n", + " company_name_raw\n", " report_date\n", - " street_address\n", " city\n", " state\n", " zip_code\n", @@ -120,7 +112,7 @@ " plants_reported_other_relationship\n", " entity_type\n", " attention_line\n", - " address_2\n", + " street_address_2\n", " zip_code_4\n", " contact_firstname\n", " contact_lastname\n", @@ -133,15 +125,70 @@ " phone_number_2\n", " phone_extension_2\n", " data_maturity\n", + " report_year\n", + " company_name_no_legal\n", + " company_name_mphone\n", " \n", " \n", " \n", " \n", " 0\n", - " 66550\n", - " 16573.0\n", - " Telyon AMZ Windsor LLC\n", - " 2024-01-01\n", + " 0\n", + " 0ham wham8 solar limited liability company\n", + " 100 california st suite 400\n", + " 64380\n", + " 8321.0\n", + " 0ham wham8 solar, llc\n", + " 2023-01-01\n", + " san francisco\n", + " ca\n", + " 94118\n", + " True\n", + " None\n", + " None\n", + " None\n", + " Q\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " final\n", + " 2023\n", + " 0ham wham8 solar\n", + " HM HM SLR\n", + " \n", + " \n", + " 1\n", + " 1\n", + " 10 briggs solar ng limited liability company\n", + " 267 water st 2nd floor\n", + " 62685\n", + " 8502.0\n", + " 10 briggs solar ng, llc\n", + " 2020-01-01\n", + " warren\n", + " ri\n", + " 02885\n", + " True\n", + " True\n", + " None\n", + " None\n", + " Q\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", + " None\n", " None\n", " None\n", " None\n", @@ -149,9 +196,28 @@ " None\n", " None\n", " None\n", + " final\n", + " 2020\n", + " 10 briggs solar ng\n", + " BRKS SLR NK\n", + " \n", + " \n", + " 2\n", + " 2\n", + " 1001 ebenezer church solar limited liability c...\n", + " 176 ebenezer church rd\n", + " 63186\n", + " 8567.0\n", + " 1001 ebenezer church solar, llc\n", + " 2020-01-01\n", + " state road\n", + " nc\n", + " 28676\n", + " True\n", " None\n", " None\n", " None\n", + " Q\n", " None\n", " None\n", " None\n", @@ -164,82 +230,75 @@ " None\n", " None\n", " None\n", - " monthly_update\n", + " None\n", + " final\n", + " 2020\n", + " 1001 ebenezer church solar\n", + " EBNSR XRX SLR\n", " \n", " \n", "\n", "" ], "text/plain": [ - " utility_id_eia utility_id_pudl utility_name_eia report_date street_address city state zip_code plants_reported_owner plants_reported_operator plants_reported_asset_manager plants_reported_other_relationship entity_type attention_line address_2 zip_code_4 contact_firstname contact_lastname contact_title phone_number phone_extension contact_firstname_2 contact_lastname_2 contact_title_2 phone_number_2 phone_extension_2 data_maturity\n", - "0 66550 16573.0 Telyon AMZ Windsor LLC 2024-01-01 None None None None None None None None None None None None None None None None None None None None None None monthly_update" + " record_id company_name street_address utility_id_eia utility_id_pudl company_name_raw report_date city state zip_code plants_reported_owner plants_reported_operator plants_reported_asset_manager plants_reported_other_relationship entity_type attention_line street_address_2 zip_code_4 contact_firstname contact_lastname contact_title phone_number phone_extension contact_firstname_2 contact_lastname_2 contact_title_2 phone_number_2 phone_extension_2 data_maturity report_year company_name_no_legal company_name_mphone\n", + "0 0 0ham wham8 solar limited liability company 100 california st suite 400 64380 8321.0 0ham wham8 solar, llc 2023-01-01 san francisco ca 94118 True None None None Q None None None None None None None None None None None None None final 2023 0ham wham8 solar HM HM SLR\n", + "1 1 10 briggs solar ng limited liability company 267 water st 2nd floor 62685 8502.0 10 briggs solar ng, llc 2020-01-01 warren ri 02885 True True None None Q None None None None None None None None None None None None None final 2020 10 briggs solar ng BRKS SLR NK\n", + "2 2 1001 ebenezer church solar limited liability c... 176 ebenezer church rd 63186 8567.0 1001 ebenezer church solar, llc 2020-01-01 state road nc 28676 True None None None Q None None None None None None None None None None None None None final 2020 1001 ebenezer church solar EBNSR XRX SLR" ] }, - "execution_count": 28, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "eia_df.head(1)" + "eia_df.head(3)" ] }, { - "cell_type": "markdown", - "id": "ae7706d6-3f32-4c99-aeb6-dac6c6529eec", + "cell_type": "code", + "execution_count": 5, + "id": "755ab2a3-a32b-4ac1-81a5-0fb3a85dcdb3", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "20821" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "### SEC 10K Basic Info" + "len(eia_df)" ] }, { "cell_type": "markdown", - "id": "012db270-d944-464c-9d30-c5995ab491a4", - "metadata": {}, - "source": [ - "TODO: read in asset from Dagster GCS storage" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "d4e950a6-ee6c-414c-b5b9-52a4175bf0b7", + "id": "ae7706d6-3f32-4c99-aeb6-dac6c6529eec", "metadata": {}, - "outputs": [], "source": [ - "sec_path = UPath(\"gs://sec10k-outputs/v2/basic_10k_company_info\")" + "### SEC 10K Basic Info" ] }, { "cell_type": "code", - "execution_count": 25, - "id": "14eb7f24-7f7b-43aa-a0df-85e888e43821", + "execution_count": 100, + "id": "3f5f9e6c-0725-48e1-920f-3d516b4388a6", "metadata": {}, "outputs": [], "source": [ - "raw_sec_df = pd.DataFrame()\n", - "for file in sec_path.iterdir():\n", - " if file.name.split(\".\")[-1] == \"parquet\":\n", - " raw_sec_df = pd.concat([raw_sec_df, pd.read_parquet(sec_path / file.name)])" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "id": "1be3364e-9887-42b2-b303-0a24e8681acf", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "raw_sec_df = raw_sec_df.reset_index().pivot_table(values=\"value\", index=\"filename\", columns=\"key\", aggfunc=\"first\")\n", - "raw_sec_df.columns.name = None" + "sec_df = pd.read_pickle(\"/Users/katielamb/CatalystCoop/dagster_home/storage/core_sec_10k__parents_and_subsidiaries\")" ] }, { "cell_type": "code", - "execution_count": 29, - "id": "5fcb05e5-6a57-439f-802f-527242f8f223", + "execution_count": 101, + "id": "a5ea9e1d-3afd-466f-a506-ecb3f23605c9", "metadata": {}, "outputs": [ { @@ -263,13 +322,14 @@ " \n", " \n", " \n", - " ]fiscal_year_end\n", - " ]irs_number\n", - " ]state_of_incorporation\n", - " business_phone\n", + " record_id\n", + " company_name\n", + " street_address\n", + " filename\n", + " phone_number\n", " central_index_key\n", " city\n", - " company_conformed_name\n", + " company_name_raw\n", " date_of_name_change\n", " film_number\n", " fiscal_year_end\n", @@ -282,777 +342,344 @@ " standard_industrial_classification\n", " state\n", " state_of_incorporation\n", - " street_1\n", - " street_2\n", - " zip\n", - " \n", - " \n", - " filename\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", + " street_address_2\n", + " zip_code\n", + " report_date\n", + " report_year\n", + " location_of_inc\n", + " company_name_no_legal\n", + " company_name_mphone\n", + " files_10k\n", + " sec_company_id\n", " \n", " \n", " \n", " \n", - " edgar/data/1000015/0000912057-00-014793.txt\n", - " NaN\n", + " 0\n", + " 0\n", + " 024 pharma incorporated\n", + " 224 datura st\n", + " edgar/data/1307969/0001683168-17-000653.txt\n", + " (732) 696-9333\n", + " 0001307969\n", + " west palm beach\n", + " 024 pharma, inc.\n", + " 20091202\n", + " 17711535\n", + " 1231\n", + " 10-k\n", + " b green innovations, inc.\n", + " 201862731\n", " NaN\n", + " 1934 act\n", + " 333-120490\n", + " plastics products, nec [3089]\n", + " fl\n", + " nj\n", " NaN\n", - " 2039736700\n", - " 0001000015\n", - " stamford\n", - " meta group inc\n", + " 33401\n", + " 2017-03-24\n", + " 2017\n", + " new jersey\n", + " 024 pharma\n", + " FRM\n", + " True\n", + " 0001307969\n", + " \n", + " \n", + " 1\n", + " 1\n", + " 1 800 contacts incorporated\n", + " 13751 s wadsworth park dr suite d140\n", + " edgar/data/1050122/0001104659-06-017311.txt\n", + " 8015728225\n", + " 0001050122\n", + " draper\n", + " 1 800 contacts inc\n", " NaN\n", - " 585471\n", + " 06691791\n", " 1231\n", " 10-k\n", " NaN\n", - " 060971675\n", + " 870571643\n", " NaN\n", + " 1934 act\n", + " 000-23633\n", + " retail-catalog & mail-order houses [5961]\n", + " ut\n", + " de\n", " NaN\n", - " 000-27280\n", - " services-engineering, accounting, research, ma...\n", - " ct\n", + " 84020\n", + " 2006-03-16\n", + " 2006\n", + " delaware\n", + " 1 800 contacts\n", + " KNTKTS\n", + " True\n", + " 0001050122\n", + " \n", + " \n", + " 2\n", + " 2\n", + " 1 800 contacts incorporated\n", + " 66 e wadsworth park dr\n", + " edgar/data/1050122/0001104659-07-019474.txt\n", + " 801-316-5000\n", + " 0001050122\n", + " draper\n", + " 1 800 contacts inc\n", + " NaN\n", + " 07696033\n", + " 1231\n", + " 10-k\n", + " NaN\n", + " 870571643\n", + " NaN\n", + " 1934 act\n", + " 000-23633\n", + " retail-catalog & mail-order houses [5961]\n", + " ut\n", " de\n", - " 208 harbor dr\n", " NaN\n", - " 06912-0061\n", + " 84020\n", + " 2007-03-15\n", + " 2007\n", + " delaware\n", + " 1 800 contacts\n", + " KNTKTS\n", + " True\n", + " 0001050122\n", " \n", " \n", "\n", "" ], "text/plain": [ - " ]fiscal_year_end ]irs_number ]state_of_incorporation business_phone central_index_key city company_conformed_name date_of_name_change film_number fiscal_year_end form_type former_conformed_name irs_number organization_name sec_act sec_file_number standard_industrial_classification state state_of_incorporation street_1 street_2 zip\n", - "filename \n", - "edgar/data/1000015/0000912057-00-014793.txt NaN NaN NaN 2039736700 0001000015 stamford meta group inc NaN 585471 1231 10-k NaN 060971675 NaN NaN 000-27280 services-engineering, accounting, research, ma... ct de 208 harbor dr NaN 06912-0061" + " record_id company_name street_address filename phone_number central_index_key city company_name_raw date_of_name_change film_number fiscal_year_end form_type former_conformed_name irs_number organization_name sec_act sec_file_number standard_industrial_classification state state_of_incorporation street_address_2 zip_code report_date report_year location_of_inc company_name_no_legal company_name_mphone files_10k sec_company_id\n", + "0 0 024 pharma incorporated 224 datura st edgar/data/1307969/0001683168-17-000653.txt (732) 696-9333 0001307969 west palm beach 024 pharma, inc. 20091202 17711535 1231 10-k b green innovations, inc. 201862731 NaN 1934 act 333-120490 plastics products, nec [3089] fl nj NaN 33401 2017-03-24 2017 new jersey 024 pharma FRM True 0001307969\n", + "1 1 1 800 contacts incorporated 13751 s wadsworth park dr suite d140 edgar/data/1050122/0001104659-06-017311.txt 8015728225 0001050122 draper 1 800 contacts inc NaN 06691791 1231 10-k NaN 870571643 NaN 1934 act 000-23633 retail-catalog & mail-order houses [5961] ut de NaN 84020 2006-03-16 2006 delaware 1 800 contacts KNTKTS True 0001050122\n", + "2 2 1 800 contacts incorporated 66 e wadsworth park dr edgar/data/1050122/0001104659-07-019474.txt 801-316-5000 0001050122 draper 1 800 contacts inc NaN 07696033 1231 10-k NaN 870571643 NaN 1934 act 000-23633 retail-catalog & mail-order houses [5961] ut de NaN 84020 2007-03-15 2007 delaware 1 800 contacts KNTKTS True 0001050122" ] }, - "execution_count": 29, + "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "raw_sec_df.head(1)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "f6f76c8b-ffbf-4e2b-870b-57f1260ba522", - "metadata": {}, - "outputs": [], - "source": [ - "sec_clean_df = prepare_sec10k_basic_info_df(raw_sec_df)" + "sec_df.head(3)" ] }, { "cell_type": "code", - "execution_count": 21, - "id": "329e5d07-4eb4-4ba2-968e-aabf9be4937b", - "metadata": {}, - "outputs": [], - "source": [ - "sec_asset_df = pd.read_parquet(UPath(\"gs://sec10k-outputs/v2/out_sec_10k__parents_and_subsidiaries/2023q1.parquet\"))" - ] - }, - { - "cell_type": "markdown", - "id": "3bac9280-1183-4aba-b78f-84bcf37ef1e2", + "execution_count": 102, + "id": "63d97f0d-df22-4c27-b3e7-1035166b4011", "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "61026" + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "### Ex. 21" + "len(sec_df)" ] }, { "cell_type": "markdown", - "id": "ae57370a-36bb-40cf-b9f1-8ffdf373fa22", + "id": "381e4a8f-d8da-4802-b86e-1ac4fc3094db", "metadata": {}, "source": [ - "TODO: get rid of this section" + "# Preprocess SEC and EIA\n", + "\n", + "Does it make more sense to do a direct match on company name after\n", + "the SEC basic info to EIA match is done? And if there's a conflicting SEC match (one basic info and one Ex. 21) then review it manually?" ] }, { "cell_type": "code", - "execution_count": 17, - "id": "611da616-45ef-40ae-bc06-8bfbc871274d", + "execution_count": 103, + "id": "7d2d103a-2bbd-4974-b770-44626bdc5111", "metadata": {}, "outputs": [], "source": [ - "ex21_path = UPath(\"gs://sec10k-outputs/v2/ex21_company_ownership_info\")" + "sec_match_df = sec_df[sec_df.files_10k][SHARED_COLS]" ] }, { "cell_type": "code", - "execution_count": 18, - "id": "1d6272f2-b6f3-4497-9251-cbeedf794a0b", + "execution_count": 104, + "id": "c734137b-fd76-4f6a-a828-23cd12d4ac27", "metadata": {}, "outputs": [], "source": [ - "raw_ex21_df = pd.DataFrame()\n", - "for file in ex21_path.iterdir():\n", - " if file.name.split(\".\")[-1] == \"parquet\":\n", - " year_quarter_df = pd.read_parquet(ex21_path / file.name)\n", - " report_year = file.name[:4]\n", - " year_quarter_df.loc[:, \"report_year\"] = report_year\n", - " year_quarter_df.loc[:, \"report_year\"] = pd.to_datetime(year_quarter_df[\"report_year\"], format=\"%Y\").dt.year\n", - " raw_ex21_df = pd.concat([raw_ex21_df, year_quarter_df])" - ] - }, - { - "cell_type": "markdown", - "id": "b636d438-ed71-426c-8c2a-9e550fe99958", - "metadata": { - "tags": [] - }, - "source": [ - "# Preprocess Ex. 21" - ] - }, - { - "cell_type": "markdown", - "id": "917c79d4-9250-46a7-855a-14e526bbce6c", - "metadata": {}, - "source": [ - "TODO: get rid of this section" + "eia_match_df = eia_df[SHARED_COLS]" ] }, { "cell_type": "code", - "execution_count": 19, - "id": "84e26751-663b-45a5-bb4d-fbfbbdca447e", + "execution_count": 105, + "id": "e754b2ef-5a0d-4582-8694-047528dfd339", "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/katielamb/CatalystCoop/mozilla-sec-eia/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py:168: FutureWarning: Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`\n", - " df = df.fillna(np.nan)\n" - ] + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" } ], "source": [ - "ex21_clean_df = prepare_ex21_df(raw_ex21_df)" + "sec_match_df.record_id.is_unique" ] }, { "cell_type": "code", - "execution_count": 34, - "id": "027191c4-82fa-491b-8c73-54551c7fa4e6", - "metadata": {}, - "outputs": [], - "source": [ - "sec_match_df = sec_clean_df.drop_duplicates(subset=[\"central_index_key\", \"company_name\", \"loc_of_incorporation\", \"report_year\"])\n", - "merged_df = sec_match_df.merge(ex21_clean_df, how=\"inner\", on=\"company_name\", suffixes=(\"_sec\", \"_ex21\"))\n", - "merged_df.loc[:, \"loc_tokens_sec\"] = merged_df[\"loc_of_incorporation_sec\"].fillna(\"\").str.lower().str.split()\n", - "merged_df.loc[:, \"loc_tokens_ex21\"] = merged_df[\"loc_of_incorporation_ex21\"].fillna(\"\").str.lower().str.split()\n", - "merged_df[\"loc_overlap\"] = merged_df.apply(\n", - " lambda row: len(set(row[\"loc_tokens_sec\"]) & set(row[\"loc_tokens_ex21\"])), axis=1\n", - ")\n", - "merged_df[\"report_year_diff\"] = merged_df.apply(\n", - " lambda row: abs(int(row[\"report_year_sec\"]) - int(row[\"report_year_ex21\"])), axis=1\n", - ")\n", - "# Sort by CIK, company_name, loc_overlap, and report_year_diff\n", - "# so that we can then choose the first record in each CIK, company_name group\n", - "merged_df = merged_df.sort_values(by=[\"central_index_key\", \"company_name\", \"loc_overlap\", \"report_year_diff\"],\n", - " ascending=[True, True, False, True]\n", - " )\n", - "# Select the row with the highest loc overlap and nearest report years for each CIK and company name\n", - "cik_and_company_pairs = merged_df.groupby([\"central_index_key\", \"company_name\"], as_index=False).first()\n", - "# We now have the closest matching CIK and company name pairs\n", - "# We want to get the best matching CIK for each company name and loc of incorporation\n", - "# Select the row with the highest loc overlap and nearest report years for each company name and loc pair\n", - "cik_and_company_pairs = cik_and_company_pairs.sort_values(by=[\"company_name\", \"loc_of_incorporation_ex21\", \"loc_overlap\", \"report_year_diff\"],\n", - " ascending=[True, True, False, True]\n", - " )\n", - "closest_match = cik_and_company_pairs.groupby([\"company_name\", \"loc_of_incorporation_ex21\"], as_index=False).first()\n", - "closest_match = closest_match.drop_duplicates(subset=[\"central_index_key\", \"company_name\", \"loc_of_incorporation_ex21\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "bd9e9f44-7ff8-4615-a5c3-ee8f32439e26", + "execution_count": 106, + "id": "38ad3504-2cde-455f-8896-6a435677541c", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "False 5808\n", - "Name: count, dtype: int64" + "True" ] }, - "execution_count": 35, + "execution_count": 106, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# a company name and location of incorporation should match to only one CIK\n", - "closest_match.duplicated(subset=[\"company_name\", \"loc_of_incorporation_ex21\"]).value_counts()" + "eia_match_df.record_id.is_unique" ] }, { "cell_type": "code", - "execution_count": 36, - "id": "64572f77-0a64-48a9-83fd-1c0179202010", + "execution_count": 107, + "id": "856c14d8-3250-4650-a2db-3808b4718f19", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "central_index_key\n", - "False 5532\n", - "True 276\n", - "Name: count, dtype: int64" + "False" ] }, - "execution_count": 36, + "execution_count": 107, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# it's okay if there's duplication here\n", - "# multiple subsidiaries can point to the same CIK\n", - "# and company names can change and they still keep the same CIK\n", - "closest_match.central_index_key.duplicated().value_counts()" + "# Note that sec_company_id isn't unique here because we are keeping each unique company name and address pair\n", + "# later we'll flatten on sec_company_id and utility_id_eia\n", + "sec_df.sec_company_id.is_unique" ] }, { - "cell_type": "code", - "execution_count": 37, - "id": "a669e0b7-c7fb-4c12-9121-0282e616286a", + "cell_type": "markdown", + "id": "b18fef7e-c316-4c90-b2bc-04706401135e", "metadata": {}, - "outputs": [], "source": [ - "ex21_with_cik = ex21_clean_df.merge(\n", - " closest_match[[\"company_name\", \"central_index_key\", \"loc_of_incorporation_ex21\"]].rename(columns={\"loc_of_incorporation_ex21\": \"loc_of_incorporation\"}),\n", - " how=\"left\",\n", - " on=[\"company_name\", \"loc_of_incorporation\"],\n", - ").rename(columns={\"central_index_key\": \"subsidiary_cik\"})" + "There can be duplicate records because sometimes a company changes utility ID or central index key over time. Keep the most recent version of that record." ] }, { "cell_type": "code", - "execution_count": 38, - "id": "245697ec-9451-47e7-953b-eba65062ee93", + "execution_count": 108, + "id": "842fa02e-5202-445c-b728-72bce42e740d", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "subsidiary_cik\n", - "True 2900030\n", - "False 21674\n", + "False 20821\n", "Name: count, dtype: int64" ] }, - "execution_count": 38, + "execution_count": 108, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "ex21_with_cik.subsidiary_cik.isnull().value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "1382a2e4-e88e-47bb-93ed-dafc576ec2f4", - "metadata": {}, - "outputs": [], - "source": [ - "ex21_with_cik = ex21_with_cik.merge(closest_match[[\"company_name\", \"central_index_key\"]],\n", - " how=\"left\",\n", - " on=\"company_name\"\n", - " ).rename(columns={\"central_index_key\": \"company_name_merge_cik\"})" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "id": "5f70e3ff-2494-4eda-bfa2-6989bcf442bb", - "metadata": {}, - "outputs": [], - "source": [ - "# if a subsidiary doesn't have a CIK and has a null location\n", - "# but its company name was assigned a CIK (with a different location)\n", - "# then assign that CIK to the subsidiary\n", - "ex21_with_cik[\"subsidiary_cik\"] = ex21_with_cik[\"subsidiary_cik\"].where(\n", - " ~(ex21_with_cik.subsidiary_cik.isnull()) | ~(ex21_with_cik.loc_of_incorporation.isnull()), \n", - " ex21_with_cik[\"company_name_merge_cik\"]\n", - ")" + "eia_match_df.duplicated(subset=MATCH_COLS).value_counts()" ] }, { "cell_type": "code", - "execution_count": 41, - "id": "63d4cc13-a4bf-4473-99bb-6d8fcf9a1174", + "execution_count": 109, + "id": "b53e6244-f0ca-4256-bc09-9c3264675389", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "subsidiary_cik\n", - "True 2897527\n", - "False 24221\n", + "False 61026\n", "Name: count, dtype: int64" ] }, - "execution_count": 41, + "execution_count": 109, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# there should be fewer null CIKs now\n", - "ex21_with_cik.subsidiary_cik.isnull().value_counts()" + "sec_match_df.duplicated(subset=MATCH_COLS).value_counts()" ] }, { "cell_type": "code", - "execution_count": 42, - "id": "e25cf09f-8bbd-4dcd-b308-71bc5a357bf5", + "execution_count": 253, + "id": "e4d54448-0c2f-452b-931c-ff79a5cc3669", "metadata": {}, "outputs": [], "source": [ - "archive = GCSArchive()\n", - "md = archive.get_metadata()" + "sec_match_df = sec_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=MATCH_COLS, keep=\"first\")\n", + "eia_match_df = eia_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=MATCH_COLS, keep=\"first\")" ] }, { - "cell_type": "code", - "execution_count": 43, - "id": "d17ed466-74d6-44e5-aaca-8dc6793712d4", + "cell_type": "markdown", + "id": "46d967d4-3722-437d-b2f0-37cbac17624f", "metadata": {}, - "outputs": [], "source": [ - "ex21_with_cik.loc[:, \"filename\"] = convert_ex21_id_to_filename(ex21_with_cik)" + "# Link SEC and EIA" ] }, { - "cell_type": "code", - "execution_count": 44, - "id": "6303051b-74bf-4043-885e-aaaf6593852d", + "cell_type": "markdown", + "id": "509988b1-ed2c-41b3-9334-f44ae599cf4f", "metadata": {}, - "outputs": [], "source": [ - "ex21_with_cik = ex21_with_cik.merge(md[\"cik\"],\n", - " how=\"left\",\n", - " left_on=\"filename\",\n", - " right_index=True).rename(columns={\"cik\": \"parent_cik\"})" + "## Exploratory Analysis" ] }, { "cell_type": "code", - "execution_count": 45, - "id": "da72f2d4-54a8-487a-82ec-92d9e8df091f", + "execution_count": 112, + "id": "de894ad0-0b72-4486-b737-c3bfb95f8f05", "metadata": {}, "outputs": [], "source": [ - "ex21_with_cik = add_sec_company_id_to_subsidiaries(ex21_with_cik)" + "db_api = DuckDBAPI()" ] }, { "cell_type": "code", - "execution_count": 46, - "id": "eff49691-d17c-4a55-817d-8eeaf83900e4", - "metadata": {}, - "outputs": [], - "source": [ - "# remove the Ex. 21 subsidiaries who were matched to a filing company\n", - "unmatched_ex21_df = ex21_with_cik[ex21_with_cik.subsidiary_cik.isnull()]" - ] - }, - { - "cell_type": "markdown", - "id": "381e4a8f-d8da-4802-b86e-1ac4fc3094db", - "metadata": {}, - "source": [ - "# Preprocess SEC and EIA\n", - "\n", - "Does it actually make sense to add in the Ex. 21 subsidiaries when we only have company name?\n", - "Does it make more sense to do a direct match on company name after\n", - "the SEC basic info to EIA match is done? And if there's a conflicting SEC match (one basic info and one Ex. 21) then review it manually?" - ] - }, - { - "cell_type": "markdown", - "id": "dd3b1335-6ffc-4c8d-b45e-5bee9f3f48da", - "metadata": {}, - "source": [ - "TODO: get rid of these cells" - ] - }, - { - "cell_type": "markdown", - "id": "aaf6c9f9-6fe6-4259-bbc4-d8a18e55984c", - "metadata": {}, - "source": [ - "TODO: filter for only \"files_10k\" filers" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "id": "8453d55d-a3ac-422d-9cef-e7f13d582efe", - "metadata": {}, - "outputs": [], - "source": [ - "# find a way to use state of incorporation even though it's not on the EIA side?\n", - "sec_full_clean_df = pd.concat([sec_clean_df, \n", - " unmatched_ex21_df[[\"sec_company_id\", \"report_year\", \"company_name\", \"company_name_no_legal\", \"company_name_mphone\", \"state_of_incorporation\"]]\n", - " ])" - ] - }, - { - "cell_type": "code", - "execution_count": 71, - "id": "2bc79d7d-b756-47d5-a61d-a3a761160250", - "metadata": {}, - "outputs": [], - "source": [ - "sec_full_clean_df = sec_full_clean_df.reset_index(drop=True).reset_index(names=\"record_id\")" - ] - }, - { - "cell_type": "code", - "execution_count": 24, - "id": "166d3c96-93d6-4a22-afbf-8d94dc9ecfb9", - "metadata": {}, - "outputs": [], - "source": [ - "# for now, just use sec_clean_df without Ex. 21 subsidiaries\n", - "sec_clean_df = sec_clean_df.reset_index(drop=True).reset_index(names=\"record_id\")" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "24defbd5-ccfe-4844-ab87-3adb1b4df2d9", - "metadata": {}, - "outputs": [], - "source": [ - "eia_clean_df = prepare_eia_df(eia_df)" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "id": "e754b2ef-5a0d-4582-8694-047528dfd339", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 27, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sec_clean_df.record_id.is_unique" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "id": "38ad3504-2cde-455f-8896-6a435677541c", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 28, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "eia_clean_df.record_id.is_unique" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - "id": "e90de0d3-3220-4869-80a3-fc7dd381d393", - "metadata": {}, - "outputs": [], - "source": [ - "# TODO: move this into preprocessing\n", - "# strip legal terms and then make a list column from company name\n", - "# use this for blocking and comnparison levels\n", - "eia_clean_df.loc[:, \"company_name_mphone_list\"] = eia_clean_df[\"company_name_mphone\"].str.split()" - ] - }, - { - "cell_type": "code", - "execution_count": 31, - "id": "b71a24f2-51b5-444f-a645-054cc3e25cf8", - "metadata": {}, - "outputs": [], - "source": [ - "sec_clean_df.loc[:, \"company_name_mphone_list\"] = sec_clean_df[\"company_name_mphone\"].str.split()" - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "id": "eb9c00dc-50a5-49cc-9589-0bf4df917ab3", - "metadata": {}, - "outputs": [], - "source": [ - "eia_clean_df.loc[:, \"zip_code\"] = eia_clean_df[\"zip_code\"].str[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": 33, - "id": "edead864-7004-4081-ab78-313c14ff81a3", - "metadata": {}, - "outputs": [], - "source": [ - "sec_clean_df.loc[:, \"zip_code\"] = sec_clean_df[\"zip_code\"].str[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "a5af13b2-9d43-42e6-9477-1fb7d52412cf", - "metadata": {}, - "outputs": [], - "source": [ - "# I think we don't need this column\n", - "eia_clean_df.loc[:, \"street_address_list\"] = eia_clean_df[\"street_address\"].str.split()\n", - "sec_clean_df.loc[:, \"street_address_list\"] = sec_clean_df[\"street_address\"].str.split()" - ] - }, - { - "cell_type": "markdown", - "id": "9f7bebc3-8e79-48e9-9178-68c112bb8ee9", - "metadata": {}, - "source": [ - "TODO: import from config file" - ] - }, - { - "cell_type": "code", - "execution_count": 36, - "id": "a284b2c9-8edf-4b3f-ab08-5b2cff65ed19", - "metadata": {}, - "outputs": [], - "source": [ - "SHARED_COLS = [\n", - " \"record_id\",\n", - " \"report_date\",\n", - " \"report_year\",\n", - " \"company_name\",\n", - " \"company_name_no_legal\",\n", - " \"street_address\",\n", - " \"street_address_list\",\n", - " \"street_address_2\",\n", - " \"city\",\n", - " \"state\", # could use state of incorporation from SEC\n", - " \"zip_code\",\n", - " \"phone_number\",\n", - " \"company_name_mphone\",\n", - " \"company_name_mphone_list\"\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": 55, - "id": "c734137b-fd76-4f6a-a828-23cd12d4ac27", - "metadata": {}, - "outputs": [], - "source": [ - "eia_match_df = eia_clean_df[SHARED_COLS]" - ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "2b8b6313-abf0-4233-8bad-43b8b9cc1e0b", - "metadata": {}, - "outputs": [], - "source": [ - "sec_match_df = sec_clean_df[SHARED_COLS]" - ] - }, - { - "cell_type": "markdown", - "id": "13bda908-2007-4bca-86ad-1bcf74b1b1ef", - "metadata": {}, - "source": [ - "TODO: import from config" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "a4a15b86-71cf-4d8d-9c09-f82a70f10273", - "metadata": {}, - "outputs": [], - "source": [ - "match_cols = [\"company_name\", \"state\", \"city\", \"street_address\", \"zip_code\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 49, - "id": "842fa02e-5202-445c-b728-72bce42e740d", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True 138441\n", - "False 39407\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 49, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# duplicates exist because of differing report years\n", - "eia_match_df.duplicated(subset=match_cols).value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 52, - "id": "b53e6244-f0ca-4256-bc09-9c3264675389", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "True 168445\n", - "False 64515\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 52, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sec_match_df.duplicated(subset=match_cols).value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 56, - "id": "baa742ae-1b49-4d0a-84c8-5f864398c8ed", - "metadata": {}, - "outputs": [], - "source": [ - "sec_match_df = sec_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=match_cols, keep=\"first\")" - ] - }, - { - "cell_type": "code", - "execution_count": 57, - "id": "63e47f5f-e142-48fa-9ffa-e14d27ee1476", - "metadata": {}, - "outputs": [], - "source": [ - "eia_match_df = eia_match_df.sort_values(by=\"report_year\", ascending=False).drop_duplicates(subset=match_cols, keep=\"first\")" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "id": "5cf7ca17-b42b-40c6-b6f7-9077acdb1220", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "standard_industrial_classification\n", - "asset-backed securities [6189] 20311\n", - "pharmaceutical preparations [2834] 8530\n", - "state commercial banks [6022] 7886\n", - "real estate investment trusts [6798] 7706\n", - "services-prepackaged software [7372] 6007\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# could try to use keywords like gas, electricity, utility etc.\n", - "sec_clean_df[\"standard_industrial_classification\"].value_counts().head(5)" - ] - }, - { - "cell_type": "markdown", - "id": "46d967d4-3722-437d-b2f0-37cbac17624f", - "metadata": {}, - "source": [ - "# Link SEC and EIA" - ] - }, - { - "cell_type": "markdown", - "id": "509988b1-ed2c-41b3-9334-f44ae599cf4f", - "metadata": {}, - "source": [ - "## Exploratory Analysis" - ] - }, - { - "cell_type": "code", - "execution_count": 128, - "id": "de894ad0-0b72-4486-b737-c3bfb95f8f05", - "metadata": {}, - "outputs": [], - "source": [ - "db_api = DuckDBAPI()" - ] - }, - { - "cell_type": "code", - "execution_count": 129, - "id": "ac4e560b-6946-4cc7-b2bc-6d5f4b154da6", + "execution_count": 113, + "id": "4bab1568-6a55-427c-9a78-e44db8b0584d", "metadata": {}, "outputs": [ { @@ -1060,23 +687,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, - "execution_count": 129, + "execution_count": 113, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# this goes way down when we start matching in the Ex. 21 subsidiaries\n", "completeness_chart(sec_match_df, db_api=db_api)" ] }, { "cell_type": "code", - "execution_count": 130, - "id": "02063bcd-8301-4a70-aab1-0bbf6119cf8b", + "execution_count": 114, + "id": "6b9479e3-e836-4407-a2b6-926c185065a8", "metadata": {}, "outputs": [ { @@ -1150,23 +776,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, - "execution_count": 130, + "execution_count": 114, "metadata": {}, "output_type": "execute_result" } @@ -1230,7 +856,7 @@ }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 115, "id": "95bd4db7-c447-4a0e-ab37-59d4beac0b11", "metadata": {}, "outputs": [ @@ -1239,23 +865,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, - "execution_count": 131, + "execution_count": 115, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "profile_columns(sec_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)" + "profile_columns(sec_match_df[MATCH_COLS], db_api=DuckDBAPI(), top_n=10, bottom_n=5)" ] }, { "cell_type": "code", - "execution_count": 132, + "execution_count": 116, "id": "d9cbc91b-b061-461b-b1f2-83036c70d7c7", "metadata": {}, "outputs": [ @@ -1328,23 +954,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, - "execution_count": 132, + "execution_count": 116, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "profile_columns(eia_match_df[match_cols], db_api=DuckDBAPI(), top_n=10, bottom_n=5)" + "profile_columns(eia_match_df[MATCH_COLS], db_api=DuckDBAPI(), top_n=10, bottom_n=5)" ] }, { @@ -1415,35 +1041,28 @@ ] }, { - "cell_type": "markdown", - "id": "5d0b403f-8a1a-4ee2-89db-f274f6a55bbd", - "metadata": {}, - "source": [ - "TODO: import BLOCKING RULES from config" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "id": "1e6d3d24-f8ce-4118-9d31-d4f237d28237", + "cell_type": "code", + "execution_count": 117, + "id": "1e6d3d24-f8ce-4118-9d31-d4f237d28237", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "{'number_of_comparisons_generated_pre_filter_conditions': 988101,\n", - " 'number_of_comparisons_to_be_scored_post_filter_conditions': 988101,\n", + "{'number_of_comparisons_generated_pre_filter_conditions': 487944,\n", + " 'number_of_comparisons_to_be_scored_post_filter_conditions': 487944,\n", " 'filter_conditions_identified': '',\n", " 'equi_join_conditions_identified': 'SUBSTRING(l.company_name_mphone, 1, 4) = SUBSTRING(r.company_name_mphone, 1, 4)',\n", " 'link_type_join_condition': 'where l.\"source_dataset\" || \\'-__-\\' || l.\"record_id\" < r.\"source_dataset\" || \\'-__-\\' || r.\"record_id\" and l.\"source_dataset\" != r.\"source_dataset\"'}" ] }, - "execution_count": 67, + "execution_count": 117, "metadata": {}, "output_type": "execute_result" } ], "source": [ + "# useful for experimenting with a new blocking rule\n", "counts = count_comparisons_from_blocking_rule(\n", " table_or_tables=[sec_match_df, eia_match_df],\n", " blocking_rule=BLOCKING_RULES[0],\n", @@ -1457,7 +1076,7 @@ }, { "cell_type": "code", - "execution_count": 68, + "execution_count": 118, "id": "c37c117a-0fa2-4b8a-9fae-da3569895ca3", "metadata": {}, "outputs": [ @@ -1491,24 +1110,24 @@ " \n", " \n", " 0\n", - " AMRK\n", - " 888\n", - " 85\n", - " 75480\n", + " INTR\n", + " 445\n", + " 76\n", + " 33820\n", " \n", " \n", " 1\n", - " INTR\n", - " 468\n", - " 157\n", - " 73476\n", + " AMRK\n", + " 851\n", + " 38\n", + " 32338\n", " \n", " \n", " 2\n", " FRST\n", - " 836\n", - " 82\n", - " 68552\n", + " 816\n", + " 36\n", + " 29376\n", " \n", " \n", "\n", @@ -1516,12 +1135,12 @@ ], "text/plain": [ " key_0 count_l count_r block_count\n", - "0 AMRK 888 85 75480\n", - "1 INTR 468 157 73476\n", - "2 FRST 836 82 68552" + "0 INTR 445 76 33820\n", + "1 AMRK 851 38 32338\n", + "2 FRST 816 36 29376" ] }, - "execution_count": 68, + "execution_count": 118, "metadata": {}, "output_type": "execute_result" } @@ -1540,46 +1159,32 @@ }, { "cell_type": "code", - "execution_count": 69, + "execution_count": 121, "id": "4e1a9844-5d98-4cac-a083-eef134f083ce", "metadata": {}, "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "bf1ed000055946dcbdc2d64e635de891", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "data": { "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.Chart(...)" ] }, - "execution_count": 69, + "execution_count": 121, "metadata": {}, "output_type": "execute_result" } @@ -1655,14 +1260,6 @@ "## Create Model" ] }, - { - "cell_type": "markdown", - "id": "d35162e9-f671-4e99-a261-e1bd4d16717e", - "metadata": {}, - "source": [ - "TODO: import comparisons from config" - ] - }, { "cell_type": "code", "execution_count": 334, @@ -1688,7 +1285,27 @@ }, { "cell_type": "code", - "execution_count": 386, + "execution_count": 422, + "id": "d2e043ed-7f64-4547-992d-7f947a63db6d", + "metadata": {}, + "outputs": [], + "source": [ + "# NOT USED\n", + "address_comparison = cl.CustomComparison(\n", + " comparison_levels = [\n", + " cll.NullLevel(\"street_address\"),\n", + " cll.ExactMatchLevel(\"street_address\"),\n", + " cll.LevenshteinLevel(\"street_address\", distance_threshold=1),\n", + " cll.ArraySubsetLevel(\"street_address_list\"),\n", + " ],\n", + " output_column_name=\"street_address\",\n", + " comparison_description=None\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 122, "id": "f72f546c-c338-4c9f-aab1-ba03c3169e18", "metadata": {}, "outputs": [ @@ -1707,22 +1324,12 @@ } ], "source": [ - "company_name_comparison = cl.NameComparison(\n", - " \"company_name_no_legal\",\n", - " jaro_winkler_thresholds=[.95],\n", - ")\n", - "\"\"\"\n", - "company_name_comparison = cl.JaccardAtThresholds(\n", - " \"company_name\",\n", - " # dmeta_col_name=\"company_name_mphone_list\" # this was breaking it for some reason\n", - ")\n", - "\"\"\"\n", "print(company_name_comparison.get_comparison(\"duckdb\").human_readable_description)" ] }, { "cell_type": "code", - "execution_count": 449, + "execution_count": 123, "id": "4298a288-c306-4d75-9d72-e5b8f87774ce", "metadata": {}, "outputs": [ @@ -1741,58 +1348,36 @@ } ], "source": [ - "address_comparison = cl.LevenshteinAtThresholds(\n", - " \"street_address\",\n", - " distance_threshold_or_thresholds=[1]\n", - ").configure(term_frequency_adjustments=True)\n", "print(address_comparison.get_comparison(\"duckdb\").human_readable_description)" ] }, { "cell_type": "code", - "execution_count": 422, - "id": "d2e043ed-7f64-4547-992d-7f947a63db6d", - "metadata": {}, - "outputs": [], - "source": [ - "# NOT USED\n", - "address_comparison = cl.CustomComparison(\n", - " comparison_levels = [\n", - " cll.NullLevel(\"street_address\"),\n", - " cll.ExactMatchLevel(\"street_address\"),\n", - " cll.LevenshteinLevel(\"street_address\", distance_threshold=1),\n", - " cll.ArraySubsetLevel(\"street_address_list\"),\n", - " ],\n", - " output_column_name=\"street_address\",\n", - " comparison_description=None\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 388, - "id": "63ed7cd2-d803-4d17-b730-c9fc17df0607", - "metadata": {}, - "outputs": [], - "source": [ - "# Use state and city instead of zip code\n", - "zip_code_comparison = cl.ExactMatch(\"zip_code\").configure(term_frequency_adjustments=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 450, - "id": "974a3982-38a1-45cb-9875-b8d4584c808d", + "execution_count": 124, + "id": "afdd5872-bc29-406f-bd0a-d5f4436f6794", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Comparison 'ExactMatch' of \"state\".\n", + "Similarity is assessed using the following ComparisonLevels:\n", + " - 'state is NULL' with SQL rule: \"state_l\" IS NULL OR \"state_r\" IS NULL\n", + " - 'Exact match on state' with SQL rule: \"state_l\" = \"state_r\"\n", + " - 'All other comparisons' with SQL rule: ELSE\n", + "\n" + ] + } + ], "source": [ - "state_comparison = cl.ExactMatch(\"state\").configure(term_frequency_adjustments=True)" + "print(state_comparison.get_comparison(\"duckdb\").human_readable_description)" ] }, { "cell_type": "code", - "execution_count": 451, - "id": "7592619b-340a-4496-8195-9ce932cae699", + "execution_count": 125, + "id": "90596d17-edb4-4ed1-9306-ea6c33ad00c6", "metadata": {}, "outputs": [ { @@ -1810,16 +1395,12 @@ } ], "source": [ - "city_comparison = cl.NameComparison(\n", - " \"city\",\n", - " jaro_winkler_thresholds=[0.9]\n", - ")\n", "print(city_comparison.get_comparison(\"duckdb\").human_readable_description)" ] }, { "cell_type": "code", - "execution_count": 452, + "execution_count": 126, "id": "a946c820-9b93-41f1-9fc2-6da47d0cf407", "metadata": {}, "outputs": [], @@ -1830,30 +1411,19 @@ " comparisons=[\n", " company_name_comparison,\n", " address_comparison,\n", - " # zip_code_comparison,\n", " state_comparison,\n", " city_comparison\n", " ],\n", - " blocking_rules_to_generate_predictions=[\n", - " BLOCKING_RULES\n", - " ],\n", + " blocking_rules_to_generate_predictions=BLOCKING_RULES,\n", " retain_intermediate_calculation_columns=True,\n", ")\n", "\n", "linker = Linker([sec_match_df, eia_match_df], settings, db_api=DuckDBAPI())" ] }, - { - "cell_type": "markdown", - "id": "04fda31f-fcea-446e-813a-08617d7a43bf", - "metadata": {}, - "source": [ - "TODO: import deterministic rules" - ] - }, { "cell_type": "code", - "execution_count": 453, + "execution_count": 127, "id": "36cae876-783d-4bff-89df-9d30cc5e60d6", "metadata": {}, "outputs": [ @@ -1861,26 +1431,18 @@ "name": "stderr", "output_type": "stream", "text": [ - "Probability two random records match is estimated to be 1.78e-06.\n", - "This means that amongst all possible pairwise record comparisons, one in 562,858.42 are expected to match. With 2,542,342,605 total possible comparisons, we expect a total of around 4,516.84 matching pairs\n" + "Probability two random records match is estimated to be 2.37e-06.\n", + "This means that amongst all possible pairwise record comparisons, one in 421,176.28 are expected to match. With 1,270,622,346 total possible comparisons, we expect a total of around 3,016.84 matching pairs\n" ] } ], "source": [ - "deterministic_rules = [\n", - " block_on(\"company_name_mphone\", \"company_name_mphone\"),\n", - " # block_on(\"street_address\"),\n", - " \"jaro_winkler_similarity(r.company_name, l.company_name) >= .95 and l.city = r.city\",\n", - " # \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4) and l.city = r.city and jaccard(r.street_address, l.street_address) >= .9\",\n", - " \"substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4) and l.city = r.city and l.street_address = r.street_address\",\n", - "]\n", - "\n", - "linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.95)" + "linker.training.estimate_probability_two_random_records_match(deterministic_blocking_rules, recall=0.95)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 128, "id": "a5190bda-070d-4d8e-a6db-be7c65b04db3", "metadata": {}, "outputs": [ @@ -1890,6 +1452,48 @@ "text": [ "----- Estimating u probabilities using random sampling -----\n" ] + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "f4e8733639644336a9a29f9b599af513", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "2fc66d179b9a430795b4ec68a164c22e", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "\n", + "Estimated u probabilities using random sampling\n", + "\n", + "Your model is not yet fully trained. Missing estimates for:\n", + " - company_name_no_legal (no m values are trained).\n", + " - street_address (no m values are trained).\n", + " - state (no m values are trained).\n", + " - city (no m values are trained).\n" + ] } ], "source": [ @@ -1898,7 +1502,7 @@ }, { "cell_type": "code", - "execution_count": 427, + "execution_count": 129, "id": "f8dd1336-8a5b-49d2-a054-df0c0a0e953f", "metadata": {}, "outputs": [ @@ -1926,12 +1530,13 @@ "WARNING:\n", "Level All other comparisons on comparison company_name_no_legal not observed in dataset, unable to train m value\n", "\n", - "Iteration 1: Largest change in params was -0.347 in the m_probability of city, level `All other comparisons`\n", - "Iteration 2: Largest change in params was 0.307 in the m_probability of city, level `All other comparisons`\n", - "Iteration 3: Largest change in params was 0.0403 in the m_probability of city, level `All other comparisons`\n", - "Iteration 4: Largest change in params was 4.46e-05 in the m_probability of city, level `All other comparisons`\n", + "Iteration 1: Largest change in params was 0.702 in the m_probability of street_address, level `All other comparisons`\n", + "Iteration 2: Largest change in params was 0.283 in probability_two_random_records_match\n", + "Iteration 3: Largest change in params was 0.282 in probability_two_random_records_match\n", + "Iteration 4: Largest change in params was 0.000537 in probability_two_random_records_match\n", + "Iteration 5: Largest change in params was 1.09e-07 in probability_two_random_records_match\n", "\n", - "EM converged after 4 iterations\n", + "EM converged after 5 iterations\n", "m probability not trained for company_name_no_legal - Jaro-Winkler distance of company_name_no_legal >= 0.95 (comparison vector value: 1). This usually means the comparison level was never observed in the training data.\n", "m probability not trained for company_name_no_legal - All other comparisons (comparison vector value: 0). This usually means the comparison level was never observed in the training data.\n", "\n", @@ -1949,7 +1554,7 @@ }, { "cell_type": "code", - "execution_count": 428, + "execution_count": 130, "id": "9581aa18-3352-429a-86c4-6078bcf13a55", "metadata": {}, "outputs": [ @@ -1971,21 +1576,19 @@ "Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: \n", " - street_address\n", "\n", - "Iteration 1: Largest change in params was -0.395 in the m_probability of city, level `All other comparisons`\n", - "Iteration 2: Largest change in params was 0.889 in the m_probability of company_name_no_legal, level `All other comparisons`\n", - "Iteration 3: Largest change in params was 0.285 in probability_two_random_records_match\n", - "Iteration 4: Largest change in params was 0.0152 in probability_two_random_records_match\n", - "Iteration 5: Largest change in params was 0.048 in the m_probability of city, level `All other comparisons`\n", - "Iteration 6: Largest change in params was 0.0559 in the m_probability of city, level `All other comparisons`\n", - "Iteration 7: Largest change in params was 0.0205 in probability_two_random_records_match\n", - "Iteration 8: Largest change in params was 0.00696 in probability_two_random_records_match\n", - "Iteration 9: Largest change in params was 0.0024 in probability_two_random_records_match\n", - "Iteration 10: Largest change in params was 0.000849 in probability_two_random_records_match\n", - "Iteration 11: Largest change in params was 0.000305 in probability_two_random_records_match\n", - "Iteration 12: Largest change in params was 0.00011 in probability_two_random_records_match\n", - "Iteration 13: Largest change in params was 3.98e-05 in probability_two_random_records_match\n", + "Iteration 1: Largest change in params was -0.967 in the m_probability of company_name_no_legal, level `Exact match on company_name_no_legal`\n", + "Iteration 2: Largest change in params was 0.477 in probability_two_random_records_match\n", + "Iteration 3: Largest change in params was 0.0395 in probability_two_random_records_match\n", + "Iteration 4: Largest change in params was 0.0443 in the m_probability of city, level `All other comparisons`\n", + "Iteration 5: Largest change in params was 0.0195 in probability_two_random_records_match\n", + "Iteration 6: Largest change in params was 0.00733 in probability_two_random_records_match\n", + "Iteration 7: Largest change in params was 0.00275 in probability_two_random_records_match\n", + "Iteration 8: Largest change in params was 0.00105 in probability_two_random_records_match\n", + "Iteration 9: Largest change in params was 0.0004 in probability_two_random_records_match\n", + "Iteration 10: Largest change in params was 0.000153 in probability_two_random_records_match\n", + "Iteration 11: Largest change in params was 5.9e-05 in probability_two_random_records_match\n", "\n", - "EM converged after 13 iterations\n", + "EM converged after 11 iterations\n", "\n", "Your model is fully trained. All comparisons have at least one estimate for their m and u values\n" ] @@ -2000,8 +1603,8 @@ }, { "cell_type": "code", - "execution_count": 429, - "id": "8ad317ed-1db9-4932-9815-6e9e0efa9580", + "execution_count": 131, + "id": "61298aa2-dbd4-4f2a-9c25-5f831d226d13", "metadata": {}, "outputs": [ { @@ -2009,23 +1612,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, - "execution_count": 429, + "execution_count": 131, "metadata": {}, "output_type": "execute_result" } @@ -2089,8 +1692,8 @@ }, { "cell_type": "code", - "execution_count": 430, - "id": "5e21bf55-64ac-4f4b-8f1c-d7507b5e7af6", + "execution_count": 132, + "id": "f365f59e-e4d0-44f3-a1fb-62e0d63d7ba3", "metadata": {}, "outputs": [ { @@ -2098,23 +1701,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.HConcatChart(...)" ] }, - "execution_count": 430, + "execution_count": 132, "metadata": {}, "output_type": "execute_result" } @@ -2198,30 +1801,16 @@ }, { "cell_type": "code", - "execution_count": 431, + "execution_count": 133, "id": "94e96441-89b6-4516-aa6a-4d1593ce03be", "metadata": {}, "outputs": [ - { - "data": { - "application/vnd.jupyter.widget-view+json": { - "model_id": "3ce1c0af73694400974ca6253619dd5b", - "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - "FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, { "name": "stderr", "output_type": "stream", "text": [ - "Blocking time: 9.73 seconds\n", - "Predict time: 0.52 seconds\n" + "Blocking time: 0.16 seconds\n", + "Predict time: 0.31 seconds\n" ] } ], @@ -2233,7 +1822,7 @@ }, { "cell_type": "code", - "execution_count": 432, + "execution_count": 134, "id": "722effbc-24e8-45ca-aa64-8cdcd75846e0", "metadata": {}, "outputs": [], @@ -2243,7 +1832,7 @@ }, { "cell_type": "code", - "execution_count": 433, + "execution_count": 135, "id": "21b19a11-15c6-46ab-bd73-7c4752f7e25e", "metadata": {}, "outputs": [ @@ -2283,10 +1872,11 @@ " bf_tf_adj_company_name_no_legal\n", " street_address_l\n", " street_address_r\n", - " street_address_list_l\n", - " street_address_list_r\n", " gamma_street_address\n", + " tf_street_address_l\n", + " tf_street_address_r\n", " bf_street_address\n", + " bf_tf_adj_street_address\n", " state_l\n", " state_r\n", " gamma_state\n", @@ -2308,199 +1898,204 @@ " \n", " \n", " \n", - " 32260\n", - " -24.047823\n", - " 5.766122e-08\n", + " 295287\n", + " -22.970759\n", + " 1.216501e-07\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 82087\n", - " 113663\n", - " sutro biopharma\n", - " stirling energy systems solar one\n", + " 9829\n", + " 3043\n", + " capitol bancorp\n", + " capital power\n", " 0\n", - " 0.000019\n", - " 0.000029\n", - " 0.985981\n", - " 1.0\n", - " 310 utah ave., suite 150\n", - " suite 150\n", - " [310, utah, ave.,, suite, 150]\n", - " [suite, 150]\n", - " 0.0\n", - " 0.265921\n", - " ca\n", - " az\n", + " 0.000024\n", + " 0.000012\n", + " 0.986045\n", + " 1.000000\n", + " capitol bancorp ctr\n", + " 120010423 101 st nw\n", " 0\n", - " 0.149142\n", - " 0.012950\n", - " 0.310698\n", - " 1.0\n", - " south san francisco\n", - " phoenix\n", + " 0.000012\n", + " 0.000110\n", + " 0.881657\n", + " 1.000000\n", + " mi\n", + " ab\n", + " 0\n", + " 0.015147\n", + " 0.000197\n", + " 0.198711\n", + " 1.000000\n", + " lansing\n", + " edmonton\n", + " 0\n", + " 0.000293\n", + " 0.000428\n", + " 0.296590\n", + " 1.000000\n", + " KPTL BNKRP\n", + " KPTL PWR\n", " 0\n", - " 0.001438\n", - " 0.003511\n", - " 0.398403\n", - " 1.0\n", - " STR BFRM\n", - " STRLNK ENRJ SSTMS SLR ON\n", - " 3\n", " \n", " \n", - " 27875\n", - " -24.047823\n", - " 5.766122e-08\n", + " 383898\n", + " -22.970759\n", + " 1.216501e-07\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 126035\n", - " 113797\n", - " corner growth acquisition 2\n", - " grubb and ellis management services\n", + " 51783\n", + " 17550\n", + " state bancorp\n", + " state street bank and trust\n", " 0\n", - " 0.000010\n", - " 0.000019\n", - " 0.985981\n", - " 1.0\n", - " 251 lytton avenue, suite 200\n", - " suite 200\n", - " [251, lytton, avenue,, suite, 200]\n", - " [suite, 200]\n", - " 0.0\n", - " 0.265921\n", - " ca\n", - " pa\n", + " 0.000024\n", + " 0.000024\n", + " 0.986045\n", + " 1.000000\n", + " 2 jericho plz\n", + " 100 summer st\n", " 0\n", - " 0.149142\n", - " 0.030197\n", - " 0.310698\n", - " 1.0\n", - " palo alto\n", - " pittsburgh\n", + " 0.000012\n", + " 0.000024\n", + " 0.881657\n", + " 1.000000\n", + " ny\n", + " ma\n", + " 0\n", + " 0.120228\n", + " 0.041765\n", + " 0.198711\n", + " 1.000000\n", + " jericho\n", + " boston\n", + " 0\n", + " 0.000306\n", + " 0.014319\n", + " 0.296590\n", + " 1.000000\n", + " STT BNKRP\n", + " STT STRT BNK ANT TRST\n", " 0\n", - " 0.001850\n", - " 0.003656\n", - " 0.398403\n", - " 1.0\n", - " KRNR KR0 AKKSXN\n", - " KRB ANT ELS MNJMNT SRFSS\n", - " 3\n", " \n", " \n", - " 27993\n", - " -24.047823\n", - " 5.766122e-08\n", + " 383897\n", + " -22.970759\n", + " 1.216501e-07\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 125096\n", - " 97905\n", - " altus power\n", - " allegheny ridge wind farm\n", + " 51782\n", + " 17550\n", + " state auto financial\n", + " state street bank and trust\n", " 0\n", - " 0.000010\n", - " 0.000038\n", - " 0.985981\n", - " 1.0\n", - " 2200 atlantic street, 6th floor\n", - " 6th floor\n", - " [2200, atlantic, street,, 6th, floor]\n", - " [6th, floor]\n", - " 0.0\n", - " 0.265921\n", - " ct\n", - " ca\n", + " 0.000024\n", + " 0.000024\n", + " 0.986045\n", + " 1.000000\n", + " 518 east broad st\n", + " 100 summer st\n", " 0\n", - " 0.020325\n", - " 0.149142\n", - " 0.310698\n", - " 1.0\n", - " stamford\n", - " san francisco\n", + " 0.000012\n", + " 0.000024\n", + " 0.881657\n", + " 1.000000\n", + " oh\n", + " ma\n", + " 0\n", + " 0.016991\n", + " 0.041765\n", + " 0.198711\n", + " 1.000000\n", + " columbus\n", + " boston\n", + " 0\n", + " 0.002788\n", + " 0.014319\n", + " 0.296590\n", + " 1.000000\n", + " STT AT FNNXL\n", + " STT STRT BNK ANT TRST\n", " 0\n", - " 0.003789\n", - " 0.013374\n", - " 0.398403\n", - " 1.0\n", - " ALTS PWR\n", - " ALKHN RJ WNT FRM\n", - " 3\n", " \n", " \n", - " 28003\n", - " -24.047823\n", - " 5.766122e-08\n", + " 383896\n", + " -22.970759\n", + " 1.216501e-07\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 115402\n", - " 91508\n", - " clearway energy\n", - " clipper windpower\n", + " 51781\n", + " 17550\n", + " state auto financial\n", + " state street bank and trust\n", " 0\n", - " 0.000038\n", - " 0.000029\n", - " 0.985981\n", - " 1.0\n", - " 300 carnegie center, suite 300\n", - " suite 300\n", - " [300, carnegie, center,, suite, 300]\n", - " [suite, 300]\n", - " 0.0\n", - " 0.265921\n", - " nj\n", - " ca\n", + " 0.000024\n", + " 0.000024\n", + " 0.986045\n", + " 1.000000\n", + " 518 e broad st\n", + " 100 summer st\n", " 0\n", - " 0.031159\n", - " 0.149142\n", - " 0.310698\n", - " 1.0\n", - " princeton\n", - " carpinteria\n", + " 0.000012\n", + " 0.000024\n", + " 0.881657\n", + " 1.000000\n", + " oh\n", + " ma\n", + " 0\n", + " 0.016991\n", + " 0.041765\n", + " 0.198711\n", + " 1.000000\n", + " columbus\n", + " boston\n", + " 0\n", + " 0.002788\n", + " 0.014319\n", + " 0.296590\n", + " 1.000000\n", + " STT AT FNNXL\n", + " STT STRT BNK ANT TRST\n", " 0\n", - " 0.002118\n", - " 0.000189\n", - " 0.398403\n", - " 1.0\n", - " KLRW ENRJ\n", - " KLPR WNTPWR\n", - " 3\n", " \n", " \n", - " 28024\n", - " -24.047823\n", - " 5.766122e-08\n", + " 383895\n", + " -22.970759\n", + " 1.216501e-07\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 125009\n", - " 77758\n", - " benchmark 2020 b21 mortgage trust\n", - " bountiful city city of\n", + " 51780\n", + " 3805\n", + " starz\n", + " citrus world\n", " 0\n", - " 0.000010\n", - " 0.000048\n", - " 0.985981\n", - " 1.0\n", - " 200 west street\n", - " 198 south 200 west street\n", - " [200, west, street]\n", - " [198, south, 200, west, street]\n", - " 0.0\n", - " 0.265921\n", - " ny\n", - " ut\n", + " 0.000024\n", + " 0.000049\n", + " 0.986045\n", + " 1.000000\n", + " 8900 liberty cir\n", + " 20205 hwy 2720205 hwy 27\n", " 0\n", - " 0.113010\n", - " 0.010475\n", - " 0.310698\n", - " 1.0\n", - " new york\n", - " bountiful city\n", + " 0.000024\n", + " 0.000012\n", + " 0.881657\n", + " 1.000000\n", + " co\n", + " fl\n", + " 0\n", + " 0.023802\n", + " 0.048477\n", + " 0.198711\n", + " 1.000000\n", + " englewood\n", + " lake wales\n", + " 0\n", + " 0.002947\n", + " 0.000049\n", + " 0.296590\n", + " 1.000000\n", + " STRS\n", + " STRS WRLT\n", " 0\n", - " 0.086944\n", - " 0.000022\n", - " 0.398403\n", - " 1.0\n", - " BNXMRK B MRTKJ TRST\n", - " BNTFL ST ST OF\n", - " 3\n", " \n", " \n", " ...\n", @@ -2540,225 +2135,231 @@ " ...\n", " ...\n", " ...\n", + " ...\n", " \n", " \n", - " 1038434\n", - " NaN\n", - " NaN\n", + " 186872\n", + " 27.519625\n", + " 1.000000e+00\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 137784\n", - " 70294\n", - " farmer brothers\n", - " farmers electric ia\n", - " 0\n", - " 0.000029\n", - " 0.000038\n", - " 0.985981\n", - " 1.0\n", - " 20333 s normandie ave\n", - " 1959 yoder ave,sw\n", - " [20333, s, normandie, ave]\n", - " [1959, yoder, ave,sw]\n", - " NaN\n", - " NaN\n", - " ca\n", - " ia\n", - " 0\n", - " 0.149142\n", - " 0.016527\n", - " 0.310698\n", - " 1.0\n", - " torrance\n", - " kalona\n", - " 0\n", - " 0.002485\n", - " 0.000011\n", - " 0.398403\n", - " 1.0\n", - " FRMR BR0RS\n", - " FRMRS ELKTRK I\n", + " 39816\n", + " 13109\n", + " northwestern public service\n", + " northwestern public service\n", + " 2\n", + " 0.000073\n", + " 0.000073\n", + " 652179.111493\n", + " 0.010580\n", + " 33 third st se\n", + " 33 third st se\n", + " 2\n", + " 0.000037\n", + " 0.000037\n", + " 9450.378101\n", + " 0.317122\n", + " sd\n", + " sd\n", + " 1\n", + " 0.001930\n", + " 0.001930\n", + " 15.873789\n", + " 26.483035\n", + " huron\n", + " huron\n", + " 2\n", + " 0.000073\n", + " 0.000073\n", + " 108.031428\n", + " 86.293486\n", + " NR0WSTRN PBLK SRFS\n", + " NR0WSTRN PBLK SRFS\n", " 0\n", " \n", " \n", - " 1038441\n", - " NaN\n", - " NaN\n", + " 580681\n", + " 27.526533\n", + " 1.000000e+00\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 139631\n", - " 137540\n", - " international game technology\n", - " intergen north america\n", - " 0\n", - " 0.000048\n", - " 0.000029\n", - " 0.985981\n", - " 1.0\n", - " 6355 south buffalo drive\n", - " 4th floor\n", - " [6355, south, buffalo, drive]\n", - " [4th, floor]\n", - " NaN\n", - " NaN\n", - " nv\n", - " ma\n", - " 0\n", - " 0.019288\n", - " 0.041401\n", - " 0.310698\n", - " 1.0\n", - " las vegas\n", - " burlington\n", - " 0\n", - " 0.010477\n", - " 0.001415\n", - " 0.398403\n", - " 1.0\n", - " INTRNXNL KM TXNLJ\n", - " INTRJN NR0 AMRK\n", + " 24650\n", + " 8047\n", + " green mountain power\n", + " green mountain power\n", + " 2\n", + " 0.000037\n", + " 0.000037\n", + " 652179.111493\n", + " 0.021160\n", + " 163 acorn ln\n", + " 163 acorn ln\n", + " 2\n", + " 0.000037\n", + " 0.000037\n", + " 9450.378101\n", + " 0.317122\n", + " vt\n", + " vt\n", + " 1\n", + " 0.001537\n", + " 0.001537\n", + " 15.873789\n", + " 33.262692\n", + " colchester\n", + " colchester\n", + " 2\n", + " 0.000183\n", + " 0.000183\n", + " 108.031428\n", + " 34.517394\n", + " KRN MNTN PWR\n", + " KRN MNTN PWR\n", " 0\n", " \n", " \n", - " 1038443\n", - " NaN\n", - " NaN\n", + " 438193\n", + " 27.757357\n", + " 1.000000e+00\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 90853\n", - " 13424\n", - " monster arts\n", - " minnesota solar csg 4\n", - " 0\n", - " 0.000010\n", - " 0.000029\n", - " 0.985981\n", - " 1.0\n", - " 806 east avenida pico\n", - " 200 wellington street west, su\n", - " [806, east, avenida, pico]\n", - " [200, wellington, street, west,, su]\n", - " NaN\n", - " NaN\n", - " ca\n", - " None\n", - " -1\n", - " 0.149142\n", - " NaN\n", - " 1.000000\n", - " 1.0\n", - " san clemente\n", - " toronto\n", - " 0\n", - " 0.000346\n", - " 0.002129\n", - " 0.398403\n", - " 1.0\n", - " MNSTR ARTS\n", - " MNST SLR KSK\n", + " 58842\n", + " 19906\n", + " wausau paper mills\n", + " wausau paper mills\n", + " 2\n", + " 0.000024\n", + " 0.000024\n", + " 652179.111493\n", + " 0.031739\n", + " one clarks is\n", + " one clarks is\n", + " 2\n", + " 0.000024\n", + " 0.000024\n", + " 9450.378101\n", + " 0.475683\n", + " wi\n", + " wi\n", + " 1\n", + " 0.008840\n", + " 0.008840\n", + " 15.873789\n", + " 5.782805\n", + " wausau\n", + " wausau\n", + " 2\n", + " 0.000061\n", + " 0.000061\n", + " 108.031428\n", + " 103.552183\n", + " WS PPR MLS\n", + " WS PPR MLS\n", " 0\n", " \n", " \n", - " 1038454\n", - " NaN\n", - " NaN\n", + " 385934\n", + " 27.884385\n", + " 1.000000e+00\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 108136\n", - " 1959\n", - " nxt id\n", - " nextgrid mastic\n", - " 0\n", - " 0.000038\n", - " 0.000029\n", - " 0.985981\n", - " 1.0\n", - " 4 research drive, #402\n", - " 879 sanchez street\n", - " [4, research, drive,, #402]\n", - " [879, sanchez, street]\n", - " NaN\n", - " NaN\n", - " ct\n", - " ca\n", - " 0\n", - " 0.020325\n", - " 0.149142\n", - " 0.310698\n", - " 1.0\n", - " shelton\n", - " san francisco\n", - " 0\n", - " 0.000390\n", - " 0.013374\n", - " 0.398403\n", - " 1.0\n", - " NKST IT\n", - " NKSTKRT MSTK\n", + " 51567\n", + " 17450\n", + " st joseph light and power\n", + " st joseph light and power\n", + " 2\n", + " 0.000024\n", + " 0.000024\n", + " 652179.111493\n", + " 0.031739\n", + " 520 francis st\n", + " 520 francis st\n", + " 2\n", + " 0.000024\n", + " 0.000024\n", + " 9450.378101\n", + " 0.475683\n", + " mo\n", + " mo\n", + " 1\n", + " 0.010118\n", + " 0.010118\n", + " 15.873789\n", + " 5.052049\n", + " st joseph\n", + " st joseph\n", + " 2\n", + " 0.000049\n", + " 0.000049\n", + " 108.031428\n", + " 129.440229\n", + " ST JSF LT ANT PWR\n", + " ST JSF LT ANT PWR\n", " 0\n", " \n", " \n", - " 1038456\n", - " NaN\n", - " NaN\n", + " 503816\n", + " 29.211031\n", + " 1.000000e+00\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 91657\n", - " 105602\n", - " coronado biosciences\n", - " garnet energy\n", - " 0\n", - " 0.000019\n", - " 0.000038\n", - " 0.985981\n", - " 1.0\n", - " 24 new england executive park\n", - " suite 102\n", - " [24, new, england, executive, park]\n", - " [suite, 102]\n", - " NaN\n", - " NaN\n", - " ma\n", - " ca\n", - " 0\n", - " 0.041401\n", - " 0.149142\n", - " 0.310698\n", - " 1.0\n", - " burlington\n", - " westlake village\n", - " 0\n", - " 0.001415\n", - " 0.000691\n", - " 0.398403\n", - " 1.0\n", - " KRNT BSSNSS\n", - " KRNT ENRJ\n", + " 20588\n", + " 6741\n", + " fibermark\n", + " fibermark\n", + " 2\n", + " 0.000037\n", + " 0.000037\n", + " 652179.111493\n", + " 0.021160\n", + " 161 wellington rd\n", + " 161 wellington rd\n", + " 2\n", + " 0.000024\n", + " 0.000024\n", + " 9450.378101\n", + " 0.475683\n", + " vt\n", + " vt\n", + " 1\n", + " 0.001537\n", + " 0.001537\n", + " 15.873789\n", + " 33.262692\n", + " brattleboro\n", + " brattleboro\n", + " 2\n", + " 0.000086\n", + " 0.000086\n", + " 108.031428\n", + " 73.965845\n", + " FBRMRK\n", + " FBRMRK\n", " 0\n", " \n", " \n", "\n", - "

1038457 rows × 36 columns

\n", + "

590575 rows × 37 columns

\n", "" ], "text/plain": [ - " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r street_address_list_l street_address_list_r gamma_street_address bf_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key\n", - "32260 -24.047823 5.766122e-08 __splink__input_table_0 __splink__input_table_1 82087 113663 sutro biopharma stirling energy systems solar one 0 0.000019 0.000029 0.985981 1.0 310 utah ave., suite 150 suite 150 [310, utah, ave.,, suite, 150] [suite, 150] 0.0 0.265921 ca az 0 0.149142 0.012950 0.310698 1.0 south san francisco phoenix 0 0.001438 0.003511 0.398403 1.0 STR BFRM STRLNK ENRJ SSTMS SLR ON 3\n", - "27875 -24.047823 5.766122e-08 __splink__input_table_0 __splink__input_table_1 126035 113797 corner growth acquisition 2 grubb and ellis management services 0 0.000010 0.000019 0.985981 1.0 251 lytton avenue, suite 200 suite 200 [251, lytton, avenue,, suite, 200] [suite, 200] 0.0 0.265921 ca pa 0 0.149142 0.030197 0.310698 1.0 palo alto pittsburgh 0 0.001850 0.003656 0.398403 1.0 KRNR KR0 AKKSXN KRB ANT ELS MNJMNT SRFSS 3\n", - "27993 -24.047823 5.766122e-08 __splink__input_table_0 __splink__input_table_1 125096 97905 altus power allegheny ridge wind farm 0 0.000010 0.000038 0.985981 1.0 2200 atlantic street, 6th floor 6th floor [2200, atlantic, street,, 6th, floor] [6th, floor] 0.0 0.265921 ct ca 0 0.020325 0.149142 0.310698 1.0 stamford san francisco 0 0.003789 0.013374 0.398403 1.0 ALTS PWR ALKHN RJ WNT FRM 3\n", - "28003 -24.047823 5.766122e-08 __splink__input_table_0 __splink__input_table_1 115402 91508 clearway energy clipper windpower 0 0.000038 0.000029 0.985981 1.0 300 carnegie center, suite 300 suite 300 [300, carnegie, center,, suite, 300] [suite, 300] 0.0 0.265921 nj ca 0 0.031159 0.149142 0.310698 1.0 princeton carpinteria 0 0.002118 0.000189 0.398403 1.0 KLRW ENRJ KLPR WNTPWR 3\n", - "28024 -24.047823 5.766122e-08 __splink__input_table_0 __splink__input_table_1 125009 77758 benchmark 2020 b21 mortgage trust bountiful city city of 0 0.000010 0.000048 0.985981 1.0 200 west street 198 south 200 west street [200, west, street] [198, south, 200, west, street] 0.0 0.265921 ny ut 0 0.113010 0.010475 0.310698 1.0 new york bountiful city 0 0.086944 0.000022 0.398403 1.0 BNXMRK B MRTKJ TRST BNTFL ST ST OF 3\n", - "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", - "1038434 NaN NaN __splink__input_table_0 __splink__input_table_1 137784 70294 farmer brothers farmers electric ia 0 0.000029 0.000038 0.985981 1.0 20333 s normandie ave 1959 yoder ave,sw [20333, s, normandie, ave] [1959, yoder, ave,sw] NaN NaN ca ia 0 0.149142 0.016527 0.310698 1.0 torrance kalona 0 0.002485 0.000011 0.398403 1.0 FRMR BR0RS FRMRS ELKTRK I 0\n", - "1038441 NaN NaN __splink__input_table_0 __splink__input_table_1 139631 137540 international game technology intergen north america 0 0.000048 0.000029 0.985981 1.0 6355 south buffalo drive 4th floor [6355, south, buffalo, drive] [4th, floor] NaN NaN nv ma 0 0.019288 0.041401 0.310698 1.0 las vegas burlington 0 0.010477 0.001415 0.398403 1.0 INTRNXNL KM TXNLJ INTRJN NR0 AMRK 0\n", - "1038443 NaN NaN __splink__input_table_0 __splink__input_table_1 90853 13424 monster arts minnesota solar csg 4 0 0.000010 0.000029 0.985981 1.0 806 east avenida pico 200 wellington street west, su [806, east, avenida, pico] [200, wellington, street, west,, su] NaN NaN ca None -1 0.149142 NaN 1.000000 1.0 san clemente toronto 0 0.000346 0.002129 0.398403 1.0 MNSTR ARTS MNST SLR KSK 0\n", - "1038454 NaN NaN __splink__input_table_0 __splink__input_table_1 108136 1959 nxt id nextgrid mastic 0 0.000038 0.000029 0.985981 1.0 4 research drive, #402 879 sanchez street [4, research, drive,, #402] [879, sanchez, street] NaN NaN ct ca 0 0.020325 0.149142 0.310698 1.0 shelton san francisco 0 0.000390 0.013374 0.398403 1.0 NKST IT NKSTKRT MSTK 0\n", - "1038456 NaN NaN __splink__input_table_0 __splink__input_table_1 91657 105602 coronado biosciences garnet energy 0 0.000019 0.000038 0.985981 1.0 24 new england executive park suite 102 [24, new, england, executive, park] [suite, 102] NaN NaN ma ca 0 0.041401 0.149142 0.310698 1.0 burlington westlake village 0 0.001415 0.000691 0.398403 1.0 KRNT BSSNSS KRNT ENRJ 0\n", - "\n", - "[1038457 rows x 36 columns]" + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key\n", + "295287 -22.970759 1.216501e-07 __splink__input_table_0 __splink__input_table_1 9829 3043 capitol bancorp capital power 0 0.000024 0.000012 0.986045 1.000000 capitol bancorp ctr 120010423 101 st nw 0 0.000012 0.000110 0.881657 1.000000 mi ab 0 0.015147 0.000197 0.198711 1.000000 lansing edmonton 0 0.000293 0.000428 0.296590 1.000000 KPTL BNKRP KPTL PWR 0\n", + "383898 -22.970759 1.216501e-07 __splink__input_table_0 __splink__input_table_1 51783 17550 state bancorp state street bank and trust 0 0.000024 0.000024 0.986045 1.000000 2 jericho plz 100 summer st 0 0.000012 0.000024 0.881657 1.000000 ny ma 0 0.120228 0.041765 0.198711 1.000000 jericho boston 0 0.000306 0.014319 0.296590 1.000000 STT BNKRP STT STRT BNK ANT TRST 0\n", + "383897 -22.970759 1.216501e-07 __splink__input_table_0 __splink__input_table_1 51782 17550 state auto financial state street bank and trust 0 0.000024 0.000024 0.986045 1.000000 518 east broad st 100 summer st 0 0.000012 0.000024 0.881657 1.000000 oh ma 0 0.016991 0.041765 0.198711 1.000000 columbus boston 0 0.002788 0.014319 0.296590 1.000000 STT AT FNNXL STT STRT BNK ANT TRST 0\n", + "383896 -22.970759 1.216501e-07 __splink__input_table_0 __splink__input_table_1 51781 17550 state auto financial state street bank and trust 0 0.000024 0.000024 0.986045 1.000000 518 e broad st 100 summer st 0 0.000012 0.000024 0.881657 1.000000 oh ma 0 0.016991 0.041765 0.198711 1.000000 columbus boston 0 0.002788 0.014319 0.296590 1.000000 STT AT FNNXL STT STRT BNK ANT TRST 0\n", + "383895 -22.970759 1.216501e-07 __splink__input_table_0 __splink__input_table_1 51780 3805 starz citrus world 0 0.000024 0.000049 0.986045 1.000000 8900 liberty cir 20205 hwy 2720205 hwy 27 0 0.000024 0.000012 0.881657 1.000000 co fl 0 0.023802 0.048477 0.198711 1.000000 englewood lake wales 0 0.002947 0.000049 0.296590 1.000000 STRS STRS WRLT 0\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "186872 27.519625 1.000000e+00 __splink__input_table_0 __splink__input_table_1 39816 13109 northwestern public service northwestern public service 2 0.000073 0.000073 652179.111493 0.010580 33 third st se 33 third st se 2 0.000037 0.000037 9450.378101 0.317122 sd sd 1 0.001930 0.001930 15.873789 26.483035 huron huron 2 0.000073 0.000073 108.031428 86.293486 NR0WSTRN PBLK SRFS NR0WSTRN PBLK SRFS 0\n", + "580681 27.526533 1.000000e+00 __splink__input_table_0 __splink__input_table_1 24650 8047 green mountain power green mountain power 2 0.000037 0.000037 652179.111493 0.021160 163 acorn ln 163 acorn ln 2 0.000037 0.000037 9450.378101 0.317122 vt vt 1 0.001537 0.001537 15.873789 33.262692 colchester colchester 2 0.000183 0.000183 108.031428 34.517394 KRN MNTN PWR KRN MNTN PWR 0\n", + "438193 27.757357 1.000000e+00 __splink__input_table_0 __splink__input_table_1 58842 19906 wausau paper mills wausau paper mills 2 0.000024 0.000024 652179.111493 0.031739 one clarks is one clarks is 2 0.000024 0.000024 9450.378101 0.475683 wi wi 1 0.008840 0.008840 15.873789 5.782805 wausau wausau 2 0.000061 0.000061 108.031428 103.552183 WS PPR MLS WS PPR MLS 0\n", + "385934 27.884385 1.000000e+00 __splink__input_table_0 __splink__input_table_1 51567 17450 st joseph light and power st joseph light and power 2 0.000024 0.000024 652179.111493 0.031739 520 francis st 520 francis st 2 0.000024 0.000024 9450.378101 0.475683 mo mo 1 0.010118 0.010118 15.873789 5.052049 st joseph st joseph 2 0.000049 0.000049 108.031428 129.440229 ST JSF LT ANT PWR ST JSF LT ANT PWR 0\n", + "503816 29.211031 1.000000e+00 __splink__input_table_0 __splink__input_table_1 20588 6741 fibermark fibermark 2 0.000037 0.000037 652179.111493 0.021160 161 wellington rd 161 wellington rd 2 0.000024 0.000024 9450.378101 0.475683 vt vt 1 0.001537 0.001537 15.873789 33.262692 brattleboro brattleboro 2 0.000086 0.000086 108.031428 73.965845 FBRMRK FBRMRK 0\n", + "\n", + "[590575 rows x 37 columns]" ] }, - "execution_count": 433, + "execution_count": 135, "metadata": {}, "output_type": "execute_result" } @@ -2769,13 +2370,13 @@ }, { "cell_type": "code", - "execution_count": 434, + "execution_count": 136, "id": "c0b292c8-26ed-407a-866e-75851577d567", "metadata": {}, "outputs": [], "source": [ "# join on utility_id_eia and CIK\n", - "preds_validation_df = preds_df.merge(sec_clean_df[[\"record_id\", \"central_index_key\", \"company_name_raw\"]],\n", + "preds_validation_df = preds_df.merge(sec_df[[\"record_id\", \"sec_company_id\", \"central_index_key\", \"company_name_raw\"]],\n", " how=\"left\",\n", " left_on=\"record_id_l\",\n", " right_on=\"record_id\")" @@ -2783,12 +2384,12 @@ }, { "cell_type": "code", - "execution_count": 435, + "execution_count": 137, "id": "e8744d54-3cc6-434c-bbbe-a10d2d3cd9c0", "metadata": {}, "outputs": [], "source": [ - "preds_validation_df = preds_validation_df.merge(eia_clean_df[[\"record_id\", \"utility_id_eia\"]],\n", + "preds_validation_df = preds_validation_df.merge(eia_df[[\"record_id\", \"utility_id_eia\"]],\n", " how=\"left\",\n", " left_on=\"record_id_r\",\n", " right_on=\"record_id\")" @@ -2796,19 +2397,19 @@ }, { "cell_type": "code", - "execution_count": 436, + "execution_count": 138, "id": "5103190c-3775-427f-a8f2-cc8a8f79892b", "metadata": {}, "outputs": [], "source": [ "preds_validation_df = preds_validation_df.sort_values(\n", - " by=[\"central_index_key\", \"utility_id_eia\", \"match_probability\"], ascending=False\n", - ").drop_duplicates(subset=[\"central_index_key\", \"utility_id_eia\"], keep=\"first\")" + " by=[\"sec_company_id\", \"utility_id_eia\", \"match_probability\"], ascending=False\n", + ").drop_duplicates(subset=[\"sec_company_id\", \"utility_id_eia\"], keep=\"first\")" ] }, { "cell_type": "code", - "execution_count": 437, + "execution_count": 139, "id": "8fa04f18-beff-4bdc-bbbb-705950eab5b8", "metadata": {}, "outputs": [ @@ -2848,10 +2449,11 @@ " bf_tf_adj_company_name_no_legal\n", " street_address_l\n", " street_address_r\n", - " street_address_list_l\n", - " street_address_list_r\n", " gamma_street_address\n", + " tf_street_address_l\n", + " tf_street_address_r\n", " bf_street_address\n", + " bf_tf_adj_street_address\n", " state_l\n", " state_r\n", " gamma_state\n", @@ -2870,6 +2472,7 @@ " company_name_mphone_r\n", " match_key\n", " record_id_x\n", + " sec_company_id\n", " central_index_key\n", " company_name_raw\n", " record_id_y\n", @@ -2878,224 +2481,234 @@ " \n", " \n", " \n", - " 889845\n", - " 5.679807\n", - " 0.980865\n", + " 466134\n", + " 3.824596\n", + " 0.934073\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 51956\n", - " 22658\n", - " constellation energy\n", - " constellation newenergy\n", - " 1\n", - " 0.000029\n", - " 0.000077\n", - " 6085.754919\n", - " 1.000000\n", - " 1310 point street\n", - " None\n", - " [1310, point, street]\n", - " NaN\n", - " -1.0\n", + " 14692\n", + " 6293\n", + " crane\n", + " entergy nuclear power marketing\n", + " 0\n", + " 0.000012\n", + " 0.000012\n", + " 0.986045\n", " 1.000000\n", - " md\n", - " md\n", + " 100 first stamford pl\n", + " 100 first stamford pl\n", + " 2\n", + " 0.000122\n", + " 0.000122\n", + " 9450.378101\n", + " 0.095137\n", + " ct\n", + " ct\n", " 1\n", - " 0.023298\n", - " 0.023298\n", - " 14.856341\n", - " 2.034020\n", - " baltimore\n", - " baltimore\n", + " 0.020876\n", + " 0.020876\n", + " 15.873789\n", + " 2.448667\n", + " stamford\n", + " stamford\n", " 2\n", - " 0.003678\n", - " 0.003678\n", - " 94.80739\n", - " 1.654881\n", - " KNSTLXN ENRJ\n", - " KNSTLXN NWNRJ\n", - " 0\n", - " 51956\n", - " 0001868275\n", - " constellation energy corp\n", - " 22658\n", - " 58491\n", + " 0.003950\n", + " 0.003950\n", + " 108.031428\n", + " 1.602975\n", + " KRN\n", + " ENTRJ NKLR PWR MRKTNK\n", + " 1\n", + " 14692\n", + " 0001944013\n", + " 0001944013\n", + " crane co\n", + " 6293\n", + " 55243\n", " \n", " \n", - " 884109\n", - " 13.095633\n", - " 0.999886\n", + " 466594\n", + " 4.620005\n", + " 0.960922\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 120267\n", - " 96849\n", - " evergy\n", - " evergy\n", + " 17752\n", + " 5535\n", + " dte electric securitization funding i\n", + " dte sustainable generation\n", + " 0\n", + " 0.000012\n", + " 0.000012\n", + " 0.986045\n", + " 1.000000\n", + " one energy plz\n", + " one energy plz\n", " 2\n", - " 0.000019\n", - " 0.000019\n", - " 872345.689655\n", - " 0.059564\n", - " 1200 main street\n", - " 1200 main street\n", - " [1200, main, street]\n", - " [1200, main, street]\n", - " 2.0\n", - " 5.407499\n", - " mo\n", - " mo\n", + " 0.000330\n", + " 0.000330\n", + " 9450.378101\n", + " 0.035236\n", + " mi\n", + " mi\n", " 1\n", - " 0.011744\n", - " 0.011744\n", - " 14.856341\n", - " 4.035057\n", - " kansas city\n", - " kansas city\n", + " 0.015147\n", + " 0.015147\n", + " 15.873789\n", + " 3.374867\n", + " detroit\n", + " detroit\n", " 2\n", - " 0.001973\n", - " 0.001973\n", - " 94.80739\n", - " 3.085372\n", - " EFRJ\n", - " EFRJ\n", - " 0\n", - " 120267\n", - " 0001711269\n", - " evergy, inc.\n", - " 96849\n", - " 64428\n", + " 0.001162\n", + " 0.001162\n", + " 108.031428\n", + " 5.450115\n", + " TT ELKTRK SKRTSXN FNTNK I\n", + " TT SSTNBL JNRXN\n", + " 1\n", + " 17752\n", + " 0001876068\n", + " 0001876068\n", + " dte electric securitization funding i llc\n", + " 5535\n", + " 64331\n", " \n", " \n", - " 893941\n", - " 12.486567\n", - " 0.999826\n", + " 480747\n", + " 4.620005\n", + " 0.960922\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 120222\n", - " 96211\n", - " consol energy\n", - " consol energy\n", + " 17752\n", + " 5522\n", + " dte electric securitization funding i\n", + " dte electric\n", + " 0\n", + " 0.000012\n", + " 0.000037\n", + " 0.986045\n", + " 1.000000\n", + " one energy plz\n", + " one energy plz\n", " 2\n", - " 0.000058\n", - " 0.000058\n", - " 872345.689655\n", - " 0.019855\n", - " 275 technology drive\n", - " 275 technology drive\n", - " [275, technology, drive]\n", - " [275, technology, drive]\n", - " 2.0\n", - " 5.407499\n", - " pa\n", - " pa\n", + " 0.000330\n", + " 0.000330\n", + " 9450.378101\n", + " 0.035236\n", + " mi\n", + " mi\n", " 1\n", - " 0.030197\n", - " 0.030197\n", - " 14.856341\n", - " 1.569346\n", - " canonsburg\n", - " canonsburg\n", + " 0.015147\n", + " 0.015147\n", + " 15.873789\n", + " 3.374867\n", + " detroit\n", + " detroit\n", " 2\n", - " 0.000390\n", - " 0.000390\n", - " 94.80739\n", - " 15.603165\n", - " KNSL ENRJ\n", - " KNSL ENRJ\n", - " 0\n", - " 120222\n", - " 0001710366\n", - " consol energy inc.\n", - " 96211\n", - " 4299\n", - " \n", - " \n", - " 943594\n", - " 9.161274\n", - " 0.998256\n", + " 0.001162\n", + " 0.001162\n", + " 108.031428\n", + " 5.450115\n", + " TT ELKTRK SKRTSXN FNTNK I\n", + " TT ELKTRK\n", + " 0\n", + " 17752\n", + " 0001876068\n", + " 0001876068\n", + " dte electric securitization funding i llc\n", + " 5522\n", + " 5109\n", + " \n", + " \n", + " 464506\n", + " 6.019599\n", + " 0.984820\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 119271\n", - " 83669\n", - " vistra energy\n", - " vistra energy\n", + " 14051\n", + " 10935\n", + " constellation energy\n", + " luminace solar rhode island\n", + " 0\n", + " 0.000024\n", + " 0.000024\n", + " 0.986045\n", + " 1.000000\n", + " 1310 pt st\n", + " 1310 pt st\n", " 2\n", - " 0.000019\n", - " 0.000019\n", - " 872345.689655\n", - " 0.059564\n", - " 6555 sierra drive\n", - " 6555 sierra drive\n", - " [6555, sierra, drive]\n", - " [6555, sierra, drive]\n", - " 2.0\n", - " 5.407499\n", - " tx\n", - " tx\n", + " 0.000024\n", + " 0.000024\n", + " 9450.378101\n", + " 0.475683\n", + " md\n", + " md\n", " 1\n", - " 0.080866\n", - " 0.080866\n", - " 14.856341\n", - " 0.586015\n", - " irving\n", - " irving\n", + " 0.025130\n", + " 0.025130\n", + " 15.873789\n", + " 2.034167\n", + " baltimore\n", + " baltimore\n", " 2\n", - " 0.004380\n", - " 0.004380\n", - " 94.80739\n", - " 1.389595\n", - " FSTR ENRJ\n", - " FSTR ENRJ\n", - " 0\n", - " 119271\n", - " 0001692819\n", - " vistra energy corp.\n", - " 83669\n", - " 62723\n", + " 0.003583\n", + " 0.003583\n", + " 108.031428\n", + " 1.767102\n", + " KNSTLXN ENRJ\n", + " LMNS SLR RHT ISLNT\n", + " 1\n", + " 14051\n", + " 0001868275\n", + " 0001868275\n", + " constellation energy corp\n", + " 10935\n", + " 62679\n", " \n", " \n", - " 860414\n", - " 7.576311\n", - " 0.994788\n", + " 340973\n", + " 6.201744\n", + " 0.986596\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 119274\n", - " 71441\n", - " vistra\n", - " vistra\n", - " 2\n", - " 0.000058\n", - " 0.000058\n", - " 872345.689655\n", - " 0.019855\n", - " 6555 sierra drive\n", - " 6555 sierra drive\n", - " [6555, sierra, drive]\n", - " [6555, sierra, drive]\n", - " 2.0\n", - " 5.407499\n", - " tx\n", - " tx\n", + " 14051\n", + " 4420\n", + " constellation energy\n", + " constellation newenergy\n", + " 1\n", + " 0.000024\n", + " 0.000024\n", + " 5704.210475\n", + " 1.000000\n", + " 1310 pt st\n", + " 100 constellation way\n", + " 0\n", + " 0.000024\n", + " 0.000183\n", + " 0.881657\n", + " 1.000000\n", + " md\n", + " md\n", " 1\n", - " 0.080866\n", - " 0.080866\n", - " 14.856341\n", - " 0.586015\n", - " irving\n", - " irving\n", + " 0.025130\n", + " 0.025130\n", + " 15.873789\n", + " 2.034167\n", + " baltimore\n", + " baltimore\n", " 2\n", - " 0.004380\n", - " 0.004380\n", - " 94.80739\n", - " 1.389595\n", - " FSTR\n", - " FSTR\n", + " 0.003583\n", + " 0.003583\n", + " 108.031428\n", + " 1.767102\n", + " KNSTLXN ENRJ\n", + " KNSTLXN NWNRJ\n", " 0\n", - " 119274\n", - " 0001692819\n", - " vistra corp.\n", - " 71441\n", - " 5504\n", + " 14051\n", + " 0001868275\n", + " 0001868275\n", + " constellation energy corp\n", + " 4420\n", + " 58491\n", " \n", " \n", " ...\n", @@ -3140,250 +2753,262 @@ " ...\n", " ...\n", " ...\n", + " ...\n", + " ...\n", " \n", " \n", - " 1026765\n", - " 12.087133\n", - " 0.999770\n", + " 464642\n", + " 5.308053\n", + " 0.975380\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 153106\n", - " 79761\n", - " archer daniels midland\n", - " archer daniels midland\n", + " 1585\n", + " 6561\n", + " air products and chemicals /de/\n", + " exelon gen extexlaporte\n", + " 0\n", + " 0.000024\n", + " 0.000012\n", + " 0.986045\n", + " 1.000000\n", + " 7201 hamilton blvd\n", + " 7201 hamilton blvd\n", " 2\n", - " 0.000058\n", - " 0.000058\n", - " 872345.689655\n", - " 0.019855\n", - " 4666 faries pkwy\n", - " 4666 faries pkwy\n", - " [4666, faries, pkwy]\n", - " [4666, faries, pkwy]\n", - " 2.0\n", - " 5.407499\n", - " il\n", - " il\n", + " 0.000122\n", + " 0.000122\n", + " 9450.378101\n", + " 0.095137\n", + " pa\n", + " pa\n", " 1\n", - " 0.033191\n", - " 0.033191\n", - " 14.856341\n", - " 1.427770\n", - " decatur\n", - " decatur\n", + " 0.029409\n", + " 0.029409\n", + " 15.873789\n", + " 1.738226\n", + " allentown\n", + " allentown\n", " 2\n", - " 0.000468\n", - " 0.000468\n", - " 94.80739\n", - " 13.002638\n", - " ARXR TNLS MTLNT\n", - " ARXR TNLS MTLNT\n", - " 0\n", - " 153106\n", - " 0000007084\n", - " archer daniels midland co\n", - " 79761\n", - " 772\n", + " 0.001003\n", + " 0.001003\n", + " 108.031428\n", + " 6.314158\n", + " AR PRTKTS ANT XMKLS T\n", + " EKSLN JN EKSTKSLPRT\n", + " 1\n", + " 1585\n", + " 0000002969\n", + " 0000002969\n", + " air products & chemicals inc /de/\n", + " 6561\n", + " 6081\n", " \n", " \n", - " 656833\n", - " 9.809977\n", - " 0.998887\n", + " 227094\n", + " 20.402617\n", + " 0.999999\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 150546\n", - " 79913\n", - " appalachian power\n", - " appalachian power\n", + " 1586\n", + " 430\n", + " air products and chemicals\n", + " air products and chemicals\n", " 2\n", - " 0.000077\n", - " 0.000077\n", - " 872345.689655\n", - " 0.014891\n", - " 1 riverside plaza\n", - " 1 riverside plaza\n", - " [1, riverside, plaza]\n", - " [1, riverside, plaza]\n", - " 2.0\n", - " 5.407499\n", - " oh\n", - " oh\n", - " 1\n", - " 0.018770\n", - " 0.018770\n", - " 14.856341\n", - " 2.524754\n", - " columbus\n", - " columbus\n", + " 0.000037\n", + " 0.000037\n", + " 652179.111493\n", + " 0.021160\n", + " 1940 air products blvd\n", + " 1940 air products blvd\n", " 2\n", - " 0.003009\n", - " 0.003009\n", - " 94.80739\n", - " 2.022633\n", - " APLXN PWR\n", - " APLXN PWR\n", - " 0\n", - " 150546\n", - " 0000006879\n", - " appalachian power co\n", - " 79913\n", - " 733\n", - " \n", - " \n", - " 640747\n", - " 10.888046\n", - " 0.999473\n", - " __splink__input_table_0\n", - " __splink__input_table_1\n", - " 144743\n", - " 80319\n", - " american crystal sugar /mn/\n", - " american crystal sugar\n", - " 1\n", - " 0.000010\n", - " 0.000029\n", - " 6085.754919\n", - " 1.000000\n", - " 101 n 3rd st\n", - " None\n", - " [101, n, 3rd, st]\n", - " NaN\n", - " -1.0\n", - " 1.000000\n", - " mn\n", - " mn\n", + " 0.000049\n", + " 0.000049\n", + " 9450.378101\n", + " 0.237842\n", + " pa\n", + " pa\n", " 1\n", - " 0.025996\n", - " 0.025996\n", - " 14.856341\n", - " 1.822919\n", - " moorhead\n", - " moorhead\n", + " 0.029409\n", + " 0.029409\n", + " 15.873789\n", + " 1.738226\n", + " allentown\n", + " allentown\n", " 2\n", - " 0.000089\n", - " 0.000089\n", - " 94.80739\n", - " 68.263848\n", - " AMRKN KRSTL SKR MN\n", - " AMRKN KRSTL SKR\n", + " 0.001003\n", + " 0.001003\n", + " 108.031428\n", + " 6.314158\n", + " AR PRTKTS ANT XMKLS\n", + " AR PRTKTS ANT XMKLS\n", " 0\n", - " 144743\n", - " 0000004828\n", - " american crystal sugar co /mn/\n", - " 80319\n", - " 491\n", + " 1586\n", + " 0000002969\n", + " 0000002969\n", + " air products & chemicals, inc.\n", + " 430\n", + " 991\n", " \n", " \n", - " 998578\n", - " 9.990554\n", - " 0.999018\n", + " 224504\n", + " 5.308053\n", + " 0.975380\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 2575\n", - " 80977\n", - " alabama power\n", - " alabama power\n", - " 2\n", - " 0.000067\n", - " 0.000067\n", - " 872345.689655\n", - " 0.017018\n", - " 600 n 18th st\n", - " None\n", - " [600, n, 18th, st]\n", - " NaN\n", - " -1.0\n", + " 1585\n", + " 435\n", + " air products and chemicals /de/\n", + " air products\n", + " 0\n", + " 0.000024\n", + " 0.000037\n", + " 0.986045\n", " 1.000000\n", - " al\n", - " al\n", + " 7201 hamilton blvd\n", + " 7201 hamilton blvd\n", + " 2\n", + " 0.000122\n", + " 0.000122\n", + " 9450.378101\n", + " 0.095137\n", + " pa\n", + " pa\n", " 1\n", - " 0.005280\n", - " 0.005280\n", - " 14.856341\n", - " 8.975778\n", - " birmingham\n", - " birmingham\n", + " 0.029409\n", + " 0.029409\n", + " 15.873789\n", + " 1.738226\n", + " allentown\n", + " allentown\n", " 2\n", - " 0.001995\n", - " 0.001995\n", - " 94.80739\n", - " 3.050898\n", - " ALBM PWR\n", - " ALBM PWR\n", + " 0.001003\n", + " 0.001003\n", + " 108.031428\n", + " 6.314158\n", + " AR PRTKTS ANT XMKLS T\n", + " AR PRTKTS\n", " 0\n", - " 2575\n", - " 0000003153\n", - " alabama power co\n", - " 80977\n", - " 195\n", + " 1585\n", + " 0000002969\n", + " 0000002969\n", + " air products & chemicals inc /de/\n", + " 435\n", + " 980\n", " \n", " \n", - " 912914\n", - " 9.434494\n", - " 0.998557\n", + " 225982\n", + " 5.308053\n", + " 0.975380\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 132976\n", - " 79317\n", + " 1585\n", + " 432\n", " air products and chemicals /de/\n", - " air products and chemicals\n", - " 1\n", - " 0.000019\n", - " 0.000048\n", - " 6085.754919\n", + " air products energy enterprises\n", + " 0\n", + " 0.000024\n", + " 0.000012\n", + " 0.986045\n", " 1.000000\n", " 7201 hamilton blvd\n", " 7201 hamilton blvd\n", - " [7201, hamilton, blvd]\n", - " [7201, hamilton, blvd]\n", - " 2.0\n", - " 5.407499\n", + " 2\n", + " 0.000122\n", + " 0.000122\n", + " 9450.378101\n", + " 0.095137\n", " pa\n", " pa\n", " 1\n", - " 0.030197\n", - " 0.030197\n", - " 14.856341\n", - " 1.569346\n", + " 0.029409\n", + " 0.029409\n", + " 15.873789\n", + " 1.738226\n", " allentown\n", " allentown\n", " 2\n", - " 0.001137\n", - " 0.001137\n", - " 94.80739\n", - " 5.354027\n", + " 0.001003\n", + " 0.001003\n", + " 108.031428\n", + " 6.314158\n", " AR PRTKTS ANT XMKLS T\n", - " AR PRTKTS ANT XMKLS\n", + " AR PRTKTS ENRJ ENTRPRSS\n", " 0\n", - " 132976\n", + " 1585\n", + " 0000002969\n", " 0000002969\n", " air products & chemicals inc /de/\n", - " 79317\n", - " 991\n", + " 432\n", + " 353\n", " \n", - " \n", - "\n", - "

197 rows × 41 columns

\n", - "" - ], - "text/plain": [ - " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r street_address_list_l street_address_list_r gamma_street_address bf_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key record_id_x central_index_key company_name_raw record_id_y utility_id_eia\n", - "889845 5.679807 0.980865 __splink__input_table_0 __splink__input_table_1 51956 22658 constellation energy constellation newenergy 1 0.000029 0.000077 6085.754919 1.000000 1310 point street None [1310, point, street] NaN -1.0 1.000000 md md 1 0.023298 0.023298 14.856341 2.034020 baltimore baltimore 2 0.003678 0.003678 94.80739 1.654881 KNSTLXN ENRJ KNSTLXN NWNRJ 0 51956 0001868275 constellation energy corp 22658 58491\n", - "884109 13.095633 0.999886 __splink__input_table_0 __splink__input_table_1 120267 96849 evergy evergy 2 0.000019 0.000019 872345.689655 0.059564 1200 main street 1200 main street [1200, main, street] [1200, main, street] 2.0 5.407499 mo mo 1 0.011744 0.011744 14.856341 4.035057 kansas city kansas city 2 0.001973 0.001973 94.80739 3.085372 EFRJ EFRJ 0 120267 0001711269 evergy, inc. 96849 64428\n", - "893941 12.486567 0.999826 __splink__input_table_0 __splink__input_table_1 120222 96211 consol energy consol energy 2 0.000058 0.000058 872345.689655 0.019855 275 technology drive 275 technology drive [275, technology, drive] [275, technology, drive] 2.0 5.407499 pa pa 1 0.030197 0.030197 14.856341 1.569346 canonsburg canonsburg 2 0.000390 0.000390 94.80739 15.603165 KNSL ENRJ KNSL ENRJ 0 120222 0001710366 consol energy inc. 96211 4299\n", - "943594 9.161274 0.998256 __splink__input_table_0 __splink__input_table_1 119271 83669 vistra energy vistra energy 2 0.000019 0.000019 872345.689655 0.059564 6555 sierra drive 6555 sierra drive [6555, sierra, drive] [6555, sierra, drive] 2.0 5.407499 tx tx 1 0.080866 0.080866 14.856341 0.586015 irving irving 2 0.004380 0.004380 94.80739 1.389595 FSTR ENRJ FSTR ENRJ 0 119271 0001692819 vistra energy corp. 83669 62723\n", - "860414 7.576311 0.994788 __splink__input_table_0 __splink__input_table_1 119274 71441 vistra vistra 2 0.000058 0.000058 872345.689655 0.019855 6555 sierra drive 6555 sierra drive [6555, sierra, drive] [6555, sierra, drive] 2.0 5.407499 tx tx 1 0.080866 0.080866 14.856341 0.586015 irving irving 2 0.004380 0.004380 94.80739 1.389595 FSTR FSTR 0 119274 0001692819 vistra corp. 71441 5504\n", - "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", - "1026765 12.087133 0.999770 __splink__input_table_0 __splink__input_table_1 153106 79761 archer daniels midland archer daniels midland 2 0.000058 0.000058 872345.689655 0.019855 4666 faries pkwy 4666 faries pkwy [4666, faries, pkwy] [4666, faries, pkwy] 2.0 5.407499 il il 1 0.033191 0.033191 14.856341 1.427770 decatur decatur 2 0.000468 0.000468 94.80739 13.002638 ARXR TNLS MTLNT ARXR TNLS MTLNT 0 153106 0000007084 archer daniels midland co 79761 772\n", - "656833 9.809977 0.998887 __splink__input_table_0 __splink__input_table_1 150546 79913 appalachian power appalachian power 2 0.000077 0.000077 872345.689655 0.014891 1 riverside plaza 1 riverside plaza [1, riverside, plaza] [1, riverside, plaza] 2.0 5.407499 oh oh 1 0.018770 0.018770 14.856341 2.524754 columbus columbus 2 0.003009 0.003009 94.80739 2.022633 APLXN PWR APLXN PWR 0 150546 0000006879 appalachian power co 79913 733\n", - "640747 10.888046 0.999473 __splink__input_table_0 __splink__input_table_1 144743 80319 american crystal sugar /mn/ american crystal sugar 1 0.000010 0.000029 6085.754919 1.000000 101 n 3rd st None [101, n, 3rd, st] NaN -1.0 1.000000 mn mn 1 0.025996 0.025996 14.856341 1.822919 moorhead moorhead 2 0.000089 0.000089 94.80739 68.263848 AMRKN KRSTL SKR MN AMRKN KRSTL SKR 0 144743 0000004828 american crystal sugar co /mn/ 80319 491\n", - "998578 9.990554 0.999018 __splink__input_table_0 __splink__input_table_1 2575 80977 alabama power alabama power 2 0.000067 0.000067 872345.689655 0.017018 600 n 18th st None [600, n, 18th, st] NaN -1.0 1.000000 al al 1 0.005280 0.005280 14.856341 8.975778 birmingham birmingham 2 0.001995 0.001995 94.80739 3.050898 ALBM PWR ALBM PWR 0 2575 0000003153 alabama power co 80977 195\n", - "912914 9.434494 0.998557 __splink__input_table_0 __splink__input_table_1 132976 79317 air products and chemicals /de/ air products and chemicals 1 0.000019 0.000048 6085.754919 1.000000 7201 hamilton blvd 7201 hamilton blvd [7201, hamilton, blvd] [7201, hamilton, blvd] 2.0 5.407499 pa pa 1 0.030197 0.030197 14.856341 1.569346 allentown allentown 2 0.001137 0.001137 94.80739 5.354027 AR PRTKTS ANT XMKLS T AR PRTKTS ANT XMKLS 0 132976 0000002969 air products & chemicals inc /de/ 79317 991\n", - "\n", - "[197 rows x 41 columns]" - ] - }, - "execution_count": 437, + " \n", + " 224473\n", + " 20.054878\n", + " 0.999999\n", + " __splink__input_table_0\n", + " __splink__input_table_1\n", + " 1348\n", + " 376\n", + " aetna life and casualty\n", + " aetna life and casualty\n", + " 2\n", + " 0.000024\n", + " 0.000024\n", + " 652179.111493\n", + " 0.031739\n", + " 151 farmington ave\n", + " 151 farmington ave\n", + " 2\n", + " 0.000110\n", + " 0.000110\n", + " 9450.378101\n", + " 0.105707\n", + " ct\n", + " ct\n", + " 1\n", + " 0.020876\n", + " 0.020876\n", + " 15.873789\n", + " 2.448667\n", + " hartford\n", + " hartford\n", + " 2\n", + " 0.001198\n", + " 0.001198\n", + " 108.031428\n", + " 5.283275\n", + " ETN LF ANT KSLT\n", + " ETN LF ANT KSLT\n", + " 0\n", + " 1348\n", + " 0000002648\n", + " 0000002648\n", + " aetna life & casualty co\n", + " 376\n", + " 211\n", + " \n", + " \n", + "\n", + "

2085 rows × 43 columns

\n", + "" + ], + "text/plain": [ + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key record_id_x sec_company_id central_index_key company_name_raw record_id_y utility_id_eia\n", + "466134 3.824596 0.934073 __splink__input_table_0 __splink__input_table_1 14692 6293 crane entergy nuclear power marketing 0 0.000012 0.000012 0.986045 1.000000 100 first stamford pl 100 first stamford pl 2 0.000122 0.000122 9450.378101 0.095137 ct ct 1 0.020876 0.020876 15.873789 2.448667 stamford stamford 2 0.003950 0.003950 108.031428 1.602975 KRN ENTRJ NKLR PWR MRKTNK 1 14692 0001944013 0001944013 crane co 6293 55243\n", + "466594 4.620005 0.960922 __splink__input_table_0 __splink__input_table_1 17752 5535 dte electric securitization funding i dte sustainable generation 0 0.000012 0.000012 0.986045 1.000000 one energy plz one energy plz 2 0.000330 0.000330 9450.378101 0.035236 mi mi 1 0.015147 0.015147 15.873789 3.374867 detroit detroit 2 0.001162 0.001162 108.031428 5.450115 TT ELKTRK SKRTSXN FNTNK I TT SSTNBL JNRXN 1 17752 0001876068 0001876068 dte electric securitization funding i llc 5535 64331\n", + "480747 4.620005 0.960922 __splink__input_table_0 __splink__input_table_1 17752 5522 dte electric securitization funding i dte electric 0 0.000012 0.000037 0.986045 1.000000 one energy plz one energy plz 2 0.000330 0.000330 9450.378101 0.035236 mi mi 1 0.015147 0.015147 15.873789 3.374867 detroit detroit 2 0.001162 0.001162 108.031428 5.450115 TT ELKTRK SKRTSXN FNTNK I TT ELKTRK 0 17752 0001876068 0001876068 dte electric securitization funding i llc 5522 5109\n", + "464506 6.019599 0.984820 __splink__input_table_0 __splink__input_table_1 14051 10935 constellation energy luminace solar rhode island 0 0.000024 0.000024 0.986045 1.000000 1310 pt st 1310 pt st 2 0.000024 0.000024 9450.378101 0.475683 md md 1 0.025130 0.025130 15.873789 2.034167 baltimore baltimore 2 0.003583 0.003583 108.031428 1.767102 KNSTLXN ENRJ LMNS SLR RHT ISLNT 1 14051 0001868275 0001868275 constellation energy corp 10935 62679\n", + "340973 6.201744 0.986596 __splink__input_table_0 __splink__input_table_1 14051 4420 constellation energy constellation newenergy 1 0.000024 0.000024 5704.210475 1.000000 1310 pt st 100 constellation way 0 0.000024 0.000183 0.881657 1.000000 md md 1 0.025130 0.025130 15.873789 2.034167 baltimore baltimore 2 0.003583 0.003583 108.031428 1.767102 KNSTLXN ENRJ KNSTLXN NWNRJ 0 14051 0001868275 0001868275 constellation energy corp 4420 58491\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "464642 5.308053 0.975380 __splink__input_table_0 __splink__input_table_1 1585 6561 air products and chemicals /de/ exelon gen extexlaporte 0 0.000024 0.000012 0.986045 1.000000 7201 hamilton blvd 7201 hamilton blvd 2 0.000122 0.000122 9450.378101 0.095137 pa pa 1 0.029409 0.029409 15.873789 1.738226 allentown allentown 2 0.001003 0.001003 108.031428 6.314158 AR PRTKTS ANT XMKLS T EKSLN JN EKSTKSLPRT 1 1585 0000002969 0000002969 air products & chemicals inc /de/ 6561 6081\n", + "227094 20.402617 0.999999 __splink__input_table_0 __splink__input_table_1 1586 430 air products and chemicals air products and chemicals 2 0.000037 0.000037 652179.111493 0.021160 1940 air products blvd 1940 air products blvd 2 0.000049 0.000049 9450.378101 0.237842 pa pa 1 0.029409 0.029409 15.873789 1.738226 allentown allentown 2 0.001003 0.001003 108.031428 6.314158 AR PRTKTS ANT XMKLS AR PRTKTS ANT XMKLS 0 1586 0000002969 0000002969 air products & chemicals, inc. 430 991\n", + "224504 5.308053 0.975380 __splink__input_table_0 __splink__input_table_1 1585 435 air products and chemicals /de/ air products 0 0.000024 0.000037 0.986045 1.000000 7201 hamilton blvd 7201 hamilton blvd 2 0.000122 0.000122 9450.378101 0.095137 pa pa 1 0.029409 0.029409 15.873789 1.738226 allentown allentown 2 0.001003 0.001003 108.031428 6.314158 AR PRTKTS ANT XMKLS T AR PRTKTS 0 1585 0000002969 0000002969 air products & chemicals inc /de/ 435 980\n", + "225982 5.308053 0.975380 __splink__input_table_0 __splink__input_table_1 1585 432 air products and chemicals /de/ air products energy enterprises 0 0.000024 0.000012 0.986045 1.000000 7201 hamilton blvd 7201 hamilton blvd 2 0.000122 0.000122 9450.378101 0.095137 pa pa 1 0.029409 0.029409 15.873789 1.738226 allentown allentown 2 0.001003 0.001003 108.031428 6.314158 AR PRTKTS ANT XMKLS T AR PRTKTS ENRJ ENTRPRSS 0 1585 0000002969 0000002969 air products & chemicals inc /de/ 432 353\n", + "224473 20.054878 0.999999 __splink__input_table_0 __splink__input_table_1 1348 376 aetna life and casualty aetna life and casualty 2 0.000024 0.000024 652179.111493 0.031739 151 farmington ave 151 farmington ave 2 0.000110 0.000110 9450.378101 0.105707 ct ct 1 0.020876 0.020876 15.873789 2.448667 hartford hartford 2 0.001198 0.001198 108.031428 5.283275 ETN LF ANT KSLT ETN LF ANT KSLT 0 1348 0000002648 0000002648 aetna life & casualty co 376 211\n", + "\n", + "[2085 rows x 43 columns]" + ] + }, + "execution_count": 139, "metadata": {}, "output_type": "execute_result" } @@ -3394,7 +3019,7 @@ }, { "cell_type": "code", - "execution_count": 89, + "execution_count": 140, "id": "11190456-12a9-49df-b863-7a6f674e39eb", "metadata": {}, "outputs": [], @@ -3404,7 +3029,7 @@ }, { "cell_type": "code", - "execution_count": 90, + "execution_count": 141, "id": "5a57bae5-6188-4a6c-be9a-14a159f82d81", "metadata": {}, "outputs": [], @@ -3414,7 +3039,7 @@ }, { "cell_type": "code", - "execution_count": 438, + "execution_count": 142, "id": "461725d0-28c8-43dd-bcd5-086b7d3f7d4b", "metadata": {}, "outputs": [], @@ -3429,7 +3054,7 @@ }, { "cell_type": "code", - "execution_count": 439, + "execution_count": 143, "id": "4d45f339-7a5b-466a-81f5-c71e425a77df", "metadata": {}, "outputs": [], @@ -3439,7 +3064,7 @@ }, { "cell_type": "code", - "execution_count": 440, + "execution_count": 144, "id": "182732ea-39c5-4fb4-9429-5f7aed781ef5", "metadata": {}, "outputs": [], @@ -3452,7 +3077,7 @@ }, { "cell_type": "code", - "execution_count": 441, + "execution_count": 145, "id": "bc058dbf-6f71-4978-90ce-5405edbbf9e5", "metadata": {}, "outputs": [ @@ -3498,10 +3123,10 @@ " alabama power co\n", " NaN\n", " 1\n", - " 2575\n", - " 80977\n", - " 0.999018\n", - " 2\n", + " 1701.0\n", + " 478.0\n", + " 1.000000\n", + " 2.0\n", " both\n", " 1.0\n", " \n", @@ -3512,10 +3137,10 @@ " fluence energy, inc.\n", " Fluence\n", " 0\n", - " 126809\n", - " 21615\n", - " 0.000002\n", - " 0\n", + " 21792.0\n", + " 6889.0\n", + " 0.016529\n", + " 0.0\n", " both\n", " 0.0\n", " \n", @@ -3526,12 +3151,12 @@ " georgia power co\n", " NaN\n", " 1\n", - " 50428\n", - " 68242\n", - " 0.029853\n", - " 2\n", + " 23416.0\n", + " 7653.0\n", + " 0.999997\n", + " 2.0\n", " both\n", - " 0.0\n", + " 1.0\n", " \n", " \n", " 3\n", @@ -3540,10 +3165,10 @@ " columbus southern power co /oh/\n", " Columbus Southern Power Co\n", " 1\n", - " 129635\n", - " 96300\n", - " 0.997628\n", - " 1\n", + " 13310.0\n", + " 4281.0\n", + " 0.999981\n", + " 1.0\n", " both\n", " 1.0\n", " \n", @@ -3554,10 +3179,10 @@ " duke energy corp\n", " NaN\n", " 1\n", - " 37661\n", - " 71555\n", - " 0.926352\n", - " 2\n", + " 17793.0\n", + " 5564.0\n", + " 0.927294\n", + " 2.0\n", " both\n", " 0.0\n", " \n", @@ -3568,10 +3193,10 @@ " duke energy carolinas, llc\n", " Duke Energy Carolinas LLC\n", " 1\n", - " 133261\n", - " 118543\n", - " 0.987916\n", - " 2\n", + " 17790.0\n", + " 5558.0\n", + " 0.999987\n", + " 2.0\n", " both\n", " 1.0\n", " \n", @@ -3582,10 +3207,10 @@ " berkshire realty co inc /de\n", " Berkshire Wind Power Cooperative Corp\n", " 0\n", - " 198821\n", - " 89415\n", - " 0.000030\n", - " 0\n", + " 7449.0\n", + " 1712.0\n", + " 0.001912\n", + " 0.0\n", " both\n", " 0.0\n", " \n", @@ -3596,10 +3221,10 @@ " southern co\n", " southern co services inc\n", " 0\n", - " 50417\n", - " 111824\n", - " 0.000063\n", - " 0\n", + " 50962.0\n", + " 17068.0\n", + " 0.007216\n", + " 0.0\n", " both\n", " 0.0\n", " \n", @@ -3610,10 +3235,10 @@ " southern co\n", " Southern Power Co\n", " 0\n", - " 50417\n", - " 49613\n", - " 0.004315\n", - " 0\n", + " 50963.0\n", + " 17089.0\n", + " 0.034232\n", + " 0.0\n", " both\n", " 0.0\n", " \n", @@ -3624,12 +3249,12 @@ " pacific gas & electric co\n", " NaN\n", " 1\n", - " 2898\n", - " 55480\n", - " 0.624991\n", - " 2\n", + " 41598.0\n", + " 13933.0\n", + " 0.999948\n", + " 2.0\n", " both\n", - " 0.0\n", + " 1.0\n", " \n", " \n", " 10\n", @@ -3638,10 +3263,10 @@ " firstenergy corp\n", " FirstEnergy\n", " 0\n", - " 14192\n", - " 69716\n", - " 0.999707\n", - " 2\n", + " 21579.0\n", + " 6776.0\n", + " 0.999998\n", + " 2.0\n", " both\n", " 1.0\n", " \n", @@ -3652,12 +3277,12 @@ " firstenergy corp\n", " FirstEnergy Nuclear Generation Corp\n", " 0\n", - " 14192\n", - " 102163\n", - " 0.000066\n", - " 0\n", - " both\n", + " 21579.0\n", + " 6780.0\n", + " 0.986543\n", " 0.0\n", + " both\n", + " 1.0\n", " \n", " \n", " 12\n", @@ -3666,10 +3291,10 @@ " firstenergy corp\n", " First Energy Services\n", " 0\n", - " 14192\n", - " 162033\n", - " 0.000066\n", - " 0\n", + " 21579.0\n", + " 6763.0\n", + " 0.085467\n", + " 0.0\n", " both\n", " 0.0\n", " \n", @@ -3680,11 +3305,11 @@ " firstenergy corp\n", " First Energy Corp\n", " 1\n", - " 14192\n", - " 121855\n", - " 0.010697\n", - " 1\n", - " both\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " left_only\n", " 0.0\n", " \n", " \n", @@ -3694,10 +3319,10 @@ " tucson electric power co\n", " NaN\n", " 1\n", - " 715\n", - " 41507\n", - " 0.999798\n", - " 2\n", + " 55725.0\n", + " 18901.0\n", + " 1.000000\n", + " 2.0\n", " both\n", " 1.0\n", " \n", @@ -3708,10 +3333,10 @@ " tampa electric co\n", " NaN\n", " 1\n", - " 231716\n", - " 47982\n", - " 0.989228\n", - " 2\n", + " 53604.0\n", + " 18180.0\n", + " 0.991059\n", + " 2.0\n", " both\n", " 1.0\n", " \n", @@ -3722,10 +3347,10 @@ " dominion energy, inc\n", " NaN\n", " 1\n", - " 15937\n", - " 71878\n", - " 0.998282\n", - " 2\n", + " 17484.0\n", + " 5386.0\n", + " 0.999985\n", + " 2.0\n", " both\n", " 1.0\n", " \n", @@ -3736,10 +3361,10 @@ " nrg energy, inc\n", " NRG Energy Gas & Wind Holdings Inc\n", " 0\n", - " 7168\n", - " 17454\n", - " 0.002575\n", - " 0\n", + " 40084.0\n", + " 13240.0\n", + " 0.300167\n", + " 0.0\n", " both\n", " 0.0\n", " \n", @@ -3750,10 +3375,10 @@ " nrg energy inc\n", " NRG Energy Inc\n", " 1\n", - " 7173\n", - " 95029\n", - " 0.988801\n", - " 2\n", + " 40084.0\n", + " 13243.0\n", + " 0.999820\n", + " 2.0\n", " both\n", " 1.0\n", " \n", @@ -3764,10 +3389,10 @@ " oglethorpe power corp\n", " NaN\n", " 1\n", - " 172902\n", - " 56478\n", - " 0.999768\n", - " 2\n", + " 40576.0\n", + " 13515.0\n", + " 1.000000\n", + " 2.0\n", " both\n", " 1.0\n", " \n", @@ -3778,43 +3403,88 @@ " central maine power co\n", " NaN\n", " 1\n", - " 126771\n", - " 176663\n", - " 0.897700\n", - " 2\n", + " 10876.0\n", + " 3424.0\n", + " 1.000000\n", + " 2.0\n", + " both\n", + " 1.0\n", + " \n", + " \n", + " 21\n", + " 0001032208\n", + " 61296\n", + " sempra energy\n", + " Sempra Generation\n", + " 1\n", + " 49303.0\n", + " 16270.0\n", + " 0.559074\n", + " 0.0\n", " both\n", " 0.0\n", " \n", + " \n", + " 22\n", + " 0000004904\n", + " 488\n", + " american electric power co inc\n", + " American Electric Power Inc\n", + " 1\n", + " 2926.0\n", + " 793.0\n", + " 0.996076\n", + " 2.0\n", + " both\n", + " 1.0\n", + " \n", + " \n", + " 23\n", + " 0000715957\n", + " 5248\n", + " dominion energy, inc\n", + " Dominion Energy Inc.\n", + " 1\n", + " 17484.0\n", + " 5386.0\n", + " 0.999985\n", + " 2.0\n", + " both\n", + " 1.0\n", + " \n", " \n", "\n", "" ], "text/plain": [ - " central_index_key utility_id_eia sec_company_name eia_company_name match record_id_l record_id_r match_probability gamma_company_name_no_legal _merge predicted_match\n", - "0 0000003153 195 alabama power co NaN 1 2575 80977 0.999018 2 both 1.0\n", - "1 0001868941 58702 fluence energy, inc. Fluence 0 126809 21615 0.000002 0 both 0.0\n", - "2 0000041091 7140 georgia power co NaN 1 50428 68242 0.029853 2 both 0.0\n", - "3 0000022198 4062 columbus southern power co /oh/ Columbus Southern Power Co 1 129635 96300 0.997628 1 both 1.0\n", - "4 0001326160 5416 duke energy corp NaN 1 37661 71555 0.926352 2 both 0.0\n", - "5 0000030371 54905 duke energy carolinas, llc Duke Energy Carolinas LLC 1 133261 118543 0.987916 2 both 1.0\n", - "6 0000869446 57140 berkshire realty co inc /de Berkshire Wind Power Cooperative Corp 0 198821 89415 0.000030 0 both 0.0\n", - "7 0000092122 18195 southern co southern co services inc 0 50417 111824 0.000063 0 both 0.0\n", - "8 0000092122 17650 southern co Southern Power Co 0 50417 49613 0.004315 0 both 0.0\n", - "9 0000075488 14328 pacific gas & electric co NaN 1 2898 55480 0.624991 2 both 0.0\n", - "10 0001031296 6526 firstenergy corp FirstEnergy 0 14192 69716 0.999707 2 both 1.0\n", - "11 0001031296 54776 firstenergy corp FirstEnergy Nuclear Generation Corp 0 14192 102163 0.000066 0 both 0.0\n", - "12 0001031296 6458 firstenergy corp First Energy Services 0 14192 162033 0.000066 0 both 0.0\n", - "13 0001031296 32208 firstenergy corp First Energy Corp 1 14192 121855 0.010697 1 both 0.0\n", - "14 0000100122 24211 tucson electric power co NaN 1 715 41507 0.999798 2 both 1.0\n", - "15 0000096271 18454 tampa electric co NaN 1 231716 47982 0.989228 2 both 1.0\n", - "16 0000715957 5248 dominion energy, inc NaN 1 15937 71878 0.998282 2 both 1.0\n", - "17 0001013871 59883 nrg energy, inc NRG Energy Gas & Wind Holdings Inc 0 7168 17454 0.002575 0 both 0.0\n", - "18 0001013871 13377 nrg energy inc NRG Energy Inc 1 7173 95029 0.988801 2 both 1.0\n", - "19 0000788816 13994 oglethorpe power corp NaN 1 172902 56478 0.999768 2 both 1.0\n", - "20 0000018675 3266 central maine power co NaN 1 126771 176663 0.897700 2 both 0.0" + " central_index_key utility_id_eia sec_company_name eia_company_name match record_id_l record_id_r match_probability gamma_company_name_no_legal _merge predicted_match\n", + "0 0000003153 195 alabama power co NaN 1 1701.0 478.0 1.000000 2.0 both 1.0\n", + "1 0001868941 58702 fluence energy, inc. Fluence 0 21792.0 6889.0 0.016529 0.0 both 0.0\n", + "2 0000041091 7140 georgia power co NaN 1 23416.0 7653.0 0.999997 2.0 both 1.0\n", + "3 0000022198 4062 columbus southern power co /oh/ Columbus Southern Power Co 1 13310.0 4281.0 0.999981 1.0 both 1.0\n", + "4 0001326160 5416 duke energy corp NaN 1 17793.0 5564.0 0.927294 2.0 both 0.0\n", + "5 0000030371 54905 duke energy carolinas, llc Duke Energy Carolinas LLC 1 17790.0 5558.0 0.999987 2.0 both 1.0\n", + "6 0000869446 57140 berkshire realty co inc /de Berkshire Wind Power Cooperative Corp 0 7449.0 1712.0 0.001912 0.0 both 0.0\n", + "7 0000092122 18195 southern co southern co services inc 0 50962.0 17068.0 0.007216 0.0 both 0.0\n", + "8 0000092122 17650 southern co Southern Power Co 0 50963.0 17089.0 0.034232 0.0 both 0.0\n", + "9 0000075488 14328 pacific gas & electric co NaN 1 41598.0 13933.0 0.999948 2.0 both 1.0\n", + "10 0001031296 6526 firstenergy corp FirstEnergy 0 21579.0 6776.0 0.999998 2.0 both 1.0\n", + "11 0001031296 54776 firstenergy corp FirstEnergy Nuclear Generation Corp 0 21579.0 6780.0 0.986543 0.0 both 1.0\n", + "12 0001031296 6458 firstenergy corp First Energy Services 0 21579.0 6763.0 0.085467 0.0 both 0.0\n", + "13 0001031296 32208 firstenergy corp First Energy Corp 1 NaN NaN NaN NaN left_only 0.0\n", + "14 0000100122 24211 tucson electric power co NaN 1 55725.0 18901.0 1.000000 2.0 both 1.0\n", + "15 0000096271 18454 tampa electric co NaN 1 53604.0 18180.0 0.991059 2.0 both 1.0\n", + "16 0000715957 5248 dominion energy, inc NaN 1 17484.0 5386.0 0.999985 2.0 both 1.0\n", + "17 0001013871 59883 nrg energy, inc NRG Energy Gas & Wind Holdings Inc 0 40084.0 13240.0 0.300167 0.0 both 0.0\n", + "18 0001013871 13377 nrg energy inc NRG Energy Inc 1 40084.0 13243.0 0.999820 2.0 both 1.0\n", + "19 0000788816 13994 oglethorpe power corp NaN 1 40576.0 13515.0 1.000000 2.0 both 1.0\n", + "20 0000018675 3266 central maine power co NaN 1 10876.0 3424.0 1.000000 2.0 both 1.0\n", + "21 0001032208 61296 sempra energy Sempra Generation 1 49303.0 16270.0 0.559074 0.0 both 0.0\n", + "22 0000004904 488 american electric power co inc American Electric Power Inc 1 2926.0 793.0 0.996076 2.0 both 1.0\n", + "23 0000715957 5248 dominion energy, inc Dominion Energy Inc. 1 17484.0 5386.0 0.999985 2.0 both 1.0" ] }, - "execution_count": 441, + "execution_count": 145, "metadata": {}, "output_type": "execute_result" } @@ -3825,7 +3495,7 @@ }, { "cell_type": "code", - "execution_count": 442, + "execution_count": 146, "id": "2fc6ed05-5b35-4856-a668-f16e253e7fea", "metadata": {}, "outputs": [], @@ -3841,19 +3511,17 @@ }, { "cell_type": "code", - "execution_count": 443, + "execution_count": 147, "id": "99a7bc64-9f32-4f53-9a89-63a31ff7debe", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "(np.float64(0.8888888888888888),\n", - " np.float64(0.6153846153846154),\n", - " 0.7142857142857143)" + "(np.float64(0.8666666666666667), np.float64(0.8125), 0.7916666666666666)" ] }, - "execution_count": 443, + "execution_count": 147, "metadata": {}, "output_type": "execute_result" } @@ -3864,7 +3532,7 @@ }, { "cell_type": "code", - "execution_count": 444, + "execution_count": 148, "id": "08932be5-b90c-440d-9efb-156cb4d63c93", "metadata": {}, "outputs": [ @@ -3896,13 +3564,13 @@ " \n", " \n", " Negative\n", - " 7\n", - " 1\n", + " 6\n", + " 2\n", " \n", " \n", " Positive\n", - " 5\n", - " 8\n", + " 3\n", + " 13\n", " \n", " \n", "\n", @@ -3910,11 +3578,11 @@ ], "text/plain": [ " Predicted Negative Predicted Positive\n", - "Negative 7 1\n", - "Positive 5 8" + "Negative 6 2\n", + "Positive 3 13" ] }, - "execution_count": 444, + "execution_count": 148, "metadata": {}, "output_type": "execute_result" } @@ -3929,7 +3597,7 @@ }, { "cell_type": "code", - "execution_count": 445, + "execution_count": 149, "id": "025c80e9-5055-4eaa-a873-38b910cd7f94", "metadata": {}, "outputs": [], @@ -3939,7 +3607,7 @@ }, { "cell_type": "code", - "execution_count": 446, + "execution_count": 150, "id": "e4f44fda-1b55-4f9c-a96d-5eec36dac768", "metadata": {}, "outputs": [ @@ -3979,44 +3647,16 @@ " \n", " \n", " \n", - " 2\n", - " 0000041091\n", - " 7140\n", - " georgia power co\n", - " NaN\n", - " 1\n", - " 50428\n", - " 68242\n", - " 0.029853\n", - " 2\n", - " both\n", - " 0.0\n", - " \n", - " \n", " 4\n", " 0001326160\n", " 5416\n", " duke energy corp\n", " NaN\n", " 1\n", - " 37661\n", - " 71555\n", - " 0.926352\n", - " 2\n", - " both\n", - " 0.0\n", - " \n", - " \n", - " 9\n", - " 0000075488\n", - " 14328\n", - " pacific gas & electric co\n", - " NaN\n", - " 1\n", - " 2898\n", - " 55480\n", - " 0.624991\n", - " 2\n", + " 17793.0\n", + " 5564.0\n", + " 0.927294\n", + " 2.0\n", " both\n", " 0.0\n", " \n", @@ -4027,10 +3667,24 @@ " firstenergy corp\n", " FirstEnergy\n", " 0\n", - " 14192\n", - " 69716\n", - " 0.999707\n", - " 2\n", + " 21579.0\n", + " 6776.0\n", + " 0.999998\n", + " 2.0\n", + " both\n", + " 1.0\n", + " \n", + " \n", + " 11\n", + " 0001031296\n", + " 54776\n", + " firstenergy corp\n", + " FirstEnergy Nuclear Generation Corp\n", + " 0\n", + " 21579.0\n", + " 6780.0\n", + " 0.986543\n", + " 0.0\n", " both\n", " 1.0\n", " \n", @@ -4041,24 +3695,24 @@ " firstenergy corp\n", " First Energy Corp\n", " 1\n", - " 14192\n", - " 121855\n", - " 0.010697\n", - " 1\n", - " both\n", + " NaN\n", + " NaN\n", + " NaN\n", + " NaN\n", + " left_only\n", " 0.0\n", " \n", " \n", - " 20\n", - " 0000018675\n", - " 3266\n", - " central maine power co\n", - " NaN\n", + " 21\n", + " 0001032208\n", + " 61296\n", + " sempra energy\n", + " Sempra Generation\n", " 1\n", - " 126771\n", - " 176663\n", - " 0.897700\n", - " 2\n", + " 49303.0\n", + " 16270.0\n", + " 0.559074\n", + " 0.0\n", " both\n", " 0.0\n", " \n", @@ -4067,16 +3721,15 @@ "" ], "text/plain": [ - " central_index_key utility_id_eia sec_company_name eia_company_name match record_id_l record_id_r match_probability gamma_company_name_no_legal _merge predicted_match\n", - "2 0000041091 7140 georgia power co NaN 1 50428 68242 0.029853 2 both 0.0\n", - "4 0001326160 5416 duke energy corp NaN 1 37661 71555 0.926352 2 both 0.0\n", - "9 0000075488 14328 pacific gas & electric co NaN 1 2898 55480 0.624991 2 both 0.0\n", - "10 0001031296 6526 firstenergy corp FirstEnergy 0 14192 69716 0.999707 2 both 1.0\n", - "13 0001031296 32208 firstenergy corp First Energy Corp 1 14192 121855 0.010697 1 both 0.0\n", - "20 0000018675 3266 central maine power co NaN 1 126771 176663 0.897700 2 both 0.0" + " central_index_key utility_id_eia sec_company_name eia_company_name match record_id_l record_id_r match_probability gamma_company_name_no_legal _merge predicted_match\n", + "4 0001326160 5416 duke energy corp NaN 1 17793.0 5564.0 0.927294 2.0 both 0.0\n", + "10 0001031296 6526 firstenergy corp FirstEnergy 0 21579.0 6776.0 0.999998 2.0 both 1.0\n", + "11 0001031296 54776 firstenergy corp FirstEnergy Nuclear Generation Corp 0 21579.0 6780.0 0.986543 0.0 both 1.0\n", + "13 0001031296 32208 firstenergy corp First Energy Corp 1 NaN NaN NaN NaN left_only 0.0\n", + "21 0001032208 61296 sempra energy Sempra Generation 1 49303.0 16270.0 0.559074 0.0 both 0.0" ] }, - "execution_count": 446, + "execution_count": 150, "metadata": {}, "output_type": "execute_result" } @@ -4087,7 +3740,7 @@ }, { "cell_type": "code", - "execution_count": 447, + "execution_count": 151, "id": "c425a676-aa6e-4d8f-b814-931da392c2ff", "metadata": {}, "outputs": [], @@ -4105,7 +3758,7 @@ }, { "cell_type": "code", - "execution_count": 448, + "execution_count": 152, "id": "ff55f2cb-7ce1-4697-99e7-bf22918f7ed1", "metadata": {}, "outputs": [ @@ -4114,23 +3767,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.LayerChart(...)" ] }, - "execution_count": 448, + "execution_count": 152, "metadata": {}, "output_type": "execute_result" } @@ -4197,597 +3850,46 @@ "id": "a2ba43b6-a664-462a-823f-e3f08585bb51", "metadata": {}, "source": [ - "# Save good predictions" + "# Save good predictions\n", + "Make the predictions one to one. First, keep the highest probability EIA utility ID for each SEC company. Then, keep the highest probability SEC company for each EIA utility" ] }, { "cell_type": "code", - "execution_count": 192, + "execution_count": 153, "id": "92172e2f-39ba-49e3-8312-98597256ca4f", "metadata": {}, + "outputs": [], + "source": [ + "one_to_one_preds = preds_validation_df[preds_validation_df.match_probability >= .95].sort_values(\n", + " by=\"match_probability\", ascending=False\n", + ").drop_duplicates(\n", + " subset=\"sec_company_id\", keep=\"first\"\n", + ").drop_duplicates(\n", + " subset=\"utility_id_eia\", keep=\"first\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 154, + "id": "07ca81ae-1b26-4cd3-ade6-75381028028a", + "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrecord_id_lrecord_id_rcompany_name_lcompany_name_rgamma_company_nametf_company_name_ltf_company_name_rbf_company_namebf_tf_adj_company_namestreet_address_lstreet_address_rgamma_street_addresstf_street_address_ltf_street_address_rbf_street_addressbf_tf_adj_street_addresszip_code_lzip_code_rgamma_zip_codetf_zip_code_ltf_zip_code_rbf_zip_codebf_tf_adj_zip_codecity_lcity_rgamma_citytf_city_ltf_city_rbf_citybf_tf_adj_citycompany_name_mphone_lcompany_name_mphone_rstreet_address_list_lstreet_address_list_rmatch_key
1996074.2654900.950575__splink__input_table_0__splink__input_table_120077117512prt group incorporatedpratt and whitney power systems00.0000190.0000100.9912201.00000080 lamberton rdmail stop 191-1300.0000360.0000120.8659481.000000060950609510.0001910.0001911148.0021893.403266windsorwindsor20.0002790.000279126.99968324.882561PRT KRPPRT ANT HTN PWR SSTMS[80, lamberton, rd][mail, stop, 191-13]0
120414.2774680.950964__splink__input_table_0__splink__input_table_1219453113555cogentrix energy incorporatedgreen country energy limited liability company00.0000190.0000380.9912201.0000009405 arrowpoint blvd9405 arrowpoint blvd20.0005340.00053414580.3906270.015600282732827310.0012560.0012561148.0021890.516567charlottechalotte10.0141550.00002279.9234871.000000KJNTRKS ENRJKRN KNTR ENRJ[9405, arrowpoint, blvd][9405, arrowpoint, blvd]1
128054.2774680.950964__splink__input_table_0__splink__input_table_1219453115755cogentrix energy incorporatedjackson county power limited liability company00.0000190.0000290.9912201.0000009405 arrowpoint blvd9405 arrowpoint blvd20.0005340.00053414580.3906270.015600282732827310.0012560.0012561148.0021890.516567charlottechaarlotte10.0141550.00001179.9234871.000000KJNTRKS ENRJJKSN KNT PWR[9405, arrowpoint, blvd][9405, arrowpoint, blvd]1
81374.2780930.950984__splink__input_table_0__splink__input_table_1648133879rand logistics incorporatednorridgewock river road solar limited liabilit...00.0000290.0000190.9912201.000000333 washington street333 washington street20.0010560.00105614580.3906270.007888073020730210.0023320.0023321148.0021890.278152jersey cityjersey city20.0029980.002998126.9996832.312506RNT LJSTKSNRJWK RFR RT SLR[333, washington, street][333, washington, street]1
81364.2780930.950984__splink__input_table_0__splink__input_table_1648135193rand logistics incorporatedanderson solar farm limited liability company00.0000290.0000290.9912201.000000333 washington street333 washington street20.0010560.00105614580.3906270.007888073020730210.0023320.0023321148.0021890.278152jersey cityjersey city20.0029980.002998126.9996832.312506RNT LJSTKSANTRSN SLR FRM[333, washington, street][333, washington, street]1
........................................................................................................................
19927827.5145841.000000__splink__input_table_0__splink__input_table_127759142183diamond brands incorporateddiamond brands incorporated20.0000290.0000297612.6805960.0379861800 cloquet avenue1800 cloquet avenue20.0000360.00003614580.3906270.233998557205572010.0000780.0000781148.0021898.265075cloquetcloquet20.0000780.000078126.99968388.866289TMNT BRNTSTMNT BRNTS[1800, cloquet, avenue][1800, cloquet, avenue]0
48507027.6553621.000000__splink__input_table_0__splink__input_table_15042095697gulf power companygulf power company20.0000380.0000387612.6805960.028490one energy placeone energy place20.0000240.00002414580.3906270.350997325203252010.0000560.0000561148.00218911.571104pensacolapensacola20.0001110.000111126.99968362.206402KLF PWRKLF PWR[one, energy, place][one, energy, place]0
33156527.9772901.000000__splink__input_table_0__splink__input_table_117077578563berry petroleum companyberry petroleum company20.0000960.0000967612.6805960.01139628700 hovey hills rd28700 hovey hills rd20.0000240.00002414580.3906270.350997932689326810.0000450.0000451148.00218914.463881tafttaft20.0000450.000045126.999683155.516006BR PTRLMBR PTRLM[28700, hovey, hills, rd][28700, hovey, hills, rd]0
86934128.9772901.000000__splink__input_table_0__splink__input_table_139609141382eme homer city generation limited partnershipeme homer city generation limited partnership20.0000380.0000387612.6805960.0284901750 power plant road1750 power plant road20.0000240.00002414580.3906270.350997157481574810.0000450.0000451148.00218914.463881homer cityhomer city20.0000560.000056126.999683124.412805EM HMR ST JNRXNEM HMR ST JNRXN[1750, power, plant, road][1750, power, plant, road]0
7321229.5443311.000000__splink__input_table_0__splink__input_table_122468150859selkirk cogen partners limited partnershipselkirk cogen partners limited partnership20.0000580.0000587612.6805960.01899324 power park drive24 power park drive20.0000240.00002414580.3906270.350997121581215810.0000340.0000341148.00218919.285174selkirkselkirk20.0000330.000033126.999683207.354675SLKRK KJN PRTNRSSLKRK KJN PRTNRS[24, power, park, drive][24, power, park, drive]0
\n", - "

3014 rows × 39 columns

\n", - "
" - ], "text/plain": [ - " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_l company_name_r gamma_company_name tf_company_name_l tf_company_name_r bf_company_name bf_tf_adj_company_name street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address zip_code_l zip_code_r gamma_zip_code tf_zip_code_l tf_zip_code_r bf_zip_code bf_tf_adj_zip_code city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r street_address_list_l street_address_list_r match_key\n", - "199607 4.265490 0.950575 __splink__input_table_0 __splink__input_table_1 20077 117512 prt group incorporated pratt and whitney power systems 0 0.000019 0.000010 0.991220 1.000000 80 lamberton rd mail stop 191-13 0 0.000036 0.000012 0.865948 1.000000 06095 06095 1 0.000191 0.000191 1148.002189 3.403266 windsor windsor 2 0.000279 0.000279 126.999683 24.882561 PRT KRP PRT ANT HTN PWR SSTMS [80, lamberton, rd] [mail, stop, 191-13] 0\n", - "12041 4.277468 0.950964 __splink__input_table_0 __splink__input_table_1 219453 113555 cogentrix energy incorporated green country energy limited liability company 0 0.000019 0.000038 0.991220 1.000000 9405 arrowpoint blvd 9405 arrowpoint blvd 2 0.000534 0.000534 14580.390627 0.015600 28273 28273 1 0.001256 0.001256 1148.002189 0.516567 charlotte chalotte 1 0.014155 0.000022 79.923487 1.000000 KJNTRKS ENRJ KRN KNTR ENRJ [9405, arrowpoint, blvd] [9405, arrowpoint, blvd] 1\n", - "12805 4.277468 0.950964 __splink__input_table_0 __splink__input_table_1 219453 115755 cogentrix energy incorporated jackson county power limited liability company 0 0.000019 0.000029 0.991220 1.000000 9405 arrowpoint blvd 9405 arrowpoint blvd 2 0.000534 0.000534 14580.390627 0.015600 28273 28273 1 0.001256 0.001256 1148.002189 0.516567 charlotte chaarlotte 1 0.014155 0.000011 79.923487 1.000000 KJNTRKS ENRJ JKSN KNT PWR [9405, arrowpoint, blvd] [9405, arrowpoint, blvd] 1\n", - "8137 4.278093 0.950984 __splink__input_table_0 __splink__input_table_1 64813 3879 rand logistics incorporated norridgewock river road solar limited liabilit... 0 0.000029 0.000019 0.991220 1.000000 333 washington street 333 washington street 2 0.001056 0.001056 14580.390627 0.007888 07302 07302 1 0.002332 0.002332 1148.002189 0.278152 jersey city jersey city 2 0.002998 0.002998 126.999683 2.312506 RNT LJSTKS NRJWK RFR RT SLR [333, washington, street] [333, washington, street] 1\n", - "8136 4.278093 0.950984 __splink__input_table_0 __splink__input_table_1 64813 5193 rand logistics incorporated anderson solar farm limited liability company 0 0.000029 0.000029 0.991220 1.000000 333 washington street 333 washington street 2 0.001056 0.001056 14580.390627 0.007888 07302 07302 1 0.002332 0.002332 1148.002189 0.278152 jersey city jersey city 2 0.002998 0.002998 126.999683 2.312506 RNT LJSTKS ANTRSN SLR FRM [333, washington, street] [333, washington, street] 1\n", - "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", - "199278 27.514584 1.000000 __splink__input_table_0 __splink__input_table_1 27759 142183 diamond brands incorporated diamond brands incorporated 2 0.000029 0.000029 7612.680596 0.037986 1800 cloquet avenue 1800 cloquet avenue 2 0.000036 0.000036 14580.390627 0.233998 55720 55720 1 0.000078 0.000078 1148.002189 8.265075 cloquet cloquet 2 0.000078 0.000078 126.999683 88.866289 TMNT BRNTS TMNT BRNTS [1800, cloquet, avenue] [1800, cloquet, avenue] 0\n", - "485070 27.655362 1.000000 __splink__input_table_0 __splink__input_table_1 50420 95697 gulf power company gulf power company 2 0.000038 0.000038 7612.680596 0.028490 one energy place one energy place 2 0.000024 0.000024 14580.390627 0.350997 32520 32520 1 0.000056 0.000056 1148.002189 11.571104 pensacola pensacola 2 0.000111 0.000111 126.999683 62.206402 KLF PWR KLF PWR [one, energy, place] [one, energy, place] 0\n", - "331565 27.977290 1.000000 __splink__input_table_0 __splink__input_table_1 170775 78563 berry petroleum company berry petroleum company 2 0.000096 0.000096 7612.680596 0.011396 28700 hovey hills rd 28700 hovey hills rd 2 0.000024 0.000024 14580.390627 0.350997 93268 93268 1 0.000045 0.000045 1148.002189 14.463881 taft taft 2 0.000045 0.000045 126.999683 155.516006 BR PTRLM BR PTRLM [28700, hovey, hills, rd] [28700, hovey, hills, rd] 0\n", - "869341 28.977290 1.000000 __splink__input_table_0 __splink__input_table_1 39609 141382 eme homer city generation limited partnership eme homer city generation limited partnership 2 0.000038 0.000038 7612.680596 0.028490 1750 power plant road 1750 power plant road 2 0.000024 0.000024 14580.390627 0.350997 15748 15748 1 0.000045 0.000045 1148.002189 14.463881 homer city homer city 2 0.000056 0.000056 126.999683 124.412805 EM HMR ST JNRXN EM HMR ST JNRXN [1750, power, plant, road] [1750, power, plant, road] 0\n", - "73212 29.544331 1.000000 __splink__input_table_0 __splink__input_table_1 224681 50859 selkirk cogen partners limited partnership selkirk cogen partners limited partnership 2 0.000058 0.000058 7612.680596 0.018993 24 power park drive 24 power park drive 2 0.000024 0.000024 14580.390627 0.350997 12158 12158 1 0.000034 0.000034 1148.002189 19.285174 selkirk selkirk 2 0.000033 0.000033 126.999683 207.354675 SLKRK KJN PRTNRS SLKRK KJN PRTNRS [24, power, park, drive] [24, power, park, drive] 0\n", - "\n", - "[3014 rows x 39 columns]" + "525" ] }, - "execution_count": 192, + "execution_count": 154, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "preds_df[preds_df.match_probability >= .95].sort_values(by=\"match_probability\")" - ] - }, - { - "cell_type": "markdown", - "id": "ad4d3859-81d1-4fa8-98cc-ff7c9fd038f6", - "metadata": {}, - "source": [ - "# Match to Ex. 21 subsidiaries" + "len(one_to_one_preds)" ] - }, - { - "cell_type": "code", - "execution_count": 38, - "id": "d1c56b09-80c7-4bfe-b1ec-c0220cadafbf", - "metadata": {}, - "outputs": [], - "source": [ - "# match EIA records that don't have a prediction to EIA subsidiaries\n", - "# can reuse code from SEC module?" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "a5599b7a-ea9a-40fd-9ce1-cb79a8d4dc35", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/src/mozilla_sec_eia/library/record_linkage_utils.py b/src/mozilla_sec_eia/library/record_linkage_utils.py index 924b6a4..217fb9b 100644 --- a/src/mozilla_sec_eia/library/record_linkage_utils.py +++ b/src/mozilla_sec_eia/library/record_linkage_utils.py @@ -1,6 +1,8 @@ """Utility functions for cleaning strings during modeling preprocessing steps.""" +import json from enum import StrEnum +from importlib import resources import jellyfish import pandas as pd @@ -91,6 +93,19 @@ def handle_invalid_names( return df +def flatten_companies_across_time( + df: pd.DataFrame, key_cols: list[str], date_col: str = "report_date" +) -> pd.DataFrame: + """Keep only the most recent record for each group of `key_cols`. + + Dataframe must have all of `key_cols` and `date_col`. + """ + df = ( + df.sort_values(by=date_col, ascending=False).groupby(key_cols).first() + ).reset_index() + return df + + # TODO: this is in PUDL, deduplicate def get_metaphone_col(col: pd.Series) -> pd.Series: """Get the metaphones of the strings in a column.""" @@ -133,3 +148,23 @@ def fill_street_address_nulls( df[secondary_address_col], ) return df + + +def expand_street_name_abbreviations(col: pd.Series) -> pd.Series: + """Standardize street address suffixes, like street to st. + + Expects lower case strings in column. + """ + # remove punctuation from column first + col = col.str.replace(r"[^\w\s]", "", regex=True) + + json_source = ( + resources.files("mozilla_sec_eia.package_data") + / "street_suffix_abbreviations.json" + ) + with json_source.open() as f: + address_expansions = json.load(f) + for standard_abbr, suffix_list in address_expansions.items(): + pattern = r"\b(" + "|".join(suffix_list) + r")\b" + col = col.str.replace(pattern, standard_abbr, regex=True) + return col diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py deleted file mode 100644 index 3caa182..0000000 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/preprocessing.py +++ /dev/null @@ -1,167 +0,0 @@ -"""Preprocessing for EIA and SEC input data before record linkage.""" - -import jellyfish -import numpy as np -import pandas as pd - -from pudl.analysis.record_linkage import name_cleaner - -EIA_COL_MAP = { - "utility_name_eia": "company_name", # TODO: should be linking to owner or operator name? - "address_2": "street_address_2", -} - -SEC_COL_MAP = { - "company_conformed_name": "company_name", - "street_1": "street_address", - "street_2": "street_address_2", - "zip": "zip_code", - "business_phone": "phone_number", - "date_filed": "report_date", -} - -SHARED_COLS = [ - "report_date", - "report_year", - "company_name", - "street_address", - "street_address_2", - "city", - "state", # could use state of incorporation from SEC - "zip_code", - "phone_number", -] - -STR_COLS = [ - "company_name", - "street_address", - "street_address_2", - "city", - "state", - "zip_code", -] - -INVALID_NAMES = [ - "llc", - "limited liability company", - "limited", - "ltd", - "iiii", - "inc", - "incorporated", - "partnership", - "i", - "name", - "company", - "&", - "", -] - - -company_name_cleaner = name_cleaner.CompanyNameCleaner( - cleaning_rules_list=[ - "remove_word_the_from_the_end", - "remove_word_the_from_the_beginning", - "replace_ampersand_by_AND", - "replace_hyphen_by_space", - "replace_underscore_by_space", - "remove_text_punctuation", - "remove_parentheses", - "remove_brackets", - "remove_curly_brackets", - "enforce_single_space_between_words", - ] -) - -legal_term_remover = name_cleaner.CompanyNameCleaner( - cleaning_rules_list=[], handle_legal_terms=2 -) - - -# TODO: this is in PUDL, pull out into helper function -def _get_metaphone(row, col_name): - if pd.isnull(row[col_name]): - return None - return jellyfish.metaphone(row[col_name]) - - -# TODO: delete -def _clean_company_name(df): - df.loc[:, "company_name_clean"] = company_name_cleaner.apply_name_cleaning( - df[["company_name"]] - ).str.strip() - df = df[df["company_name_clean"] != ""] - df = df.rename(columns={"company_name": "company_name_raw"}).rename( - columns={"company_name_clean": "company_name"} - ) - df.loc[:, "company_name_no_legal"] = legal_term_remover.apply_name_cleaning( - df[["company_name"]] - ) - return df - - -# TODO: delete -def clean_sec_df(df): - """Shared cleaning for SEC 10K and Ex. 21 dataframes. - - Arguments: - df: Ex. 21 or SEC 10K basic info dataframe with columns - company_name, loc_of_incorporation, and report_year. - """ - df = _clean_company_name(df) - df.loc[:, "company_name_mphone"] = df.apply( - _get_metaphone, axis=1, args=("company_name_no_legal",) - ) - df = df[ - (~df["company_name"].isin(INVALID_NAMES)) - & (~df["company_name_raw"].isin(INVALID_NAMES)) - ] - df = df.fillna(np.nan) - - return df - - -# TODO: delete -def prepare_sec10k_basic_info_df(sec_df): - """Preprocess SEC 10k basic information dataframe for record linkage.""" - sec_df = sec_df.rename(columns=SEC_COL_MAP).reset_index() - sec_df = clean_sec_df(sec_df) - sec_df[STR_COLS] = sec_df[STR_COLS].apply(lambda x: x.str.strip().str.lower()) - # TODO: does this actually drop anything? - sec_df = sec_df.drop_duplicates( - subset=[ - "central_index_key", - "report_year", - "company_name", - "standard_industrial_classification", - "city", - "state", - "street_address", - "zip_code", - ] - ) - return sec_df - - -# TODO: delete -def prepare_ex21_df(ex21_df): - """Preprocess Ex. 21 extracted dataframe for record linkage.""" - ex21_df = clean_sec_df(ex21_df) - return ex21_df - - -# TODO: delete -def prepare_eia_df(eia_df): - """Preprocess EIA utility dataframe for record linkage.""" - eia_df = eia_df.rename(columns=EIA_COL_MAP) - eia_df.loc[:, "report_year"] = ( - eia_df["report_date"].astype("datetime64[ns]").dt.year - ) - eia_df = eia_df.fillna(np.nan) - eia_df[STR_COLS] = eia_df[STR_COLS].apply(lambda x: x.str.strip().str.lower()) - eia_df = _clean_company_name(eia_df) - eia_df.loc[:, "company_name_mphone"] = eia_df.apply( - _get_metaphone, axis=1, args=("company_name_no_legal",) - ) - eia_df = eia_df.reset_index(drop=True).reset_index(names="record_id") - return eia_df diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py index 3a5edae..c8ccfd9 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_eia_splink_config.py @@ -32,8 +32,8 @@ BLOCKING_RULES = [ "substr(l.company_name_mphone,1,4) = substr(r.company_name_mphone,1,4)", "l.street_address = r.street_address", - "substr(l.company_name_mphone,1,3) = substr(r.company_name_mphone,1,3) and l.city = r.city", - "substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and array_length(list_intersect(l.street_address_list, r.street_address_list)) >= 2", + "substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and l.city = r.city", + # "substr(l.company_name_mphone,1,2) = substr(r.company_name_mphone,1,2) and array_length(list_intersect(l.street_address_list, r.street_address_list)) >= 2", ] company_name_comparison = cl.NameComparison( @@ -44,7 +44,6 @@ address_comparison = cl.LevenshteinAtThresholds( "street_address", distance_threshold_or_thresholds=[1] ).configure(term_frequency_adjustments=True) -print(address_comparison.get_comparison("duckdb").human_readable_description) state_comparison = cl.ExactMatch("state").configure(term_frequency_adjustments=True) city_comparison = cl.NameComparison("city", jaro_winkler_thresholds=[0.9]) diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py index 4da5c1b..c832cf0 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py @@ -5,7 +5,9 @@ from dagster import AssetOut, multi_asset from mozilla_sec_eia.library.record_linkage_utils import ( + expand_street_name_abbreviations, fill_street_address_nulls, + flatten_companies_across_time, transform_company_name, ) from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import STR_COLS @@ -87,19 +89,27 @@ def eia_rl_input_table(): "s3://pudl.catalyst.coop/stable/out_eia__yearly_utilities.parquet" ) eia861_df = harvest_eia861_utilities() - eia_df = pd.concat([raw_eia_df, eia861_df]) - eia_df = eia_df.drop_duplicates( - subset=["utility_id_eia", "report_date"], keep="first" - ).dropna(subset="utility_name_eia") - eia_df = eia_df.rename(columns=EIA_COL_MAP) - eia_df["report_date"] = eia_df["report_date"].astype("datetime64[ns]") - eia_df.loc[:, "report_year"] = eia_df["report_date"].dt.year - eia_df = transform_company_name(eia_df) - eia_df.loc[:, "zip_code"] = eia_df["zip_code"].str[:5] - eia_df = fill_street_address_nulls(eia_df) + eia_df = ( + pd.concat([raw_eia_df, eia861_df]) + .dropna(subset=["utility_name_eia"]) + .rename(columns=EIA_COL_MAP) + .assign( + report_date=lambda df: df["report_date"].astype("datetime64[ns]"), + report_year=lambda df: df["report_date"].dt.year, + zip_code=lambda df: df["zip_code"].str[:5], + ) + .pipe(transform_company_name) + .pipe(fill_street_address_nulls) + .pipe(lambda df: df.fillna(np.nan)) + .reset_index(drop=True) + ) eia_df[STR_COLS] = eia_df[STR_COLS].apply(lambda x: x.str.strip().str.lower()) - eia_df = eia_df.fillna(np.nan) - eia_df = eia_df.reset_index(drop=True).reset_index(names="record_id") + eia_df["street_address"] = expand_street_name_abbreviations( + eia_df["street_address"] + ) + eia_df = flatten_companies_across_time( + df=eia_df, key_cols=["company_name", "street_address"] + ).reset_index(names="record_id") return eia_df diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py index ff88151..0a51f23 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py @@ -10,7 +10,9 @@ from dagster import AssetIn, asset from mozilla_sec_eia.library.record_linkage_utils import ( + expand_street_name_abbreviations, fill_street_address_nulls, + flatten_companies_across_time, transform_company_name, ) from mozilla_sec_eia.models.sec10k.utils.cloud import ( @@ -73,24 +75,6 @@ def _add_report_year_to_sec(sec_df: pd.DataFrame, md: pd.DataFrame) -> pd.DataFr return sec_df -def _flatten_sec_companies_across_time(sec_df: pd.DataFrame) -> pd.DataFrame: - """Keep only the most recent record for each unique SEC company. - - Note that this drops old records for companies that have changed - names or addresses across time. Also, we group by sec_company_id not - CIK, so filer companies and subsidiary companies are unique in the - output dataframe. - TODO: create an asset that tracks name and address chnages across - time. - """ - sec_df = ( - sec_df.sort_values(by="report_year", ascending=False) - .groupby("sec_company_id") - .first() - ).reset_index() - return sec_df - - def get_sec_state_code_dict() -> dict[str, str]: """Create a dictionary mapping state codes to their names. @@ -236,6 +220,8 @@ def match_ex21_subsidiaries_to_filer_company( ex21_with_cik_df = ex21_with_cik_df.rename( columns={"subsidiary_cik": "central_index_key"} ) + ex21_with_cik_df = ex21_with_cik_df.drop_duplicates() + return ex21_with_cik_df @@ -281,7 +267,9 @@ def transformed_ex21_subsidiary_table( # add an sec_company_id, ultimately this ID become the subsidiary's CIK # if the subsidiary is matched to an SEC filer ex21_df = create_sec_company_id_for_ex21_subs(ex21_df=ex21_df) - ex21_df = _flatten_sec_companies_across_time(ex21_df) + ex21_df = flatten_companies_across_time( + df=ex21_df, key_cols=["sec_company_id"], date_col="report_year" + ) ex21_df = ex21_df.fillna(np.nan) return ex21_df @@ -295,35 +283,45 @@ def transform_basic10k_table( values="value", index="filename", columns="key", aggfunc="first" ) basic_10k_df.columns.name = None - # TODO: chain these function calls together - basic_10k_df = basic_10k_df.reset_index() - basic_10k_df = _remove_weird_sec_cols(basic_10k_df) - basic_10k_df = _add_report_year_to_sec(basic_10k_df, sec10k_filing_metadata) - basic_10k_df = basic_10k_df.rename(columns=SEC_COL_MAP) - # add a location of incorporation to better match it to Ex. 21 subsidiaries - basic_10k_df = clean_location_of_inc(basic_10k_df) - basic_10k_df = transform_company_name(basic_10k_df) - basic_10k_df.loc[:, "zip_code"] = basic_10k_df["zip_code"].str[:5] - basic_10k_df = fill_street_address_nulls(basic_10k_df) - basic_10k_df.loc[:, "files_10k"] = True - basic_10k_df.loc[:, "sec_company_id"] = basic_10k_df["central_index_key"] + basic_10k_df = ( + basic_10k_df.reset_index() + .pipe(_remove_weird_sec_cols) + .pipe(_add_report_year_to_sec, sec10k_filing_metadata) + .rename(columns=SEC_COL_MAP) + .pipe(clean_location_of_inc) + .pipe(transform_company_name) + .assign( + zip_code=lambda df: df["zip_code"].str[:5], + files_10k=True, + sec_company_id=lambda df: df["central_index_key"], + ) + .pipe(fill_street_address_nulls) + ) basic_10k_df[STR_COLS] = basic_10k_df[STR_COLS].apply( lambda x: x.str.strip().str.lower() ) + basic_10k_df["street_address"] = expand_street_name_abbreviations( + basic_10k_df["street_address"] + ) + # flatten across time on unique company name and address pair + basic_10k_df = flatten_companies_across_time( + df=basic_10k_df, key_cols=["company_name", "street_address"] + ) + return basic_10k_df @asset( ins={ "basic_10k_dfs": AssetIn("basic_10k_company_info"), - "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"), + # "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"), "sec10k_filing_metadata_dfs": AssetIn("sec10k_filing_metadata"), # specify an io_manager_key? }, ) -def core_sec_10k__parents_and_subsidiaries( +def core_sec_10k__filers( basic_10k_dfs: dict[str, pd.DataFrame], - clean_ex21_df: pd.DataFrame, + # clean_ex21_df: pd.DataFrame, sec10k_filing_metadata_dfs: dict[str, pd.DataFrame], ) -> pd.DataFrame: """Asset for creating an SEC 10K output table. @@ -336,10 +334,34 @@ def core_sec_10k__parents_and_subsidiaries( basic_10k_df = pd.concat(basic_10k_dfs.values()) sec10k_filing_metadata = pd.concat(sec10k_filing_metadata_dfs.values()) basic_10k_df = transform_basic10k_table(basic_10k_df, sec10k_filing_metadata) + # exclude Ex. 21 subs and just match to filers + # once the match has been conducted, add back in the Ex. 21 subs + out_df = basic_10k_df.fillna(np.nan).reset_index(names="record_id") + # TODO: Here we conduct the match to EIA and add on a column with utility_id_eia + return out_df + + +@asset( + ins={ + "sec10k_filers_matched_df": AssetIn("core_sec_10k__filers"), + "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"), + }, +) +def out_sec_10k__parents_and_subsidiaries( + sec10k_filers_matched_df: pd.DataFrame, + clean_ex21_df: pd.DataFrame, +) -> pd.DataFrame: + """Asset for creating an SEC 10K output table. + + Flatten the table across time to only keep the most recent record + for each CIK. Add in Ex. 21 subsidiaries and link them to already present + filing companies. Create an sec_company_id for subsidiaries that aren't linked + to a CIK. + """ ex21_df_with_cik = match_ex21_subsidiaries_to_filer_company( - basic10k_df=basic_10k_df, ex21_df=clean_ex21_df + basic10k_df=sec10k_filers_matched_df, ex21_df=clean_ex21_df ) - basic_10k_df = basic_10k_df.merge( + sec10k_filers_matched_df = sec10k_filers_matched_df.merge( ex21_df_with_cik[["central_index_key", "parent_company_cik", "own_per"]], how="left", on="central_index_key", @@ -349,16 +371,13 @@ def core_sec_10k__parents_and_subsidiaries( ex21_df_with_cik["central_index_key"].isnull() ] ex21_non_filing_subs_df.loc[:, "files_10k"] = False - out_df = pd.concat([basic_10k_df, ex21_non_filing_subs_df]) - out_df = out_df.fillna(np.nan) - # this drops records for earlier company names and addresses - # that have since changed, so we lose some information - out_df = _flatten_sec_companies_across_time(out_df) - + out_df = pd.concat([sec10k_filers_matched_df, ex21_non_filing_subs_df]) + # TODO: match the EIA utilities to the Ex. 21 subs? return out_df production_assets = [ - core_sec_10k__parents_and_subsidiaries, + core_sec_10k__filers, transformed_ex21_subsidiary_table, + out_sec_10k__parents_and_subsidiaries, ] diff --git a/src/mozilla_sec_eia/package_data/street_suffix_abbreviations.json b/src/mozilla_sec_eia/package_data/street_suffix_abbreviations.json new file mode 100644 index 0000000..e305113 --- /dev/null +++ b/src/mozilla_sec_eia/package_data/street_suffix_abbreviations.json @@ -0,0 +1,203 @@ +{ + "aly": ["alley", "allee", "ally"], + "anx": ["anex", "annex", "annx"], + "arc": ["arcade"], + "ave": ["avenue", "av", "aven", "avenu", "avn", "avnue"], + "byu": ["bayou", "bayoo"], + "bch": ["beach"], + "bnd": ["bend"], + "blf": ["bluff", "bluf"], + "blfs": ["bluffs"], + "btm": ["bottom", "bot", "bottm"], + "blvd": ["boulevard", "boul", "boulv"], + "br": ["branch", "brnch"], + "brg": ["bridge", "brdge"], + "brk": ["brook"], + "brks": ["brooks"], + "bg": ["burg"], + "bgs": ["burgs"], + "byp": ["bypass", "bypa", "bypas", "byps"], + "cp": ["camp", "cmp"], + "cyn": ["canyon", "canyn", "cnyn"], + "cpe": ["cape"], + "cswy": ["causeway", "causwa"], + "ctr": ["center", "cen", "cent", "centr", "centre", "cnter", "cntr"], + "ctrs": ["centers"], + "cir": ["circle", "circ", "circl", "crcl", "crcle"], + "cirs": ["circles"], + "clf": ["cliff"], + "clfs": ["cliffs"], + "clb": ["club"], + "cmn": ["common"], + "cmns": ["commons"], + "cor": ["corner"], + "cors": ["corners"], + "crse": ["course"], + "ct": ["court"], + "cts": ["courts"], + "cv": ["cove"], + "cvs": ["coves"], + "crk": ["creek"], + "cres": ["crescent", "crsent", "crsnt"], + "crst": ["crest"], + "xing": ["crossing", "crssng"], + "xrd": ["crossroad"], + "xrds": ["crossroads"], + "curv": ["curve"], + "dl": ["dale"], + "dm": ["dam"], + "dv": ["divide", "div", "dvd"], + "dr": ["drive", "driv", "drv"], + "drs": ["drives"], + "est": ["estate"], + "ests": ["estates"], + "expy": ["expressway", "exp", "expr", "express", "expw"], + "ext": ["extension", "extn", "extnsn"], + "exts": ["extensions"], + "fls": ["falls"], + "fry": ["ferry", "frry"], + "fld": ["field"], + "flds": ["fields"], + "flt": ["flat"], + "flts": ["flats"], + "frd": ["ford"], + "frds": ["fords"], + "frst": ["forest", "forests"], + "frg": ["forge", "forg"], + "frgs": ["forges"], + "frk": ["fork"], + "frks": ["forks"], + "ft": ["fort", "frt"], + "fwy": ["freeway", "freewy", "frway", "frwy"], + "gdn": ["garden", "gardn", "grden", "grdn"], + "gdns": ["gardens", "grdns"], + "gtwy": ["gateway", "gatewy", "gatway", "gtway"], + "gln": ["glen"], + "glns": ["glens"], + "grn": ["green"], + "grns": ["greens"], + "grv": ["grove", "grov"], + "grvs": ["groves"], + "hbr": ["harbor", "harb", "harbr", "hrbor"], + "hbrs": ["harbors"], + "hvn": ["haven"], + "hts": ["heights", "ht"], + "hwy": ["highway", "highwy", "hiway", "hiwy", "hway"], + "hl": ["hill"], + "hls": ["hills"], + "holw": ["hollow", "hllw", "hollows", "holws"], + "inlt": ["inlet"], + "is": ["island", "islnd"], + "iss": ["islands", "islnds"], + "isle": ["isles"], + "jct": ["junction", "jction", "jctn", "junctn", "juncton"], + "jcts": ["junctions", "jctns"], + "ky": ["key"], + "kys": ["keys"], + "knl": ["knoll", "knol"], + "knls": ["knolls"], + "lk": ["lake"], + "lks": ["lakes"], + "land": ["land"], + "lndg": ["landing", "lndng"], + "ln": ["lane"], + "lgt": ["light"], + "lgts": ["lights"], + "lf": ["loaf"], + "lck": ["lock"], + "lcks": ["locks"], + "ldg": ["lodge", "ldge", "lodg"], + "loop": ["loops"], + "mall": ["mall"], + "mnr": ["manor"], + "mnrs": ["manors"], + "mdw": ["meadow"], + "mdws": ["meadows", "mdw", "medows"], + "mews": ["mews"], + "ml": ["mill"], + "mls": ["mills"], + "msn": ["mission", "missn", "mssn"], + "mtwy": ["motorway"], + "mt": ["mount", "mnt"], + "mtn": ["mountain", "mntain", "mntn", "mountin", "mtin"], + "mtns": ["mountains", "mntns"], + "nck": ["neck"], + "orch": ["orchard", "orchrd"], + "oval": ["ovl"], + "opas": ["overpass"], + "park": ["parks"], + "pkwy": ["parkway", "parkwy", "pkway", "pky", "parkways", "pkwys"], + "pass": ["pass"], + "psge": ["passage"], + "path": ["paths"], + "pike": ["pikes"], + "pne": ["pine"], + "pnes": ["pines"], + "pl": ["place"], + "pln": ["plain"], + "plns": ["plains"], + "plz": ["plaza", "plza"], + "pt": ["point"], + "pts": ["points"], + "prt": ["port"], + "prts": ["ports"], + "pr": ["prairie", "prr"], + "radl": ["radial", "rad", "radiel"], + "ramp": ["ramp"], + "rnch": ["ranch", "ranches", "rnchs"], + "rpd": ["rapid"], + "rpds": ["rapids"], + "rst": ["rest"], + "rdg": ["ridge", "rdge"], + "rdgs": ["ridges"], + "riv": ["river", "rvr", "rivr"], + "rd": ["road"], + "rds": ["roads"], + "rte": ["route"], + "row": ["row"], + "rue": ["rue"], + "run": ["run"], + "shl": ["shoal"], + "shls": ["shoals"], + "shr": ["shore", "shoar"], + "shrs": ["shores", "shoars"], + "skwy": ["skyway"], + "spg": ["spring", "spng", "sprng"], + "spgs": ["springs", "spngs", "sprngs"], + "spur": ["spurs"], + "sq": ["square", "sqr", "sqre", "squ"], + "sqs": ["squares", "sqrs"], + "sta": ["station", "statn", "stn"], + "stra": ["stravenue", "strav", "straven", "stravn", "strvn", "strvnue"], + "strm": ["stream", "streme"], + "st": ["street", "strt", "str"], + "sts": ["streets"], + "smt": ["summit", "sumit", "sumitt"], + "ter": ["terrace", "terr"], + "trwy": ["throughway"], + "trce": ["trace", "traces"], + "trak": ["track", "tracks", "trk", "trks"], + "trfy": ["trafficway"], + "trl": ["trail", "trails", "trls"], + "trlr": ["trailer", "trlrs"], + "tunl": ["tunnel", "tunel", "tunls", "tunnels", "tunnl"], + "tpke": ["turnpike", "trnpk", "turnpk"], + "upas": ["underpass"], + "un": ["union"], + "uns": ["unions"], + "vly": ["valley", "vally", "vlly"], + "vlys": ["valleys"], + "via": ["viaduct", "vdct", "viadct"], + "vw": ["view"], + "vws": ["views"], + "vlg": ["village", "vill", "villag", "villg", "villiage"], + "vlgs": ["villages"], + "vl": ["ville"], + "vis": ["vista", "vist", "vst", "vsta"], + "walk": ["walks"], + "wall": ["wall"], + "way": ["wy"], + "ways": ["ways"], + "wl": ["well"], + "wls": ["wells"] +} From f4cceb7d983a9e17a2b16581aa150f5698672c79 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Mon, 16 Dec 2024 22:13:16 -0800 Subject: [PATCH 153/161] clean up new structure of sec assets --- .../transform_sec_input.py | 32 +++++++------------ 1 file changed, 12 insertions(+), 20 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py index 0a51f23..cf21059 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py @@ -261,7 +261,6 @@ def transformed_ex21_subsidiary_table( ex21_df = _add_report_year_to_sec(ex21_df, sec10k_filing_metadata) ex21_df = ex21_df.rename(columns=EX21_COL_MAP) ex21_df = clean_location_of_inc(ex21_df) - # TODO: what to do with the clean company name? ex21_df = transform_company_name(ex21_df) ex21_df = _add_parent_company_cik(ex21_df, sec10k_filing_metadata) # add an sec_company_id, ultimately this ID become the subsidiary's CIK @@ -314,54 +313,48 @@ def transform_basic10k_table( @asset( ins={ "basic_10k_dfs": AssetIn("basic_10k_company_info"), - # "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"), "sec10k_filing_metadata_dfs": AssetIn("sec10k_filing_metadata"), - # specify an io_manager_key? }, ) def core_sec_10k__filers( basic_10k_dfs: dict[str, pd.DataFrame], - # clean_ex21_df: pd.DataFrame, sec10k_filing_metadata_dfs: dict[str, pd.DataFrame], ) -> pd.DataFrame: - """Asset for creating an SEC 10K output table. + """Asset for creating a cleaned basic 10k table with EIA utility matched. Flatten the table across time to only keep the most recent record - for each CIK. Add in Ex. 21 subsidiaries and link them to already present - filing companies. Create an sec_company_id for subsidiaries that aren't linked - to a CIK. + for each unique company name and address pair. Clean table and link filers + to EIA utilities. """ basic_10k_df = pd.concat(basic_10k_dfs.values()) sec10k_filing_metadata = pd.concat(sec10k_filing_metadata_dfs.values()) basic_10k_df = transform_basic10k_table(basic_10k_df, sec10k_filing_metadata) - # exclude Ex. 21 subs and just match to filers - # once the match has been conducted, add back in the Ex. 21 subs out_df = basic_10k_df.fillna(np.nan).reset_index(names="record_id") + # match EIA utilities to filers # TODO: Here we conduct the match to EIA and add on a column with utility_id_eia return out_df @asset( ins={ - "sec10k_filers_matched_df": AssetIn("core_sec_10k__filers"), + "sec_10k_filers_matched_df": AssetIn("core_sec_10k__filers"), "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"), }, ) def out_sec_10k__parents_and_subsidiaries( - sec10k_filers_matched_df: pd.DataFrame, + sec_10k_filers_matched_df: pd.DataFrame, clean_ex21_df: pd.DataFrame, ) -> pd.DataFrame: """Asset for creating an SEC 10K output table. - Flatten the table across time to only keep the most recent record - for each CIK. Add in Ex. 21 subsidiaries and link them to already present - filing companies. Create an sec_company_id for subsidiaries that aren't linked - to a CIK. + Add in Ex. 21 subsidiaries and link them to already present + filing companies. Create an sec_company_id for subsidiaries + that aren't linked to a CIK. """ ex21_df_with_cik = match_ex21_subsidiaries_to_filer_company( - basic10k_df=sec10k_filers_matched_df, ex21_df=clean_ex21_df + basic10k_df=sec_10k_filers_matched_df, ex21_df=clean_ex21_df ) - sec10k_filers_matched_df = sec10k_filers_matched_df.merge( + sec_10k_filers_matched_df = sec_10k_filers_matched_df.merge( ex21_df_with_cik[["central_index_key", "parent_company_cik", "own_per"]], how="left", on="central_index_key", @@ -371,8 +364,7 @@ def out_sec_10k__parents_and_subsidiaries( ex21_df_with_cik["central_index_key"].isnull() ] ex21_non_filing_subs_df.loc[:, "files_10k"] = False - out_df = pd.concat([sec10k_filers_matched_df, ex21_non_filing_subs_df]) - # TODO: match the EIA utilities to the Ex. 21 subs? + out_df = pd.concat([sec_10k_filers_matched_df, ex21_non_filing_subs_df]) return out_df From fa9e52e527697276ce6fa9c22a775a0484ebd166 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Wed, 18 Dec 2024 15:28:56 -0800 Subject: [PATCH 154/161] add in final match between ex 21 subs and eia utilities --- notebooks/18-kl-splink-sec-eia.ipynb | 17 +-- .../20-kl-validate-sec-output-table.ipynb | 114 ++++++++++++++++++ .../transform_sec_input.py | 17 +++ 3 files changed, 137 insertions(+), 11 deletions(-) diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb index 8de5812..8299b9f 100644 --- a/notebooks/18-kl-splink-sec-eia.ipynb +++ b/notebooks/18-kl-splink-sec-eia.ipynb @@ -13,15 +13,11 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "id": "1107fe42-197c-4fea-9c48-06d08699af0b", "metadata": {}, "outputs": [], "source": [ - "import json\n", - "import os\n", - "from pathlib import Path\n", - "\n", "import pandas as pd\n", "from sklearn.metrics import precision_score, recall_score, accuracy_score, roc_auc_score, confusion_matrix\n", "from splink import block_on, DuckDBAPI, Linker, SettingsCreator\n", @@ -29,7 +25,6 @@ "import splink.comparison_library as cl\n", "import splink.comparison_level_library as cll\n", "from splink.exploratory import completeness_chart, profile_columns\n", - "from upath import UPath\n", "\n", "from mozilla_sec_eia.models.sec_eia_record_linkage.sec_eia_splink_config import (\n", " BLOCKING_RULES,\n", @@ -61,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "id": "8b1add80-34d7-44a8-a7b4-181a770bb2cb", "metadata": {}, "outputs": [], @@ -71,7 +66,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 3, "id": "9547a0ca-39f7-46c3-9a02-dcb08b75181a", "metadata": {}, "outputs": [ @@ -247,7 +242,7 @@ "2 2 1001 ebenezer church solar limited liability c... 176 ebenezer church rd 63186 8567.0 1001 ebenezer church solar, llc 2020-01-01 state road nc 28676 True None None None Q None None None None None None None None None None None None None final 2020 1001 ebenezer church solar EBNSR XRX SLR" ] }, - "execution_count": 4, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -258,7 +253,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "id": "755ab2a3-a32b-4ac1-81a5-0fb3a85dcdb3", "metadata": {}, "outputs": [ @@ -268,7 +263,7 @@ "20821" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } diff --git a/notebooks/20-kl-validate-sec-output-table.ipynb b/notebooks/20-kl-validate-sec-output-table.ipynb index 2b28fb9..d6045f9 100644 --- a/notebooks/20-kl-validate-sec-output-table.ipynb +++ b/notebooks/20-kl-validate-sec-output-table.ipynb @@ -22,6 +22,120 @@ "from upath import UPath" ] }, + { + "cell_type": "markdown", + "id": "511b2c77-ebd2-43b0-8e45-1d1c76fb321d", + "metadata": {}, + "source": [ + "### EIA" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "4907820f-2552-4a3b-866a-30c3181af91b", + "metadata": {}, + "outputs": [], + "source": [ + "eia_df = pd.read_parquet(\"gs://sec10k-outputs/v2/core_eia__parents_and_subsidiaries.parquet\")" + ] + }, + { + "cell_type": "markdown", + "id": "5f488f86-4b34-4a94-985f-588f991ba86b", + "metadata": {}, + "source": [ + "### Ex. 21" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "c1795acc-8005-4b6d-be4d-27c722b634f1", + "metadata": {}, + "outputs": [], + "source": [ + "ex21_df = pd.read_pickle(\"/Users/katielamb/CatalystCoop/dagster_home/storage/transformed_ex21_subsidiary_table\")" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "304d929b-ce6c-4508-b511-475f287a6b37", + "metadata": {}, + "outputs": [], + "source": [ + "merged_df = ex21_df.merge(\n", + " eia_df.drop_duplicates(subset=\"company_name\")[[\"company_name\", \"utility_id_eia\"]], how=\"left\", on=\"company_name\", suffixes=(\"_ex21\", \"_eia\")\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "d315f8d5-7166-4161-bc4e-79c45ed3ad59", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(1055987, 20821)" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(ex21_df), len(eia_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "id": "3aae6d2c-a941-478e-8178-84cf1321e0b3", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "utility_id_eia\n", + "True 1050887\n", + "False 5100\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "merged_df.utility_id_eia.isnull().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "id": "6aba0ae8-a8ee-47ef-8eb9-a0ef9f283b51", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1675" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "len(merged_df.utility_id_eia.unique())" + ] + }, { "cell_type": "markdown", "id": "8d178634-b494-4769-93e3-c0213e4a0326", diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py index cf21059..a825e2b 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py @@ -339,11 +339,13 @@ def core_sec_10k__filers( ins={ "sec_10k_filers_matched_df": AssetIn("core_sec_10k__filers"), "clean_ex21_df": AssetIn("transformed_ex21_subsidiary_table"), + "clean_eia_df": AssetIn("core_eia__parents_and_subsidiaries"), }, ) def out_sec_10k__parents_and_subsidiaries( sec_10k_filers_matched_df: pd.DataFrame, clean_ex21_df: pd.DataFrame, + clean_eia_df: pd.DataFrame, ) -> pd.DataFrame: """Asset for creating an SEC 10K output table. @@ -364,6 +366,21 @@ def out_sec_10k__parents_and_subsidiaries( ex21_df_with_cik["central_index_key"].isnull() ] ex21_non_filing_subs_df.loc[:, "files_10k"] = False + # the last step is to take the EIA utilities that haven't been matched + # to a filer company, and merge them by company name onto the Ex. 21 subs + unmatched_eia_df = clean_eia_df[ + ~clean_eia_df["utility_id_eia"].isin( + sec_10k_filers_matched_df.utility_id_eia.unique() + ) + ].drop_duplicates(subset="company_name") + ex21_non_filing_subs_df = ex21_non_filing_subs_df.merge( + unmatched_eia_df[["utility_id_eia", "company_name"]], + how="left", + on="company_name", + ) + logger.info( + f"Ex. 21 subsidiary names matched to an EIA utility name: {len(ex21_non_filing_subs_df["utility_id_eia"].unique())}" + ) out_df = pd.concat([sec_10k_filers_matched_df, ex21_non_filing_subs_df]) return out_df From 599ae877f26b9357a542aaa28ddedd58e9291d6a Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Wed, 18 Dec 2024 15:29:53 -0800 Subject: [PATCH 155/161] remove sec output table module --- .../models/sec_eia_record_linkage/sec_output_table.py | 10 ---------- 1 file changed, 10 deletions(-) delete mode 100644 src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_output_table.py diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_output_table.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_output_table.py deleted file mode 100644 index 7f974ad..0000000 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/sec_output_table.py +++ /dev/null @@ -1,10 +0,0 @@ -"""Module for creating the SEC company output table which connects to EIA company data.""" - - -# the input to this method is "core_sec_10k__parents_and_subsidiaries" -def sec_output_table(): - """Connect SEC to EIA and format an output table.""" - # run record linkage to connect SEC to EIA? - # add a utility_id_eia column onto the core table - # drop the following columns: company_name_no_legal, company_name_mphone, any other intermediate columns - pass From c340718b8feb500f0ea3bac3cf4056346f248639 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Wed, 18 Dec 2024 15:31:53 -0800 Subject: [PATCH 156/161] add drop duplicates on sec company id --- .../models/sec_eia_record_linkage/transform_sec_input.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py index a825e2b..666f010 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_sec_input.py @@ -377,7 +377,7 @@ def out_sec_10k__parents_and_subsidiaries( unmatched_eia_df[["utility_id_eia", "company_name"]], how="left", on="company_name", - ) + ).drop_duplicates(subset="sec_company_id") logger.info( f"Ex. 21 subsidiary names matched to an EIA utility name: {len(ex21_non_filing_subs_df["utility_id_eia"].unique())}" ) From 24de7d61127458c6b4972d24bfc1f1dafdcab74f Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Thu, 19 Dec 2024 11:17:44 -0800 Subject: [PATCH 157/161] clean up notbook --- environment.yml | 1 - notebooks/18-kl-splink-sec-eia.ipynb | 2216 ++++++++--------- .../20-kl-validate-sec-output-table.ipynb | 280 ++- src/mozilla_sec_eia/models/sec10k/__init__.py | 1 - 4 files changed, 1327 insertions(+), 1171 deletions(-) diff --git a/environment.yml b/environment.yml index 33b1e04..89eadc8 100644 --- a/environment.yml +++ b/environment.yml @@ -29,5 +29,4 @@ dependencies: # Use pip to install the package defined by this repo for development: - pip: - # - git+https://github.com/catalyst-cooperative/pudl.git@main - --editable ./[dev,docs,tests,types] diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb index 8299b9f..a105e3b 100644 --- a/notebooks/18-kl-splink-sec-eia.ipynb +++ b/notebooks/18-kl-splink-sec-eia.ipynb @@ -13,7 +13,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "id": "1107fe42-197c-4fea-9c48-06d08699af0b", "metadata": {}, "outputs": [], @@ -56,7 +56,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "id": "8b1add80-34d7-44a8-a7b4-181a770bb2cb", "metadata": {}, "outputs": [], @@ -64,193 +64,6 @@ "eia_df = pd.read_parquet(\"gs://sec10k-outputs/v2/core_eia__parents_and_subsidiaries.parquet\")" ] }, - { - "cell_type": "code", - "execution_count": 3, - "id": "9547a0ca-39f7-46c3-9a02-dcb08b75181a", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
record_idcompany_namestreet_addressutility_id_eiautility_id_pudlcompany_name_rawreport_datecitystatezip_codeplants_reported_ownerplants_reported_operatorplants_reported_asset_managerplants_reported_other_relationshipentity_typeattention_linestreet_address_2zip_code_4contact_firstnamecontact_lastnamecontact_titlephone_numberphone_extensioncontact_firstname_2contact_lastname_2contact_title_2phone_number_2phone_extension_2data_maturityreport_yearcompany_name_no_legalcompany_name_mphone
000ham wham8 solar limited liability company100 california st suite 400643808321.00ham wham8 solar, llc2023-01-01san franciscoca94118TrueNoneNoneNoneQNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNonefinal20230ham wham8 solarHM HM SLR
1110 briggs solar ng limited liability company267 water st 2nd floor626858502.010 briggs solar ng, llc2020-01-01warrenri02885TrueTrueNoneNoneQNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNonefinal202010 briggs solar ngBRKS SLR NK
221001 ebenezer church solar limited liability c...176 ebenezer church rd631868567.01001 ebenezer church solar, llc2020-01-01state roadnc28676TrueNoneNoneNoneQNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNoneNonefinal20201001 ebenezer church solarEBNSR XRX SLR
\n", - "
" - ], - "text/plain": [ - " record_id company_name street_address utility_id_eia utility_id_pudl company_name_raw report_date city state zip_code plants_reported_owner plants_reported_operator plants_reported_asset_manager plants_reported_other_relationship entity_type attention_line street_address_2 zip_code_4 contact_firstname contact_lastname contact_title phone_number phone_extension contact_firstname_2 contact_lastname_2 contact_title_2 phone_number_2 phone_extension_2 data_maturity report_year company_name_no_legal company_name_mphone\n", - "0 0 0ham wham8 solar limited liability company 100 california st suite 400 64380 8321.0 0ham wham8 solar, llc 2023-01-01 san francisco ca 94118 True None None None Q None None None None None None None None None None None None None final 2023 0ham wham8 solar HM HM SLR\n", - "1 1 10 briggs solar ng limited liability company 267 water st 2nd floor 62685 8502.0 10 briggs solar ng, llc 2020-01-01 warren ri 02885 True True None None Q None None None None None None None None None None None None None final 2020 10 briggs solar ng BRKS SLR NK\n", - "2 2 1001 ebenezer church solar limited liability c... 176 ebenezer church rd 63186 8567.0 1001 ebenezer church solar, llc 2020-01-01 state road nc 28676 True None None None Q None None None None None None None None None None None None None final 2020 1001 ebenezer church solar EBNSR XRX SLR" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "eia_df.head(3)" - ] - }, { "cell_type": "code", "execution_count": 4, @@ -282,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 100, + "execution_count": 5, "id": "3f5f9e6c-0725-48e1-920f-3d516b4388a6", "metadata": {}, "outputs": [], @@ -292,182 +105,7 @@ }, { "cell_type": "code", - "execution_count": 101, - "id": "a5ea9e1d-3afd-466f-a506-ecb3f23605c9", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
record_idcompany_namestreet_addressfilenamephone_numbercentral_index_keycitycompany_name_rawdate_of_name_changefilm_numberfiscal_year_endform_typeformer_conformed_nameirs_numberorganization_namesec_actsec_file_numberstandard_industrial_classificationstatestate_of_incorporationstreet_address_2zip_codereport_datereport_yearlocation_of_inccompany_name_no_legalcompany_name_mphonefiles_10ksec_company_id
00024 pharma incorporated224 datura stedgar/data/1307969/0001683168-17-000653.txt(732) 696-93330001307969west palm beach024 pharma, inc.2009120217711535123110-kb green innovations, inc.201862731NaN1934 act333-120490plastics products, nec [3089]flnjNaN334012017-03-242017new jersey024 pharmaFRMTrue0001307969
111 800 contacts incorporated13751 s wadsworth park dr suite d140edgar/data/1050122/0001104659-06-017311.txt80157282250001050122draper1 800 contacts incNaN06691791123110-kNaN870571643NaN1934 act000-23633retail-catalog & mail-order houses [5961]utdeNaN840202006-03-162006delaware1 800 contactsKNTKTSTrue0001050122
221 800 contacts incorporated66 e wadsworth park dredgar/data/1050122/0001104659-07-019474.txt801-316-50000001050122draper1 800 contacts incNaN07696033123110-kNaN870571643NaN1934 act000-23633retail-catalog & mail-order houses [5961]utdeNaN840202007-03-152007delaware1 800 contactsKNTKTSTrue0001050122
\n", - "
" - ], - "text/plain": [ - " record_id company_name street_address filename phone_number central_index_key city company_name_raw date_of_name_change film_number fiscal_year_end form_type former_conformed_name irs_number organization_name sec_act sec_file_number standard_industrial_classification state state_of_incorporation street_address_2 zip_code report_date report_year location_of_inc company_name_no_legal company_name_mphone files_10k sec_company_id\n", - "0 0 024 pharma incorporated 224 datura st edgar/data/1307969/0001683168-17-000653.txt (732) 696-9333 0001307969 west palm beach 024 pharma, inc. 20091202 17711535 1231 10-k b green innovations, inc. 201862731 NaN 1934 act 333-120490 plastics products, nec [3089] fl nj NaN 33401 2017-03-24 2017 new jersey 024 pharma FRM True 0001307969\n", - "1 1 1 800 contacts incorporated 13751 s wadsworth park dr suite d140 edgar/data/1050122/0001104659-06-017311.txt 8015728225 0001050122 draper 1 800 contacts inc NaN 06691791 1231 10-k NaN 870571643 NaN 1934 act 000-23633 retail-catalog & mail-order houses [5961] ut de NaN 84020 2006-03-16 2006 delaware 1 800 contacts KNTKTS True 0001050122\n", - "2 2 1 800 contacts incorporated 66 e wadsworth park dr edgar/data/1050122/0001104659-07-019474.txt 801-316-5000 0001050122 draper 1 800 contacts inc NaN 07696033 1231 10-k NaN 870571643 NaN 1934 act 000-23633 retail-catalog & mail-order houses [5961] ut de NaN 84020 2007-03-15 2007 delaware 1 800 contacts KNTKTS True 0001050122" - ] - }, - "execution_count": 101, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "sec_df.head(3)" - ] - }, - { - "cell_type": "code", - "execution_count": 102, + "execution_count": 6, "id": "63d97f0d-df22-4c27-b3e7-1035166b4011", "metadata": {}, "outputs": [ @@ -477,7 +115,7 @@ "61026" ] }, - "execution_count": 102, + "execution_count": 6, "metadata": {}, "output_type": "execute_result" } @@ -499,7 +137,7 @@ }, { "cell_type": "code", - "execution_count": 103, + "execution_count": 7, "id": "7d2d103a-2bbd-4974-b770-44626bdc5111", "metadata": {}, "outputs": [], @@ -509,7 +147,7 @@ }, { "cell_type": "code", - "execution_count": 104, + "execution_count": 8, "id": "c734137b-fd76-4f6a-a828-23cd12d4ac27", "metadata": {}, "outputs": [], @@ -519,7 +157,7 @@ }, { "cell_type": "code", - "execution_count": 105, + "execution_count": 9, "id": "e754b2ef-5a0d-4582-8694-047528dfd339", "metadata": {}, "outputs": [ @@ -529,7 +167,7 @@ "True" ] }, - "execution_count": 105, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -540,7 +178,7 @@ }, { "cell_type": "code", - "execution_count": 106, + "execution_count": 10, "id": "38ad3504-2cde-455f-8896-6a435677541c", "metadata": {}, "outputs": [ @@ -550,7 +188,7 @@ "True" ] }, - "execution_count": 106, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -561,7 +199,7 @@ }, { "cell_type": "code", - "execution_count": 107, + "execution_count": 12, "id": "856c14d8-3250-4650-a2db-3808b4718f19", "metadata": {}, "outputs": [ @@ -571,14 +209,13 @@ "False" ] }, - "execution_count": 107, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Note that sec_company_id isn't unique here because we are keeping each unique company name and address pair\n", - "# later we'll flatten on sec_company_id and utility_id_eia\n", "sec_df.sec_company_id.is_unique" ] }, @@ -587,12 +224,12 @@ "id": "b18fef7e-c316-4c90-b2bc-04706401135e", "metadata": {}, "source": [ - "There can be duplicate records because sometimes a company changes utility ID or central index key over time. Keep the most recent version of that record." + "There should probably be no duplicate record, but if there are, keep the most recent version of that record." ] }, { "cell_type": "code", - "execution_count": 108, + "execution_count": 19, "id": "842fa02e-5202-445c-b728-72bce42e740d", "metadata": {}, "outputs": [ @@ -603,7 +240,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 108, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -614,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 109, + "execution_count": 20, "id": "b53e6244-f0ca-4256-bc09-9c3264675389", "metadata": {}, "outputs": [ @@ -625,7 +262,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 109, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -636,7 +273,7 @@ }, { "cell_type": "code", - "execution_count": 253, + "execution_count": 18, "id": "e4d54448-0c2f-452b-931c-ff79a5cc3669", "metadata": {}, "outputs": [], @@ -663,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": 112, + "execution_count": 21, "id": "de894ad0-0b72-4486-b737-c3bfb95f8f05", "metadata": {}, "outputs": [], @@ -673,7 +310,7 @@ }, { "cell_type": "code", - "execution_count": 113, + "execution_count": 22, "id": "4bab1568-6a55-427c-9a78-e44db8b0584d", "metadata": {}, "outputs": [ @@ -682,23 +319,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, - "execution_count": 115, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -940,7 +577,7 @@ }, { "cell_type": "code", - "execution_count": 116, + "execution_count": 25, "id": "d9cbc91b-b061-461b-b1f2-83036c70d7c7", "metadata": {}, "outputs": [ @@ -949,23 +586,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, - "execution_count": 116, + "execution_count": 25, "metadata": {}, "output_type": "execute_result" } @@ -1037,7 +674,7 @@ }, { "cell_type": "code", - "execution_count": 117, + "execution_count": 26, "id": "1e6d3d24-f8ce-4118-9d31-d4f237d28237", "metadata": {}, "outputs": [ @@ -1051,7 +688,7 @@ " 'link_type_join_condition': 'where l.\"source_dataset\" || \\'-__-\\' || l.\"record_id\" < r.\"source_dataset\" || \\'-__-\\' || r.\"record_id\" and l.\"source_dataset\" != r.\"source_dataset\"'}" ] }, - "execution_count": 117, + "execution_count": 26, "metadata": {}, "output_type": "execute_result" } @@ -1071,7 +708,7 @@ }, { "cell_type": "code", - "execution_count": 118, + "execution_count": 27, "id": "c37c117a-0fa2-4b8a-9fae-da3569895ca3", "metadata": {}, "outputs": [ @@ -1135,7 +772,7 @@ "2 FRST 816 36 29376" ] }, - "execution_count": 118, + "execution_count": 27, "metadata": {}, "output_type": "execute_result" } @@ -1154,7 +791,7 @@ }, { "cell_type": "code", - "execution_count": 121, + "execution_count": 28, "id": "4e1a9844-5d98-4cac-a083-eef134f083ce", "metadata": {}, "outputs": [ @@ -1163,23 +800,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, - "execution_count": 131, + "execution_count": 38, "metadata": {}, "output_type": "execute_result" } @@ -1687,7 +1281,7 @@ }, { "cell_type": "code", - "execution_count": 132, + "execution_count": 39, "id": "f365f59e-e4d0-44f3-a1fb-62e0d63d7ba3", "metadata": {}, "outputs": [ @@ -1696,23 +1290,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.HConcatChart(...)" ] }, - "execution_count": 132, + "execution_count": 39, "metadata": {}, "output_type": "execute_result" } @@ -1781,8 +1375,9 @@ "metadata": {}, "outputs": [], "source": [ + "# you could save the model weights like this\n", "settings = linker.misc.save_model_to_json(\n", - " \"model_unsupervised_city_state_0.json\", overwrite=True\n", + " \"model_unsupervised_0.json\", overwrite=True\n", ")" ] }, @@ -1796,7 +1391,7 @@ }, { "cell_type": "code", - "execution_count": 133, + "execution_count": 40, "id": "94e96441-89b6-4516-aa6a-4d1593ce03be", "metadata": {}, "outputs": [ @@ -1805,19 +1400,17 @@ "output_type": "stream", "text": [ "Blocking time: 0.16 seconds\n", - "Predict time: 0.31 seconds\n" + "Predict time: 0.26 seconds\n" ] } ], "source": [ - "# it's helpful to keep threshold at .5 just to see what makes it into blocking\n", - "# df_predictions = linker.inference.predict(threshold_match_probability=0.5)\n", "df_predictions = linker.inference.predict()" ] }, { "cell_type": "code", - "execution_count": 134, + "execution_count": 41, "id": "722effbc-24e8-45ca-aa64-8cdcd75846e0", "metadata": {}, "outputs": [], @@ -1827,7 +1420,7 @@ }, { "cell_type": "code", - "execution_count": 135, + "execution_count": 42, "id": "21b19a11-15c6-46ab-bd73-7c4752f7e25e", "metadata": {}, "outputs": [ @@ -1894,202 +1487,202 @@ " \n", " \n", " 295287\n", - " -22.970759\n", - " 1.216501e-07\n", + " -22.967975\n", + " 1.218850e-07\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 9829\n", - " 3043\n", - " capitol bancorp\n", - " capital power\n", + " 56230\n", + " 19078\n", + " union pacific\n", + " union electric\n", " 0\n", - " 0.000024\n", - " 0.000012\n", - " 0.986045\n", + " 0.000049\n", + " 0.000098\n", + " 0.986046\n", " 1.000000\n", - " capitol bancorp ctr\n", - " 120010423 101 st nw\n", + " 1416 dodge st\n", + " mc 1400\n", " 0\n", - " 0.000012\n", - " 0.000110\n", - " 0.881657\n", + " 0.000049\n", + " 0.000049\n", + " 0.881658\n", " 1.000000\n", - " mi\n", - " ab\n", + " ne\n", + " mo\n", " 0\n", - " 0.015147\n", - " 0.000197\n", - " 0.198711\n", + " 0.006455\n", + " 0.010118\n", + " 0.199012\n", " 1.000000\n", - " lansing\n", - " edmonton\n", + " omaha\n", + " st louis\n", " 0\n", - " 0.000293\n", - " 0.000428\n", - " 0.296590\n", + " 0.003448\n", + " 0.002764\n", + " 0.296714\n", " 1.000000\n", - " KPTL BNKRP\n", - " KPTL PWR\n", + " UNN PSFK\n", + " UNN ELKTRK\n", " 0\n", " \n", " \n", - " 383898\n", - " -22.970759\n", - " 1.216501e-07\n", + " 384509\n", + " -22.967975\n", + " 1.218850e-07\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 51783\n", - " 17550\n", - " state bancorp\n", - " state street bank and trust\n", + " 56484\n", + " 19138\n", + " united states lime and minerals\n", + " united water conservation\n", " 0\n", + " 0.000037\n", " 0.000024\n", - " 0.000024\n", - " 0.986045\n", + " 0.986046\n", " 1.000000\n", - " 2 jericho plz\n", - " 100 summer st\n", + " 5429 lbj fwy\n", + " 1701 north lombard st\n", " 0\n", - " 0.000012\n", " 0.000024\n", - " 0.881657\n", + " 0.000012\n", + " 0.881658\n", " 1.000000\n", - " ny\n", - " ma\n", + " tx\n", + " ca\n", " 0\n", - " 0.120228\n", - " 0.041765\n", - " 0.198711\n", + " 0.079841\n", + " 0.157960\n", + " 0.199012\n", " 1.000000\n", - " jericho\n", - " boston\n", + " dallas\n", + " oxnard\n", " 0\n", - " 0.000306\n", - " 0.014319\n", - " 0.296590\n", + " 0.013855\n", + " 0.000257\n", + " 0.296714\n", " 1.000000\n", - " STT BNKRP\n", - " STT STRT BNK ANT TRST\n", + " UNTT STTS LM ANT MNRLS\n", + " UNTT WTR KNSRFXN\n", " 0\n", " \n", " \n", - " 383897\n", - " -22.970759\n", - " 1.216501e-07\n", + " 384504\n", + " -22.967975\n", + " 1.218850e-07\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 51782\n", - " 17550\n", - " state auto financial\n", - " state street bank and trust\n", + " 56436\n", + " 19138\n", + " united rentals\n", + " united water conservation\n", " 0\n", " 0.000024\n", " 0.000024\n", - " 0.986045\n", + " 0.986046\n", " 1.000000\n", - " 518 east broad st\n", - " 100 summer st\n", + " 100 first stamford pl\n", + " 1701 north lombard st\n", " 0\n", + " 0.000122\n", " 0.000012\n", - " 0.000024\n", - " 0.881657\n", + " 0.881658\n", " 1.000000\n", - " oh\n", - " ma\n", + " ct\n", + " ca\n", " 0\n", - " 0.016991\n", - " 0.041765\n", - " 0.198711\n", + " 0.020876\n", + " 0.157960\n", + " 0.199012\n", " 1.000000\n", - " columbus\n", - " boston\n", + " stamford\n", + " oxnard\n", " 0\n", - " 0.002788\n", - " 0.014319\n", - " 0.296590\n", + " 0.003950\n", + " 0.000257\n", + " 0.296714\n", " 1.000000\n", - " STT AT FNNXL\n", - " STT STRT BNK ANT TRST\n", + " UNTT RNTLS\n", + " UNTT WTR KNSRFXN\n", " 0\n", " \n", " \n", - " 383896\n", - " -22.970759\n", - " 1.216501e-07\n", + " 384503\n", + " -22.967975\n", + " 1.218850e-07\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 51781\n", - " 17550\n", - " state auto financial\n", - " state street bank and trust\n", + " 56424\n", + " 19138\n", + " united parcel service\n", + " united water conservation\n", " 0\n", " 0.000024\n", " 0.000024\n", - " 0.986045\n", + " 0.986046\n", " 1.000000\n", - " 518 e broad st\n", - " 100 summer st\n", + " 55 glenlake pkwy ne\n", + " 1701 north lombard st\n", " 0\n", " 0.000012\n", - " 0.000024\n", - " 0.881657\n", + " 0.000012\n", + " 0.881658\n", " 1.000000\n", - " oh\n", - " ma\n", + " ga\n", + " ca\n", " 0\n", - " 0.016991\n", - " 0.041765\n", - " 0.198711\n", + " 0.018626\n", + " 0.157960\n", + " 0.199012\n", " 1.000000\n", - " columbus\n", - " boston\n", + " atlanta\n", + " oxnard\n", " 0\n", - " 0.002788\n", - " 0.014319\n", - " 0.296590\n", + " 0.008462\n", + " 0.000257\n", + " 0.296714\n", " 1.000000\n", - " STT AT FNNXL\n", - " STT STRT BNK ANT TRST\n", + " UNTT PRSL SRFS\n", + " UNTT WTR KNSRFXN\n", " 0\n", " \n", " \n", - " 383895\n", - " -22.970759\n", - " 1.216501e-07\n", + " 384502\n", + " -22.967975\n", + " 1.218850e-07\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 51780\n", - " 3805\n", - " starz\n", - " citrus world\n", + " 56312\n", + " 19138\n", + " united bancorp /oh/\n", + " united water conservation\n", " 0\n", " 0.000024\n", - " 0.000049\n", - " 0.986045\n", + " 0.000024\n", + " 0.986046\n", " 1.000000\n", - " 8900 liberty cir\n", - " 20205 hwy 2720205 hwy 27\n", + " 201 south fourth st\n", + " 1701 north lombard st\n", " 0\n", - " 0.000024\n", " 0.000012\n", - " 0.881657\n", + " 0.000012\n", + " 0.881658\n", " 1.000000\n", - " co\n", - " fl\n", + " oh\n", + " ca\n", " 0\n", - " 0.023802\n", - " 0.048477\n", - " 0.198711\n", + " 0.016991\n", + " 0.157960\n", + " 0.199012\n", " 1.000000\n", - " englewood\n", - " lake wales\n", + " martins ferry\n", + " oxnard\n", " 0\n", - " 0.002947\n", - " 0.000049\n", - " 0.296590\n", + " 0.000024\n", + " 0.000257\n", + " 0.296714\n", " 1.000000\n", - " STRS\n", - " STRS WRLT\n", + " UNTT BNKRP\n", + " UNTT WTR KNSRFXN\n", " 0\n", " \n", " \n", @@ -2133,8 +1726,8 @@ " ...\n", " \n", " \n", - " 186872\n", - " 27.519625\n", + " 163815\n", + " 27.519606\n", " 1.000000e+00\n", " __splink__input_table_0\n", " __splink__input_table_1\n", @@ -2145,36 +1738,36 @@ " 2\n", " 0.000073\n", " 0.000073\n", - " 652179.111493\n", - " 0.010580\n", + " 415263.133269\n", + " 0.016616\n", " 33 third st se\n", " 33 third st se\n", " 2\n", " 0.000037\n", " 0.000037\n", - " 9450.378101\n", - " 0.317122\n", + " 9605.781694\n", + " 0.311992\n", " sd\n", " sd\n", " 1\n", " 0.001930\n", " 0.001930\n", - " 15.873789\n", - " 26.483035\n", + " 15.445559\n", + " 27.217182\n", " huron\n", " huron\n", " 2\n", " 0.000073\n", " 0.000073\n", - " 108.031428\n", - " 86.293486\n", + " 102.014123\n", + " 91.382644\n", " NR0WSTRN PBLK SRFS\n", " NR0WSTRN PBLK SRFS\n", " 0\n", " \n", " \n", - " 580681\n", - " 27.526533\n", + " 241593\n", + " 27.526514\n", " 1.000000e+00\n", " __splink__input_table_0\n", " __splink__input_table_1\n", @@ -2185,36 +1778,36 @@ " 2\n", " 0.000037\n", " 0.000037\n", - " 652179.111493\n", - " 0.021160\n", + " 415263.133269\n", + " 0.033231\n", " 163 acorn ln\n", " 163 acorn ln\n", " 2\n", " 0.000037\n", " 0.000037\n", - " 9450.378101\n", - " 0.317122\n", + " 9605.781694\n", + " 0.311992\n", " vt\n", " vt\n", " 1\n", " 0.001537\n", " 0.001537\n", - " 15.873789\n", - " 33.262692\n", + " 15.445559\n", + " 34.184780\n", " colchester\n", " colchester\n", " 2\n", " 0.000183\n", " 0.000183\n", - " 108.031428\n", - " 34.517394\n", + " 102.014123\n", + " 36.553058\n", " KRN MNTN PWR\n", " KRN MNTN PWR\n", " 0\n", " \n", " \n", - " 438193\n", - " 27.757357\n", + " 165487\n", + " 27.757338\n", " 1.000000e+00\n", " __splink__input_table_0\n", " __splink__input_table_1\n", @@ -2225,36 +1818,36 @@ " 2\n", " 0.000024\n", " 0.000024\n", - " 652179.111493\n", - " 0.031739\n", + " 415263.133269\n", + " 0.049847\n", " one clarks is\n", " one clarks is\n", " 2\n", " 0.000024\n", " 0.000024\n", - " 9450.378101\n", - " 0.475683\n", + " 9605.781694\n", + " 0.467987\n", " wi\n", " wi\n", " 1\n", " 0.008840\n", " 0.008840\n", - " 15.873789\n", - " 5.782805\n", + " 15.445559\n", + " 5.943112\n", " wausau\n", " wausau\n", " 2\n", " 0.000061\n", " 0.000061\n", - " 108.031428\n", - " 103.552183\n", + " 102.014123\n", + " 109.659173\n", " WS PPR MLS\n", " WS PPR MLS\n", " 0\n", " \n", " \n", - " 385934\n", - " 27.884385\n", + " 340414\n", + " 27.884365\n", " 1.000000e+00\n", " __splink__input_table_0\n", " __splink__input_table_1\n", @@ -2265,36 +1858,36 @@ " 2\n", " 0.000024\n", " 0.000024\n", - " 652179.111493\n", - " 0.031739\n", + " 415263.133269\n", + " 0.049847\n", " 520 francis st\n", " 520 francis st\n", " 2\n", " 0.000024\n", " 0.000024\n", - " 9450.378101\n", - " 0.475683\n", + " 9605.781694\n", + " 0.467987\n", " mo\n", " mo\n", " 1\n", " 0.010118\n", " 0.010118\n", - " 15.873789\n", - " 5.052049\n", + " 15.445559\n", + " 5.192099\n", " st joseph\n", " st joseph\n", " 2\n", " 0.000049\n", " 0.000049\n", - " 108.031428\n", - " 129.440229\n", + " 102.014123\n", + " 137.073967\n", " ST JSF LT ANT PWR\n", " ST JSF LT ANT PWR\n", " 0\n", " \n", " \n", - " 503816\n", - " 29.211031\n", + " 274760\n", + " 29.211012\n", " 1.000000e+00\n", " __splink__input_table_0\n", " __splink__input_table_1\n", @@ -2305,29 +1898,29 @@ " 2\n", " 0.000037\n", " 0.000037\n", - " 652179.111493\n", - " 0.021160\n", + " 415263.133269\n", + " 0.033231\n", " 161 wellington rd\n", " 161 wellington rd\n", " 2\n", " 0.000024\n", " 0.000024\n", - " 9450.378101\n", - " 0.475683\n", + " 9605.781694\n", + " 0.467987\n", " vt\n", " vt\n", " 1\n", " 0.001537\n", " 0.001537\n", - " 15.873789\n", - " 33.262692\n", + " 15.445559\n", + " 34.184780\n", " brattleboro\n", " brattleboro\n", " 2\n", " 0.000086\n", " 0.000086\n", - " 108.031428\n", - " 73.965845\n", + " 102.014123\n", + " 78.327981\n", " FBRMRK\n", " FBRMRK\n", " 0\n", @@ -2338,23 +1931,23 @@ "" ], "text/plain": [ - " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key\n", - "295287 -22.970759 1.216501e-07 __splink__input_table_0 __splink__input_table_1 9829 3043 capitol bancorp capital power 0 0.000024 0.000012 0.986045 1.000000 capitol bancorp ctr 120010423 101 st nw 0 0.000012 0.000110 0.881657 1.000000 mi ab 0 0.015147 0.000197 0.198711 1.000000 lansing edmonton 0 0.000293 0.000428 0.296590 1.000000 KPTL BNKRP KPTL PWR 0\n", - "383898 -22.970759 1.216501e-07 __splink__input_table_0 __splink__input_table_1 51783 17550 state bancorp state street bank and trust 0 0.000024 0.000024 0.986045 1.000000 2 jericho plz 100 summer st 0 0.000012 0.000024 0.881657 1.000000 ny ma 0 0.120228 0.041765 0.198711 1.000000 jericho boston 0 0.000306 0.014319 0.296590 1.000000 STT BNKRP STT STRT BNK ANT TRST 0\n", - "383897 -22.970759 1.216501e-07 __splink__input_table_0 __splink__input_table_1 51782 17550 state auto financial state street bank and trust 0 0.000024 0.000024 0.986045 1.000000 518 east broad st 100 summer st 0 0.000012 0.000024 0.881657 1.000000 oh ma 0 0.016991 0.041765 0.198711 1.000000 columbus boston 0 0.002788 0.014319 0.296590 1.000000 STT AT FNNXL STT STRT BNK ANT TRST 0\n", - "383896 -22.970759 1.216501e-07 __splink__input_table_0 __splink__input_table_1 51781 17550 state auto financial state street bank and trust 0 0.000024 0.000024 0.986045 1.000000 518 e broad st 100 summer st 0 0.000012 0.000024 0.881657 1.000000 oh ma 0 0.016991 0.041765 0.198711 1.000000 columbus boston 0 0.002788 0.014319 0.296590 1.000000 STT AT FNNXL STT STRT BNK ANT TRST 0\n", - "383895 -22.970759 1.216501e-07 __splink__input_table_0 __splink__input_table_1 51780 3805 starz citrus world 0 0.000024 0.000049 0.986045 1.000000 8900 liberty cir 20205 hwy 2720205 hwy 27 0 0.000024 0.000012 0.881657 1.000000 co fl 0 0.023802 0.048477 0.198711 1.000000 englewood lake wales 0 0.002947 0.000049 0.296590 1.000000 STRS STRS WRLT 0\n", - "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", - "186872 27.519625 1.000000e+00 __splink__input_table_0 __splink__input_table_1 39816 13109 northwestern public service northwestern public service 2 0.000073 0.000073 652179.111493 0.010580 33 third st se 33 third st se 2 0.000037 0.000037 9450.378101 0.317122 sd sd 1 0.001930 0.001930 15.873789 26.483035 huron huron 2 0.000073 0.000073 108.031428 86.293486 NR0WSTRN PBLK SRFS NR0WSTRN PBLK SRFS 0\n", - "580681 27.526533 1.000000e+00 __splink__input_table_0 __splink__input_table_1 24650 8047 green mountain power green mountain power 2 0.000037 0.000037 652179.111493 0.021160 163 acorn ln 163 acorn ln 2 0.000037 0.000037 9450.378101 0.317122 vt vt 1 0.001537 0.001537 15.873789 33.262692 colchester colchester 2 0.000183 0.000183 108.031428 34.517394 KRN MNTN PWR KRN MNTN PWR 0\n", - "438193 27.757357 1.000000e+00 __splink__input_table_0 __splink__input_table_1 58842 19906 wausau paper mills wausau paper mills 2 0.000024 0.000024 652179.111493 0.031739 one clarks is one clarks is 2 0.000024 0.000024 9450.378101 0.475683 wi wi 1 0.008840 0.008840 15.873789 5.782805 wausau wausau 2 0.000061 0.000061 108.031428 103.552183 WS PPR MLS WS PPR MLS 0\n", - "385934 27.884385 1.000000e+00 __splink__input_table_0 __splink__input_table_1 51567 17450 st joseph light and power st joseph light and power 2 0.000024 0.000024 652179.111493 0.031739 520 francis st 520 francis st 2 0.000024 0.000024 9450.378101 0.475683 mo mo 1 0.010118 0.010118 15.873789 5.052049 st joseph st joseph 2 0.000049 0.000049 108.031428 129.440229 ST JSF LT ANT PWR ST JSF LT ANT PWR 0\n", - "503816 29.211031 1.000000e+00 __splink__input_table_0 __splink__input_table_1 20588 6741 fibermark fibermark 2 0.000037 0.000037 652179.111493 0.021160 161 wellington rd 161 wellington rd 2 0.000024 0.000024 9450.378101 0.475683 vt vt 1 0.001537 0.001537 15.873789 33.262692 brattleboro brattleboro 2 0.000086 0.000086 108.031428 73.965845 FBRMRK FBRMRK 0\n", + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key\n", + "295287 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56230 19078 union pacific union electric 0 0.000049 0.000098 0.986046 1.000000 1416 dodge st mc 1400 0 0.000049 0.000049 0.881658 1.000000 ne mo 0 0.006455 0.010118 0.199012 1.000000 omaha st louis 0 0.003448 0.002764 0.296714 1.000000 UNN PSFK UNN ELKTRK 0\n", + "384509 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56484 19138 united states lime and minerals united water conservation 0 0.000037 0.000024 0.986046 1.000000 5429 lbj fwy 1701 north lombard st 0 0.000024 0.000012 0.881658 1.000000 tx ca 0 0.079841 0.157960 0.199012 1.000000 dallas oxnard 0 0.013855 0.000257 0.296714 1.000000 UNTT STTS LM ANT MNRLS UNTT WTR KNSRFXN 0\n", + "384504 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56436 19138 united rentals united water conservation 0 0.000024 0.000024 0.986046 1.000000 100 first stamford pl 1701 north lombard st 0 0.000122 0.000012 0.881658 1.000000 ct ca 0 0.020876 0.157960 0.199012 1.000000 stamford oxnard 0 0.003950 0.000257 0.296714 1.000000 UNTT RNTLS UNTT WTR KNSRFXN 0\n", + "384503 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56424 19138 united parcel service united water conservation 0 0.000024 0.000024 0.986046 1.000000 55 glenlake pkwy ne 1701 north lombard st 0 0.000012 0.000012 0.881658 1.000000 ga ca 0 0.018626 0.157960 0.199012 1.000000 atlanta oxnard 0 0.008462 0.000257 0.296714 1.000000 UNTT PRSL SRFS UNTT WTR KNSRFXN 0\n", + "384502 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56312 19138 united bancorp /oh/ united water conservation 0 0.000024 0.000024 0.986046 1.000000 201 south fourth st 1701 north lombard st 0 0.000012 0.000012 0.881658 1.000000 oh ca 0 0.016991 0.157960 0.199012 1.000000 martins ferry oxnard 0 0.000024 0.000257 0.296714 1.000000 UNTT BNKRP UNTT WTR KNSRFXN 0\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "163815 27.519606 1.000000e+00 __splink__input_table_0 __splink__input_table_1 39816 13109 northwestern public service northwestern public service 2 0.000073 0.000073 415263.133269 0.016616 33 third st se 33 third st se 2 0.000037 0.000037 9605.781694 0.311992 sd sd 1 0.001930 0.001930 15.445559 27.217182 huron huron 2 0.000073 0.000073 102.014123 91.382644 NR0WSTRN PBLK SRFS NR0WSTRN PBLK SRFS 0\n", + "241593 27.526514 1.000000e+00 __splink__input_table_0 __splink__input_table_1 24650 8047 green mountain power green mountain power 2 0.000037 0.000037 415263.133269 0.033231 163 acorn ln 163 acorn ln 2 0.000037 0.000037 9605.781694 0.311992 vt vt 1 0.001537 0.001537 15.445559 34.184780 colchester colchester 2 0.000183 0.000183 102.014123 36.553058 KRN MNTN PWR KRN MNTN PWR 0\n", + "165487 27.757338 1.000000e+00 __splink__input_table_0 __splink__input_table_1 58842 19906 wausau paper mills wausau paper mills 2 0.000024 0.000024 415263.133269 0.049847 one clarks is one clarks is 2 0.000024 0.000024 9605.781694 0.467987 wi wi 1 0.008840 0.008840 15.445559 5.943112 wausau wausau 2 0.000061 0.000061 102.014123 109.659173 WS PPR MLS WS PPR MLS 0\n", + "340414 27.884365 1.000000e+00 __splink__input_table_0 __splink__input_table_1 51567 17450 st joseph light and power st joseph light and power 2 0.000024 0.000024 415263.133269 0.049847 520 francis st 520 francis st 2 0.000024 0.000024 9605.781694 0.467987 mo mo 1 0.010118 0.010118 15.445559 5.192099 st joseph st joseph 2 0.000049 0.000049 102.014123 137.073967 ST JSF LT ANT PWR ST JSF LT ANT PWR 0\n", + "274760 29.211012 1.000000e+00 __splink__input_table_0 __splink__input_table_1 20588 6741 fibermark fibermark 2 0.000037 0.000037 415263.133269 0.033231 161 wellington rd 161 wellington rd 2 0.000024 0.000024 9605.781694 0.467987 vt vt 1 0.001537 0.001537 15.445559 34.184780 brattleboro brattleboro 2 0.000086 0.000086 102.014123 78.327981 FBRMRK FBRMRK 0\n", "\n", "[590575 rows x 37 columns]" ] }, - "execution_count": 135, + "execution_count": 42, "metadata": {}, "output_type": "execute_result" } @@ -2365,7 +1958,7 @@ }, { "cell_type": "code", - "execution_count": 136, + "execution_count": 43, "id": "c0b292c8-26ed-407a-866e-75851577d567", "metadata": {}, "outputs": [], @@ -2379,7 +1972,7 @@ }, { "cell_type": "code", - "execution_count": 137, + "execution_count": 44, "id": "e8744d54-3cc6-434c-bbbe-a10d2d3cd9c0", "metadata": {}, "outputs": [], @@ -2392,7 +1985,7 @@ }, { "cell_type": "code", - "execution_count": 138, + "execution_count": 45, "id": "5103190c-3775-427f-a8f2-cc8a8f79892b", "metadata": {}, "outputs": [], @@ -2404,7 +1997,7 @@ }, { "cell_type": "code", - "execution_count": 139, + "execution_count": 46, "id": "8fa04f18-beff-4bdc-bbbb-705950eab5b8", "metadata": {}, "outputs": [ @@ -2476,9 +2069,9 @@ " \n", " \n", " \n", - " 466134\n", - " 3.824596\n", - " 0.934073\n", + " 218797\n", + " 3.824578\n", + " 0.934072\n", " __splink__input_table_0\n", " __splink__input_table_1\n", " 14692\n", @@ -2488,29 +2081,29 @@ " 0\n", " 0.000012\n", " 0.000012\n", - " 0.986045\n", - " 1.000000\n", + " 0.986046\n", + " 1.0\n", " 100 first stamford pl\n", " 100 first stamford pl\n", " 2\n", " 0.000122\n", " 0.000122\n", - " 9450.378101\n", - " 0.095137\n", + " 9605.781694\n", + " 0.093597\n", " ct\n", " ct\n", " 1\n", " 0.020876\n", " 0.020876\n", - " 15.873789\n", - " 2.448667\n", + " 15.445559\n", + " 2.516547\n", " stamford\n", " stamford\n", " 2\n", " 0.003950\n", " 0.003950\n", - " 108.031428\n", - " 1.602975\n", + " 102.014123\n", + " 1.697510\n", " KRN\n", " ENTRJ NKLR PWR MRKTNK\n", " 1\n", @@ -2522,8 +2115,8 @@ " 55243\n", " \n", " \n", - " 466594\n", - " 4.620005\n", + " 220036\n", + " 4.619987\n", " 0.960922\n", " __splink__input_table_0\n", " __splink__input_table_1\n", @@ -2534,29 +2127,29 @@ " 0\n", " 0.000012\n", " 0.000012\n", - " 0.986045\n", - " 1.000000\n", + " 0.986046\n", + " 1.0\n", " one energy plz\n", " one energy plz\n", " 2\n", " 0.000330\n", " 0.000330\n", - " 9450.378101\n", - " 0.035236\n", + " 9605.781694\n", + " 0.034666\n", " mi\n", " mi\n", " 1\n", " 0.015147\n", " 0.015147\n", - " 15.873789\n", - " 3.374867\n", + " 15.445559\n", + " 3.468423\n", " detroit\n", " detroit\n", " 2\n", " 0.001162\n", " 0.001162\n", - " 108.031428\n", - " 5.450115\n", + " 102.014123\n", + " 5.771535\n", " TT ELKTRK SKRTSXN FNTNK I\n", " TT SSTNBL JNRXN\n", " 1\n", @@ -2568,8 +2161,8 @@ " 64331\n", " \n", " \n", - " 480747\n", - " 4.620005\n", + " 358152\n", + " 4.619987\n", " 0.960922\n", " __splink__input_table_0\n", " __splink__input_table_1\n", @@ -2580,29 +2173,29 @@ " 0\n", " 0.000012\n", " 0.000037\n", - " 0.986045\n", - " 1.000000\n", + " 0.986046\n", + " 1.0\n", " one energy plz\n", " one energy plz\n", " 2\n", " 0.000330\n", " 0.000330\n", - " 9450.378101\n", - " 0.035236\n", + " 9605.781694\n", + " 0.034666\n", " mi\n", " mi\n", " 1\n", " 0.015147\n", " 0.015147\n", - " 15.873789\n", - " 3.374867\n", + " 15.445559\n", + " 3.468423\n", " detroit\n", " detroit\n", " 2\n", " 0.001162\n", " 0.001162\n", - " 108.031428\n", - " 5.450115\n", + " 102.014123\n", + " 5.771535\n", " TT ELKTRK SKRTSXN FNTNK I\n", " TT ELKTRK\n", " 0\n", @@ -2613,408 +2206,29 @@ " 5522\n", " 5109\n", " \n", - " \n", - " 464506\n", - " 6.019599\n", - " 0.984820\n", - " __splink__input_table_0\n", - " __splink__input_table_1\n", - " 14051\n", - " 10935\n", - " constellation energy\n", - " luminace solar rhode island\n", - " 0\n", - " 0.000024\n", - " 0.000024\n", - " 0.986045\n", - " 1.000000\n", - " 1310 pt st\n", - " 1310 pt st\n", - " 2\n", - " 0.000024\n", - " 0.000024\n", - " 9450.378101\n", - " 0.475683\n", - " md\n", - " md\n", - " 1\n", - " 0.025130\n", - " 0.025130\n", - " 15.873789\n", - " 2.034167\n", - " baltimore\n", - " baltimore\n", - " 2\n", - " 0.003583\n", - " 0.003583\n", - " 108.031428\n", - " 1.767102\n", - " KNSTLXN ENRJ\n", - " LMNS SLR RHT ISLNT\n", - " 1\n", - " 14051\n", - " 0001868275\n", - " 0001868275\n", - " constellation energy corp\n", - " 10935\n", - " 62679\n", - " \n", - " \n", - " 340973\n", - " 6.201744\n", - " 0.986596\n", - " __splink__input_table_0\n", - " __splink__input_table_1\n", - " 14051\n", - " 4420\n", - " constellation energy\n", - " constellation newenergy\n", - " 1\n", - " 0.000024\n", - " 0.000024\n", - " 5704.210475\n", - " 1.000000\n", - " 1310 pt st\n", - " 100 constellation way\n", - " 0\n", - " 0.000024\n", - " 0.000183\n", - " 0.881657\n", - " 1.000000\n", - " md\n", - " md\n", - " 1\n", - " 0.025130\n", - " 0.025130\n", - " 15.873789\n", - " 2.034167\n", - " baltimore\n", - " baltimore\n", - " 2\n", - " 0.003583\n", - " 0.003583\n", - " 108.031428\n", - " 1.767102\n", - " KNSTLXN ENRJ\n", - " KNSTLXN NWNRJ\n", - " 0\n", - " 14051\n", - " 0001868275\n", - " 0001868275\n", - " constellation energy corp\n", - " 4420\n", - " 58491\n", - " \n", - " \n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " ...\n", - " \n", - " \n", - " 464642\n", - " 5.308053\n", - " 0.975380\n", - " __splink__input_table_0\n", - " __splink__input_table_1\n", - " 1585\n", - " 6561\n", - " air products and chemicals /de/\n", - " exelon gen extexlaporte\n", - " 0\n", - " 0.000024\n", - " 0.000012\n", - " 0.986045\n", - " 1.000000\n", - " 7201 hamilton blvd\n", - " 7201 hamilton blvd\n", - " 2\n", - " 0.000122\n", - " 0.000122\n", - " 9450.378101\n", - " 0.095137\n", - " pa\n", - " pa\n", - " 1\n", - " 0.029409\n", - " 0.029409\n", - " 15.873789\n", - " 1.738226\n", - " allentown\n", - " allentown\n", - " 2\n", - " 0.001003\n", - " 0.001003\n", - " 108.031428\n", - " 6.314158\n", - " AR PRTKTS ANT XMKLS T\n", - " EKSLN JN EKSTKSLPRT\n", - " 1\n", - " 1585\n", - " 0000002969\n", - " 0000002969\n", - " air products & chemicals inc /de/\n", - " 6561\n", - " 6081\n", - " \n", - " \n", - " 227094\n", - " 20.402617\n", - " 0.999999\n", - " __splink__input_table_0\n", - " __splink__input_table_1\n", - " 1586\n", - " 430\n", - " air products and chemicals\n", - " air products and chemicals\n", - " 2\n", - " 0.000037\n", - " 0.000037\n", - " 652179.111493\n", - " 0.021160\n", - " 1940 air products blvd\n", - " 1940 air products blvd\n", - " 2\n", - " 0.000049\n", - " 0.000049\n", - " 9450.378101\n", - " 0.237842\n", - " pa\n", - " pa\n", - " 1\n", - " 0.029409\n", - " 0.029409\n", - " 15.873789\n", - " 1.738226\n", - " allentown\n", - " allentown\n", - " 2\n", - " 0.001003\n", - " 0.001003\n", - " 108.031428\n", - " 6.314158\n", - " AR PRTKTS ANT XMKLS\n", - " AR PRTKTS ANT XMKLS\n", - " 0\n", - " 1586\n", - " 0000002969\n", - " 0000002969\n", - " air products & chemicals, inc.\n", - " 430\n", - " 991\n", - " \n", - " \n", - " 224504\n", - " 5.308053\n", - " 0.975380\n", - " __splink__input_table_0\n", - " __splink__input_table_1\n", - " 1585\n", - " 435\n", - " air products and chemicals /de/\n", - " air products\n", - " 0\n", - " 0.000024\n", - " 0.000037\n", - " 0.986045\n", - " 1.000000\n", - " 7201 hamilton blvd\n", - " 7201 hamilton blvd\n", - " 2\n", - " 0.000122\n", - " 0.000122\n", - " 9450.378101\n", - " 0.095137\n", - " pa\n", - " pa\n", - " 1\n", - " 0.029409\n", - " 0.029409\n", - " 15.873789\n", - " 1.738226\n", - " allentown\n", - " allentown\n", - " 2\n", - " 0.001003\n", - " 0.001003\n", - " 108.031428\n", - " 6.314158\n", - " AR PRTKTS ANT XMKLS T\n", - " AR PRTKTS\n", - " 0\n", - " 1585\n", - " 0000002969\n", - " 0000002969\n", - " air products & chemicals inc /de/\n", - " 435\n", - " 980\n", - " \n", - " \n", - " 225982\n", - " 5.308053\n", - " 0.975380\n", - " __splink__input_table_0\n", - " __splink__input_table_1\n", - " 1585\n", - " 432\n", - " air products and chemicals /de/\n", - " air products energy enterprises\n", - " 0\n", - " 0.000024\n", - " 0.000012\n", - " 0.986045\n", - " 1.000000\n", - " 7201 hamilton blvd\n", - " 7201 hamilton blvd\n", - " 2\n", - " 0.000122\n", - " 0.000122\n", - " 9450.378101\n", - " 0.095137\n", - " pa\n", - " pa\n", - " 1\n", - " 0.029409\n", - " 0.029409\n", - " 15.873789\n", - " 1.738226\n", - " allentown\n", - " allentown\n", - " 2\n", - " 0.001003\n", - " 0.001003\n", - " 108.031428\n", - " 6.314158\n", - " AR PRTKTS ANT XMKLS T\n", - " AR PRTKTS ENRJ ENTRPRSS\n", - " 0\n", - " 1585\n", - " 0000002969\n", - " 0000002969\n", - " air products & chemicals inc /de/\n", - " 432\n", - " 353\n", - " \n", - " \n", - " 224473\n", - " 20.054878\n", - " 0.999999\n", - " __splink__input_table_0\n", - " __splink__input_table_1\n", - " 1348\n", - " 376\n", - " aetna life and casualty\n", - " aetna life and casualty\n", - " 2\n", - " 0.000024\n", - " 0.000024\n", - " 652179.111493\n", - " 0.031739\n", - " 151 farmington ave\n", - " 151 farmington ave\n", - " 2\n", - " 0.000110\n", - " 0.000110\n", - " 9450.378101\n", - " 0.105707\n", - " ct\n", - " ct\n", - " 1\n", - " 0.020876\n", - " 0.020876\n", - " 15.873789\n", - " 2.448667\n", - " hartford\n", - " hartford\n", - " 2\n", - " 0.001198\n", - " 0.001198\n", - " 108.031428\n", - " 5.283275\n", - " ETN LF ANT KSLT\n", - " ETN LF ANT KSLT\n", - " 0\n", - " 1348\n", - " 0000002648\n", - " 0000002648\n", - " aetna life & casualty co\n", - " 376\n", - " 211\n", - " \n", " \n", "\n", - "

2085 rows × 43 columns

\n", "" ], "text/plain": [ - " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key record_id_x sec_company_id central_index_key company_name_raw record_id_y utility_id_eia\n", - "466134 3.824596 0.934073 __splink__input_table_0 __splink__input_table_1 14692 6293 crane entergy nuclear power marketing 0 0.000012 0.000012 0.986045 1.000000 100 first stamford pl 100 first stamford pl 2 0.000122 0.000122 9450.378101 0.095137 ct ct 1 0.020876 0.020876 15.873789 2.448667 stamford stamford 2 0.003950 0.003950 108.031428 1.602975 KRN ENTRJ NKLR PWR MRKTNK 1 14692 0001944013 0001944013 crane co 6293 55243\n", - "466594 4.620005 0.960922 __splink__input_table_0 __splink__input_table_1 17752 5535 dte electric securitization funding i dte sustainable generation 0 0.000012 0.000012 0.986045 1.000000 one energy plz one energy plz 2 0.000330 0.000330 9450.378101 0.035236 mi mi 1 0.015147 0.015147 15.873789 3.374867 detroit detroit 2 0.001162 0.001162 108.031428 5.450115 TT ELKTRK SKRTSXN FNTNK I TT SSTNBL JNRXN 1 17752 0001876068 0001876068 dte electric securitization funding i llc 5535 64331\n", - "480747 4.620005 0.960922 __splink__input_table_0 __splink__input_table_1 17752 5522 dte electric securitization funding i dte electric 0 0.000012 0.000037 0.986045 1.000000 one energy plz one energy plz 2 0.000330 0.000330 9450.378101 0.035236 mi mi 1 0.015147 0.015147 15.873789 3.374867 detroit detroit 2 0.001162 0.001162 108.031428 5.450115 TT ELKTRK SKRTSXN FNTNK I TT ELKTRK 0 17752 0001876068 0001876068 dte electric securitization funding i llc 5522 5109\n", - "464506 6.019599 0.984820 __splink__input_table_0 __splink__input_table_1 14051 10935 constellation energy luminace solar rhode island 0 0.000024 0.000024 0.986045 1.000000 1310 pt st 1310 pt st 2 0.000024 0.000024 9450.378101 0.475683 md md 1 0.025130 0.025130 15.873789 2.034167 baltimore baltimore 2 0.003583 0.003583 108.031428 1.767102 KNSTLXN ENRJ LMNS SLR RHT ISLNT 1 14051 0001868275 0001868275 constellation energy corp 10935 62679\n", - "340973 6.201744 0.986596 __splink__input_table_0 __splink__input_table_1 14051 4420 constellation energy constellation newenergy 1 0.000024 0.000024 5704.210475 1.000000 1310 pt st 100 constellation way 0 0.000024 0.000183 0.881657 1.000000 md md 1 0.025130 0.025130 15.873789 2.034167 baltimore baltimore 2 0.003583 0.003583 108.031428 1.767102 KNSTLXN ENRJ KNSTLXN NWNRJ 0 14051 0001868275 0001868275 constellation energy corp 4420 58491\n", - "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", - "464642 5.308053 0.975380 __splink__input_table_0 __splink__input_table_1 1585 6561 air products and chemicals /de/ exelon gen extexlaporte 0 0.000024 0.000012 0.986045 1.000000 7201 hamilton blvd 7201 hamilton blvd 2 0.000122 0.000122 9450.378101 0.095137 pa pa 1 0.029409 0.029409 15.873789 1.738226 allentown allentown 2 0.001003 0.001003 108.031428 6.314158 AR PRTKTS ANT XMKLS T EKSLN JN EKSTKSLPRT 1 1585 0000002969 0000002969 air products & chemicals inc /de/ 6561 6081\n", - "227094 20.402617 0.999999 __splink__input_table_0 __splink__input_table_1 1586 430 air products and chemicals air products and chemicals 2 0.000037 0.000037 652179.111493 0.021160 1940 air products blvd 1940 air products blvd 2 0.000049 0.000049 9450.378101 0.237842 pa pa 1 0.029409 0.029409 15.873789 1.738226 allentown allentown 2 0.001003 0.001003 108.031428 6.314158 AR PRTKTS ANT XMKLS AR PRTKTS ANT XMKLS 0 1586 0000002969 0000002969 air products & chemicals, inc. 430 991\n", - "224504 5.308053 0.975380 __splink__input_table_0 __splink__input_table_1 1585 435 air products and chemicals /de/ air products 0 0.000024 0.000037 0.986045 1.000000 7201 hamilton blvd 7201 hamilton blvd 2 0.000122 0.000122 9450.378101 0.095137 pa pa 1 0.029409 0.029409 15.873789 1.738226 allentown allentown 2 0.001003 0.001003 108.031428 6.314158 AR PRTKTS ANT XMKLS T AR PRTKTS 0 1585 0000002969 0000002969 air products & chemicals inc /de/ 435 980\n", - "225982 5.308053 0.975380 __splink__input_table_0 __splink__input_table_1 1585 432 air products and chemicals /de/ air products energy enterprises 0 0.000024 0.000012 0.986045 1.000000 7201 hamilton blvd 7201 hamilton blvd 2 0.000122 0.000122 9450.378101 0.095137 pa pa 1 0.029409 0.029409 15.873789 1.738226 allentown allentown 2 0.001003 0.001003 108.031428 6.314158 AR PRTKTS ANT XMKLS T AR PRTKTS ENRJ ENTRPRSS 0 1585 0000002969 0000002969 air products & chemicals inc /de/ 432 353\n", - "224473 20.054878 0.999999 __splink__input_table_0 __splink__input_table_1 1348 376 aetna life and casualty aetna life and casualty 2 0.000024 0.000024 652179.111493 0.031739 151 farmington ave 151 farmington ave 2 0.000110 0.000110 9450.378101 0.105707 ct ct 1 0.020876 0.020876 15.873789 2.448667 hartford hartford 2 0.001198 0.001198 108.031428 5.283275 ETN LF ANT KSLT ETN LF ANT KSLT 0 1348 0000002648 0000002648 aetna life & casualty co 376 211\n", - "\n", - "[2085 rows x 43 columns]" + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key record_id_x sec_company_id central_index_key company_name_raw record_id_y utility_id_eia\n", + "218797 3.824578 0.934072 __splink__input_table_0 __splink__input_table_1 14692 6293 crane entergy nuclear power marketing 0 0.000012 0.000012 0.986046 1.0 100 first stamford pl 100 first stamford pl 2 0.000122 0.000122 9605.781694 0.093597 ct ct 1 0.020876 0.020876 15.445559 2.516547 stamford stamford 2 0.003950 0.003950 102.014123 1.697510 KRN ENTRJ NKLR PWR MRKTNK 1 14692 0001944013 0001944013 crane co 6293 55243\n", + "220036 4.619987 0.960922 __splink__input_table_0 __splink__input_table_1 17752 5535 dte electric securitization funding i dte sustainable generation 0 0.000012 0.000012 0.986046 1.0 one energy plz one energy plz 2 0.000330 0.000330 9605.781694 0.034666 mi mi 1 0.015147 0.015147 15.445559 3.468423 detroit detroit 2 0.001162 0.001162 102.014123 5.771535 TT ELKTRK SKRTSXN FNTNK I TT SSTNBL JNRXN 1 17752 0001876068 0001876068 dte electric securitization funding i llc 5535 64331\n", + "358152 4.619987 0.960922 __splink__input_table_0 __splink__input_table_1 17752 5522 dte electric securitization funding i dte electric 0 0.000012 0.000037 0.986046 1.0 one energy plz one energy plz 2 0.000330 0.000330 9605.781694 0.034666 mi mi 1 0.015147 0.015147 15.445559 3.468423 detroit detroit 2 0.001162 0.001162 102.014123 5.771535 TT ELKTRK SKRTSXN FNTNK I TT ELKTRK 0 17752 0001876068 0001876068 dte electric securitization funding i llc 5522 5109" ] }, - "execution_count": 139, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "preds_validation_df[preds_validation_df.match_probability > .9]" + "preds_validation_df[preds_validation_df.match_probability > .9].head(3)" ] }, { "cell_type": "code", - "execution_count": 140, + "execution_count": 47, "id": "11190456-12a9-49df-b863-7a6f674e39eb", "metadata": {}, "outputs": [], @@ -3024,7 +2238,7 @@ }, { "cell_type": "code", - "execution_count": 141, + "execution_count": 48, "id": "5a57bae5-6188-4a6c-be9a-14a159f82d81", "metadata": {}, "outputs": [], @@ -3034,7 +2248,7 @@ }, { "cell_type": "code", - "execution_count": 142, + "execution_count": 49, "id": "461725d0-28c8-43dd-bcd5-086b7d3f7d4b", "metadata": {}, "outputs": [], @@ -3049,7 +2263,7 @@ }, { "cell_type": "code", - "execution_count": 143, + "execution_count": 50, "id": "4d45f339-7a5b-466a-81f5-c71e425a77df", "metadata": {}, "outputs": [], @@ -3059,7 +2273,7 @@ }, { "cell_type": "code", - "execution_count": 144, + "execution_count": 51, "id": "182732ea-39c5-4fb4-9429-5f7aed781ef5", "metadata": {}, "outputs": [], @@ -3072,7 +2286,7 @@ }, { "cell_type": "code", - "execution_count": 145, + "execution_count": 52, "id": "bc058dbf-6f71-4978-90ce-5405edbbf9e5", "metadata": {}, "outputs": [ @@ -3162,7 +2376,7 @@ " 1\n", " 13310.0\n", " 4281.0\n", - " 0.999981\n", + " 0.999982\n", " 1.0\n", " both\n", " 1.0\n", @@ -3176,7 +2390,7 @@ " 1\n", " 17793.0\n", " 5564.0\n", - " 0.927294\n", + " 0.927293\n", " 2.0\n", " both\n", " 0.0\n", @@ -3216,7 +2430,7 @@ " southern co\n", " southern co services inc\n", " 0\n", - " 50962.0\n", + " 50964.0\n", " 17068.0\n", " 0.007216\n", " 0.0\n", @@ -3274,7 +2488,7 @@ " 0\n", " 21579.0\n", " 6780.0\n", - " 0.986543\n", + " 0.986542\n", " 0.0\n", " both\n", " 1.0\n", @@ -3288,7 +2502,7 @@ " 0\n", " 21579.0\n", " 6763.0\n", - " 0.085467\n", + " 0.085466\n", " 0.0\n", " both\n", " 0.0\n", @@ -3358,7 +2572,7 @@ " 0\n", " 40084.0\n", " 13240.0\n", - " 0.300167\n", + " 0.300165\n", " 0.0\n", " both\n", " 0.0\n", @@ -3372,7 +2586,7 @@ " 1\n", " 40084.0\n", " 13243.0\n", - " 0.999820\n", + " 0.999813\n", " 2.0\n", " both\n", " 1.0\n", @@ -3414,7 +2628,7 @@ " 1\n", " 49303.0\n", " 16270.0\n", - " 0.559074\n", + " 0.559071\n", " 0.0\n", " both\n", " 0.0\n", @@ -3426,7 +2640,7 @@ " american electric power co inc\n", " American Electric Power Inc\n", " 1\n", - " 2926.0\n", + " 2927.0\n", " 793.0\n", " 0.996076\n", " 2.0\n", @@ -3456,30 +2670,30 @@ "0 0000003153 195 alabama power co NaN 1 1701.0 478.0 1.000000 2.0 both 1.0\n", "1 0001868941 58702 fluence energy, inc. Fluence 0 21792.0 6889.0 0.016529 0.0 both 0.0\n", "2 0000041091 7140 georgia power co NaN 1 23416.0 7653.0 0.999997 2.0 both 1.0\n", - "3 0000022198 4062 columbus southern power co /oh/ Columbus Southern Power Co 1 13310.0 4281.0 0.999981 1.0 both 1.0\n", - "4 0001326160 5416 duke energy corp NaN 1 17793.0 5564.0 0.927294 2.0 both 0.0\n", + "3 0000022198 4062 columbus southern power co /oh/ Columbus Southern Power Co 1 13310.0 4281.0 0.999982 1.0 both 1.0\n", + "4 0001326160 5416 duke energy corp NaN 1 17793.0 5564.0 0.927293 2.0 both 0.0\n", "5 0000030371 54905 duke energy carolinas, llc Duke Energy Carolinas LLC 1 17790.0 5558.0 0.999987 2.0 both 1.0\n", "6 0000869446 57140 berkshire realty co inc /de Berkshire Wind Power Cooperative Corp 0 7449.0 1712.0 0.001912 0.0 both 0.0\n", - "7 0000092122 18195 southern co southern co services inc 0 50962.0 17068.0 0.007216 0.0 both 0.0\n", + "7 0000092122 18195 southern co southern co services inc 0 50964.0 17068.0 0.007216 0.0 both 0.0\n", "8 0000092122 17650 southern co Southern Power Co 0 50963.0 17089.0 0.034232 0.0 both 0.0\n", "9 0000075488 14328 pacific gas & electric co NaN 1 41598.0 13933.0 0.999948 2.0 both 1.0\n", "10 0001031296 6526 firstenergy corp FirstEnergy 0 21579.0 6776.0 0.999998 2.0 both 1.0\n", - "11 0001031296 54776 firstenergy corp FirstEnergy Nuclear Generation Corp 0 21579.0 6780.0 0.986543 0.0 both 1.0\n", - "12 0001031296 6458 firstenergy corp First Energy Services 0 21579.0 6763.0 0.085467 0.0 both 0.0\n", + "11 0001031296 54776 firstenergy corp FirstEnergy Nuclear Generation Corp 0 21579.0 6780.0 0.986542 0.0 both 1.0\n", + "12 0001031296 6458 firstenergy corp First Energy Services 0 21579.0 6763.0 0.085466 0.0 both 0.0\n", "13 0001031296 32208 firstenergy corp First Energy Corp 1 NaN NaN NaN NaN left_only 0.0\n", "14 0000100122 24211 tucson electric power co NaN 1 55725.0 18901.0 1.000000 2.0 both 1.0\n", "15 0000096271 18454 tampa electric co NaN 1 53604.0 18180.0 0.991059 2.0 both 1.0\n", "16 0000715957 5248 dominion energy, inc NaN 1 17484.0 5386.0 0.999985 2.0 both 1.0\n", - "17 0001013871 59883 nrg energy, inc NRG Energy Gas & Wind Holdings Inc 0 40084.0 13240.0 0.300167 0.0 both 0.0\n", - "18 0001013871 13377 nrg energy inc NRG Energy Inc 1 40084.0 13243.0 0.999820 2.0 both 1.0\n", + "17 0001013871 59883 nrg energy, inc NRG Energy Gas & Wind Holdings Inc 0 40084.0 13240.0 0.300165 0.0 both 0.0\n", + "18 0001013871 13377 nrg energy inc NRG Energy Inc 1 40084.0 13243.0 0.999813 2.0 both 1.0\n", "19 0000788816 13994 oglethorpe power corp NaN 1 40576.0 13515.0 1.000000 2.0 both 1.0\n", "20 0000018675 3266 central maine power co NaN 1 10876.0 3424.0 1.000000 2.0 both 1.0\n", - "21 0001032208 61296 sempra energy Sempra Generation 1 49303.0 16270.0 0.559074 0.0 both 0.0\n", - "22 0000004904 488 american electric power co inc American Electric Power Inc 1 2926.0 793.0 0.996076 2.0 both 1.0\n", + "21 0001032208 61296 sempra energy Sempra Generation 1 49303.0 16270.0 0.559071 0.0 both 0.0\n", + "22 0000004904 488 american electric power co inc American Electric Power Inc 1 2927.0 793.0 0.996076 2.0 both 1.0\n", "23 0000715957 5248 dominion energy, inc Dominion Energy Inc. 1 17484.0 5386.0 0.999985 2.0 both 1.0" ] }, - "execution_count": 145, + "execution_count": 52, "metadata": {}, "output_type": "execute_result" } @@ -3490,7 +2704,7 @@ }, { "cell_type": "code", - "execution_count": 146, + "execution_count": 53, "id": "2fc6ed05-5b35-4856-a668-f16e253e7fea", "metadata": {}, "outputs": [], @@ -3506,7 +2720,7 @@ }, { "cell_type": "code", - "execution_count": 147, + "execution_count": 54, "id": "99a7bc64-9f32-4f53-9a89-63a31ff7debe", "metadata": {}, "outputs": [ @@ -3516,7 +2730,7 @@ "(np.float64(0.8666666666666667), np.float64(0.8125), 0.7916666666666666)" ] }, - "execution_count": 147, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } @@ -3527,7 +2741,7 @@ }, { "cell_type": "code", - "execution_count": 148, + "execution_count": 55, "id": "08932be5-b90c-440d-9efb-156cb4d63c93", "metadata": {}, "outputs": [ @@ -3577,7 +2791,7 @@ "Positive 3 13" ] }, - "execution_count": 148, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } @@ -3592,7 +2806,7 @@ }, { "cell_type": "code", - "execution_count": 149, + "execution_count": 56, "id": "025c80e9-5055-4eaa-a873-38b910cd7f94", "metadata": {}, "outputs": [], @@ -3602,7 +2816,7 @@ }, { "cell_type": "code", - "execution_count": 150, + "execution_count": 57, "id": "e4f44fda-1b55-4f9c-a96d-5eec36dac768", "metadata": {}, "outputs": [ @@ -3650,7 +2864,7 @@ " 1\n", " 17793.0\n", " 5564.0\n", - " 0.927294\n", + " 0.927293\n", " 2.0\n", " both\n", " 0.0\n", @@ -3678,7 +2892,7 @@ " 0\n", " 21579.0\n", " 6780.0\n", - " 0.986543\n", + " 0.986542\n", " 0.0\n", " both\n", " 1.0\n", @@ -3706,7 +2920,7 @@ " 1\n", " 49303.0\n", " 16270.0\n", - " 0.559074\n", + " 0.559071\n", " 0.0\n", " both\n", " 0.0\n", @@ -3717,14 +2931,14 @@ ], "text/plain": [ " central_index_key utility_id_eia sec_company_name eia_company_name match record_id_l record_id_r match_probability gamma_company_name_no_legal _merge predicted_match\n", - "4 0001326160 5416 duke energy corp NaN 1 17793.0 5564.0 0.927294 2.0 both 0.0\n", + "4 0001326160 5416 duke energy corp NaN 1 17793.0 5564.0 0.927293 2.0 both 0.0\n", "10 0001031296 6526 firstenergy corp FirstEnergy 0 21579.0 6776.0 0.999998 2.0 both 1.0\n", - "11 0001031296 54776 firstenergy corp FirstEnergy Nuclear Generation Corp 0 21579.0 6780.0 0.986543 0.0 both 1.0\n", + "11 0001031296 54776 firstenergy corp FirstEnergy Nuclear Generation Corp 0 21579.0 6780.0 0.986542 0.0 both 1.0\n", "13 0001031296 32208 firstenergy corp First Energy Corp 1 NaN NaN NaN NaN left_only 0.0\n", - "21 0001032208 61296 sempra energy Sempra Generation 1 49303.0 16270.0 0.559074 0.0 both 0.0" + "21 0001032208 61296 sempra energy Sempra Generation 1 49303.0 16270.0 0.559071 0.0 both 0.0" ] }, - "execution_count": 150, + "execution_count": 57, "metadata": {}, "output_type": "execute_result" } @@ -3851,7 +3065,7 @@ }, { "cell_type": "code", - "execution_count": 153, + "execution_count": 58, "id": "92172e2f-39ba-49e3-8312-98597256ca4f", "metadata": {}, "outputs": [], @@ -3867,17 +3081,17 @@ }, { "cell_type": "code", - "execution_count": 154, + "execution_count": 59, "id": "07ca81ae-1b26-4cd3-ade6-75381028028a", "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "525" + "534" ] }, - "execution_count": 154, + "execution_count": 59, "metadata": {}, "output_type": "execute_result" } @@ -3885,6 +3099,672 @@ "source": [ "len(one_to_one_preds)" ] + }, + { + "cell_type": "markdown", + "id": "c3db3175-7cf3-497c-8f22-e68a6c9c6af2", + "metadata": {}, + "source": [ + "# Add `utility_id_eia` onto the SEC table to create output table" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "id": "361b3e30-e823-4137-9062-6a00eae537fe", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
match_weightmatch_probabilitysource_dataset_lsource_dataset_rrecord_id_lrecord_id_rcompany_name_no_legal_lcompany_name_no_legal_rgamma_company_name_no_legaltf_company_name_no_legal_ltf_company_name_no_legal_rbf_company_name_no_legalbf_tf_adj_company_name_no_legalstreet_address_lstreet_address_rgamma_street_addresstf_street_address_ltf_street_address_rbf_street_addressbf_tf_adj_street_addressstate_lstate_rgamma_statetf_state_ltf_state_rbf_statebf_tf_adj_statecity_lcity_rgamma_citytf_city_ltf_city_rbf_citybf_tf_adj_citycompany_name_mphone_lcompany_name_mphone_rmatch_keyrecord_id_xsec_company_idcentral_index_keycompany_name_rawrecord_id_yutility_id_eia
27476029.2110121.000000__splink__input_table_0__splink__input_table_1205886741fibermarkfibermark20.0000370.000037415263.1332690.033231161 wellington rd161 wellington rd20.0000240.0000249605.7816940.467987vtvt10.0015370.00153715.44555934.184780brattleborobrattleboro20.0000860.000086102.01412378.327981FBRMRKFBRMRK02058800008875910000887591fibermark inc67416309
34041427.8843651.000000__splink__input_table_0__splink__input_table_15156717450st joseph light and powerst joseph light and power20.0000240.000024415263.1332690.049847520 francis st520 francis st20.0000240.0000249605.7816940.467987momo10.0101180.01011815.4455595.192099st josephst joseph20.0000490.000049102.014123137.073967ST JSF LT ANT PWRST JSF LT ANT PWR05156700000862510000086251st joseph light & power co1745017881
16548727.7573381.000000__splink__input_table_0__splink__input_table_15884219906wausau paper millswausau paper mills20.0000240.000024415263.1332690.049847one clarks isone clarks is20.0000240.0000249605.7816940.467987wiwi10.0088400.00884015.4455595.943112wausauwausau20.0000610.000061102.014123109.659173WS PPR MLSWS PPR MLS05884200001050760000105076wausau paper mills co1990620190
24159327.5265141.000000__splink__input_table_0__splink__input_table_1246508047green mountain powergreen mountain power20.0000370.000037415263.1332690.033231163 acorn ln163 acorn ln20.0000370.0000379605.7816940.311992vtvt10.0015370.00153715.44555934.184780colchestercolchester20.0001830.000183102.01412336.553058KRN MNTN PWRKRN MNTN PWR02465000000437040000043704green mountain power corp80477601
16381527.5196061.000000__splink__input_table_0__splink__input_table_13981613109northwestern public servicenorthwestern public service20.0000730.000073415263.1332690.01661633 third st se33 third st se20.0000370.0000379605.7816940.311992sdsd10.0019300.00193015.44555927.217182huronhuron20.0000730.000073102.01412391.382644NR0WSTRN PBLK SRFSNR0WSTRN PBLK SRFS03981600000730880000073088northwestern public service co1310913809
....................................................................................................................................
14834.3371210.952856__splink__input_table_0__splink__input_table_15800417611vistacarestirling energy systems solar three00.0000240.0000370.9860461.0000004800 n scottsdale rd4800 n scottsdale rd20.0001100.0001109605.7816940.103997azaz10.0128720.01287215.4455594.081277scottsdalescottsdale20.0049890.004989102.0141231.343862FSTKRSTRLNK ENRJ SSTMS SLR 0R15800400007870300000787030vistacare, inc.1761156168
2184534.2721570.950792__splink__input_table_0__splink__input_table_1191747605enovisgenon sabine delaware00.0000120.0000120.9860461.0000002711 centerville rd2711 centerville rd20.0000610.0000619605.7816940.187195dede10.0117170.01171715.4455594.483838wilmingtonwilmington20.0103210.010321102.0141230.649640ENFSJNN SBN TLWR11917400014208000001420800enovis corp760556922
10554.2721570.950792__splink__input_table_0__splink__input_table_1165016368aisystemsshannon wind00.0000240.0000240.9860461.0000002711 centerville rd2711 centerville rd20.0000610.0000619605.7816940.187195dede10.0117170.01171715.4455594.483838wilmingtonwilmington20.0103210.010321102.0141230.649640ASSTMSXNN WNT1165000013287690001328769aisystems, inc.1636858872
72164.2721570.950792__splink__input_table_0__splink__input_table_13240314089lease investment flight trustpasadena statutory trust00.0000120.0000120.9860461.0000001100 north market st1100 north market st20.0000610.0000619605.7816940.187195dede10.0117170.01171715.4455594.483838wilmingtonwilmington20.0103210.010321102.0141230.649640LS INFSTMNT FLT TRSTPSTN STTTR TRST13240300011583890001158389lease investment flight trust1408961235
61134.2721570.950792__splink__input_table_0__splink__input_table_1162616195airplanes us trustse solar trust v c00.0000120.0000120.9860461.0000001100 north market st1100 north market st20.0000610.0000619605.7816940.187195dede10.0117170.01171715.4455594.483838wilmingtonwilmington20.0103210.010321102.0141230.649640ARPLNS US TRSTS SLR TRST F K1162600010045400001004540airplanes us trust1619556900
\n", + "

534 rows × 43 columns

\n", + "
" + ], + "text/plain": [ + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key record_id_x sec_company_id central_index_key company_name_raw record_id_y utility_id_eia\n", + "274760 29.211012 1.000000 __splink__input_table_0 __splink__input_table_1 20588 6741 fibermark fibermark 2 0.000037 0.000037 415263.133269 0.033231 161 wellington rd 161 wellington rd 2 0.000024 0.000024 9605.781694 0.467987 vt vt 1 0.001537 0.001537 15.445559 34.184780 brattleboro brattleboro 2 0.000086 0.000086 102.014123 78.327981 FBRMRK FBRMRK 0 20588 0000887591 0000887591 fibermark inc 6741 6309\n", + "340414 27.884365 1.000000 __splink__input_table_0 __splink__input_table_1 51567 17450 st joseph light and power st joseph light and power 2 0.000024 0.000024 415263.133269 0.049847 520 francis st 520 francis st 2 0.000024 0.000024 9605.781694 0.467987 mo mo 1 0.010118 0.010118 15.445559 5.192099 st joseph st joseph 2 0.000049 0.000049 102.014123 137.073967 ST JSF LT ANT PWR ST JSF LT ANT PWR 0 51567 0000086251 0000086251 st joseph light & power co 17450 17881\n", + "165487 27.757338 1.000000 __splink__input_table_0 __splink__input_table_1 58842 19906 wausau paper mills wausau paper mills 2 0.000024 0.000024 415263.133269 0.049847 one clarks is one clarks is 2 0.000024 0.000024 9605.781694 0.467987 wi wi 1 0.008840 0.008840 15.445559 5.943112 wausau wausau 2 0.000061 0.000061 102.014123 109.659173 WS PPR MLS WS PPR MLS 0 58842 0000105076 0000105076 wausau paper mills co 19906 20190\n", + "241593 27.526514 1.000000 __splink__input_table_0 __splink__input_table_1 24650 8047 green mountain power green mountain power 2 0.000037 0.000037 415263.133269 0.033231 163 acorn ln 163 acorn ln 2 0.000037 0.000037 9605.781694 0.311992 vt vt 1 0.001537 0.001537 15.445559 34.184780 colchester colchester 2 0.000183 0.000183 102.014123 36.553058 KRN MNTN PWR KRN MNTN PWR 0 24650 0000043704 0000043704 green mountain power corp 8047 7601\n", + "163815 27.519606 1.000000 __splink__input_table_0 __splink__input_table_1 39816 13109 northwestern public service northwestern public service 2 0.000073 0.000073 415263.133269 0.016616 33 third st se 33 third st se 2 0.000037 0.000037 9605.781694 0.311992 sd sd 1 0.001930 0.001930 15.445559 27.217182 huron huron 2 0.000073 0.000073 102.014123 91.382644 NR0WSTRN PBLK SRFS NR0WSTRN PBLK SRFS 0 39816 0000073088 0000073088 northwestern public service co 13109 13809\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "1483 4.337121 0.952856 __splink__input_table_0 __splink__input_table_1 58004 17611 vistacare stirling energy systems solar three 0 0.000024 0.000037 0.986046 1.000000 4800 n scottsdale rd 4800 n scottsdale rd 2 0.000110 0.000110 9605.781694 0.103997 az az 1 0.012872 0.012872 15.445559 4.081277 scottsdale scottsdale 2 0.004989 0.004989 102.014123 1.343862 FSTKR STRLNK ENRJ SSTMS SLR 0R 1 58004 0000787030 0000787030 vistacare, inc. 17611 56168\n", + "218453 4.272157 0.950792 __splink__input_table_0 __splink__input_table_1 19174 7605 enovis genon sabine delaware 0 0.000012 0.000012 0.986046 1.000000 2711 centerville rd 2711 centerville rd 2 0.000061 0.000061 9605.781694 0.187195 de de 1 0.011717 0.011717 15.445559 4.483838 wilmington wilmington 2 0.010321 0.010321 102.014123 0.649640 ENFS JNN SBN TLWR 1 19174 0001420800 0001420800 enovis corp 7605 56922\n", + "1055 4.272157 0.950792 __splink__input_table_0 __splink__input_table_1 1650 16368 aisystems shannon wind 0 0.000024 0.000024 0.986046 1.000000 2711 centerville rd 2711 centerville rd 2 0.000061 0.000061 9605.781694 0.187195 de de 1 0.011717 0.011717 15.445559 4.483838 wilmington wilmington 2 0.010321 0.010321 102.014123 0.649640 ASSTMS XNN WNT 1 1650 0001328769 0001328769 aisystems, inc. 16368 58872\n", + "7216 4.272157 0.950792 __splink__input_table_0 __splink__input_table_1 32403 14089 lease investment flight trust pasadena statutory trust 0 0.000012 0.000012 0.986046 1.000000 1100 north market st 1100 north market st 2 0.000061 0.000061 9605.781694 0.187195 de de 1 0.011717 0.011717 15.445559 4.483838 wilmington wilmington 2 0.010321 0.010321 102.014123 0.649640 LS INFSTMNT FLT TRST PSTN STTTR TRST 1 32403 0001158389 0001158389 lease investment flight trust 14089 61235\n", + "6113 4.272157 0.950792 __splink__input_table_0 __splink__input_table_1 1626 16195 airplanes us trust se solar trust v c 0 0.000012 0.000012 0.986046 1.000000 1100 north market st 1100 north market st 2 0.000061 0.000061 9605.781694 0.187195 de de 1 0.011717 0.011717 15.445559 4.483838 wilmington wilmington 2 0.010321 0.010321 102.014123 0.649640 ARPLNS US TRST S SLR TRST F K 1 1626 0001004540 0001004540 airplanes us trust 16195 56900\n", + "\n", + "[534 rows x 43 columns]" + ] + }, + "execution_count": 60, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "one_to_one_preds" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "id": "1d3e41bd-f92a-4f77-a0a7-0bd24f7ea70c", + "metadata": {}, + "outputs": [], + "source": [ + "out_df = sec_df.merge(\n", + " one_to_one_preds[[\"sec_company_id\", \"utility_id_eia\"]],\n", + " how=\"left\",\n", + " on=\"sec_company_id\",\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 66, + "id": "cce2b383-48b3-4efd-977a-0c734b0e3ec2", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "utility_id_eia\n", + "True 59895\n", + "False 1131\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 66, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "out_df.utility_id_eia.isnull().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "1cf0be2e-b1ef-4eb1-a07a-28e977c40252", + "metadata": {}, + "outputs": [], + "source": [ + "len(one_to_one_preds" + ] } ], "metadata": { diff --git a/notebooks/20-kl-validate-sec-output-table.ipynb b/notebooks/20-kl-validate-sec-output-table.ipynb index d6045f9..061a227 100644 --- a/notebooks/20-kl-validate-sec-output-table.ipynb +++ b/notebooks/20-kl-validate-sec-output-table.ipynb @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 3, "id": "c1795acc-8005-4b6d-be4d-27c722b634f1", "metadata": {}, "outputs": [], @@ -58,6 +58,284 @@ "ex21_df = pd.read_pickle(\"/Users/katielamb/CatalystCoop/dagster_home/storage/transformed_ex21_subsidiary_table\")" ] }, + { + "cell_type": "code", + "execution_count": 6, + "id": "291ce873-4971-4e03-985a-65dbdd8b0850", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
sec_company_idcompany_name_rawlocation_of_incown_perfilenamereport_datereport_yearcompany_namecompany_name_no_legalcompany_name_mphoneparent_company_cik
00000000020_colormax limited_united kingdomcolormax limitedunited kingdomNaNedgar/data/20/0000893220-06-000650.txt2006-03-232006colormax limitedcolormaxKLRMKS0000000020
10000000020_gundlach equipment corporation_dela...gundlach equipment corporationdelawareNaNedgar/data/20/0000950123-10-024631.txt2010-03-152010gundlach equipment corporationgundlach equipmentKNTLX EKPMNT0000000020
20000000020_jeffrey rader ab_swedenjeffrey rader abswedenNaNedgar/data/20/0000950123-10-024631.txt2010-03-152010jeffrey rader abjeffrey rader abJFR RTR AB0000000020
30000000020_jeffrey rader canada company_canadajeffrey rader canada companycanadaNaNedgar/data/20/0000950123-10-024631.txt2010-03-152010jeffrey rader canada companyjeffrey rader canadaJFR RTR KNT0000000020
40000000020_jeffrey rader corporation_delawarejeffrey rader corporationdelawareNaNedgar/data/20/0000950123-10-024631.txt2010-03-152010jeffrey rader corporationjeffrey raderJFR RTR0000000020
....................................
10559820001967649_vestis supply chain limited liabili...vestis (supply chain), llcdelawareNaNedgar/data/1967649/0001967649-23-000025.txt2023-12-212023vestis supply chain limited liability companyvestis supply chainFSTS SPL XN0001967649
10559830001967649_vestis syracuse limited liability c...vestis (syracuse), llcdelawareNaNedgar/data/1967649/0001967649-23-000025.txt2023-12-212023vestis syracuse limited liability companyvestis syracuseFSTS SRKS0001967649
10559840001967649_vestis texas limited liability comp...vestis (texas), llcdelawareNaNedgar/data/1967649/0001967649-23-000025.txt2023-12-212023vestis texas limited liability companyvestis texasFSTS TKSS0001967649
10559850001967649_vestis west adams limited liability...vestis (west adams), llcdelawareNaNedgar/data/1967649/0001967649-23-000025.txt2023-12-212023vestis west adams limited liability companyvestis west adamsFSTS WST ATMS0001967649
10559860001978811_gouverneur savings and loan associa...gouverneur savings and loan associationnew york100.0edgar/data/1978811/0001558370-23-020009.txt2023-12-262023gouverneur savings and loan associationgouverneur savings and loanKFRNR SFNKS ANT LN0001978811
\n", + "

1055987 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " sec_company_id \\\n", + "0 0000000020_colormax limited_united kingdom \n", + "1 0000000020_gundlach equipment corporation_dela... \n", + "2 0000000020_jeffrey rader ab_sweden \n", + "3 0000000020_jeffrey rader canada company_canada \n", + "4 0000000020_jeffrey rader corporation_delaware \n", + "... ... \n", + "1055982 0001967649_vestis supply chain limited liabili... \n", + "1055983 0001967649_vestis syracuse limited liability c... \n", + "1055984 0001967649_vestis texas limited liability comp... \n", + "1055985 0001967649_vestis west adams limited liability... \n", + "1055986 0001978811_gouverneur savings and loan associa... \n", + "\n", + " company_name_raw location_of_inc own_per \\\n", + "0 colormax limited united kingdom NaN \n", + "1 gundlach equipment corporation delaware NaN \n", + "2 jeffrey rader ab sweden NaN \n", + "3 jeffrey rader canada company canada NaN \n", + "4 jeffrey rader corporation delaware NaN \n", + "... ... ... ... \n", + "1055982 vestis (supply chain), llc delaware NaN \n", + "1055983 vestis (syracuse), llc delaware NaN \n", + "1055984 vestis (texas), llc delaware NaN \n", + "1055985 vestis (west adams), llc delaware NaN \n", + "1055986 gouverneur savings and loan association new york 100.0 \n", + "\n", + " filename report_date report_year \\\n", + "0 edgar/data/20/0000893220-06-000650.txt 2006-03-23 2006 \n", + "1 edgar/data/20/0000950123-10-024631.txt 2010-03-15 2010 \n", + "2 edgar/data/20/0000950123-10-024631.txt 2010-03-15 2010 \n", + "3 edgar/data/20/0000950123-10-024631.txt 2010-03-15 2010 \n", + "4 edgar/data/20/0000950123-10-024631.txt 2010-03-15 2010 \n", + "... ... ... ... \n", + "1055982 edgar/data/1967649/0001967649-23-000025.txt 2023-12-21 2023 \n", + "1055983 edgar/data/1967649/0001967649-23-000025.txt 2023-12-21 2023 \n", + "1055984 edgar/data/1967649/0001967649-23-000025.txt 2023-12-21 2023 \n", + "1055985 edgar/data/1967649/0001967649-23-000025.txt 2023-12-21 2023 \n", + "1055986 edgar/data/1978811/0001558370-23-020009.txt 2023-12-26 2023 \n", + "\n", + " company_name \\\n", + "0 colormax limited \n", + "1 gundlach equipment corporation \n", + "2 jeffrey rader ab \n", + "3 jeffrey rader canada company \n", + "4 jeffrey rader corporation \n", + "... ... \n", + "1055982 vestis supply chain limited liability company \n", + "1055983 vestis syracuse limited liability company \n", + "1055984 vestis texas limited liability company \n", + "1055985 vestis west adams limited liability company \n", + "1055986 gouverneur savings and loan association \n", + "\n", + " company_name_no_legal company_name_mphone parent_company_cik \n", + "0 colormax KLRMKS 0000000020 \n", + "1 gundlach equipment KNTLX EKPMNT 0000000020 \n", + "2 jeffrey rader ab JFR RTR AB 0000000020 \n", + "3 jeffrey rader canada JFR RTR KNT 0000000020 \n", + "4 jeffrey rader JFR RTR 0000000020 \n", + "... ... ... ... \n", + "1055982 vestis supply chain FSTS SPL XN 0001967649 \n", + "1055983 vestis syracuse FSTS SRKS 0001967649 \n", + "1055984 vestis texas FSTS TKSS 0001967649 \n", + "1055985 vestis west adams FSTS WST ATMS 0001967649 \n", + "1055986 gouverneur savings and loan KFRNR SFNKS ANT LN 0001978811 \n", + "\n", + "[1055987 rows x 11 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "ex21_df" + ] + }, { "cell_type": "code", "execution_count": 25, diff --git a/src/mozilla_sec_eia/models/sec10k/__init__.py b/src/mozilla_sec_eia/models/sec10k/__init__.py index 4fd2f14..cec954d 100644 --- a/src/mozilla_sec_eia/models/sec10k/__init__.py +++ b/src/mozilla_sec_eia/models/sec10k/__init__.py @@ -36,7 +36,6 @@ ex21_data_assets = load_assets_from_modules([ex_21.data]) shared_assets = load_assets_from_modules([extract]) - basic_10k_production_job = model_jobs.create_production_model_job( "basic_10k_extraction", basic_10k.production_assets, From 26b1a72c66f1642f58564583fae9ba7a3ed81673 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Thu, 19 Dec 2024 11:22:23 -0800 Subject: [PATCH 158/161] add markdown cell note --- notebooks/18-kl-splink-sec-eia.ipynb | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb index a105e3b..81f3513 100644 --- a/notebooks/18-kl-splink-sec-eia.ipynb +++ b/notebooks/18-kl-splink-sec-eia.ipynb @@ -2226,6 +2226,14 @@ "preds_validation_df[preds_validation_df.match_probability > .9].head(3)" ] }, + { + "cell_type": "markdown", + "id": "07fbec17-cef2-4b9c-a005-1623c65c5e20", + "metadata": {}, + "source": [ + "Figure out what to do about this validation CSV, maybe it should be part of package data? It's not a very big sample size and it's imperfect so the metrics gained from it are should be taken with a grain of salt." + ] + }, { "cell_type": "code", "execution_count": 47, From 70427a01f08d1eb5704c4da8ec3cefab0b5d4277 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Thu, 19 Dec 2024 11:23:14 -0800 Subject: [PATCH 159/161] make asset not multi asset --- .../models/sec_eia_record_linkage/transform_eia_input.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py index c832cf0..c8f311c 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd -from dagster import AssetOut, multi_asset +from dagster import AssetOut, asset from mozilla_sec_eia.library.record_linkage_utils import ( expand_street_name_abbreviations, @@ -74,7 +74,7 @@ def harvest_eia861_utilities(): return eia861_df -@multi_asset( +@asset( outs={ "core_eia__parents_and_subsidiaries": AssetOut( io_manager_key="pandas_parquet_io_manager" From 136709dc99fda13c0c46067eea97bf8f75ac24d9 Mon Sep 17 00:00:00 2001 From: zschira Date: Tue, 7 Jan 2025 14:57:58 -0500 Subject: [PATCH 160/161] Fix asset keywords --- .../sec_eia_record_linkage/transform_eia_input.py | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py index c8f311c..b12ac71 100644 --- a/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py +++ b/src/mozilla_sec_eia/models/sec_eia_record_linkage/transform_eia_input.py @@ -2,7 +2,7 @@ import numpy as np import pandas as pd -from dagster import AssetOut, asset +from dagster import asset from mozilla_sec_eia.library.record_linkage_utils import ( expand_street_name_abbreviations, @@ -75,12 +75,8 @@ def harvest_eia861_utilities(): @asset( - outs={ - "core_eia__parents_and_subsidiaries": AssetOut( - io_manager_key="pandas_parquet_io_manager" - ) - # TODO: allow year partitions? - } + name="core_eia__parents_and_subsidiaries", + io_manager_key="pandas_parquet_io_manager", ) # TODO: add Dagster asset inputs for PUDL inputs instead of reading from AWS? def eia_rl_input_table(): From 4da70e06e7b83f432e3324a13b87021a1c3661b3 Mon Sep 17 00:00:00 2001 From: Katie Lamb Date: Wed, 15 Jan 2025 20:53:49 -0800 Subject: [PATCH 161/161] splink notebook update --- notebooks/18-kl-splink-sec-eia.ipynb | 1050 +++++++++++++------------- 1 file changed, 506 insertions(+), 544 deletions(-) diff --git a/notebooks/18-kl-splink-sec-eia.ipynb b/notebooks/18-kl-splink-sec-eia.ipynb index 81f3513..88d3351 100644 --- a/notebooks/18-kl-splink-sec-eia.ipynb +++ b/notebooks/18-kl-splink-sec-eia.ipynb @@ -199,7 +199,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "id": "856c14d8-3250-4650-a2db-3808b4718f19", "metadata": {}, "outputs": [ @@ -209,7 +209,7 @@ "False" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } @@ -229,7 +229,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 12, "id": "842fa02e-5202-445c-b728-72bce42e740d", "metadata": {}, "outputs": [ @@ -240,7 +240,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 19, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } @@ -251,7 +251,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 13, "id": "b53e6244-f0ca-4256-bc09-9c3264675389", "metadata": {}, "outputs": [ @@ -262,7 +262,7 @@ "Name: count, dtype: int64" ] }, - "execution_count": 20, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } @@ -273,7 +273,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 14, "id": "e4d54448-0c2f-452b-931c-ff79a5cc3669", "metadata": {}, "outputs": [], @@ -300,7 +300,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 15, "id": "de894ad0-0b72-4486-b737-c3bfb95f8f05", "metadata": {}, "outputs": [], @@ -310,7 +310,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 16, "id": "4bab1568-6a55-427c-9a78-e44db8b0584d", "metadata": {}, "outputs": [ @@ -319,23 +319,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, - "execution_count": 24, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -577,7 +577,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 19, "id": "d9cbc91b-b061-461b-b1f2-83036c70d7c7", "metadata": {}, "outputs": [ @@ -586,23 +586,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, - "execution_count": 25, + "execution_count": 19, "metadata": {}, "output_type": "execute_result" } @@ -674,7 +674,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 20, "id": "1e6d3d24-f8ce-4118-9d31-d4f237d28237", "metadata": {}, "outputs": [ @@ -688,7 +688,7 @@ " 'link_type_join_condition': 'where l.\"source_dataset\" || \\'-__-\\' || l.\"record_id\" < r.\"source_dataset\" || \\'-__-\\' || r.\"record_id\" and l.\"source_dataset\" != r.\"source_dataset\"'}" ] }, - "execution_count": 26, + "execution_count": 20, "metadata": {}, "output_type": "execute_result" } @@ -708,7 +708,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 21, "id": "c37c117a-0fa2-4b8a-9fae-da3569895ca3", "metadata": {}, "outputs": [ @@ -772,7 +772,7 @@ "2 FRST 816 36 29376" ] }, - "execution_count": 27, + "execution_count": 21, "metadata": {}, "output_type": "execute_result" } @@ -791,7 +791,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 22, "id": "4e1a9844-5d98-4cac-a083-eef134f083ce", "metadata": {}, "outputs": [ @@ -800,23 +800,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.VConcatChart(...)" ] }, - "execution_count": 38, + "execution_count": 32, "metadata": {}, "output_type": "execute_result" } @@ -1281,7 +1281,7 @@ }, { "cell_type": "code", - "execution_count": 39, + "execution_count": 33, "id": "f365f59e-e4d0-44f3-a1fb-62e0d63d7ba3", "metadata": {}, "outputs": [ @@ -1290,23 +1290,23 @@ "text/html": [ "\n", "\n", - "
\n", + "
\n", "" ], "text/plain": [ "alt.HConcatChart(...)" ] }, - "execution_count": 39, + "execution_count": 33, "metadata": {}, "output_type": "execute_result" } @@ -1391,7 +1391,7 @@ }, { "cell_type": "code", - "execution_count": 40, + "execution_count": 34, "id": "94e96441-89b6-4516-aa6a-4d1593ce03be", "metadata": {}, "outputs": [ @@ -1399,7 +1399,7 @@ "name": "stderr", "output_type": "stream", "text": [ - "Blocking time: 0.16 seconds\n", + "Blocking time: 0.14 seconds\n", "Predict time: 0.26 seconds\n" ] } @@ -1410,7 +1410,7 @@ }, { "cell_type": "code", - "execution_count": 41, + "execution_count": 35, "id": "722effbc-24e8-45ca-aa64-8cdcd75846e0", "metadata": {}, "outputs": [], @@ -1420,7 +1420,7 @@ }, { "cell_type": "code", - "execution_count": 42, + "execution_count": 36, "id": "21b19a11-15c6-46ab-bd73-7c4752f7e25e", "metadata": {}, "outputs": [ @@ -1487,8 +1487,8 @@ " \n", " \n", " 295287\n", - " -22.967975\n", - " 1.218850e-07\n", + " -22.970354\n", + " 1.216843e-07\n", " __splink__input_table_0\n", " __splink__input_table_1\n", " 56230\n", @@ -1505,184 +1505,184 @@ " 0\n", " 0.000049\n", " 0.000049\n", - " 0.881658\n", + " 0.881656\n", " 1.000000\n", " ne\n", " mo\n", " 0\n", " 0.006455\n", " 0.010118\n", - " 0.199012\n", + " 0.198718\n", " 1.000000\n", " omaha\n", " st louis\n", " 0\n", " 0.003448\n", " 0.002764\n", - " 0.296714\n", + " 0.296663\n", " 1.000000\n", " UNN PSFK\n", " UNN ELKTRK\n", " 0\n", " \n", " \n", - " 384509\n", - " -22.967975\n", - " 1.218850e-07\n", + " 307206\n", + " -22.970354\n", + " 1.216843e-07\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 56484\n", - " 19138\n", - " united states lime and minerals\n", - " united water conservation\n", + " 29764\n", + " 9337\n", + " international lease finance\n", + " international paper riegel\n", " 0\n", " 0.000037\n", - " 0.000024\n", + " 0.000012\n", " 0.986046\n", " 1.000000\n", - " 5429 lbj fwy\n", - " 1701 north lombard st\n", + " 1999 ave of the stars\n", + " 6400 poplar ave\n", " 0\n", - " 0.000024\n", - " 0.000012\n", - " 0.881658\n", + " 0.000110\n", + " 0.000061\n", + " 0.881656\n", " 1.000000\n", - " tx\n", " ca\n", + " tn\n", " 0\n", - " 0.079841\n", " 0.157960\n", - " 0.199012\n", + " 0.010622\n", + " 0.198718\n", " 1.000000\n", - " dallas\n", - " oxnard\n", + " los angeles\n", + " memphis\n", " 0\n", - " 0.013855\n", - " 0.000257\n", - " 0.296714\n", + " 0.008107\n", + " 0.001357\n", + " 0.296663\n", " 1.000000\n", - " UNTT STTS LM ANT MNRLS\n", - " UNTT WTR KNSRFXN\n", + " INTRNXNL LS FNNS\n", + " INTRNXNL PPR RJL\n", " 0\n", " \n", " \n", - " 384504\n", - " -22.967975\n", - " 1.218850e-07\n", + " 307205\n", + " -22.970354\n", + " 1.216843e-07\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 56436\n", - " 19138\n", - " united rentals\n", - " united water conservation\n", + " 29818\n", + " 9337\n", + " international speedway\n", + " international paper riegel\n", " 0\n", - " 0.000024\n", - " 0.000024\n", + " 0.000037\n", + " 0.000012\n", " 0.986046\n", " 1.000000\n", - " 100 first stamford pl\n", - " 1701 north lombard st\n", + " 1801 w international speedway blvd\n", + " 6400 poplar ave\n", " 0\n", - " 0.000122\n", " 0.000012\n", - " 0.881658\n", + " 0.000061\n", + " 0.881656\n", " 1.000000\n", - " ct\n", - " ca\n", + " fl\n", + " tn\n", " 0\n", - " 0.020876\n", - " 0.157960\n", - " 0.199012\n", + " 0.048477\n", + " 0.010622\n", + " 0.198718\n", " 1.000000\n", - " stamford\n", - " oxnard\n", + " daytona beach\n", + " memphis\n", " 0\n", - " 0.003950\n", - " 0.000257\n", - " 0.296714\n", + " 0.000245\n", + " 0.001357\n", + " 0.296663\n", " 1.000000\n", - " UNTT RNTLS\n", - " UNTT WTR KNSRFXN\n", + " INTRNXNL SPTW\n", + " INTRNXNL PPR RJL\n", " 0\n", " \n", " \n", - " 384503\n", - " -22.967975\n", - " 1.218850e-07\n", + " 307204\n", + " -22.970354\n", + " 1.216843e-07\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 56424\n", - " 19138\n", - " united parcel service\n", - " united water conservation\n", + " 59433\n", + " 20092\n", + " west penn funding\n", + " west line solar\n", " 0\n", " 0.000024\n", - " 0.000024\n", + " 0.000012\n", " 0.986046\n", " 1.000000\n", - " 55 glenlake pkwy ne\n", - " 1701 north lombard st\n", + " 2325b2 renaissance dr\n", + " 2180 south 1300 east\n", " 0\n", " 0.000012\n", - " 0.000012\n", - " 0.881658\n", + " 0.000110\n", + " 0.881656\n", " 1.000000\n", - " ga\n", - " ca\n", + " nv\n", + " ut\n", " 0\n", - " 0.018626\n", - " 0.157960\n", - " 0.199012\n", + " 0.020458\n", + " 0.010549\n", + " 0.198718\n", " 1.000000\n", - " atlanta\n", - " oxnard\n", + " las vegas\n", + " salt lake city\n", " 0\n", - " 0.008462\n", - " 0.000257\n", - " 0.296714\n", + " 0.010724\n", + " 0.005772\n", + " 0.296663\n", " 1.000000\n", - " UNTT PRSL SRFS\n", - " UNTT WTR KNSRFXN\n", + " WST PN FNTNK\n", + " WST LN SLR\n", " 0\n", " \n", " \n", - " 384502\n", - " -22.967975\n", - " 1.218850e-07\n", + " 307203\n", + " -22.970354\n", + " 1.216843e-07\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 56312\n", - " 19138\n", - " united bancorp /oh/\n", - " united water conservation\n", + " 39648\n", + " 12908\n", + " north country financial\n", + " north american energy services\n", " 0\n", " 0.000024\n", - " 0.000024\n", + " 0.000110\n", " 0.986046\n", " 1.000000\n", - " 201 south fourth st\n", - " 1701 north lombard st\n", + " 3530 north country dr\n", + " 1070 erie ave\n", " 0\n", - " 0.000012\n", - " 0.000012\n", - " 0.881658\n", + " 0.000024\n", + " 0.000037\n", + " 0.881656\n", " 1.000000\n", - " oh\n", - " ca\n", + " mi\n", + " ny\n", " 0\n", - " 0.016991\n", - " 0.157960\n", - " 0.199012\n", + " 0.015147\n", + " 0.120228\n", + " 0.198718\n", " 1.000000\n", - " martins ferry\n", - " oxnard\n", + " traverse city\n", + " north tonawanda\n", " 0\n", - " 0.000024\n", - " 0.000257\n", - " 0.296714\n", + " 0.000269\n", + " 0.000049\n", + " 0.296663\n", " 1.000000\n", - " UNTT BNKRP\n", - " UNTT WTR KNSRFXN\n", + " NR0 KNTR FNNXL\n", + " NR0 AMRKN ENRJ SRFSS\n", " 0\n", " \n", " \n", @@ -1727,7 +1727,7 @@ " \n", " \n", " 163815\n", - " 27.519606\n", + " 27.519613\n", " 1.000000e+00\n", " __splink__input_table_0\n", " __splink__input_table_1\n", @@ -1738,36 +1738,36 @@ " 2\n", " 0.000073\n", " 0.000073\n", - " 415263.133269\n", - " 0.016616\n", + " 477874.511191\n", + " 0.014439\n", " 33 third st se\n", " 33 third st se\n", " 2\n", " 0.000037\n", " 0.000037\n", - " 9605.781694\n", - " 0.311992\n", + " 9888.266177\n", + " 0.303079\n", " sd\n", " sd\n", " 1\n", " 0.001930\n", " 0.001930\n", - " 15.445559\n", - " 27.217182\n", + " 15.866015\n", + " 26.495963\n", " huron\n", " huron\n", " 2\n", " 0.000073\n", " 0.000073\n", - " 102.014123\n", - " 91.382644\n", + " 103.554689\n", + " 90.023441\n", " NR0WSTRN PBLK SRFS\n", " NR0WSTRN PBLK SRFS\n", " 0\n", " \n", " \n", " 241593\n", - " 27.526514\n", + " 27.526521\n", " 1.000000e+00\n", " __splink__input_table_0\n", " __splink__input_table_1\n", @@ -1778,36 +1778,36 @@ " 2\n", " 0.000037\n", " 0.000037\n", - " 415263.133269\n", - " 0.033231\n", + " 477874.511191\n", + " 0.028877\n", " 163 acorn ln\n", " 163 acorn ln\n", " 2\n", " 0.000037\n", " 0.000037\n", - " 9605.781694\n", - " 0.311992\n", + " 9888.266177\n", + " 0.303079\n", " vt\n", " vt\n", " 1\n", " 0.001537\n", " 0.001537\n", - " 15.445559\n", - " 34.184780\n", + " 15.866015\n", + " 33.278930\n", " colchester\n", " colchester\n", " 2\n", " 0.000183\n", " 0.000183\n", - " 102.014123\n", - " 36.553058\n", + " 103.554689\n", + " 36.009376\n", " KRN MNTN PWR\n", " KRN MNTN PWR\n", " 0\n", " \n", " \n", " 165487\n", - " 27.757338\n", + " 27.757345\n", " 1.000000e+00\n", " __splink__input_table_0\n", " __splink__input_table_1\n", @@ -1818,36 +1818,36 @@ " 2\n", " 0.000024\n", " 0.000024\n", - " 415263.133269\n", - " 0.049847\n", + " 477874.511191\n", + " 0.043316\n", " one clarks is\n", " one clarks is\n", " 2\n", " 0.000024\n", " 0.000024\n", - " 9605.781694\n", - " 0.467987\n", + " 9888.266177\n", + " 0.454618\n", " wi\n", " wi\n", " 1\n", " 0.008840\n", " 0.008840\n", - " 15.445559\n", - " 5.943112\n", + " 15.866015\n", + " 5.785628\n", " wausau\n", " wausau\n", " 2\n", " 0.000061\n", " 0.000061\n", - " 102.014123\n", - " 109.659173\n", + " 103.554689\n", + " 108.028129\n", " WS PPR MLS\n", " WS PPR MLS\n", " 0\n", " \n", " \n", " 340414\n", - " 27.884365\n", + " 27.884373\n", " 1.000000e+00\n", " __splink__input_table_0\n", " __splink__input_table_1\n", @@ -1858,36 +1858,36 @@ " 2\n", " 0.000024\n", " 0.000024\n", - " 415263.133269\n", - " 0.049847\n", + " 477874.511191\n", + " 0.043316\n", " 520 francis st\n", " 520 francis st\n", " 2\n", " 0.000024\n", " 0.000024\n", - " 9605.781694\n", - " 0.467987\n", + " 9888.266177\n", + " 0.454618\n", " mo\n", " mo\n", " 1\n", " 0.010118\n", " 0.010118\n", - " 15.445559\n", - " 5.192099\n", + " 15.866015\n", + " 5.054515\n", " st joseph\n", " st joseph\n", " 2\n", " 0.000049\n", " 0.000049\n", - " 102.014123\n", - " 137.073967\n", + " 103.554689\n", + " 135.035162\n", " ST JSF LT ANT PWR\n", " ST JSF LT ANT PWR\n", " 0\n", " \n", " \n", " 274760\n", - " 29.211012\n", + " 29.211020\n", " 1.000000e+00\n", " __splink__input_table_0\n", " __splink__input_table_1\n", @@ -1898,29 +1898,29 @@ " 2\n", " 0.000037\n", " 0.000037\n", - " 415263.133269\n", - " 0.033231\n", + " 477874.511191\n", + " 0.028877\n", " 161 wellington rd\n", " 161 wellington rd\n", " 2\n", " 0.000024\n", " 0.000024\n", - " 9605.781694\n", - " 0.467987\n", + " 9888.266177\n", + " 0.454618\n", " vt\n", " vt\n", " 1\n", " 0.001537\n", " 0.001537\n", - " 15.445559\n", - " 34.184780\n", + " 15.866015\n", + " 33.278930\n", " brattleboro\n", " brattleboro\n", " 2\n", " 0.000086\n", " 0.000086\n", - " 102.014123\n", - " 78.327981\n", + " 103.554689\n", + " 77.162949\n", " FBRMRK\n", " FBRMRK\n", " 0\n", @@ -1931,23 +1931,23 @@ "" ], "text/plain": [ - " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key\n", - "295287 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56230 19078 union pacific union electric 0 0.000049 0.000098 0.986046 1.000000 1416 dodge st mc 1400 0 0.000049 0.000049 0.881658 1.000000 ne mo 0 0.006455 0.010118 0.199012 1.000000 omaha st louis 0 0.003448 0.002764 0.296714 1.000000 UNN PSFK UNN ELKTRK 0\n", - "384509 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56484 19138 united states lime and minerals united water conservation 0 0.000037 0.000024 0.986046 1.000000 5429 lbj fwy 1701 north lombard st 0 0.000024 0.000012 0.881658 1.000000 tx ca 0 0.079841 0.157960 0.199012 1.000000 dallas oxnard 0 0.013855 0.000257 0.296714 1.000000 UNTT STTS LM ANT MNRLS UNTT WTR KNSRFXN 0\n", - "384504 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56436 19138 united rentals united water conservation 0 0.000024 0.000024 0.986046 1.000000 100 first stamford pl 1701 north lombard st 0 0.000122 0.000012 0.881658 1.000000 ct ca 0 0.020876 0.157960 0.199012 1.000000 stamford oxnard 0 0.003950 0.000257 0.296714 1.000000 UNTT RNTLS UNTT WTR KNSRFXN 0\n", - "384503 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56424 19138 united parcel service united water conservation 0 0.000024 0.000024 0.986046 1.000000 55 glenlake pkwy ne 1701 north lombard st 0 0.000012 0.000012 0.881658 1.000000 ga ca 0 0.018626 0.157960 0.199012 1.000000 atlanta oxnard 0 0.008462 0.000257 0.296714 1.000000 UNTT PRSL SRFS UNTT WTR KNSRFXN 0\n", - "384502 -22.967975 1.218850e-07 __splink__input_table_0 __splink__input_table_1 56312 19138 united bancorp /oh/ united water conservation 0 0.000024 0.000024 0.986046 1.000000 201 south fourth st 1701 north lombard st 0 0.000012 0.000012 0.881658 1.000000 oh ca 0 0.016991 0.157960 0.199012 1.000000 martins ferry oxnard 0 0.000024 0.000257 0.296714 1.000000 UNTT BNKRP UNTT WTR KNSRFXN 0\n", - "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", - "163815 27.519606 1.000000e+00 __splink__input_table_0 __splink__input_table_1 39816 13109 northwestern public service northwestern public service 2 0.000073 0.000073 415263.133269 0.016616 33 third st se 33 third st se 2 0.000037 0.000037 9605.781694 0.311992 sd sd 1 0.001930 0.001930 15.445559 27.217182 huron huron 2 0.000073 0.000073 102.014123 91.382644 NR0WSTRN PBLK SRFS NR0WSTRN PBLK SRFS 0\n", - "241593 27.526514 1.000000e+00 __splink__input_table_0 __splink__input_table_1 24650 8047 green mountain power green mountain power 2 0.000037 0.000037 415263.133269 0.033231 163 acorn ln 163 acorn ln 2 0.000037 0.000037 9605.781694 0.311992 vt vt 1 0.001537 0.001537 15.445559 34.184780 colchester colchester 2 0.000183 0.000183 102.014123 36.553058 KRN MNTN PWR KRN MNTN PWR 0\n", - "165487 27.757338 1.000000e+00 __splink__input_table_0 __splink__input_table_1 58842 19906 wausau paper mills wausau paper mills 2 0.000024 0.000024 415263.133269 0.049847 one clarks is one clarks is 2 0.000024 0.000024 9605.781694 0.467987 wi wi 1 0.008840 0.008840 15.445559 5.943112 wausau wausau 2 0.000061 0.000061 102.014123 109.659173 WS PPR MLS WS PPR MLS 0\n", - "340414 27.884365 1.000000e+00 __splink__input_table_0 __splink__input_table_1 51567 17450 st joseph light and power st joseph light and power 2 0.000024 0.000024 415263.133269 0.049847 520 francis st 520 francis st 2 0.000024 0.000024 9605.781694 0.467987 mo mo 1 0.010118 0.010118 15.445559 5.192099 st joseph st joseph 2 0.000049 0.000049 102.014123 137.073967 ST JSF LT ANT PWR ST JSF LT ANT PWR 0\n", - "274760 29.211012 1.000000e+00 __splink__input_table_0 __splink__input_table_1 20588 6741 fibermark fibermark 2 0.000037 0.000037 415263.133269 0.033231 161 wellington rd 161 wellington rd 2 0.000024 0.000024 9605.781694 0.467987 vt vt 1 0.001537 0.001537 15.445559 34.184780 brattleboro brattleboro 2 0.000086 0.000086 102.014123 78.327981 FBRMRK FBRMRK 0\n", + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key\n", + "295287 -22.970354 1.216843e-07 __splink__input_table_0 __splink__input_table_1 56230 19078 union pacific union electric 0 0.000049 0.000098 0.986046 1.000000 1416 dodge st mc 1400 0 0.000049 0.000049 0.881656 1.000000 ne mo 0 0.006455 0.010118 0.198718 1.000000 omaha st louis 0 0.003448 0.002764 0.296663 1.000000 UNN PSFK UNN ELKTRK 0\n", + "307206 -22.970354 1.216843e-07 __splink__input_table_0 __splink__input_table_1 29764 9337 international lease finance international paper riegel 0 0.000037 0.000012 0.986046 1.000000 1999 ave of the stars 6400 poplar ave 0 0.000110 0.000061 0.881656 1.000000 ca tn 0 0.157960 0.010622 0.198718 1.000000 los angeles memphis 0 0.008107 0.001357 0.296663 1.000000 INTRNXNL LS FNNS INTRNXNL PPR RJL 0\n", + "307205 -22.970354 1.216843e-07 __splink__input_table_0 __splink__input_table_1 29818 9337 international speedway international paper riegel 0 0.000037 0.000012 0.986046 1.000000 1801 w international speedway blvd 6400 poplar ave 0 0.000012 0.000061 0.881656 1.000000 fl tn 0 0.048477 0.010622 0.198718 1.000000 daytona beach memphis 0 0.000245 0.001357 0.296663 1.000000 INTRNXNL SPTW INTRNXNL PPR RJL 0\n", + "307204 -22.970354 1.216843e-07 __splink__input_table_0 __splink__input_table_1 59433 20092 west penn funding west line solar 0 0.000024 0.000012 0.986046 1.000000 2325b2 renaissance dr 2180 south 1300 east 0 0.000012 0.000110 0.881656 1.000000 nv ut 0 0.020458 0.010549 0.198718 1.000000 las vegas salt lake city 0 0.010724 0.005772 0.296663 1.000000 WST PN FNTNK WST LN SLR 0\n", + "307203 -22.970354 1.216843e-07 __splink__input_table_0 __splink__input_table_1 39648 12908 north country financial north american energy services 0 0.000024 0.000110 0.986046 1.000000 3530 north country dr 1070 erie ave 0 0.000024 0.000037 0.881656 1.000000 mi ny 0 0.015147 0.120228 0.198718 1.000000 traverse city north tonawanda 0 0.000269 0.000049 0.296663 1.000000 NR0 KNTR FNNXL NR0 AMRKN ENRJ SRFSS 0\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "163815 27.519613 1.000000e+00 __splink__input_table_0 __splink__input_table_1 39816 13109 northwestern public service northwestern public service 2 0.000073 0.000073 477874.511191 0.014439 33 third st se 33 third st se 2 0.000037 0.000037 9888.266177 0.303079 sd sd 1 0.001930 0.001930 15.866015 26.495963 huron huron 2 0.000073 0.000073 103.554689 90.023441 NR0WSTRN PBLK SRFS NR0WSTRN PBLK SRFS 0\n", + "241593 27.526521 1.000000e+00 __splink__input_table_0 __splink__input_table_1 24650 8047 green mountain power green mountain power 2 0.000037 0.000037 477874.511191 0.028877 163 acorn ln 163 acorn ln 2 0.000037 0.000037 9888.266177 0.303079 vt vt 1 0.001537 0.001537 15.866015 33.278930 colchester colchester 2 0.000183 0.000183 103.554689 36.009376 KRN MNTN PWR KRN MNTN PWR 0\n", + "165487 27.757345 1.000000e+00 __splink__input_table_0 __splink__input_table_1 58842 19906 wausau paper mills wausau paper mills 2 0.000024 0.000024 477874.511191 0.043316 one clarks is one clarks is 2 0.000024 0.000024 9888.266177 0.454618 wi wi 1 0.008840 0.008840 15.866015 5.785628 wausau wausau 2 0.000061 0.000061 103.554689 108.028129 WS PPR MLS WS PPR MLS 0\n", + "340414 27.884373 1.000000e+00 __splink__input_table_0 __splink__input_table_1 51567 17450 st joseph light and power st joseph light and power 2 0.000024 0.000024 477874.511191 0.043316 520 francis st 520 francis st 2 0.000024 0.000024 9888.266177 0.454618 mo mo 1 0.010118 0.010118 15.866015 5.054515 st joseph st joseph 2 0.000049 0.000049 103.554689 135.035162 ST JSF LT ANT PWR ST JSF LT ANT PWR 0\n", + "274760 29.211020 1.000000e+00 __splink__input_table_0 __splink__input_table_1 20588 6741 fibermark fibermark 2 0.000037 0.000037 477874.511191 0.028877 161 wellington rd 161 wellington rd 2 0.000024 0.000024 9888.266177 0.454618 vt vt 1 0.001537 0.001537 15.866015 33.278930 brattleboro brattleboro 2 0.000086 0.000086 103.554689 77.162949 FBRMRK FBRMRK 0\n", "\n", "[590575 rows x 37 columns]" ] }, - "execution_count": 42, + "execution_count": 36, "metadata": {}, "output_type": "execute_result" } @@ -1958,7 +1958,7 @@ }, { "cell_type": "code", - "execution_count": 43, + "execution_count": 37, "id": "c0b292c8-26ed-407a-866e-75851577d567", "metadata": {}, "outputs": [], @@ -1972,7 +1972,7 @@ }, { "cell_type": "code", - "execution_count": 44, + "execution_count": 38, "id": "e8744d54-3cc6-434c-bbbe-a10d2d3cd9c0", "metadata": {}, "outputs": [], @@ -1985,7 +1985,7 @@ }, { "cell_type": "code", - "execution_count": 45, + "execution_count": 39, "id": "5103190c-3775-427f-a8f2-cc8a8f79892b", "metadata": {}, "outputs": [], @@ -1997,7 +1997,7 @@ }, { "cell_type": "code", - "execution_count": 46, + "execution_count": 40, "id": "8fa04f18-beff-4bdc-bbbb-705950eab5b8", "metadata": {}, "outputs": [ @@ -2070,8 +2070,8 @@ " \n", " \n", " 218797\n", - " 3.824578\n", - " 0.934072\n", + " 3.824584\n", + " 0.934073\n", " __splink__input_table_0\n", " __splink__input_table_1\n", " 14692\n", @@ -2088,22 +2088,22 @@ " 2\n", " 0.000122\n", " 0.000122\n", - " 9605.781694\n", - " 0.093597\n", + " 9888.266177\n", + " 0.090924\n", " ct\n", " ct\n", " 1\n", " 0.020876\n", " 0.020876\n", - " 15.445559\n", - " 2.516547\n", + " 15.866015\n", + " 2.449862\n", " stamford\n", " stamford\n", " 2\n", " 0.003950\n", " 0.003950\n", - " 102.014123\n", - " 1.697510\n", + " 103.554689\n", + " 1.672262\n", " KRN\n", " ENTRJ NKLR PWR MRKTNK\n", " 1\n", @@ -2116,7 +2116,7 @@ " \n", " \n", " 220036\n", - " 4.619987\n", + " 4.619994\n", " 0.960922\n", " __splink__input_table_0\n", " __splink__input_table_1\n", @@ -2134,22 +2134,22 @@ " 2\n", " 0.000330\n", " 0.000330\n", - " 9605.781694\n", - " 0.034666\n", + " 9888.266177\n", + " 0.033675\n", " mi\n", " mi\n", " 1\n", " 0.015147\n", " 0.015147\n", - " 15.445559\n", - " 3.468423\n", + " 15.866015\n", + " 3.376515\n", " detroit\n", " detroit\n", " 2\n", " 0.001162\n", " 0.001162\n", - " 102.014123\n", - " 5.771535\n", + " 103.554689\n", + " 5.685691\n", " TT ELKTRK SKRTSXN FNTNK I\n", " TT SSTNBL JNRXN\n", " 1\n", @@ -2161,8 +2161,8 @@ " 64331\n", " \n", " \n", - " 358152\n", - " 4.619987\n", + " 481032\n", + " 4.619994\n", " 0.960922\n", " __splink__input_table_0\n", " __splink__input_table_1\n", @@ -2180,22 +2180,22 @@ " 2\n", " 0.000330\n", " 0.000330\n", - " 9605.781694\n", - " 0.034666\n", + " 9888.266177\n", + " 0.033675\n", " mi\n", " mi\n", " 1\n", " 0.015147\n", " 0.015147\n", - " 15.445559\n", - " 3.468423\n", + " 15.866015\n", + " 3.376515\n", " detroit\n", " detroit\n", " 2\n", " 0.001162\n", " 0.001162\n", - " 102.014123\n", - " 5.771535\n", + " 103.554689\n", + " 5.685691\n", " TT ELKTRK SKRTSXN FNTNK I\n", " TT ELKTRK\n", " 0\n", @@ -2212,12 +2212,12 @@ ], "text/plain": [ " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key record_id_x sec_company_id central_index_key company_name_raw record_id_y utility_id_eia\n", - "218797 3.824578 0.934072 __splink__input_table_0 __splink__input_table_1 14692 6293 crane entergy nuclear power marketing 0 0.000012 0.000012 0.986046 1.0 100 first stamford pl 100 first stamford pl 2 0.000122 0.000122 9605.781694 0.093597 ct ct 1 0.020876 0.020876 15.445559 2.516547 stamford stamford 2 0.003950 0.003950 102.014123 1.697510 KRN ENTRJ NKLR PWR MRKTNK 1 14692 0001944013 0001944013 crane co 6293 55243\n", - "220036 4.619987 0.960922 __splink__input_table_0 __splink__input_table_1 17752 5535 dte electric securitization funding i dte sustainable generation 0 0.000012 0.000012 0.986046 1.0 one energy plz one energy plz 2 0.000330 0.000330 9605.781694 0.034666 mi mi 1 0.015147 0.015147 15.445559 3.468423 detroit detroit 2 0.001162 0.001162 102.014123 5.771535 TT ELKTRK SKRTSXN FNTNK I TT SSTNBL JNRXN 1 17752 0001876068 0001876068 dte electric securitization funding i llc 5535 64331\n", - "358152 4.619987 0.960922 __splink__input_table_0 __splink__input_table_1 17752 5522 dte electric securitization funding i dte electric 0 0.000012 0.000037 0.986046 1.0 one energy plz one energy plz 2 0.000330 0.000330 9605.781694 0.034666 mi mi 1 0.015147 0.015147 15.445559 3.468423 detroit detroit 2 0.001162 0.001162 102.014123 5.771535 TT ELKTRK SKRTSXN FNTNK I TT ELKTRK 0 17752 0001876068 0001876068 dte electric securitization funding i llc 5522 5109" + "218797 3.824584 0.934073 __splink__input_table_0 __splink__input_table_1 14692 6293 crane entergy nuclear power marketing 0 0.000012 0.000012 0.986046 1.0 100 first stamford pl 100 first stamford pl 2 0.000122 0.000122 9888.266177 0.090924 ct ct 1 0.020876 0.020876 15.866015 2.449862 stamford stamford 2 0.003950 0.003950 103.554689 1.672262 KRN ENTRJ NKLR PWR MRKTNK 1 14692 0001944013 0001944013 crane co 6293 55243\n", + "220036 4.619994 0.960922 __splink__input_table_0 __splink__input_table_1 17752 5535 dte electric securitization funding i dte sustainable generation 0 0.000012 0.000012 0.986046 1.0 one energy plz one energy plz 2 0.000330 0.000330 9888.266177 0.033675 mi mi 1 0.015147 0.015147 15.866015 3.376515 detroit detroit 2 0.001162 0.001162 103.554689 5.685691 TT ELKTRK SKRTSXN FNTNK I TT SSTNBL JNRXN 1 17752 0001876068 0001876068 dte electric securitization funding i llc 5535 64331\n", + "481032 4.619994 0.960922 __splink__input_table_0 __splink__input_table_1 17752 5522 dte electric securitization funding i dte electric 0 0.000012 0.000037 0.986046 1.0 one energy plz one energy plz 2 0.000330 0.000330 9888.266177 0.033675 mi mi 1 0.015147 0.015147 15.866015 3.376515 detroit detroit 2 0.001162 0.001162 103.554689 5.685691 TT ELKTRK SKRTSXN FNTNK I TT ELKTRK 0 17752 0001876068 0001876068 dte electric securitization funding i llc 5522 5109" ] }, - "execution_count": 46, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } @@ -2236,7 +2236,7 @@ }, { "cell_type": "code", - "execution_count": 47, + "execution_count": 41, "id": "11190456-12a9-49df-b863-7a6f674e39eb", "metadata": {}, "outputs": [], @@ -2246,7 +2246,7 @@ }, { "cell_type": "code", - "execution_count": 48, + "execution_count": 42, "id": "5a57bae5-6188-4a6c-be9a-14a159f82d81", "metadata": {}, "outputs": [], @@ -2256,7 +2256,7 @@ }, { "cell_type": "code", - "execution_count": 49, + "execution_count": 43, "id": "461725d0-28c8-43dd-bcd5-086b7d3f7d4b", "metadata": {}, "outputs": [], @@ -2271,7 +2271,7 @@ }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 44, "id": "4d45f339-7a5b-466a-81f5-c71e425a77df", "metadata": {}, "outputs": [], @@ -2281,7 +2281,7 @@ }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 45, "id": "182732ea-39c5-4fb4-9429-5f7aed781ef5", "metadata": {}, "outputs": [], @@ -2294,7 +2294,7 @@ }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 46, "id": "bc058dbf-6f71-4978-90ce-5405edbbf9e5", "metadata": {}, "outputs": [ @@ -2384,7 +2384,7 @@ " 1\n", " 13310.0\n", " 4281.0\n", - " 0.999982\n", + " 0.999984\n", " 1.0\n", " both\n", " 1.0\n", @@ -2398,7 +2398,7 @@ " 1\n", " 17793.0\n", " 5564.0\n", - " 0.927293\n", + " 0.927294\n", " 2.0\n", " both\n", " 0.0\n", @@ -2496,7 +2496,7 @@ " 0\n", " 21579.0\n", " 6780.0\n", - " 0.986542\n", + " 0.986543\n", " 0.0\n", " both\n", " 1.0\n", @@ -2510,7 +2510,7 @@ " 0\n", " 21579.0\n", " 6763.0\n", - " 0.085466\n", + " 0.085467\n", " 0.0\n", " both\n", " 0.0\n", @@ -2594,7 +2594,7 @@ " 1\n", " 40084.0\n", " 13243.0\n", - " 0.999813\n", + " 0.999834\n", " 2.0\n", " both\n", " 1.0\n", @@ -2636,7 +2636,7 @@ " 1\n", " 49303.0\n", " 16270.0\n", - " 0.559071\n", + " 0.559072\n", " 0.0\n", " both\n", " 0.0\n", @@ -2678,30 +2678,30 @@ "0 0000003153 195 alabama power co NaN 1 1701.0 478.0 1.000000 2.0 both 1.0\n", "1 0001868941 58702 fluence energy, inc. Fluence 0 21792.0 6889.0 0.016529 0.0 both 0.0\n", "2 0000041091 7140 georgia power co NaN 1 23416.0 7653.0 0.999997 2.0 both 1.0\n", - "3 0000022198 4062 columbus southern power co /oh/ Columbus Southern Power Co 1 13310.0 4281.0 0.999982 1.0 both 1.0\n", - "4 0001326160 5416 duke energy corp NaN 1 17793.0 5564.0 0.927293 2.0 both 0.0\n", + "3 0000022198 4062 columbus southern power co /oh/ Columbus Southern Power Co 1 13310.0 4281.0 0.999984 1.0 both 1.0\n", + "4 0001326160 5416 duke energy corp NaN 1 17793.0 5564.0 0.927294 2.0 both 0.0\n", "5 0000030371 54905 duke energy carolinas, llc Duke Energy Carolinas LLC 1 17790.0 5558.0 0.999987 2.0 both 1.0\n", "6 0000869446 57140 berkshire realty co inc /de Berkshire Wind Power Cooperative Corp 0 7449.0 1712.0 0.001912 0.0 both 0.0\n", "7 0000092122 18195 southern co southern co services inc 0 50964.0 17068.0 0.007216 0.0 both 0.0\n", "8 0000092122 17650 southern co Southern Power Co 0 50963.0 17089.0 0.034232 0.0 both 0.0\n", "9 0000075488 14328 pacific gas & electric co NaN 1 41598.0 13933.0 0.999948 2.0 both 1.0\n", "10 0001031296 6526 firstenergy corp FirstEnergy 0 21579.0 6776.0 0.999998 2.0 both 1.0\n", - "11 0001031296 54776 firstenergy corp FirstEnergy Nuclear Generation Corp 0 21579.0 6780.0 0.986542 0.0 both 1.0\n", - "12 0001031296 6458 firstenergy corp First Energy Services 0 21579.0 6763.0 0.085466 0.0 both 0.0\n", + "11 0001031296 54776 firstenergy corp FirstEnergy Nuclear Generation Corp 0 21579.0 6780.0 0.986543 0.0 both 1.0\n", + "12 0001031296 6458 firstenergy corp First Energy Services 0 21579.0 6763.0 0.085467 0.0 both 0.0\n", "13 0001031296 32208 firstenergy corp First Energy Corp 1 NaN NaN NaN NaN left_only 0.0\n", "14 0000100122 24211 tucson electric power co NaN 1 55725.0 18901.0 1.000000 2.0 both 1.0\n", "15 0000096271 18454 tampa electric co NaN 1 53604.0 18180.0 0.991059 2.0 both 1.0\n", "16 0000715957 5248 dominion energy, inc NaN 1 17484.0 5386.0 0.999985 2.0 both 1.0\n", "17 0001013871 59883 nrg energy, inc NRG Energy Gas & Wind Holdings Inc 0 40084.0 13240.0 0.300165 0.0 both 0.0\n", - "18 0001013871 13377 nrg energy inc NRG Energy Inc 1 40084.0 13243.0 0.999813 2.0 both 1.0\n", + "18 0001013871 13377 nrg energy inc NRG Energy Inc 1 40084.0 13243.0 0.999834 2.0 both 1.0\n", "19 0000788816 13994 oglethorpe power corp NaN 1 40576.0 13515.0 1.000000 2.0 both 1.0\n", "20 0000018675 3266 central maine power co NaN 1 10876.0 3424.0 1.000000 2.0 both 1.0\n", - "21 0001032208 61296 sempra energy Sempra Generation 1 49303.0 16270.0 0.559071 0.0 both 0.0\n", + "21 0001032208 61296 sempra energy Sempra Generation 1 49303.0 16270.0 0.559072 0.0 both 0.0\n", "22 0000004904 488 american electric power co inc American Electric Power Inc 1 2927.0 793.0 0.996076 2.0 both 1.0\n", "23 0000715957 5248 dominion energy, inc Dominion Energy Inc. 1 17484.0 5386.0 0.999985 2.0 both 1.0" ] }, - "execution_count": 52, + "execution_count": 46, "metadata": {}, "output_type": "execute_result" } @@ -2712,7 +2712,7 @@ }, { "cell_type": "code", - "execution_count": 53, + "execution_count": 47, "id": "2fc6ed05-5b35-4856-a668-f16e253e7fea", "metadata": {}, "outputs": [], @@ -2728,7 +2728,7 @@ }, { "cell_type": "code", - "execution_count": 54, + "execution_count": 48, "id": "99a7bc64-9f32-4f53-9a89-63a31ff7debe", "metadata": {}, "outputs": [ @@ -2738,7 +2738,7 @@ "(np.float64(0.8666666666666667), np.float64(0.8125), 0.7916666666666666)" ] }, - "execution_count": 54, + "execution_count": 48, "metadata": {}, "output_type": "execute_result" } @@ -2749,7 +2749,7 @@ }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 49, "id": "08932be5-b90c-440d-9efb-156cb4d63c93", "metadata": {}, "outputs": [ @@ -2799,7 +2799,7 @@ "Positive 3 13" ] }, - "execution_count": 55, + "execution_count": 49, "metadata": {}, "output_type": "execute_result" } @@ -2814,7 +2814,7 @@ }, { "cell_type": "code", - "execution_count": 56, + "execution_count": 50, "id": "025c80e9-5055-4eaa-a873-38b910cd7f94", "metadata": {}, "outputs": [], @@ -2824,7 +2824,7 @@ }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 51, "id": "e4f44fda-1b55-4f9c-a96d-5eec36dac768", "metadata": {}, "outputs": [ @@ -2872,7 +2872,7 @@ " 1\n", " 17793.0\n", " 5564.0\n", - " 0.927293\n", + " 0.927294\n", " 2.0\n", " both\n", " 0.0\n", @@ -2900,7 +2900,7 @@ " 0\n", " 21579.0\n", " 6780.0\n", - " 0.986542\n", + " 0.986543\n", " 0.0\n", " both\n", " 1.0\n", @@ -2928,7 +2928,7 @@ " 1\n", " 49303.0\n", " 16270.0\n", - " 0.559071\n", + " 0.559072\n", " 0.0\n", " both\n", " 0.0\n", @@ -2939,14 +2939,14 @@ ], "text/plain": [ " central_index_key utility_id_eia sec_company_name eia_company_name match record_id_l record_id_r match_probability gamma_company_name_no_legal _merge predicted_match\n", - "4 0001326160 5416 duke energy corp NaN 1 17793.0 5564.0 0.927293 2.0 both 0.0\n", + "4 0001326160 5416 duke energy corp NaN 1 17793.0 5564.0 0.927294 2.0 both 0.0\n", "10 0001031296 6526 firstenergy corp FirstEnergy 0 21579.0 6776.0 0.999998 2.0 both 1.0\n", - "11 0001031296 54776 firstenergy corp FirstEnergy Nuclear Generation Corp 0 21579.0 6780.0 0.986542 0.0 both 1.0\n", + "11 0001031296 54776 firstenergy corp FirstEnergy Nuclear Generation Corp 0 21579.0 6780.0 0.986543 0.0 both 1.0\n", "13 0001031296 32208 firstenergy corp First Energy Corp 1 NaN NaN NaN NaN left_only 0.0\n", - "21 0001032208 61296 sempra energy Sempra Generation 1 49303.0 16270.0 0.559071 0.0 both 0.0" + "21 0001032208 61296 sempra energy Sempra Generation 1 49303.0 16270.0 0.559072 0.0 both 0.0" ] }, - "execution_count": 57, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -2957,7 +2957,7 @@ }, { "cell_type": "code", - "execution_count": 151, + "execution_count": 52, "id": "c425a676-aa6e-4d8f-b814-931da392c2ff", "metadata": {}, "outputs": [], @@ -3073,7 +3073,7 @@ }, { "cell_type": "code", - "execution_count": 58, + "execution_count": 53, "id": "92172e2f-39ba-49e3-8312-98597256ca4f", "metadata": {}, "outputs": [], @@ -3089,7 +3089,7 @@ }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 54, "id": "07ca81ae-1b26-4cd3-ade6-75381028028a", "metadata": {}, "outputs": [ @@ -3099,7 +3099,7 @@ "534" ] }, - "execution_count": 59, + "execution_count": 54, "metadata": {}, "output_type": "execute_result" } @@ -3118,7 +3118,7 @@ }, { "cell_type": "code", - "execution_count": 60, + "execution_count": 55, "id": "361b3e30-e823-4137-9062-6a00eae537fe", "metadata": {}, "outputs": [ @@ -3191,7 +3191,7 @@ " \n", " \n", " 274760\n", - " 29.211012\n", + " 29.211020\n", " 1.000000\n", " __splink__input_table_0\n", " __splink__input_table_1\n", @@ -3202,29 +3202,29 @@ " 2\n", " 0.000037\n", " 0.000037\n", - " 415263.133269\n", - " 0.033231\n", + " 477874.511191\n", + " 0.028877\n", " 161 wellington rd\n", " 161 wellington rd\n", " 2\n", " 0.000024\n", " 0.000024\n", - " 9605.781694\n", - " 0.467987\n", + " 9888.266177\n", + " 0.454618\n", " vt\n", " vt\n", " 1\n", " 0.001537\n", " 0.001537\n", - " 15.445559\n", - " 34.184780\n", + " 15.866015\n", + " 33.278930\n", " brattleboro\n", " brattleboro\n", " 2\n", " 0.000086\n", " 0.000086\n", - " 102.014123\n", - " 78.327981\n", + " 103.554689\n", + " 77.162949\n", " FBRMRK\n", " FBRMRK\n", " 0\n", @@ -3237,7 +3237,7 @@ " \n", " \n", " 340414\n", - " 27.884365\n", + " 27.884373\n", " 1.000000\n", " __splink__input_table_0\n", " __splink__input_table_1\n", @@ -3248,29 +3248,29 @@ " 2\n", " 0.000024\n", " 0.000024\n", - " 415263.133269\n", - " 0.049847\n", + " 477874.511191\n", + " 0.043316\n", " 520 francis st\n", " 520 francis st\n", " 2\n", " 0.000024\n", " 0.000024\n", - " 9605.781694\n", - " 0.467987\n", + " 9888.266177\n", + " 0.454618\n", " mo\n", " mo\n", " 1\n", " 0.010118\n", " 0.010118\n", - " 15.445559\n", - " 5.192099\n", + " 15.866015\n", + " 5.054515\n", " st joseph\n", " st joseph\n", " 2\n", " 0.000049\n", " 0.000049\n", - " 102.014123\n", - " 137.073967\n", + " 103.554689\n", + " 135.035162\n", " ST JSF LT ANT PWR\n", " ST JSF LT ANT PWR\n", " 0\n", @@ -3283,7 +3283,7 @@ " \n", " \n", " 165487\n", - " 27.757338\n", + " 27.757345\n", " 1.000000\n", " __splink__input_table_0\n", " __splink__input_table_1\n", @@ -3294,29 +3294,29 @@ " 2\n", " 0.000024\n", " 0.000024\n", - " 415263.133269\n", - " 0.049847\n", + " 477874.511191\n", + " 0.043316\n", " one clarks is\n", " one clarks is\n", " 2\n", " 0.000024\n", " 0.000024\n", - " 9605.781694\n", - " 0.467987\n", + " 9888.266177\n", + " 0.454618\n", " wi\n", " wi\n", " 1\n", " 0.008840\n", " 0.008840\n", - " 15.445559\n", - " 5.943112\n", + " 15.866015\n", + " 5.785628\n", " wausau\n", " wausau\n", " 2\n", " 0.000061\n", " 0.000061\n", - " 102.014123\n", - " 109.659173\n", + " 103.554689\n", + " 108.028129\n", " WS PPR MLS\n", " WS PPR MLS\n", " 0\n", @@ -3329,7 +3329,7 @@ " \n", " \n", " 241593\n", - " 27.526514\n", + " 27.526521\n", " 1.000000\n", " __splink__input_table_0\n", " __splink__input_table_1\n", @@ -3340,29 +3340,29 @@ " 2\n", " 0.000037\n", " 0.000037\n", - " 415263.133269\n", - " 0.033231\n", + " 477874.511191\n", + " 0.028877\n", " 163 acorn ln\n", " 163 acorn ln\n", " 2\n", " 0.000037\n", " 0.000037\n", - " 9605.781694\n", - " 0.311992\n", + " 9888.266177\n", + " 0.303079\n", " vt\n", " vt\n", " 1\n", " 0.001537\n", " 0.001537\n", - " 15.445559\n", - " 34.184780\n", + " 15.866015\n", + " 33.278930\n", " colchester\n", " colchester\n", " 2\n", " 0.000183\n", " 0.000183\n", - " 102.014123\n", - " 36.553058\n", + " 103.554689\n", + " 36.009376\n", " KRN MNTN PWR\n", " KRN MNTN PWR\n", " 0\n", @@ -3375,7 +3375,7 @@ " \n", " \n", " 163815\n", - " 27.519606\n", + " 27.519613\n", " 1.000000\n", " __splink__input_table_0\n", " __splink__input_table_1\n", @@ -3386,29 +3386,29 @@ " 2\n", " 0.000073\n", " 0.000073\n", - " 415263.133269\n", - " 0.016616\n", + " 477874.511191\n", + " 0.014439\n", " 33 third st se\n", " 33 third st se\n", " 2\n", " 0.000037\n", " 0.000037\n", - " 9605.781694\n", - " 0.311992\n", + " 9888.266177\n", + " 0.303079\n", " sd\n", " sd\n", " 1\n", " 0.001930\n", " 0.001930\n", - " 15.445559\n", - " 27.217182\n", + " 15.866015\n", + " 26.495963\n", " huron\n", " huron\n", " 2\n", " 0.000073\n", " 0.000073\n", - " 102.014123\n", - " 91.382644\n", + " 103.554689\n", + " 90.023441\n", " NR0WSTRN PBLK SRFS\n", " NR0WSTRN PBLK SRFS\n", " 0\n", @@ -3466,17 +3466,17 @@ " ...\n", " \n", " \n", - " 1483\n", - " 4.337121\n", + " 218776\n", + " 4.337127\n", " 0.952856\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 58004\n", - " 17611\n", - " vistacare\n", - " stirling energy systems solar three\n", + " 32941\n", + " 17608\n", + " lifestance health group\n", + " stirling energy systems solar one\n", " 0\n", - " 0.000024\n", + " 0.000012\n", " 0.000037\n", " 0.986046\n", " 1.000000\n", @@ -3485,180 +3485,180 @@ " 2\n", " 0.000110\n", " 0.000110\n", - " 9605.781694\n", - " 0.103997\n", + " 9888.266177\n", + " 0.101026\n", " az\n", " az\n", " 1\n", " 0.012872\n", " 0.012872\n", - " 15.445559\n", - " 4.081277\n", + " 15.866015\n", + " 3.973129\n", " scottsdale\n", " scottsdale\n", " 2\n", " 0.004989\n", " 0.004989\n", - " 102.014123\n", - " 1.343862\n", - " FSTKR\n", - " STRLNK ENRJ SSTMS SLR 0R\n", + " 103.554689\n", + " 1.323874\n", + " LFSTNS HL0 KRP\n", + " STRLNK ENRJ SSTMS SLR ON\n", " 1\n", - " 58004\n", - " 0000787030\n", - " 0000787030\n", - " vistacare, inc.\n", - " 17611\n", - " 56168\n", + " 32941\n", + " 0001845257\n", + " 0001845257\n", + " lifestance health group, inc.\n", + " 17608\n", + " 56166\n", " \n", " \n", - " 218453\n", - " 4.272157\n", - " 0.950792\n", + " 145930\n", + " 4.321967\n", + " 0.952382\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 19174\n", - " 7605\n", - " enovis\n", - " genon sabine delaware\n", + " 28535\n", + " 9121\n", + " imperial holly\n", + " imperial holly\n", + " 2\n", + " 0.000024\n", + " 0.000024\n", + " 477874.511191\n", + " 0.043316\n", + " one imperial sq ste 200\n", + " p o box 9\n", " 0\n", - " 0.000012\n", - " 0.000012\n", - " 0.986046\n", + " 0.000024\n", + " 0.000159\n", + " 0.881656\n", " 1.000000\n", - " 2711 centerville rd\n", - " 2711 centerville rd\n", - " 2\n", - " 0.000061\n", - " 0.000061\n", - " 9605.781694\n", - " 0.187195\n", - " de\n", - " de\n", + " tx\n", + " tx\n", " 1\n", - " 0.011717\n", - " 0.011717\n", - " 15.445559\n", - " 4.483838\n", - " wilmington\n", - " wilmington\n", - " 2\n", - " 0.010321\n", - " 0.010321\n", - " 102.014123\n", - " 0.649640\n", - " ENFS\n", - " JNN SBN TLWR\n", + " 0.079841\n", + " 0.079841\n", + " 15.866015\n", + " 0.640571\n", + " sugar land\n", + " sugarland\n", " 1\n", - " 19174\n", - " 0001420800\n", - " 0001420800\n", - " enovis corp\n", - " 7605\n", - " 56922\n", + " 0.000355\n", + " 0.000098\n", + " 45.415672\n", + " 1.000000\n", + " IMPRL HL\n", + " IMPRL HL\n", + " 0\n", + " 28535\n", + " 0000831327\n", + " 0000831327\n", + " imperial holly corp\n", + " 9121\n", + " 9223\n", " \n", " \n", - " 1055\n", - " 4.272157\n", + " 6194\n", + " 4.272164\n", " 0.950792\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 1650\n", - " 16368\n", - " aisystems\n", - " shannon wind\n", + " 32403\n", + " 16195\n", + " lease investment flight trust\n", + " se solar trust v c\n", " 0\n", - " 0.000024\n", - " 0.000024\n", + " 0.000012\n", + " 0.000012\n", " 0.986046\n", " 1.000000\n", - " 2711 centerville rd\n", - " 2711 centerville rd\n", + " 1100 north market st\n", + " 1100 north market st\n", " 2\n", " 0.000061\n", " 0.000061\n", - " 9605.781694\n", - " 0.187195\n", + " 9888.266177\n", + " 0.181847\n", " de\n", " de\n", " 1\n", " 0.011717\n", " 0.011717\n", - " 15.445559\n", - " 4.483838\n", + " 15.866015\n", + " 4.365022\n", " wilmington\n", " wilmington\n", " 2\n", " 0.010321\n", " 0.010321\n", - " 102.014123\n", - " 0.649640\n", - " ASSTMS\n", - " XNN WNT\n", + " 103.554689\n", + " 0.639977\n", + " LS INFSTMNT FLT TRST\n", + " S SLR TRST F K\n", " 1\n", - " 1650\n", - " 0001328769\n", - " 0001328769\n", - " aisystems, inc.\n", - " 16368\n", - " 58872\n", + " 32403\n", + " 0001158389\n", + " 0001158389\n", + " lease investment flight trust\n", + " 16195\n", + " 56900\n", " \n", " \n", - " 7216\n", - " 4.272157\n", + " 1135\n", + " 4.272164\n", " 0.950792\n", " __splink__input_table_0\n", " __splink__input_table_1\n", - " 32403\n", - " 14089\n", - " lease investment flight trust\n", - " pasadena statutory trust\n", + " 22415\n", + " 7605\n", + " fresenius kabi pharmaceuticals holding\n", + " genon sabine delaware\n", " 0\n", " 0.000012\n", " 0.000012\n", " 0.986046\n", " 1.000000\n", - " 1100 north market st\n", - " 1100 north market st\n", + " 2711 centerville rd\n", + " 2711 centerville rd\n", " 2\n", " 0.000061\n", " 0.000061\n", - " 9605.781694\n", - " 0.187195\n", + " 9888.266177\n", + " 0.181847\n", " de\n", " de\n", " 1\n", " 0.011717\n", " 0.011717\n", - " 15.445559\n", - " 4.483838\n", + " 15.866015\n", + " 4.365022\n", " wilmington\n", " wilmington\n", " 2\n", " 0.010321\n", " 0.010321\n", - " 102.014123\n", - " 0.649640\n", - " LS INFSTMNT FLT TRST\n", - " PSTN STTTR TRST\n", + " 103.554689\n", + " 0.639977\n", + " FRSNS KB FRMSTKLS HLTNK\n", + " JNN SBN TLWR\n", " 1\n", - " 32403\n", - " 0001158389\n", - " 0001158389\n", - " lease investment flight trust\n", - " 14089\n", - " 61235\n", + " 22415\n", + " 0001439449\n", + " 0001439449\n", + " fresenius kabi pharmaceuticals holding, inc.\n", + " 7605\n", + " 56922\n", " \n", " \n", - " 6113\n", - " 4.272157\n", + " 9350\n", + " 4.272164\n", " 0.950792\n", " __splink__input_table_0\n", " __splink__input_table_1\n", " 1626\n", - " 16195\n", + " 14089\n", " airplanes us trust\n", - " se solar trust v c\n", + " pasadena statutory trust\n", " 0\n", " 0.000012\n", " 0.000012\n", @@ -3669,31 +3669,31 @@ " 2\n", " 0.000061\n", " 0.000061\n", - " 9605.781694\n", - " 0.187195\n", + " 9888.266177\n", + " 0.181847\n", " de\n", " de\n", " 1\n", " 0.011717\n", " 0.011717\n", - " 15.445559\n", - " 4.483838\n", + " 15.866015\n", + " 4.365022\n", " wilmington\n", " wilmington\n", " 2\n", " 0.010321\n", " 0.010321\n", - " 102.014123\n", - " 0.649640\n", + " 103.554689\n", + " 0.639977\n", " ARPLNS US TRST\n", - " S SLR TRST F K\n", + " PSTN STTTR TRST\n", " 1\n", " 1626\n", " 0001004540\n", " 0001004540\n", " airplanes us trust\n", - " 16195\n", - " 56900\n", + " 14089\n", + " 61235\n", " \n", " \n", "\n", @@ -3701,23 +3701,23 @@ "" ], "text/plain": [ - " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key record_id_x sec_company_id central_index_key company_name_raw record_id_y utility_id_eia\n", - "274760 29.211012 1.000000 __splink__input_table_0 __splink__input_table_1 20588 6741 fibermark fibermark 2 0.000037 0.000037 415263.133269 0.033231 161 wellington rd 161 wellington rd 2 0.000024 0.000024 9605.781694 0.467987 vt vt 1 0.001537 0.001537 15.445559 34.184780 brattleboro brattleboro 2 0.000086 0.000086 102.014123 78.327981 FBRMRK FBRMRK 0 20588 0000887591 0000887591 fibermark inc 6741 6309\n", - "340414 27.884365 1.000000 __splink__input_table_0 __splink__input_table_1 51567 17450 st joseph light and power st joseph light and power 2 0.000024 0.000024 415263.133269 0.049847 520 francis st 520 francis st 2 0.000024 0.000024 9605.781694 0.467987 mo mo 1 0.010118 0.010118 15.445559 5.192099 st joseph st joseph 2 0.000049 0.000049 102.014123 137.073967 ST JSF LT ANT PWR ST JSF LT ANT PWR 0 51567 0000086251 0000086251 st joseph light & power co 17450 17881\n", - "165487 27.757338 1.000000 __splink__input_table_0 __splink__input_table_1 58842 19906 wausau paper mills wausau paper mills 2 0.000024 0.000024 415263.133269 0.049847 one clarks is one clarks is 2 0.000024 0.000024 9605.781694 0.467987 wi wi 1 0.008840 0.008840 15.445559 5.943112 wausau wausau 2 0.000061 0.000061 102.014123 109.659173 WS PPR MLS WS PPR MLS 0 58842 0000105076 0000105076 wausau paper mills co 19906 20190\n", - "241593 27.526514 1.000000 __splink__input_table_0 __splink__input_table_1 24650 8047 green mountain power green mountain power 2 0.000037 0.000037 415263.133269 0.033231 163 acorn ln 163 acorn ln 2 0.000037 0.000037 9605.781694 0.311992 vt vt 1 0.001537 0.001537 15.445559 34.184780 colchester colchester 2 0.000183 0.000183 102.014123 36.553058 KRN MNTN PWR KRN MNTN PWR 0 24650 0000043704 0000043704 green mountain power corp 8047 7601\n", - "163815 27.519606 1.000000 __splink__input_table_0 __splink__input_table_1 39816 13109 northwestern public service northwestern public service 2 0.000073 0.000073 415263.133269 0.016616 33 third st se 33 third st se 2 0.000037 0.000037 9605.781694 0.311992 sd sd 1 0.001930 0.001930 15.445559 27.217182 huron huron 2 0.000073 0.000073 102.014123 91.382644 NR0WSTRN PBLK SRFS NR0WSTRN PBLK SRFS 0 39816 0000073088 0000073088 northwestern public service co 13109 13809\n", - "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", - "1483 4.337121 0.952856 __splink__input_table_0 __splink__input_table_1 58004 17611 vistacare stirling energy systems solar three 0 0.000024 0.000037 0.986046 1.000000 4800 n scottsdale rd 4800 n scottsdale rd 2 0.000110 0.000110 9605.781694 0.103997 az az 1 0.012872 0.012872 15.445559 4.081277 scottsdale scottsdale 2 0.004989 0.004989 102.014123 1.343862 FSTKR STRLNK ENRJ SSTMS SLR 0R 1 58004 0000787030 0000787030 vistacare, inc. 17611 56168\n", - "218453 4.272157 0.950792 __splink__input_table_0 __splink__input_table_1 19174 7605 enovis genon sabine delaware 0 0.000012 0.000012 0.986046 1.000000 2711 centerville rd 2711 centerville rd 2 0.000061 0.000061 9605.781694 0.187195 de de 1 0.011717 0.011717 15.445559 4.483838 wilmington wilmington 2 0.010321 0.010321 102.014123 0.649640 ENFS JNN SBN TLWR 1 19174 0001420800 0001420800 enovis corp 7605 56922\n", - "1055 4.272157 0.950792 __splink__input_table_0 __splink__input_table_1 1650 16368 aisystems shannon wind 0 0.000024 0.000024 0.986046 1.000000 2711 centerville rd 2711 centerville rd 2 0.000061 0.000061 9605.781694 0.187195 de de 1 0.011717 0.011717 15.445559 4.483838 wilmington wilmington 2 0.010321 0.010321 102.014123 0.649640 ASSTMS XNN WNT 1 1650 0001328769 0001328769 aisystems, inc. 16368 58872\n", - "7216 4.272157 0.950792 __splink__input_table_0 __splink__input_table_1 32403 14089 lease investment flight trust pasadena statutory trust 0 0.000012 0.000012 0.986046 1.000000 1100 north market st 1100 north market st 2 0.000061 0.000061 9605.781694 0.187195 de de 1 0.011717 0.011717 15.445559 4.483838 wilmington wilmington 2 0.010321 0.010321 102.014123 0.649640 LS INFSTMNT FLT TRST PSTN STTTR TRST 1 32403 0001158389 0001158389 lease investment flight trust 14089 61235\n", - "6113 4.272157 0.950792 __splink__input_table_0 __splink__input_table_1 1626 16195 airplanes us trust se solar trust v c 0 0.000012 0.000012 0.986046 1.000000 1100 north market st 1100 north market st 2 0.000061 0.000061 9605.781694 0.187195 de de 1 0.011717 0.011717 15.445559 4.483838 wilmington wilmington 2 0.010321 0.010321 102.014123 0.649640 ARPLNS US TRST S SLR TRST F K 1 1626 0001004540 0001004540 airplanes us trust 16195 56900\n", + " match_weight match_probability source_dataset_l source_dataset_r record_id_l record_id_r company_name_no_legal_l company_name_no_legal_r gamma_company_name_no_legal tf_company_name_no_legal_l tf_company_name_no_legal_r bf_company_name_no_legal bf_tf_adj_company_name_no_legal street_address_l street_address_r gamma_street_address tf_street_address_l tf_street_address_r bf_street_address bf_tf_adj_street_address state_l state_r gamma_state tf_state_l tf_state_r bf_state bf_tf_adj_state city_l city_r gamma_city tf_city_l tf_city_r bf_city bf_tf_adj_city company_name_mphone_l company_name_mphone_r match_key record_id_x sec_company_id central_index_key company_name_raw record_id_y utility_id_eia\n", + "274760 29.211020 1.000000 __splink__input_table_0 __splink__input_table_1 20588 6741 fibermark fibermark 2 0.000037 0.000037 477874.511191 0.028877 161 wellington rd 161 wellington rd 2 0.000024 0.000024 9888.266177 0.454618 vt vt 1 0.001537 0.001537 15.866015 33.278930 brattleboro brattleboro 2 0.000086 0.000086 103.554689 77.162949 FBRMRK FBRMRK 0 20588 0000887591 0000887591 fibermark inc 6741 6309\n", + "340414 27.884373 1.000000 __splink__input_table_0 __splink__input_table_1 51567 17450 st joseph light and power st joseph light and power 2 0.000024 0.000024 477874.511191 0.043316 520 francis st 520 francis st 2 0.000024 0.000024 9888.266177 0.454618 mo mo 1 0.010118 0.010118 15.866015 5.054515 st joseph st joseph 2 0.000049 0.000049 103.554689 135.035162 ST JSF LT ANT PWR ST JSF LT ANT PWR 0 51567 0000086251 0000086251 st joseph light & power co 17450 17881\n", + "165487 27.757345 1.000000 __splink__input_table_0 __splink__input_table_1 58842 19906 wausau paper mills wausau paper mills 2 0.000024 0.000024 477874.511191 0.043316 one clarks is one clarks is 2 0.000024 0.000024 9888.266177 0.454618 wi wi 1 0.008840 0.008840 15.866015 5.785628 wausau wausau 2 0.000061 0.000061 103.554689 108.028129 WS PPR MLS WS PPR MLS 0 58842 0000105076 0000105076 wausau paper mills co 19906 20190\n", + "241593 27.526521 1.000000 __splink__input_table_0 __splink__input_table_1 24650 8047 green mountain power green mountain power 2 0.000037 0.000037 477874.511191 0.028877 163 acorn ln 163 acorn ln 2 0.000037 0.000037 9888.266177 0.303079 vt vt 1 0.001537 0.001537 15.866015 33.278930 colchester colchester 2 0.000183 0.000183 103.554689 36.009376 KRN MNTN PWR KRN MNTN PWR 0 24650 0000043704 0000043704 green mountain power corp 8047 7601\n", + "163815 27.519613 1.000000 __splink__input_table_0 __splink__input_table_1 39816 13109 northwestern public service northwestern public service 2 0.000073 0.000073 477874.511191 0.014439 33 third st se 33 third st se 2 0.000037 0.000037 9888.266177 0.303079 sd sd 1 0.001930 0.001930 15.866015 26.495963 huron huron 2 0.000073 0.000073 103.554689 90.023441 NR0WSTRN PBLK SRFS NR0WSTRN PBLK SRFS 0 39816 0000073088 0000073088 northwestern public service co 13109 13809\n", + "... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...\n", + "218776 4.337127 0.952856 __splink__input_table_0 __splink__input_table_1 32941 17608 lifestance health group stirling energy systems solar one 0 0.000012 0.000037 0.986046 1.000000 4800 n scottsdale rd 4800 n scottsdale rd 2 0.000110 0.000110 9888.266177 0.101026 az az 1 0.012872 0.012872 15.866015 3.973129 scottsdale scottsdale 2 0.004989 0.004989 103.554689 1.323874 LFSTNS HL0 KRP STRLNK ENRJ SSTMS SLR ON 1 32941 0001845257 0001845257 lifestance health group, inc. 17608 56166\n", + "145930 4.321967 0.952382 __splink__input_table_0 __splink__input_table_1 28535 9121 imperial holly imperial holly 2 0.000024 0.000024 477874.511191 0.043316 one imperial sq ste 200 p o box 9 0 0.000024 0.000159 0.881656 1.000000 tx tx 1 0.079841 0.079841 15.866015 0.640571 sugar land sugarland 1 0.000355 0.000098 45.415672 1.000000 IMPRL HL IMPRL HL 0 28535 0000831327 0000831327 imperial holly corp 9121 9223\n", + "6194 4.272164 0.950792 __splink__input_table_0 __splink__input_table_1 32403 16195 lease investment flight trust se solar trust v c 0 0.000012 0.000012 0.986046 1.000000 1100 north market st 1100 north market st 2 0.000061 0.000061 9888.266177 0.181847 de de 1 0.011717 0.011717 15.866015 4.365022 wilmington wilmington 2 0.010321 0.010321 103.554689 0.639977 LS INFSTMNT FLT TRST S SLR TRST F K 1 32403 0001158389 0001158389 lease investment flight trust 16195 56900\n", + "1135 4.272164 0.950792 __splink__input_table_0 __splink__input_table_1 22415 7605 fresenius kabi pharmaceuticals holding genon sabine delaware 0 0.000012 0.000012 0.986046 1.000000 2711 centerville rd 2711 centerville rd 2 0.000061 0.000061 9888.266177 0.181847 de de 1 0.011717 0.011717 15.866015 4.365022 wilmington wilmington 2 0.010321 0.010321 103.554689 0.639977 FRSNS KB FRMSTKLS HLTNK JNN SBN TLWR 1 22415 0001439449 0001439449 fresenius kabi pharmaceuticals holding, inc. 7605 56922\n", + "9350 4.272164 0.950792 __splink__input_table_0 __splink__input_table_1 1626 14089 airplanes us trust pasadena statutory trust 0 0.000012 0.000012 0.986046 1.000000 1100 north market st 1100 north market st 2 0.000061 0.000061 9888.266177 0.181847 de de 1 0.011717 0.011717 15.866015 4.365022 wilmington wilmington 2 0.010321 0.010321 103.554689 0.639977 ARPLNS US TRST PSTN STTTR TRST 1 1626 0001004540 0001004540 airplanes us trust 14089 61235\n", "\n", "[534 rows x 43 columns]" ] }, - "execution_count": 60, + "execution_count": 55, "metadata": {}, "output_type": "execute_result" } @@ -3728,50 +3728,12 @@ }, { "cell_type": "code", - "execution_count": 64, - "id": "1d3e41bd-f92a-4f77-a0a7-0bd24f7ea70c", - "metadata": {}, - "outputs": [], - "source": [ - "out_df = sec_df.merge(\n", - " one_to_one_preds[[\"sec_company_id\", \"utility_id_eia\"]],\n", - " how=\"left\",\n", - " on=\"sec_company_id\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 66, - "id": "cce2b383-48b3-4efd-977a-0c734b0e3ec2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "utility_id_eia\n", - "True 59895\n", - "False 1131\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 66, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "out_df.utility_id_eia.isnull().value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "1cf0be2e-b1ef-4eb1-a07a-28e977c40252", + "execution_count": 320, + "id": "4633e3f8-f0a3-4109-ae66-b3e898059ed7", "metadata": {}, "outputs": [], "source": [ - "len(one_to_one_preds" + "one_to_one_preds.to_parquet(\"one_to_one_preds.parquet\")" ] } ],