From 79cd01bdd5666699475143a0a951f999dee33b92 Mon Sep 17 00:00:00 2001 From: Alonso Guevara Date: Thu, 13 Feb 2025 16:22:16 -0600 Subject: [PATCH 1/5] Remove most iterrow usages --- graphrag/index/operations/create_graph.py | 2 +- graphrag/index/operations/snapshot_rows.py | 85 ----- .../summarize_descriptions.py | 12 +- graphrag/index/update/entities.py | 9 +- graphrag/query/input/loaders/dfs.py | 103 +++--- graphrag/query/input/loaders/utils.py | 304 +++++++----------- 6 files changed, 188 insertions(+), 327 deletions(-) delete mode 100644 graphrag/index/operations/snapshot_rows.py diff --git a/graphrag/index/operations/create_graph.py b/graphrag/index/operations/create_graph.py index 54b63b70aa..a28b1ceb5b 100644 --- a/graphrag/index/operations/create_graph.py +++ b/graphrag/index/operations/create_graph.py @@ -18,6 +18,6 @@ def create_graph( if nodes is not None: nodes.set_index(node_id, inplace=True) - graph.add_nodes_from((n, dict(d)) for n, d in nodes.iterrows()) + graph.add_nodes_from(nodes.to_dict("index").items()) return graph diff --git a/graphrag/index/operations/snapshot_rows.py b/graphrag/index/operations/snapshot_rows.py deleted file mode 100644 index 1140ee555b..0000000000 --- a/graphrag/index/operations/snapshot_rows.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (c) 2024 Microsoft Corporation. -# Licensed under the MIT License - -"""A module containing 'FormatSpecifier' model.""" - -import json -from dataclasses import dataclass -from typing import Any - -import pandas as pd - -from graphrag.storage.pipeline_storage import PipelineStorage - - -@dataclass -class FormatSpecifier: - """Format specifier class definition.""" - - format: str - extension: str - - -async def snapshot_rows( - input: pd.DataFrame, - column: str | None, - base_name: str, - storage: PipelineStorage, - formats: list[str | dict[str, Any]], - row_name_column: str | None = None, -) -> None: - """Take a by-row snapshot of the tabular data.""" - parsed_formats = _parse_formats(formats) - num_rows = len(input) - - def get_row_name(row: Any, row_idx: Any): - if row_name_column is None: - if num_rows == 1: - return base_name - return f"{base_name}.{row_idx}" - return f"{base_name}.{row[row_name_column]}" - - for row_idx, row in input.iterrows(): - for fmt in parsed_formats: - row_name = get_row_name(row, row_idx) - extension = fmt.extension - if fmt.format == "json": - await storage.set( - f"{row_name}.{extension}", - ( - json.dumps(row[column], ensure_ascii=False) - if column is not None - else json.dumps(row.to_dict(), ensure_ascii=False) - ), - ) - elif fmt.format == "text": - if column is None: - msg = "column must be specified for text format" - raise ValueError(msg) - await storage.set(f"{row_name}.{extension}", str(row[column])) - - -def _parse_formats(formats: list[str | dict[str, Any]]) -> list[FormatSpecifier]: - """Parse the formats into a list of FormatSpecifiers.""" - return [ - ( - FormatSpecifier(**fmt) - if isinstance(fmt, dict) - else FormatSpecifier(format=fmt, extension=_get_format_extension(fmt)) - ) - for fmt in formats - ] - - -def _get_format_extension(fmt: str) -> str: - """Get the file extension for a given format.""" - if fmt == "json": - return "json" - if fmt == "text": - return "txt" - if fmt == "parquet": - return "parquet" - if fmt == "csv": - return "csv" - msg = f"Unknown format: {fmt}" - raise ValueError(msg) diff --git a/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py b/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py index ca1baa6198..94829949cc 100644 --- a/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py +++ b/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py @@ -85,12 +85,12 @@ async def get_summarized( node_futures = [ do_summarize_descriptions( - str(row[1]["title"]), - sorted(set(row[1]["description"])), + str(row.title), + sorted(set(row.description)), ticker, semaphore, ) - for row in nodes.iterrows() + for row in nodes.itertuples(index=False) ] node_results = await asyncio.gather(*node_futures) @@ -105,12 +105,12 @@ async def get_summarized( edge_futures = [ do_summarize_descriptions( - (str(row[1]["source"]), str(row[1]["target"])), - sorted(set(row[1]["description"])), + (str(row.source), str(row.target)), + sorted(set(row.description)), ticker, semaphore, ) - for row in edges.iterrows() + for row in edges.itertuples(index=False) ] edge_results = await asyncio.gather(*edge_futures) diff --git a/graphrag/index/update/entities.py b/graphrag/index/update/entities.py index 42be21848b..89ac6712f6 100644 --- a/graphrag/index/update/entities.py +++ b/graphrag/index/update/entities.py @@ -119,11 +119,12 @@ async def _run_entity_summarization( # Prepare tasks for async summarization where needed async def process_row(row): - description = row["description"] + # Accessing attributes directly from the named tuple. + description = row.description if isinstance(description, list) and len(description) > 1: # Run entity summarization asynchronously result = await run_entity_summarization( - row["title"], + row.title, description, callbacks, cache, @@ -134,7 +135,9 @@ async def process_row(row): return description[0] if isinstance(description, list) else description # Create a list of async tasks for summarization - tasks = [process_row(row) for _, row in entities_df.iterrows()] + tasks = [ + process_row(row) for row in entities_df.itertuples(index=False, name="Entity") + ] results = await asyncio.gather(*tasks) # Update the 'description' column in the DataFrame diff --git a/graphrag/query/input/loaders/dfs.py b/graphrag/query/input/loaders/dfs.py index 097a628951..d662832b1e 100644 --- a/graphrag/query/input/loaders/dfs.py +++ b/graphrag/query/input/loaders/dfs.py @@ -21,6 +21,15 @@ ) +def _prepare_records(df: pd.DataFrame) -> list[dict]: + """ + Reset index and convert the DataFrame to a list of dictionaries. + We rename the reset index column to 'Index' for consistency. + """ + df_reset = df.reset_index().rename(columns={"index": "Index"}) + return df_reset.to_dict("records") + + def read_entities( df: pd.DataFrame, id_col: str = "id", @@ -35,12 +44,14 @@ def read_entities( rank_col: str | None = "degree", attributes_cols: list[str] | None = None, ) -> list[Entity]: - """Read entities from a dataframe.""" - entities = [] - for idx, row in df.iterrows(): - entity = Entity( + """Read entities from a dataframe using pre-converted records.""" + records = _prepare_records(df) + return [ + Entity( id=to_str(row, id_col), - short_id=to_optional_str(row, short_id_col) if short_id_col else str(idx), + short_id=to_optional_str(row, short_id_col) + if short_id_col + else str(row["Index"]), title=to_str(row, title_col), type=to_optional_str(row, type_col), description=to_optional_str(row, description_col), @@ -57,8 +68,8 @@ def read_entities( else None ), ) - entities.append(entity) - return entities + for row in records + ] def read_relationships( @@ -74,12 +85,14 @@ def read_relationships( text_unit_ids_col: str | None = "text_unit_ids", attributes_cols: list[str] | None = None, ) -> list[Relationship]: - """Read relationships from a dataframe.""" - relationships = [] - for idx, row in df.iterrows(): - rel = Relationship( + """Read relationships from a dataframe using pre-converted records.""" + records = _prepare_records(df) + return [ + Relationship( id=to_str(row, id_col), - short_id=to_optional_str(row, short_id_col) if short_id_col else str(idx), + short_id=to_optional_str(row, short_id_col) + if short_id_col + else str(row["Index"]), source=to_str(row, source_col), target=to_str(row, target_col), description=to_optional_str(row, description_col), @@ -95,8 +108,8 @@ def read_relationships( else None ), ) - relationships.append(rel) - return relationships + for row in records + ] def read_covariates( @@ -108,12 +121,14 @@ def read_covariates( text_unit_ids_col: str | None = "text_unit_ids", attributes_cols: list[str] | None = None, ) -> list[Covariate]: - """Read covariates from a dataframe.""" - covariates = [] - for idx, row in df.iterrows(): - cov = Covariate( + """Read covariates from a dataframe using pre-converted records.""" + records = _prepare_records(df) + return [ + Covariate( id=to_str(row, id_col), - short_id=to_optional_str(row, short_id_col) if short_id_col else str(idx), + short_id=to_optional_str(row, short_id_col) + if short_id_col + else str(row["Index"]), subject_id=to_str(row, subject_col), covariate_type=( to_str(row, covariate_type_col) if covariate_type_col else "claim" @@ -125,8 +140,8 @@ def read_covariates( else None ), ) - covariates.append(cov) - return covariates + for row in records + ] def read_communities( @@ -141,12 +156,14 @@ def read_communities( sub_communities_col: str | None = "sub_community_ids", attributes_cols: list[str] | None = None, ) -> list[Community]: - """Read communities from a dataframe.""" - communities = [] - for idx, row in df.iterrows(): - comm = Community( + """Read communities from a dataframe using pre-converted records.""" + records = _prepare_records(df) + return [ + Community( id=to_str(row, id_col), - short_id=to_optional_str(row, short_id_col) if short_id_col else str(idx), + short_id=to_optional_str(row, short_id_col) + if short_id_col + else str(row["Index"]), title=to_str(row, title_col), level=to_str(row, level_col), entity_ids=to_optional_list(row, entities_col, item_type=str), @@ -161,8 +178,8 @@ def read_communities( else None ), ) - communities.append(comm) - return communities + for row in records + ] def read_community_reports( @@ -177,12 +194,14 @@ def read_community_reports( content_embedding_col: str | None = "full_content_embedding", attributes_cols: list[str] | None = None, ) -> list[CommunityReport]: - """Read community reports from a dataframe.""" - reports = [] - for idx, row in df.iterrows(): - report = CommunityReport( + """Read community reports from a dataframe using pre-converted records.""" + records = _prepare_records(df) + return [ + CommunityReport( id=to_str(row, id_col), - short_id=to_optional_str(row, short_id_col) if short_id_col else str(idx), + short_id=to_optional_str(row, short_id_col) + if short_id_col + else str(row["Index"]), title=to_str(row, title_col), community_id=to_str(row, community_col), summary=to_str(row, summary_col), @@ -197,8 +216,8 @@ def read_community_reports( else None ), ) - reports.append(report) - return reports + for row in records + ] def read_text_units( @@ -212,12 +231,12 @@ def read_text_units( document_ids_col: str | None = "document_ids", attributes_cols: list[str] | None = None, ) -> list[TextUnit]: - """Read text units from a dataframe.""" - text_units = [] - for idx, row in df.iterrows(): - chunk = TextUnit( + """Read text units from a dataframe using pre-converted records.""" + records = _prepare_records(df) + return [ + TextUnit( id=to_str(row, id_col), - short_id=str(idx), + short_id=str(row["Index"]), text=to_str(row, text_col), entity_ids=to_optional_list(row, entities_col, item_type=str), relationship_ids=to_optional_list(row, relationships_col, item_type=str), @@ -232,5 +251,5 @@ def read_text_units( else None ), ) - text_units.append(chunk) - return text_units + for row in records + ] diff --git a/graphrag/query/input/loaders/utils.py b/graphrag/query/input/loaders/utils.py index a96844ba76..c067576d61 100644 --- a/graphrag/query/input/loaders/utils.py +++ b/graphrag/query/input/loaders/utils.py @@ -3,245 +3,169 @@ """Data load utils.""" -import numpy as np -import pandas as pd +from typing import Any, Mapping +import numpy as np -def to_str(data: pd.Series, column_name: str | None) -> str: - """Convert and validate a value to a string.""" - if column_name is None: - msg = "Column name is None" - raise ValueError(msg) - if column_name in data: - return str(data[column_name]) - msg = f"Column [{column_name}] not found in data" - raise ValueError(msg) +def _get_value( + data: Mapping[str, Any], column_name: str | None, required: bool = True +) -> Any: + """ + Retrieve a column value from data. + If `required` is True, raises a ValueError when: + - column_name is None, or + - column_name is not in data. -def to_optional_str(data: pd.Series, column_name: str | None) -> str | None: - """Convert and validate a value to an optional string.""" + For optional columns (required=False), returns None if column_name is None. + """ if column_name is None: - msg = "Column name is None" - raise ValueError(msg) - + if required: + raise ValueError("Column name is None") + return None if column_name in data: - value = data[column_name] - if value is None: - return None - return str(data[column_name]) - msg = f"Column [{column_name}] not found in data" - raise ValueError(msg) + return data[column_name] + if required: + raise ValueError(f"Column [{column_name}] not found in data") + return None -def to_list( - data: pd.Series, column_name: str | None, item_type: type | None = None -) -> list: - """Convert and validate a value to a list.""" - if column_name is None: - msg = "Column name is None" - raise ValueError(msg) +def to_str(data: Mapping[str, Any], column_name: str | None) -> str: + """Convert and validate a value to a string.""" + value = _get_value(data, column_name, required=True) + return str(value) - if column_name in data: - value = data[column_name] - if isinstance(value, np.ndarray): - value = value.tolist() - if not isinstance(value, list): - msg = f"value is not a list: {value} ({type(value)})" - raise ValueError(msg) +def to_optional_str(data: Mapping[str, Any], column_name: str | None) -> str | None: + """Convert and validate a value to an optional string.""" + value = _get_value(data, column_name, required=True) + return None if value is None else str(value) - if item_type is not None: - for v in value: - if not isinstance(v, item_type): - msg = ( - f"list item has item that is not [{item_type}]: {v} ({type(v)})" - ) - raise TypeError(msg) - return value - msg = f"Column [{column_name}] not found in data" - raise ValueError(msg) +def to_list( + data: Mapping[str, Any], column_name: str | None, item_type: type | None = None +) -> list: + """Convert and validate a value to a list.""" + value = _get_value(data, column_name, required=True) + if isinstance(value, np.ndarray): + value = value.tolist() + if not isinstance(value, list): + raise ValueError(f"value is not a list: {value} ({type(value)})") + if item_type is not None: + for v in value: + if not isinstance(v, item_type): + raise TypeError(f"list item is not [{item_type}]: {v} ({type(v)})") + return value def to_optional_list( - data: pd.Series, column_name: str | None, item_type: type | None = None + data: Mapping[str, Any], column_name: str | None, item_type: type | None = None ) -> list | None: """Convert and validate a value to an optional list.""" - if column_name is None: + if column_name is None or column_name not in data: return None - - if column_name in data: - value = data[column_name] # type: ignore - if value is None: - return None - - if isinstance(value, np.ndarray): - value = value.tolist() - - if isinstance(value, str): - value = [value] - - if not isinstance(value, list): - msg = f"value is not a list: {value} ({type(value)})" - raise ValueError(msg) - - if item_type is not None: - for v in value: - if not isinstance(v, item_type): - msg = ( - f"list item has item that is not [{item_type}]: {v} ({type(v)})" - ) - raise TypeError(msg) - return value - - return None - - -def to_int(data: pd.Series, column_name: str | None) -> int: + value = data[column_name] + if value is None: + return None + if isinstance(value, np.ndarray): + value = value.tolist() + if isinstance(value, str): + value = [value] + if not isinstance(value, list): + raise ValueError(f"value is not a list: {value} ({type(value)})") + if item_type is not None: + for v in value: + if not isinstance(v, item_type): + raise TypeError(f"list item is not [{item_type}]: {v} ({type(v)})") + return value + + +def to_int(data: Mapping[str, Any], column_name: str | None) -> int: """Convert and validate a value to an int.""" - if column_name is None: - msg = "Column name is None" - raise ValueError(msg) - - if column_name in data: - value = data[column_name] - if isinstance(value, float): - value = int(value) - if not isinstance(value, int): - msg = f"value is not an int: {value} ({type(value)})" - raise ValueError(msg) - else: - msg = f"Column [{column_name}] not found in data" - raise ValueError(msg) - + value = _get_value(data, column_name, required=True) + if isinstance(value, float): + value = int(value) + if not isinstance(value, int): + raise ValueError(f"value is not an int: {value} ({type(value)})") return int(value) -def to_optional_int(data: pd.Series, column_name: str | None) -> int | None: +def to_optional_int(data: Mapping[str, Any], column_name: str | None) -> int | None: """Convert and validate a value to an optional int.""" - if column_name is None: + if column_name is None or column_name not in data: return None - - if column_name in data: - value = data[column_name] - - if value is None: - return None - - if isinstance(value, float): - value = int(value) - if not isinstance(value, int): - msg = f"value is not an int: {value} ({type(value)})" - raise ValueError(msg) - else: - msg = f"Column [{column_name}] not found in data" - raise ValueError(msg) - + value = data[column_name] + if value is None: + return None + if isinstance(value, float): + value = int(value) + if not isinstance(value, int): + raise ValueError(f"value is not an int: {value} ({type(value)})") return int(value) -def to_float(data: pd.Series, column_name: str | None) -> float: +def to_float(data: Mapping[str, Any], column_name: str | None) -> float: """Convert and validate a value to a float.""" - if column_name is None: - msg = "Column name is None" - raise ValueError(msg) - - if column_name in data: - value = data[column_name] - if not isinstance(value, float): - msg = f"value is not a float: {value} ({type(value)})" - raise ValueError(msg) - else: - msg = f"Column [{column_name}] not found in data" - raise ValueError(msg) - + value = _get_value(data, column_name, required=True) + if not isinstance(value, float): + raise ValueError(f"value is not a float: {value} ({type(value)})") return float(value) -def to_optional_float(data: pd.Series, column_name: str | None) -> float | None: +def to_optional_float(data: Mapping[str, Any], column_name: str | None) -> float | None: """Convert and validate a value to an optional float.""" - if column_name is None: + if column_name is None or column_name not in data: return None - - if column_name in data: - value = data[column_name] - if value is None: - return None - if not isinstance(value, float): - return float(value) - else: - msg = f"Column {column_name} not found in data" - raise ValueError(msg) - + value = data[column_name] + if value is None: + return None + if not isinstance(value, float): + return float(value) return float(value) def to_dict( - data: pd.Series, + data: Mapping[str, Any], column_name: str | None, key_type: type | None = None, value_type: type | None = None, ) -> dict: """Convert and validate a value to a dict.""" - if column_name is None: - msg = "Column name is None" - raise ValueError(msg) - - if column_name in data: - value = data[column_name] - if not isinstance(value, dict): - msg = f"value is not a dict: {value} ({type(value)})" - raise ValueError(msg) - - if key_type is not None: - for v in value: - if not isinstance(v, key_type): - msg = f"dict key has item that is not [{key_type}]: {v} ({type(v)})" - raise TypeError(msg) - - if value_type is not None: - for v in value.values(): - if not isinstance(v, value_type): - msg = f"dict value has item that is not [{value_type}]: {v} ({type(v)})" - raise TypeError(msg) - return value - - msg = f"Column [{column_name}] not found in data" - raise ValueError(msg) + value = _get_value(data, column_name, required=True) + if not isinstance(value, dict): + raise ValueError(f"value is not a dict: {value} ({type(value)})") + if key_type is not None: + for k in value: + if not isinstance(k, key_type): + raise TypeError(f"dict key is not [{key_type}]: {k} ({type(k)})") + if value_type is not None: + for v in value.values(): + if not isinstance(v, value_type): + raise TypeError(f"dict value is not [{value_type}]: {v} ({type(v)})") + return value def to_optional_dict( - data: pd.Series, + data: Mapping[str, Any], column_name: str | None, key_type: type | None = None, value_type: type | None = None, ) -> dict | None: """Convert and validate a value to an optional dict.""" - if column_name is None: + if column_name is None or column_name not in data: return None - - if column_name in data: - value = data[column_name] - if value is None: - return None - if not isinstance(value, dict): - msg = f"value is not a dict: {value} ({type(value)})" - raise TypeError(msg) - - if key_type is not None: - for v in value: - if not isinstance(v, key_type): - msg = f"dict key has item that is not [{key_type}]: {v} ({type(v)})" - raise TypeError(msg) - - if value_type is not None: - for v in value.values(): - if not isinstance(v, value_type): - msg = f"dict value has item that is not [{value_type}]: {v} ({type(v)})" - raise TypeError(msg) - - return value - - msg = f"Column {column_name} not found in data" - raise ValueError(msg) + value = data[column_name] + if value is None: + return None + if not isinstance(value, dict): + raise TypeError(f"value is not a dict: {value} ({type(value)})") + if key_type is not None: + for k in value: + if not isinstance(k, key_type): + raise TypeError(f"dict key is not [{key_type}]: {k} ({type(k)})") + if value_type is not None: + for v in value.values(): + if not isinstance(v, value_type): + raise TypeError(f"dict value is not [{value_type}]: {v} ({type(v)})") + return value From b8390cc1ca9070469dd85bab8a9765bfd6c1cfb9 Mon Sep 17 00:00:00 2001 From: Alonso Guevara Date: Thu, 13 Feb 2025 16:22:56 -0600 Subject: [PATCH 2/5] Semver --- .semversioner/next-release/patch-20250213222251109897.json | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 .semversioner/next-release/patch-20250213222251109897.json diff --git a/.semversioner/next-release/patch-20250213222251109897.json b/.semversioner/next-release/patch-20250213222251109897.json new file mode 100644 index 0000000000..9870ded838 --- /dev/null +++ b/.semversioner/next-release/patch-20250213222251109897.json @@ -0,0 +1,4 @@ +{ + "type": "patch", + "description": "Optimize data iteration by removing some iterrows from code" +} From ba98ae95febdcda935c50598cd1916d0527d1d79 Mon Sep 17 00:00:00 2001 From: Alonso Guevara Date: Thu, 13 Feb 2025 16:39:41 -0600 Subject: [PATCH 3/5] Ruff --- graphrag/query/input/loaders/dfs.py | 1 + graphrag/query/input/loaders/utils.py | 48 ++++++++++++++++++--------- 2 files changed, 33 insertions(+), 16 deletions(-) diff --git a/graphrag/query/input/loaders/dfs.py b/graphrag/query/input/loaders/dfs.py index d662832b1e..6df0e7e78d 100644 --- a/graphrag/query/input/loaders/dfs.py +++ b/graphrag/query/input/loaders/dfs.py @@ -24,6 +24,7 @@ def _prepare_records(df: pd.DataFrame) -> list[dict]: """ Reset index and convert the DataFrame to a list of dictionaries. + We rename the reset index column to 'Index' for consistency. """ df_reset = df.reset_index().rename(columns={"index": "Index"}) diff --git a/graphrag/query/input/loaders/utils.py b/graphrag/query/input/loaders/utils.py index c067576d61..40fe2cfd89 100644 --- a/graphrag/query/input/loaders/utils.py +++ b/graphrag/query/input/loaders/utils.py @@ -3,7 +3,8 @@ """Data load utils.""" -from typing import Any, Mapping +from collections.abc import Mapping +from typing import Any import numpy as np @@ -22,12 +23,14 @@ def _get_value( """ if column_name is None: if required: - raise ValueError("Column name is None") + msg = "Column name is None" + raise ValueError(msg) return None if column_name in data: return data[column_name] if required: - raise ValueError(f"Column [{column_name}] not found in data") + msg = f"Column [{column_name}] not found in data" + raise ValueError(msg) return None @@ -51,11 +54,13 @@ def to_list( if isinstance(value, np.ndarray): value = value.tolist() if not isinstance(value, list): - raise ValueError(f"value is not a list: {value} ({type(value)})") + msg = f"value is not a list: {value} ({type(value)})" + raise TypeError(msg) if item_type is not None: for v in value: if not isinstance(v, item_type): - raise TypeError(f"list item is not [{item_type}]: {v} ({type(v)})") + msg = f"list item is not [{item_type}]: {v} ({type(v)})" + raise TypeError(msg) return value @@ -73,11 +78,13 @@ def to_optional_list( if isinstance(value, str): value = [value] if not isinstance(value, list): - raise ValueError(f"value is not a list: {value} ({type(value)})") + msg = f"value is not a list: {value} ({type(value)})" + raise TypeError(msg) if item_type is not None: for v in value: if not isinstance(v, item_type): - raise TypeError(f"list item is not [{item_type}]: {v} ({type(v)})") + msg = f"list item is not [{item_type}]: {v} ({type(v)})" + raise TypeError(msg) return value @@ -87,7 +94,8 @@ def to_int(data: Mapping[str, Any], column_name: str | None) -> int: if isinstance(value, float): value = int(value) if not isinstance(value, int): - raise ValueError(f"value is not an int: {value} ({type(value)})") + msg = f"value is not an int: {value} ({type(value)})" + raise TypeError(msg) return int(value) @@ -101,7 +109,8 @@ def to_optional_int(data: Mapping[str, Any], column_name: str | None) -> int | N if isinstance(value, float): value = int(value) if not isinstance(value, int): - raise ValueError(f"value is not an int: {value} ({type(value)})") + msg = f"value is not an int: {value} ({type(value)})" + raise TypeError(msg) return int(value) @@ -109,7 +118,8 @@ def to_float(data: Mapping[str, Any], column_name: str | None) -> float: """Convert and validate a value to a float.""" value = _get_value(data, column_name, required=True) if not isinstance(value, float): - raise ValueError(f"value is not a float: {value} ({type(value)})") + msg = f"value is not a float: {value} ({type(value)})" + raise TypeError(msg) return float(value) @@ -134,15 +144,18 @@ def to_dict( """Convert and validate a value to a dict.""" value = _get_value(data, column_name, required=True) if not isinstance(value, dict): - raise ValueError(f"value is not a dict: {value} ({type(value)})") + msg = f"value is not a dict: {value} ({type(value)})" + raise TypeError(msg) if key_type is not None: for k in value: if not isinstance(k, key_type): - raise TypeError(f"dict key is not [{key_type}]: {k} ({type(k)})") + msg = f"dict key is not [{key_type}]: {k} ({type(k)})" + raise TypeError(msg) if value_type is not None: for v in value.values(): if not isinstance(v, value_type): - raise TypeError(f"dict value is not [{value_type}]: {v} ({type(v)})") + msg = f"dict value is not [{value_type}]: {v} ({type(v)})" + raise TypeError(msg) return value @@ -159,13 +172,16 @@ def to_optional_dict( if value is None: return None if not isinstance(value, dict): - raise TypeError(f"value is not a dict: {value} ({type(value)})") + msg = f"value is not a dict: {value} ({type(value)})" + raise TypeError(msg) if key_type is not None: for k in value: if not isinstance(k, key_type): - raise TypeError(f"dict key is not [{key_type}]: {k} ({type(k)})") + msg = f"dict key is not [{key_type}]: {k} ({type(k)})" + raise TypeError(msg) if value_type is not None: for v in value.values(): if not isinstance(v, value_type): - raise TypeError(f"dict value is not [{value_type}]: {v} ({type(v)})") + msg = f"dict value is not [{value_type}]: {v} ({type(v)})" + raise TypeError(msg) return value From b62d6315b48ddb04cddab497f55974ee0b950e38 Mon Sep 17 00:00:00 2001 From: Alonso Guevara Date: Thu, 13 Feb 2025 16:47:28 -0600 Subject: [PATCH 4/5] Pyright --- .../summarize_descriptions/summarize_descriptions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py b/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py index 13e8d2ecdb..c6be524f80 100644 --- a/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py +++ b/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py @@ -89,8 +89,8 @@ async def get_summarized( node_futures = [ do_summarize_descriptions( - str(row.title), - sorted(set(row.description)), + str(row.title), # type: ignore + sorted(set(row.description)), # type: ignore ticker, semaphore, ) @@ -109,8 +109,8 @@ async def get_summarized( edge_futures = [ do_summarize_descriptions( - (str(row.source), str(row.target)), - sorted(set(row.description)), + (str(row.source), str(row.target)), # type: ignore + sorted(set(row.description)), # type: ignore ticker, semaphore, ) From 31a9c4295ba7f640304b4532b5506e7ec8229acf Mon Sep 17 00:00:00 2001 From: Alonso Guevara Date: Thu, 13 Feb 2025 17:00:48 -0600 Subject: [PATCH 5/5] Format --- .../summarize_descriptions/summarize_descriptions.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py b/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py index c6be524f80..25331b9071 100644 --- a/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py +++ b/graphrag/index/operations/summarize_descriptions/summarize_descriptions.py @@ -89,8 +89,8 @@ async def get_summarized( node_futures = [ do_summarize_descriptions( - str(row.title), # type: ignore - sorted(set(row.description)), # type: ignore + str(row.title), # type: ignore + sorted(set(row.description)), # type: ignore ticker, semaphore, ) @@ -109,8 +109,8 @@ async def get_summarized( edge_futures = [ do_summarize_descriptions( - (str(row.source), str(row.target)), # type: ignore - sorted(set(row.description)), # type: ignore + (str(row.source), str(row.target)), # type: ignore + sorted(set(row.description)), # type: ignore ticker, semaphore, )