From c064d6b5f0d2a4c7bc3ce34bd557bebeee463fab Mon Sep 17 00:00:00 2001 From: Jeffrey Aven Date: Tue, 22 Aug 2023 23:26:12 +1000 Subject: [PATCH] updates --- notebooks/includes/google-audit-setup.ipynb | 148 ++++++-------------- 1 file changed, 44 insertions(+), 104 deletions(-) diff --git a/notebooks/includes/google-audit-setup.ipynb b/notebooks/includes/google-audit-setup.ipynb index 3ec548c..b887b67 100644 --- a/notebooks/includes/google-audit-setup.ipynb +++ b/notebooks/includes/google-audit-setup.ipynb @@ -7,11 +7,15 @@ "outputs": [], "source": [ "## imports and object instantiation\n", - "import json, time, nest_asyncio, json, itertools, sys, threading\n", + "import json, time, nest_asyncio, json, itertools, sys, threading, psycopg2\n", "from pystackql import StackQL\n", "import pandas as pd\n", "from IPython.display import clear_output, display, Markdown, HTML\n", "from ipytree import Tree, Node\n", + "from psycopg2.extras import RealDictCursor\n", + "from psycopg2 import ProgrammingError\n", + "\n", + "conn = psycopg2.connect(\"dbname=stackql user=stackql host=localhost port=5444\")\n", "\n", "stackql = StackQL()\n", "nest_asyncio.apply()" @@ -106,6 +110,24 @@ " return exploded_df" ] }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def run_stackql_query(query, debug=False):\n", + " try:\n", + " with conn.cursor(cursor_factory=RealDictCursor) as cur:\n", + " cur.execute(query)\n", + " rows = cur.fetchall()\n", + " return pd.DataFrame(rows)\n", + " except Exception as e:\n", + " if debug:\n", + " print(f\"Error executing query: {str(e)}\")\n", + " return pd.DataFrame()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -114,48 +136,43 @@ "source": [ "# get all folders and projects function\n", "def get_resources_recursive(entity_id, get_projects_query_fn, get_folders_query_fn, parent_display_name='organization'):\n", - " resources = []\n", + " resources_df = pd.DataFrame()\n", "\n", " # Query for projects\n", " print_overwrite(f\"Searching {entity_id} for projects...\")\n", " project_query = get_projects_query_fn(entity_id)\n", - " project_results = json.loads(stackql.execute(project_query))\n", + " projects_df = run_stackql_query(project_query)\n", + " projects_df[\"parentDisplayName\"] = parent_display_name\n", + " projects_df[\"resType\"] = \"project\"\n", + "\n", + " resources_df = pd.concat([resources_df, projects_df], ignore_index=True)\n", "\n", - " if isinstance(project_results, list):\n", - " print_overwrite(f\"Found {len(project_results)} projects in {entity_id}\")\n", - " for proj in project_results:\n", - " proj[\"parentDisplayName\"] = parent_display_name\n", - " proj[\"resType\"] = \"project\"\n", - " resources.append(proj)\n", + " print_overwrite(f\"Found {len(projects_df)} projects in {entity_id}\")\n", "\n", " # Query for folders\n", " print_overwrite(f\"Searching {entity_id} for folders...\")\n", " folder_query = get_folders_query_fn(entity_id)\n", - " folder_results = json.loads(stackql.execute(folder_query))\n", + " folders_df = run_stackql_query(folder_query)\n", + " folders_df[\"parentDisplayName\"] = parent_display_name\n", + " folders_df[\"resType\"] = \"folder\"\n", "\n", - " if isinstance(folder_results, list):\n", - " print_overwrite(f\"Found {len(folder_results)} folders in {entity_id}\")\n", - " for folder in folder_results:\n", - " folder[\"parentDisplayName\"] = parent_display_name\n", - " folder[\"resType\"] = \"folder\"\n", - " resources.append(folder)\n", + " resources_df = pd.concat([resources_df, folders_df], ignore_index=True)\n", "\n", - " # Fetch resources under this folder\n", - " if 'name' in folder:\n", - " resources.extend(get_resources_recursive(folder['name'], get_projects_query_fn, get_folders_query_fn, folder['displayName']))\n", + " print_overwrite(f\"Found {len(folders_df)} folders in {entity_id}\")\n", "\n", - " return resources\n", + " for _, folder in folders_df.iterrows():\n", + " # Fetch resources under this folder\n", + " if 'name' in folder:\n", + " child_resources_df = get_resources_recursive(folder['name'], get_projects_query_fn, get_folders_query_fn, folder['displayName'])\n", + " resources_df = pd.concat([resources_df, child_resources_df], ignore_index=True)\n", + "\n", + " return resources_df\n", "\n", "def get_all_resources(get_projects_query, get_folders_query):\n", " start_time = time.time()\n", " \n", " # Start with the root organization to get all resources\n", - " all_resources = get_resources_recursive(\"organizations/%s\" % (org_id), get_projects_query, get_folders_query)\n", - " \n", - " # Convert list to dataframe and filter\n", - " resources_df = (pd.DataFrame(all_resources)\n", - " .loc[lambda df: df['error'].isna()]\n", - " .drop('error', axis=1, errors='ignore'))\n", + " resources_df = get_resources_recursive(\"organizations/%s\" % (org_id), get_projects_query, get_folders_query)\n", " \n", " # Create root node and build the tree\n", " root = Node(\"organization\", opened=False, icon='building')\n", @@ -174,7 +191,7 @@ " cards_data = [(\"Number of Projects\", num_projects), (\"Number of Folders\", num_folders)]\n", " display_cards(cards_data)\n", " \n", - " return resources_df, projects, tree\n" + " return resources_df, projects, tree" ] }, { @@ -206,77 +223,6 @@ " return bindings_df" ] }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# regions and zones function\n", - "def get_all_regions_and_zones(projects_df, queries):\n", - "\n", - " start_time = time.time()\n", - "\n", - " all_results = []\n", - " for query in queries:\n", - " res = stackql.execute(query)\n", - " try:\n", - " parsed_result = json.loads(res)\n", - " all_results.extend(parsed_result)\n", - " except json.JSONDecodeError:\n", - " print(f\"Failed to parse result from query: {query}\")\n", - " print(f\"Raw result: {res}\")\n", - " \n", - " zones_df = (\n", - " pd.DataFrame(all_results)\n", - " .loc[lambda x: x['error'].isnull()]\n", - " .drop('error', axis=1)\n", - " .drop_duplicates()\n", - " )\n", - " \n", - " regions_df = pd.DataFrame(zones_df['region'].unique(), columns=['region'])\n", - " # For every combination of project and region\n", - " projects_regions_df = projects_df.assign(key=1).merge(regions_df.assign(key=1), on='key').drop('key', axis=1)\n", - " # For every combination of project and zone\n", - " projects_zones_df = projects_df.assign(key=1).merge(zones_df.assign(key=1), on='key').drop(['key', 'region'], axis=1)\n", - " \n", - " # python list variables\n", - " regions = regions_df['region'].tolist()\n", - " zones = zones_df['name'].tolist()\n", - " projects_regions = projects_regions_df.to_dict(orient='records')\n", - " projects_zones = projects_zones_df.to_dict(orient='records')\n", - "\n", - " number_of_rows = zones_df.shape[0]\n", - " number_of_projects = projects_df.shape[0]\n", - " elapsed_time = round(time.time() - start_time)\n", - "\n", - " print(f\"Found {number_of_rows} zones across {number_of_projects} projects in {elapsed_time} seconds\")\n", - "\n", - " return regions_df, zones_df, regions, zones, projects_regions, projects_zones" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# get all instances\n", - "def get_all_instances(queries):\n", - " start_time = time.time()\n", - "\n", - " res = stackql.executeQueriesAsync(queries)\n", - "\n", - " isinstances_df = pd.read_json(json.dumps(res))\n", - "\n", - " number_of_rows = isinstances_df.shape[0]\n", - " elapsed_time = round(time.time() - start_time)\n", - "\n", - " print(f\"Found {number_of_rows} instances in {elapsed_time} seconds\")\n", - "\n", - " return isinstances_df" - ] - }, { "cell_type": "code", "execution_count": null, @@ -303,12 +249,6 @@ "metadata": {}, "outputs": [], "source": [ - "import psycopg2\n", - "from psycopg2.extras import RealDictCursor\n", - "from psycopg2 import ProgrammingError\n", - "\n", - "conn = psycopg2.connect(\"dbname=stackql user=stackql host=localhost port=5444\")\n", - "\n", "def run_stackql_queries(queries, debug=False):\n", " start_time = time.time()\n", " all_results = []\n",