From 88a6be32fdd1e992111613578e5db91303770c43 Mon Sep 17 00:00:00 2001 From: FlorentLvr Date: Wed, 13 Mar 2024 14:05:59 +0100 Subject: [PATCH] fix: gsheet request too large -> use a function created in data --- __pipeline__.ipynb | 7 +-- .../Google_Sheets_Update_contacts_view.ipynb | 26 ++------ .../Google_Sheets_Update_growth_db.ipynb | 10 +-- ...Google_Sheets_Update_interactions_db.ipynb | 20 ++---- ...oogle_Sheets_Update_organizations_db.ipynb | 27 +++----- .../Google_Sheets_Update_people_db.ipynb | 12 ++-- ...LinkedIn_Get_interactions_from_posts.ipynb | 2 +- .../Google_Sheets_Update_deals_db.ipynb | 21 +------ utils/data.ipynb | 61 ++++++++++++++++++- 9 files changed, 95 insertions(+), 91 deletions(-) diff --git a/__pipeline__.ipynb b/__pipeline__.ipynb index 147c2b7..cf7d0b5 100644 --- a/__pipeline__.ipynb +++ b/__pipeline__.ipynb @@ -243,11 +243,10 @@ " prompt=\"Find timezone in the format 'Region/City'. If there is no exact match, please return a subjective answer based on the data you received\",\n", " message=f\"Region: {region}, Country: {country}\",\n", " )\n", - " print(\"- Timezone:\", timezone)\n", - " print()\n", - " naas.set_remote_timezone(timezone)\n", " pdump(output_dir, timezone, \"timezone\")\n", " naas.dependency.add(os.path.join(output_dir, \"timezone.pickle\"))\n", + " print(\"- Timezone:\", timezone)\n", + " naas.set_remote_timezone(timezone)\n", " print()\n", " \n", " # Create notebook steps\n", @@ -358,7 +357,7 @@ "outputs": [], "source": [ "# Schedule pipeline\n", - "cron = \"0 8 * * *\"\n", + "cron = \"0 12 * * *\"\n", "print(\"⏰ Scheduler:\", cron)\n", "naas.scheduler.add(cron=cron)" ] diff --git a/models/growth-engine/core/domain/Google_Sheets_Update_contacts_view.ipynb b/models/growth-engine/core/domain/Google_Sheets_Update_contacts_view.ipynb index 2150d2b..6cbd62b 100644 --- a/models/growth-engine/core/domain/Google_Sheets_Update_contacts_view.ipynb +++ b/models/growth-engine/core/domain/Google_Sheets_Update_contacts_view.ipynb @@ -176,6 +176,7 @@ "outputs": [], "source": [ "df_init = gsheet.connect(spreadsheet_url).get(sheet_name=sheet_contacts)\n", + "df_init = pload(output_dir, file_contacts)\n", "if not isinstance(df_init, pd.DataFrame):\n", " df_init = pd.DataFrame()\n", " messaging_options = {}\n", @@ -430,7 +431,9 @@ " messaging = create_chat_completion(api_key, prompt_messaging, str(tmp_df.to_dict()))\n", " except Exception as e:\n", " print(e)\n", - " print(\"Messaging options:\", messaging)\n", + " if str(messaging) == \"None\":\n", + " messaging = \"TBD\"\n", + " print(\"Messaging options:\", messaging)\n", " print()\n", " messaging_options[profile_url] = messaging\n", " pdump(output_dir, messaging_options, \"messaging_options\")\n", @@ -491,31 +494,14 @@ { "cell_type": "code", "execution_count": null, - "id": "32e91b35-c1f8-4fe6-ae78-a2d4b79c8be7", + "id": "ea691820-773d-4148-943b-34aad838a759", "metadata": { - "papermill": {}, "tags": [] }, "outputs": [], "source": [ - "df_check = pd.concat([df_init.astype(str), df_contacts.astype(str)]).drop_duplicates(keep=False)\n", - "if len(df_check) > 0:\n", - " if len(df_contacts) < 3000:\n", - " gsheet.connect(spreadsheet_url).send(data=df_contacts, sheet_name=sheet_contacts, append=False)\n", - " else:\n", - " gsheet.connect(spreadsheet_url).send(data=df_contacts[:3000], sheet_name=sheet_contacts, append=False)\n", - " gsheet.connect(spreadsheet_url).send(data=df_contacts[3000:], sheet_name=sheet_contacts, append=True)\n", - "else:\n", - " print(\"Noting to update in Google Sheets!\")" + "send_data_to_gsheet(df_contacts, df_init, spreadsheet_url, sheet_contacts)" ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "ea691820-773d-4148-943b-34aad838a759", - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { diff --git a/models/growth-engine/core/domain/Google_Sheets_Update_growth_db.ipynb b/models/growth-engine/core/domain/Google_Sheets_Update_growth_db.ipynb index a8b4b6e..e6a33f7 100644 --- a/models/growth-engine/core/domain/Google_Sheets_Update_growth_db.ipynb +++ b/models/growth-engine/core/domain/Google_Sheets_Update_growth_db.ipynb @@ -134,7 +134,7 @@ "# Inputs\n", "entity_dir = pload(os.path.join(naas_data_product.OUTPUTS_PATH, \"entities\", \"0\"), \"entity_dir\")\n", "input_dir = os.path.join(entity_dir, \"growth-engine\", date.today().isoformat())\n", - "file_interactions = \"linkedin_interactions\"\n", + "file_interactions = \"interactions\"\n", "\n", "# Outputs\n", "spreadsheet_url = pload(os.path.join(naas_data_product.OUTPUTS_PATH, \"entities\", \"0\"), \"abi_spreadsheet\")\n", @@ -439,17 +439,13 @@ }, "outputs": [], "source": [ - "df_check = pd.concat([df_init.astype(str), df_growth.astype(str)]).drop_duplicates(keep=False)\n", - "if len(df_check) > 0:\n", - " gsheet.connect(spreadsheet_url).send(sheet_name=sheet_growth, data=df_growth, append=False)\n", - "else:\n", - " print(\"Noting to update in Google Sheets!\")" + "send_data_to_gsheet(df_growth, df_init, spreadsheet_url, sheet_growth)" ] }, { "cell_type": "code", "execution_count": null, - "id": "13cc4241-9a56-4495-9369-bf3de2bcfb42", + "id": "c81ada64-6da0-4c75-a433-751e28b62b47", "metadata": {}, "outputs": [], "source": [] diff --git a/models/growth-engine/core/domain/Google_Sheets_Update_interactions_db.ipynb b/models/growth-engine/core/domain/Google_Sheets_Update_interactions_db.ipynb index f8babb6..ac3aaae 100644 --- a/models/growth-engine/core/domain/Google_Sheets_Update_interactions_db.ipynb +++ b/models/growth-engine/core/domain/Google_Sheets_Update_interactions_db.ipynb @@ -138,7 +138,7 @@ "\n", "# Outputs\n", "output_dir = os.path.join(entity_dir, \"growth-engine\", date.today().isoformat())\n", - "output_file = \"linkedin_interactions\"\n", + "output_file = \"interactions\"\n", "spreadsheet_url = pload(os.path.join(naas_data_product.OUTPUTS_PATH, \"entities\", \"0\"), \"abi_spreadsheet\")\n", "sheet_interaction = \"INTERACTIONS\"" ] @@ -216,11 +216,7 @@ " tmp_posts_url = tmp_df_reactions[\"POST_URL\"].unique().tolist()\n", " for x in tmp_posts_url:\n", " if x not in posts_url:\n", - " # Histo\n", - " if date_dir < date(2024, 5 , 1):\n", - " tmp_df_reactions[\"DATE_REACTION\"] = pd.to_datetime(tmp_df_reactions['PUBLISHED_DATE'], format='%Y-%m-%d %H:%M:%S%z').dt.tz_convert(TIMEZONE).dt.strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", - " else:\n", - " tmp_df_reactions[\"DATE_REACTION\"] = tmp_df_reactions['PUBLISHED_DATE']\n", + " tmp_df_reactions[\"DATE_REACTION\"] = tmp_df_reactions['PUBLISHED_DATE']\n", " posts_url.append(x)\n", " else:\n", " tmp_df_reactions[\"DATE_REACTION\"] = pd.to_datetime(tmp_df_reactions['DATE_EXTRACT'], format='%Y-%m-%d %H:%M:%S').dt.tz_localize(pytz.timezone(\"Europe/Paris\")).dt.tz_convert(TIMEZONE).dt.strftime(\"%Y-%m-%d %H:%M:%S%z\")\n", @@ -329,10 +325,10 @@ " \n", " # Concat df\n", " df = pd.concat([df1, df2]).reset_index(drop=True)\n", - " \n", + "\n", " # Exclude Entity from Full name\n", " if len(df) > 0:\n", - " df.insert(loc=2, column=\"DATE\", value=pd.to_datetime(df['DATE_INTERACTION'], format=\"%Y-%m-%d %H:%M:%S%z\").dt.strftime(\"%a. %d %b.\"))\n", + " df.insert(loc=2, column=\"DATE\", value=pd.to_datetime(df['DATE_INTERACTION'].str[:19], format=\"%Y-%m-%d %H:%M:%S\").dt.strftime(\"%a. %d %b.\"))\n", " entity = df.loc[0 , \"ENTITY\"]\n", " df = df[df[\"FULLNAME\"] != entity]\n", " \n", @@ -419,17 +415,13 @@ }, "outputs": [], "source": [ - "df_check = pd.concat([df_init.astype(str), df_interactions.astype(str)]).drop_duplicates(keep=False)\n", - "if len(df_check) > 0:\n", - " gsheet.connect(spreadsheet_url).send(sheet_name=sheet_interaction, data=df_interactions, append=False)\n", - "else:\n", - " print(\"Noting to update in Google Sheets!\")" + "send_data_to_gsheet(df_interactions, df_init, spreadsheet_url, sheet_interaction)" ] }, { "cell_type": "code", "execution_count": null, - "id": "29368ef4-de34-44bc-8d8a-346909b8ea30", + "id": "c15fc896-7362-4ab9-8c99-6c45fc41c9d1", "metadata": {}, "outputs": [], "source": [] diff --git a/models/growth-engine/core/domain/Google_Sheets_Update_organizations_db.ipynb b/models/growth-engine/core/domain/Google_Sheets_Update_organizations_db.ipynb index e20dc0c..b5c08bf 100644 --- a/models/growth-engine/core/domain/Google_Sheets_Update_organizations_db.ipynb +++ b/models/growth-engine/core/domain/Google_Sheets_Update_organizations_db.ipynb @@ -172,7 +172,7 @@ "id": "4a5d888e-9fa7-4fe9-983b-c028cd0bb2a9", "metadata": {}, "source": [ - "### Get companies" + "### Get organizations" ] }, { @@ -611,12 +611,13 @@ " # Update org name with company name from LinkedIn\n", " df = df_organizations.copy()\n", " df = df[(df[\"ORGANIZATION\"] != df[\"ORG_NAME\"]) & (df[\"ORG_LINKEDIN_ID\"] != \"TBD\")]\n", - " org_names_1 = get_dict_from_df(df, \"ORG_NAME\", \"ORGANIZATION\", f\"organizations_names_{datetime.now().isoformat()}\", output_dir)\n", + " org_names_1 = get_dict_from_df(df, \"ORG_NAME\", \"ORGANIZATION\", f\"organizations_names\", output_dir)\n", " print(\"-> New Organization to be updated in People db:\", len(org_names_1))\n", " if len(org_names_1) > 0:\n", " df_org, df_p = update_dfs_from_dict(org_names_1, df_p, df_org)\n", " org_lk_urls = pload(output_dir, \"org_lk_urls\")\n", " for key, value in org_names_1.items():\n", + " key = key.replace(\"Not Found\", \"\")\n", " org_lk_urls[value] = org_lk_urls[key]\n", " pdump(output_dir, org_lk_urls, \"org_lk_urls\")\n", " \n", @@ -626,7 +627,7 @@ " # Get org name with data tbd with same LinkedIn URL as org found\n", " df1 = df_org.copy()\n", " df1 = df1[(df1[\"LINKEDIN_URL\"] != \"NA\") & (df1[\"ORG_LINKEDIN_ID\"] != \"TBD\")]\n", - " organizations_org_urls = get_dict_from_df(df1, \"ORGANIZATION\", \"LINKEDIN_URL\", f\"organizations_org_urls_{datetime.now().isoformat()}\", output_dir)\n", + " organizations_org_urls = get_dict_from_df(df1, \"ORGANIZATION\", \"LINKEDIN_URL\", f\"organizations_org_urls\", output_dir)\n", "\n", " # Get org name with data tbd with similar LinkedIn URL\n", " df2 = df_org.copy()\n", @@ -717,18 +718,13 @@ { "cell_type": "code", "execution_count": null, - "id": "32e91b35-c1f8-4fe6-ae78-a2d4b79c8be7", + "id": "5098f953-0607-4225-b3f3-39e06dd301db", "metadata": { - "papermill": {}, "tags": [] }, "outputs": [], "source": [ - "df_check = pd.concat([df_init.astype(str), df_organizations_u.astype(str)]).drop_duplicates(keep=False)\n", - "if len(df_check) > 0:\n", - " gsheet.connect(spreadsheet_url).send(data=df_organizations_u, sheet_name=sheet_people_organizations, append=False)\n", - "else:\n", - " print(\"Noting to update in Google Sheets!\")" + "send_data_to_gsheet(df_organizations_u, df_init, spreadsheet_url, sheet_people_organizations)" ] }, { @@ -742,24 +738,19 @@ { "cell_type": "code", "execution_count": null, - "id": "f3c0277a-239f-4d5d-b34f-655c9f7c0028", + "id": "23bcfff6-e962-4191-b79c-164ab9f65ff6", "metadata": { "tags": [] }, "outputs": [], "source": [ - "df_check = pd.concat([df_people.astype(str), df_people_u.astype(str)]).drop_duplicates(keep=False)\n", - "if len(df_check) > 0:\n", - " pdump(output_dir, df_people_u, file_people)\n", - " gsheet.connect(spreadsheet_url).send(data=df_people_u, sheet_name=sheet_people, append=False)\n", - "else:\n", - " print(\"Noting to update in Google Sheets!\")" + "send_data_to_gsheet(df_people_u, df_people, spreadsheet_url, sheet_people)" ] }, { "cell_type": "code", "execution_count": null, - "id": "23bcfff6-e962-4191-b79c-164ab9f65ff6", + "id": "f0ca1da1-617f-459e-a698-c5bb695a83be", "metadata": {}, "outputs": [], "source": [] diff --git a/models/growth-engine/core/domain/Google_Sheets_Update_people_db.ipynb b/models/growth-engine/core/domain/Google_Sheets_Update_people_db.ipynb index f0b6e27..a08a35a 100644 --- a/models/growth-engine/core/domain/Google_Sheets_Update_people_db.ipynb +++ b/models/growth-engine/core/domain/Google_Sheets_Update_people_db.ipynb @@ -143,7 +143,7 @@ "# Inputs\n", "entity_dir = pload(os.path.join(naas_data_product.OUTPUTS_PATH, \"entities\", \"0\"), \"entity_dir\")\n", "input_dir = os.path.join(entity_dir, \"growth-engine\", date.today().isoformat())\n", - "file_interactions = \"linkedin_interactions\"\n", + "file_interactions = \"interactions\"\n", "api_key = os.environ.get(\"NAAS_API_TOKEN\") or naas.secret.get('NAAS_API_TOKEN')\n", "li_at = os.environ.get(\"LINKEDIN_LI_AT\") or naas.secret.get(\"LINKEDIN_LI_AT\")\n", "JSESSIONID = os.environ.get(\"LINKEDIN_JSESSIONID\") or naas.secret.get(\"LINKEDIN_JSESSIONID\")\n", @@ -676,23 +676,19 @@ { "cell_type": "code", "execution_count": null, - "id": "59a498dd-a1d3-46fd-8b81-efa4df1813f1", + "id": "6219b07a-9052-4f7f-8754-b2aa22eeabc7", "metadata": { "tags": [] }, "outputs": [], "source": [ - "df_check = pd.concat([df_init.astype(str), df_people.astype(str)]).drop_duplicates(keep=False)\n", - "if len(df_check) > 0:\n", - " gsheet.connect(spreadsheet_url).send(data=df_people, sheet_name=sheet_people, append=False)\n", - "else:\n", - " print(\"Noting to update in Google Sheets!\") " + "send_data_to_gsheet(df_people, df_init, spreadsheet_url, sheet_people)" ] }, { "cell_type": "code", "execution_count": null, - "id": "6219b07a-9052-4f7f-8754-b2aa22eeabc7", + "id": "5f9806ef-87dc-4582-8d32-772eb036c8ad", "metadata": {}, "outputs": [], "source": [] diff --git a/models/growth-engine/core/domain/LinkedIn_Get_interactions_from_posts.ipynb b/models/growth-engine/core/domain/LinkedIn_Get_interactions_from_posts.ipynb index 1cf7f3a..27ef3e3 100644 --- a/models/growth-engine/core/domain/LinkedIn_Get_interactions_from_posts.ipynb +++ b/models/growth-engine/core/domain/LinkedIn_Get_interactions_from_posts.ipynb @@ -173,7 +173,7 @@ "\n", "# Outputs\n", "output_dir = os.path.join(entity_dir, \"growth-engine\", date.today().isoformat())\n", - "file_posts = \"linkedin_posts\"\n", + "file_posts = \"posts\"\n", "file_reactions = \"linkedin_post_reactions\"\n", "file_comments = \"linkedin_post_comments\"" ] diff --git a/models/sales-engine/core/domain/Google_Sheets_Update_deals_db.ipynb b/models/sales-engine/core/domain/Google_Sheets_Update_deals_db.ipynb index 88fb489..5a27869 100644 --- a/models/sales-engine/core/domain/Google_Sheets_Update_deals_db.ipynb +++ b/models/sales-engine/core/domain/Google_Sheets_Update_deals_db.ipynb @@ -392,30 +392,15 @@ "### Send data to Google Sheets spreadsheet" ] }, - { - "cell_type": "code", - "execution_count": null, - "id": "32e91b35-c1f8-4fe6-ae78-a2d4b79c8be7", - "metadata": { - "papermill": {}, - "tags": [] - }, - "outputs": [], - "source": [ - "df_check = pd.concat([db_deals.astype(str), df_init.astype(str)]).drop_duplicates(keep=False)\n", - "if len(df_check) > 0:\n", - " gsheet.connect(spreadsheet_url).send(sheet_name=sheet_name_output, data=db_deals, append=False)\n", - "else:\n", - " print(\"Noting to update in Google Sheets!\") " - ] - }, { "cell_type": "code", "execution_count": null, "id": "13cc4241-9a56-4495-9369-bf3de2bcfb42", "metadata": {}, "outputs": [], - "source": [] + "source": [ + "send_data_to_gsheet(db_deals, df_init, spreadsheet_url, sheet_name_output)" + ] } ], "metadata": { diff --git a/utils/data.ipynb b/utils/data.ipynb index af14d59..846dbe7 100644 --- a/utils/data.ipynb +++ b/utils/data.ipynb @@ -58,7 +58,8 @@ "import naas\n", "from naas_drivers import linkedin\n", "import naas_data_product\n", - "import shutil" + "import shutil\n", + "import pandas as pd" ] }, { @@ -136,6 +137,64 @@ " return None" ] }, + { + "cell_type": "markdown", + "id": "8cee202d-2085-46ff-ba31-54baee04f0be", + "metadata": {}, + "source": [ + "### Send data to Gsheet" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "c3baae15-fcf9-4c22-9940-fedbeb20b262", + "metadata": {}, + "outputs": [], + "source": [ + "def send_data_to_gsheet(\n", + " df: pd.DataFrame,\n", + " df_init: pd.DataFrame, \n", + " spreadsheet_url: str, \n", + " sheet_name: str, \n", + " chunck_size: int = 100000\n", + "):\n", + " \"\"\"\n", + " This function compares two dataframes and if they are different, sends the data from the second dataframe to a Google Sheet.\n", + "\n", + " :param df: The main dataframe to be sent to Google Sheet.\n", + " :param df_init: The initial dataframe to be compared with the main dataframe.\n", + " :param spreadsheet_url: The URL of the Google Sheet to send data to.\n", + " :param sheet_name: The name of the sheet in the Google Sheet to send data to.\n", + " :param chunck_size: The size of the chunks to split the data into for sending. Default is 100000.\n", + " \"\"\"\n", + " # Compare dataframes\n", + " df_check = pd.concat([df.astype(str), df_init.astype(str)]).drop_duplicates(keep=False)\n", + " \n", + " # Update or Do nothing\n", + " if len(df_check) > 0:\n", + " df_size = len(df) * len(df.columns)\n", + " if df_size < chunck_size:\n", + " gsheet.connect(spreadsheet_url).send(sheet_name=sheet_name, data=df, append=False)\n", + " print(f\"✅ DataFrame successfully sent to Google Sheets!\")\n", + " else:\n", + " max_rows = int(chunck_size / len(df.columns))\n", + " start = 0\n", + " limit = start + max_rows\n", + " gsheet.connect(spreadsheet_url).send(sheet_name=sheet_name, data=df[start:limit], append=False)\n", + " print(f\"✅ Rows {start} to {limit} successfully added to Google Sheets!\")\n", + " start += max_rows\n", + " while start < len(df):\n", + " limit = start + max_rows\n", + " if limit > len(df):\n", + " limit = len(df)\n", + " gsheet.connect(spreadsheet_url).send(sheet_name=sheet_name, data=df[start:limit], append=True)\n", + " print(f\"✅ Rows {start} to {limit} successfully added to Google Sheets!\")\n", + " start += max_rows\n", + " else:\n", + " print(\"Noting to update in Google Sheets!\")" + ] + }, { "cell_type": "markdown", "id": "e929759d-1080-40df-912a-873fa21deb92",