From 501c89cff725a3ae1aacd42c5794448328bb2e58 Mon Sep 17 00:00:00 2001 From: Jennifer Melot Date: Tue, 16 Jul 2024 22:27:41 -0400 Subject: [PATCH 1/2] Add yearly counts Closes #479 --- web/scripts/retrieve_data.py | 83 ++++++++++++++++++++++++++++++++++-- 1 file changed, 79 insertions(+), 4 deletions(-) diff --git a/web/scripts/retrieve_data.py b/web/scripts/retrieve_data.py index 56e0a63..110edfb 100644 --- a/web/scripts/retrieve_data.py +++ b/web/scripts/retrieve_data.py @@ -162,6 +162,54 @@ ("Workforce: Tech Team 1 workers", lambda row: row["other_metrics"]["tt1_jobs"]["total"]), ]) +YEARLY_COUNT_MAPPING = OrderedDict([ + ("Publications: AI publications", lambda row: row["articles"]["ai_publications"]["counts"]), + ("Publications: CV publications", lambda row: row["articles"]["cv_publications"]["counts"]), + ("Publications: NLP publications", lambda row: row["articles"]["nlp_publications"]["counts"]), + ("Publications: Robotics publications", lambda row: row["articles"]["robotics_publications"]["counts"]), + ("Publications: AI publications in top conferences", lambda row: row["articles"]["ai_pubs_top_conf"]["counts"]), + ("Patents: AI patents", lambda row: row["patents"]["ai_patents"]["counts"]), + ("Patents: AI use cases: Agriculture", lambda row: row["patents"]["Agricultural"]["counts"]), + ("Patents: AI use cases: Banking and finance", lambda row: row["patents"]["Banking_and_Finance"]["counts"]), + ("Patents: AI use cases: Business", lambda row: row["patents"]["Business"]["counts"]), + ("Patents: AI use cases: Computing in government", lambda row: row["patents"]["Computing_in_Government"]["counts"]), + ("Patents: AI use cases: Document management and publishing", + lambda row: row["patents"]["Document_Mgt_and_Publishing"]["counts"]), + ("Patents: AI use cases: Education", lambda row: row["patents"]["Education"]["counts"]), + ("Patents: AI use cases: Energy", lambda row: row["patents"]["Energy_Management"]["counts"]), + ("Patents: AI use cases: Entertainment", lambda row: row["patents"]["Entertainment"]["counts"]), + ("Patents: AI use cases: Industry and manufacturing", + lambda row: row["patents"]["Industrial_and_Manufacturing"]["counts"]), + ("Patents: AI use cases: Life sciences", lambda row: row["patents"]["Life_Sciences"]["counts"]), + ("Patents: AI use cases: Military", lambda row: row["patents"]["Military"]["counts"]), + ("Patents: AI use cases: Nanotechnology", lambda row: row["patents"]["Nanotechnology"]["counts"]), + ("Patents: AI use cases: Networking", lambda row: row["patents"]["Networks__eg_social_IOT_etc"]["counts"]), + ("Patents: AI use cases: Personal devices and computing", + lambda row: row["patents"]["Personal_Devices_and_Computing"]["counts"]), + ("Patents: AI use cases: Physical sciences and engineering", + lambda row: row["patents"]["Physical_Sciences_and_Engineering"]["counts"]), + ("Patents: AI use cases: Security", lambda row: row["patents"]["Security__eg_cybersecurity"]["counts"]), + ("Patents: AI use cases: Semiconductors", lambda row: row["patents"]["Semiconductors"]["counts"]), + ("Patents: AI use cases: Telecommunications", lambda row: row["patents"]["Telecommunications"]["counts"]), + ("Patents: AI use cases: Transportation", lambda row: row["patents"]["Transportation"]["counts"]), + ("Patents: AI applications and techniques: Analytics and algorithms", + lambda row: row["patents"]["Analytics_and_Algorithms"]["counts"]), + ("Patents: AI applications and techniques: Computer vision", lambda row: row["patents"]["Computer_Vision"]["counts"]), + ("Patents: AI applications and techniques: Control", lambda row: row["patents"]["Control"]["counts"]), + ("Patents: AI applications and techniques: Distributed AI", lambda row: row["patents"]["Distributed_AI"]["counts"]), + ("Patents: AI applications and techniques: Knowledge representation", + lambda row: row["patents"]["Knowledge_Representation"]["counts"]), + ("Patents: AI applications and techniques: Language processing", + lambda row: row["patents"]["Language_Processing"]["counts"]), + ("Patents: AI applications and techniques: Measuring and testing", + lambda row: row["patents"]["Measuring_and_Testing"]["counts"]), + ("Patents: AI applications and techniques: Planning and scheduling", + lambda row: row["patents"]["Planning_and_Scheduling"]["counts"]), + ("Patents: AI applications and techniques: Robotics", lambda row: row["patents"]["Robotics"]["counts"]), + ("Patents: AI applications and techniques: Speech processing", + lambda row: row["patents"]["Speech_Processing"]["counts"]), +]) + ### END CONSTANTS ### @@ -897,7 +945,7 @@ def clean_link(link: str) -> str: return link -def clean(refresh_images: bool, refresh_sectors: bool) -> dict: +def clean(refresh_images: bool, refresh_sectors: bool) -> tuple: """ Reads and cleans the raw data from the local cache :param refresh_images: if true, will re-download all the company logos from crunchbase; don't call with true @@ -1067,10 +1115,35 @@ def get_extra_org_meta() -> dict: return extra_meta -def update_data_delivery(clean_company_rows: dict) -> None: +def write_yearly_counts(rows: list, output_file: str) -> None: + """ + Write csv containing yearly counts of categories of publication and patents + :param rows: Company metadata + :param output_file: File where the outputs should be written + :return: None + """ + with open(output_file, mode="w") as out: + fieldnames = ["Name", "ID", "Category", "Year", "Value"] + writer = csv.DictWriter(out, fieldnames=fieldnames) + writer.writeheader() + for row in rows: + counts = {new_name: get(row) for new_name, get in YEARLY_COUNT_MAPPING.items()} + for category in counts: + for year, value in zip(YEARS, counts[category]): + output_row = { + "ID": row["cset_id"], + "Name": row["name"], + "Category": category, + "Year": year, + "Value": value + } + writer.writerow(output_row) + + +def update_data_delivery(clean_company_rows: list) -> None: """ Updates data delivery for Zenodo - :param group_data: list of clean metadata for each row + :param clean_company_rows: list of clean metadata for each row :return: None """ print("retrieving metadata") @@ -1079,6 +1152,7 @@ def update_data_delivery(clean_company_rows: dict) -> None: ids_file = "id.csv" aliases_file = "alias.csv" ticker_file = "ticker.csv" + yearly_counts_file = "yearly_publication_counts.csv" extra_org_meta = get_extra_org_meta() with open(os.path.join(td, core_file), mode="w") as out: fieldnames = list(CORE_COLUMN_MAPPING.keys())+list(extra_org_meta[list(extra_org_meta.keys())[0]].keys())+["PARAT link"] @@ -1090,6 +1164,7 @@ def update_data_delivery(clean_company_rows: dict) -> None: slugified_name = slugify(reformatted_row["Name"].replace("/", "").replace("'", "")) reformatted_row["PARAT link"] = f"https://parat.eto.tech/company/{reformatted_row['ID']}-{slugified_name}" writer.writerow(reformatted_row) + write_yearly_counts(clean_company_rows, os.path.join(td, yearly_counts_file)) write_query_to_csv( """ SELECT @@ -1135,7 +1210,7 @@ def update_data_delivery(clean_company_rows: dict) -> None: ) download_name = f"parat_data_{datetime.now().strftime('%Y%m%d')}" with zipfile.ZipFile(f"{download_name}.zip", "w") as zip: - for out_csv in [core_file, ids_file, aliases_file, ticker_file]: + for out_csv in [core_file, ids_file, aliases_file, ticker_file, yearly_counts_file]: zip.write(os.path.join(td, out_csv), os.path.join(download_name, out_csv)) From 2d3207608a5fcf609642d8dd409f2716049ef293 Mon Sep 17 00:00:00 2001 From: Jennifer Melot Date: Wed, 17 Jul 2024 16:31:56 -0400 Subject: [PATCH 2/2] Add parat links, completeness column, and split dataset and category --- web/scripts/retrieve_data.py | 119 ++++++++++++++++++++--------------- 1 file changed, 69 insertions(+), 50 deletions(-) diff --git a/web/scripts/retrieve_data.py b/web/scripts/retrieve_data.py index 110edfb..e9ebe17 100644 --- a/web/scripts/retrieve_data.py +++ b/web/scripts/retrieve_data.py @@ -162,51 +162,54 @@ ("Workforce: Tech Team 1 workers", lambda row: row["other_metrics"]["tt1_jobs"]["total"]), ]) -YEARLY_COUNT_MAPPING = OrderedDict([ - ("Publications: AI publications", lambda row: row["articles"]["ai_publications"]["counts"]), - ("Publications: CV publications", lambda row: row["articles"]["cv_publications"]["counts"]), - ("Publications: NLP publications", lambda row: row["articles"]["nlp_publications"]["counts"]), - ("Publications: Robotics publications", lambda row: row["articles"]["robotics_publications"]["counts"]), - ("Publications: AI publications in top conferences", lambda row: row["articles"]["ai_pubs_top_conf"]["counts"]), - ("Patents: AI patents", lambda row: row["patents"]["ai_patents"]["counts"]), - ("Patents: AI use cases: Agriculture", lambda row: row["patents"]["Agricultural"]["counts"]), - ("Patents: AI use cases: Banking and finance", lambda row: row["patents"]["Banking_and_Finance"]["counts"]), - ("Patents: AI use cases: Business", lambda row: row["patents"]["Business"]["counts"]), - ("Patents: AI use cases: Computing in government", lambda row: row["patents"]["Computing_in_Government"]["counts"]), - ("Patents: AI use cases: Document management and publishing", +PUBLICATION_YEARLY_COUNT_MAPPING = OrderedDict([ + ("AI publications", lambda row: row["articles"]["ai_publications"]["counts"]), + ("CV publications", lambda row: row["articles"]["cv_publications"]["counts"]), + ("NLP publications", lambda row: row["articles"]["nlp_publications"]["counts"]), + ("Robotics publications", lambda row: row["articles"]["robotics_publications"]["counts"]), + ("AI publications in top conferences", lambda row: row["articles"]["ai_pubs_top_conf"]["counts"]), +]) + +PATENT_YEARLY_COUNT_MAPPING = OrderedDict([ + ("AI patents", lambda row: row["patents"]["ai_patents"]["counts"]), + ("AI use cases: Agriculture", lambda row: row["patents"]["Agricultural"]["counts"]), + ("AI use cases: Banking and finance", lambda row: row["patents"]["Banking_and_Finance"]["counts"]), + ("AI use cases: Business", lambda row: row["patents"]["Business"]["counts"]), + ("AI use cases: Computing in government", lambda row: row["patents"]["Computing_in_Government"]["counts"]), + ("AI use cases: Document management and publishing", lambda row: row["patents"]["Document_Mgt_and_Publishing"]["counts"]), - ("Patents: AI use cases: Education", lambda row: row["patents"]["Education"]["counts"]), - ("Patents: AI use cases: Energy", lambda row: row["patents"]["Energy_Management"]["counts"]), - ("Patents: AI use cases: Entertainment", lambda row: row["patents"]["Entertainment"]["counts"]), - ("Patents: AI use cases: Industry and manufacturing", + ("AI use cases: Education", lambda row: row["patents"]["Education"]["counts"]), + ("AI use cases: Energy", lambda row: row["patents"]["Energy_Management"]["counts"]), + ("AI use cases: Entertainment", lambda row: row["patents"]["Entertainment"]["counts"]), + ("AI use cases: Industry and manufacturing", lambda row: row["patents"]["Industrial_and_Manufacturing"]["counts"]), - ("Patents: AI use cases: Life sciences", lambda row: row["patents"]["Life_Sciences"]["counts"]), - ("Patents: AI use cases: Military", lambda row: row["patents"]["Military"]["counts"]), - ("Patents: AI use cases: Nanotechnology", lambda row: row["patents"]["Nanotechnology"]["counts"]), - ("Patents: AI use cases: Networking", lambda row: row["patents"]["Networks__eg_social_IOT_etc"]["counts"]), - ("Patents: AI use cases: Personal devices and computing", + ("AI use cases: Life sciences", lambda row: row["patents"]["Life_Sciences"]["counts"]), + ("AI use cases: Military", lambda row: row["patents"]["Military"]["counts"]), + ("AI use cases: Nanotechnology", lambda row: row["patents"]["Nanotechnology"]["counts"]), + ("AI use cases: Networking", lambda row: row["patents"]["Networks__eg_social_IOT_etc"]["counts"]), + ("AI use cases: Personal devices and computing", lambda row: row["patents"]["Personal_Devices_and_Computing"]["counts"]), - ("Patents: AI use cases: Physical sciences and engineering", + ("AI use cases: Physical sciences and engineering", lambda row: row["patents"]["Physical_Sciences_and_Engineering"]["counts"]), - ("Patents: AI use cases: Security", lambda row: row["patents"]["Security__eg_cybersecurity"]["counts"]), - ("Patents: AI use cases: Semiconductors", lambda row: row["patents"]["Semiconductors"]["counts"]), - ("Patents: AI use cases: Telecommunications", lambda row: row["patents"]["Telecommunications"]["counts"]), - ("Patents: AI use cases: Transportation", lambda row: row["patents"]["Transportation"]["counts"]), - ("Patents: AI applications and techniques: Analytics and algorithms", + ("AI use cases: Security", lambda row: row["patents"]["Security__eg_cybersecurity"]["counts"]), + ("AI use cases: Semiconductors", lambda row: row["patents"]["Semiconductors"]["counts"]), + ("AI use cases: Telecommunications", lambda row: row["patents"]["Telecommunications"]["counts"]), + ("AI use cases: Transportation", lambda row: row["patents"]["Transportation"]["counts"]), + ("AI applications and techniques: Analytics and algorithms", lambda row: row["patents"]["Analytics_and_Algorithms"]["counts"]), - ("Patents: AI applications and techniques: Computer vision", lambda row: row["patents"]["Computer_Vision"]["counts"]), - ("Patents: AI applications and techniques: Control", lambda row: row["patents"]["Control"]["counts"]), - ("Patents: AI applications and techniques: Distributed AI", lambda row: row["patents"]["Distributed_AI"]["counts"]), - ("Patents: AI applications and techniques: Knowledge representation", + ("AI applications and techniques: Computer vision", lambda row: row["patents"]["Computer_Vision"]["counts"]), + ("AI applications and techniques: Control", lambda row: row["patents"]["Control"]["counts"]), + ("AI applications and techniques: Distributed AI", lambda row: row["patents"]["Distributed_AI"]["counts"]), + ("AI applications and techniques: Knowledge representation", lambda row: row["patents"]["Knowledge_Representation"]["counts"]), - ("Patents: AI applications and techniques: Language processing", + ("AI applications and techniques: Language processing", lambda row: row["patents"]["Language_Processing"]["counts"]), - ("Patents: AI applications and techniques: Measuring and testing", + ("AI applications and techniques: Measuring and testing", lambda row: row["patents"]["Measuring_and_Testing"]["counts"]), - ("Patents: AI applications and techniques: Planning and scheduling", + ("AI applications and techniques: Planning and scheduling", lambda row: row["patents"]["Planning_and_Scheduling"]["counts"]), - ("Patents: AI applications and techniques: Robotics", lambda row: row["patents"]["Robotics"]["counts"]), - ("Patents: AI applications and techniques: Speech processing", + ("AI applications and techniques: Robotics", lambda row: row["patents"]["Robotics"]["counts"]), + ("AI applications and techniques: Speech processing", lambda row: row["patents"]["Speech_Processing"]["counts"]), ]) @@ -1115,6 +1118,18 @@ def get_extra_org_meta() -> dict: return extra_meta +def get_parat_link(parat_id: str, name: str) -> str: + """ + Get link to parat detail view of a company + :param parat_id: ID of company in parat + :param name: Name of company in parat + :return: Link to detail view + """ + slugified_name = slugify(name.replace("/", "").replace("'", "")) + link = f"https://parat.eto.tech/company/{parat_id}-{slugified_name}" + return link + + def write_yearly_counts(rows: list, output_file: str) -> None: """ Write csv containing yearly counts of categories of publication and patents @@ -1123,21 +1138,26 @@ def write_yearly_counts(rows: list, output_file: str) -> None: :return: None """ with open(output_file, mode="w") as out: - fieldnames = ["Name", "ID", "Category", "Year", "Value"] + fieldnames = ["Name", "ID", "PARAT link", "Dataset", "Category", "Year", "Value", "Complete"] writer = csv.DictWriter(out, fieldnames=fieldnames) writer.writeheader() for row in rows: - counts = {new_name: get(row) for new_name, get in YEARLY_COUNT_MAPPING.items()} - for category in counts: - for year, value in zip(YEARS, counts[category]): - output_row = { - "ID": row["cset_id"], - "Name": row["name"], - "Category": category, - "Year": year, - "Value": value - } - writer.writerow(output_row) + for dataset, end_year, count_mapping in [("Publications", END_ARTICLE_YEAR, PUBLICATION_YEARLY_COUNT_MAPPING), + ("Patents", END_PATENT_YEAR, PATENT_YEARLY_COUNT_MAPPING)]: + counts = {new_name: get(row) for new_name, get in count_mapping.items()} + for category in counts: + for year, value in zip(YEARS, counts[category]): + output_row = { + "ID": row["cset_id"], + "Name": row["name"], + "PARAT link": get_parat_link(row["cset_id"], row["name"]), + "Dataset": dataset, + "Category": category, + "Year": year, + "Value": value, + "Complete": year <= end_year, + } + writer.writerow(output_row) def update_data_delivery(clean_company_rows: list) -> None: @@ -1161,8 +1181,7 @@ def update_data_delivery(clean_company_rows: list) -> None: for row in clean_company_rows: reformatted_row = {new_name: get(row) for new_name, get in CORE_COLUMN_MAPPING.items()} reformatted_row.update(extra_org_meta.get(reformatted_row["ID"], set())) - slugified_name = slugify(reformatted_row["Name"].replace("/", "").replace("'", "")) - reformatted_row["PARAT link"] = f"https://parat.eto.tech/company/{reformatted_row['ID']}-{slugified_name}" + reformatted_row["PARAT link"] = get_parat_link(reformatted_row["ID"], reformatted_row["Name"]) writer.writerow(reformatted_row) write_yearly_counts(clean_company_rows, os.path.join(td, yearly_counts_file)) write_query_to_csv(