Skip to content

Commit

Permalink
Merge pull request #480 from georgetown-cset/479-yearly-counts
Browse files Browse the repository at this point in the history
Add yearly counts
  • Loading branch information
brianlove authored Jul 18, 2024
2 parents e928537 + 2d32076 commit 0b73039
Showing 1 changed file with 100 additions and 6 deletions.
106 changes: 100 additions & 6 deletions web/scripts/retrieve_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,6 +162,57 @@
("Workforce: Tech Team 1 workers", lambda row: row["other_metrics"]["tt1_jobs"]["total"]),
])

PUBLICATION_YEARLY_COUNT_MAPPING = OrderedDict([
("AI publications", lambda row: row["articles"]["ai_publications"]["counts"]),
("CV publications", lambda row: row["articles"]["cv_publications"]["counts"]),
("NLP publications", lambda row: row["articles"]["nlp_publications"]["counts"]),
("Robotics publications", lambda row: row["articles"]["robotics_publications"]["counts"]),
("AI publications in top conferences", lambda row: row["articles"]["ai_pubs_top_conf"]["counts"]),
])

PATENT_YEARLY_COUNT_MAPPING = OrderedDict([
("AI patents", lambda row: row["patents"]["ai_patents"]["counts"]),
("AI use cases: Agriculture", lambda row: row["patents"]["Agricultural"]["counts"]),
("AI use cases: Banking and finance", lambda row: row["patents"]["Banking_and_Finance"]["counts"]),
("AI use cases: Business", lambda row: row["patents"]["Business"]["counts"]),
("AI use cases: Computing in government", lambda row: row["patents"]["Computing_in_Government"]["counts"]),
("AI use cases: Document management and publishing",
lambda row: row["patents"]["Document_Mgt_and_Publishing"]["counts"]),
("AI use cases: Education", lambda row: row["patents"]["Education"]["counts"]),
("AI use cases: Energy", lambda row: row["patents"]["Energy_Management"]["counts"]),
("AI use cases: Entertainment", lambda row: row["patents"]["Entertainment"]["counts"]),
("AI use cases: Industry and manufacturing",
lambda row: row["patents"]["Industrial_and_Manufacturing"]["counts"]),
("AI use cases: Life sciences", lambda row: row["patents"]["Life_Sciences"]["counts"]),
("AI use cases: Military", lambda row: row["patents"]["Military"]["counts"]),
("AI use cases: Nanotechnology", lambda row: row["patents"]["Nanotechnology"]["counts"]),
("AI use cases: Networking", lambda row: row["patents"]["Networks__eg_social_IOT_etc"]["counts"]),
("AI use cases: Personal devices and computing",
lambda row: row["patents"]["Personal_Devices_and_Computing"]["counts"]),
("AI use cases: Physical sciences and engineering",
lambda row: row["patents"]["Physical_Sciences_and_Engineering"]["counts"]),
("AI use cases: Security", lambda row: row["patents"]["Security__eg_cybersecurity"]["counts"]),
("AI use cases: Semiconductors", lambda row: row["patents"]["Semiconductors"]["counts"]),
("AI use cases: Telecommunications", lambda row: row["patents"]["Telecommunications"]["counts"]),
("AI use cases: Transportation", lambda row: row["patents"]["Transportation"]["counts"]),
("AI applications and techniques: Analytics and algorithms",
lambda row: row["patents"]["Analytics_and_Algorithms"]["counts"]),
("AI applications and techniques: Computer vision", lambda row: row["patents"]["Computer_Vision"]["counts"]),
("AI applications and techniques: Control", lambda row: row["patents"]["Control"]["counts"]),
("AI applications and techniques: Distributed AI", lambda row: row["patents"]["Distributed_AI"]["counts"]),
("AI applications and techniques: Knowledge representation",
lambda row: row["patents"]["Knowledge_Representation"]["counts"]),
("AI applications and techniques: Language processing",
lambda row: row["patents"]["Language_Processing"]["counts"]),
("AI applications and techniques: Measuring and testing",
lambda row: row["patents"]["Measuring_and_Testing"]["counts"]),
("AI applications and techniques: Planning and scheduling",
lambda row: row["patents"]["Planning_and_Scheduling"]["counts"]),
("AI applications and techniques: Robotics", lambda row: row["patents"]["Robotics"]["counts"]),
("AI applications and techniques: Speech processing",
lambda row: row["patents"]["Speech_Processing"]["counts"]),
])

### END CONSTANTS ###


Expand Down Expand Up @@ -897,7 +948,7 @@ def clean_link(link: str) -> str:
return link


def clean(refresh_images: bool, refresh_sectors: bool) -> dict:
def clean(refresh_images: bool, refresh_sectors: bool) -> tuple:
"""
Reads and cleans the raw data from the local cache
:param refresh_images: if true, will re-download all the company logos from crunchbase; don't call with true
Expand Down Expand Up @@ -1067,10 +1118,52 @@ def get_extra_org_meta() -> dict:
return extra_meta


def update_data_delivery(clean_company_rows: dict) -> None:
def get_parat_link(parat_id: str, name: str) -> str:
"""
Get link to parat detail view of a company
:param parat_id: ID of company in parat
:param name: Name of company in parat
:return: Link to detail view
"""
slugified_name = slugify(name.replace("/", "").replace("'", ""))
link = f"https://parat.eto.tech/company/{parat_id}-{slugified_name}"
return link


def write_yearly_counts(rows: list, output_file: str) -> None:
"""
Write csv containing yearly counts of categories of publication and patents
:param rows: Company metadata
:param output_file: File where the outputs should be written
:return: None
"""
with open(output_file, mode="w") as out:
fieldnames = ["Name", "ID", "PARAT link", "Dataset", "Category", "Year", "Value", "Complete"]
writer = csv.DictWriter(out, fieldnames=fieldnames)
writer.writeheader()
for row in rows:
for dataset, end_year, count_mapping in [("Publications", END_ARTICLE_YEAR, PUBLICATION_YEARLY_COUNT_MAPPING),
("Patents", END_PATENT_YEAR, PATENT_YEARLY_COUNT_MAPPING)]:
counts = {new_name: get(row) for new_name, get in count_mapping.items()}
for category in counts:
for year, value in zip(YEARS, counts[category]):
output_row = {
"ID": row["cset_id"],
"Name": row["name"],
"PARAT link": get_parat_link(row["cset_id"], row["name"]),
"Dataset": dataset,
"Category": category,
"Year": year,
"Value": value,
"Complete": year <= end_year,
}
writer.writerow(output_row)


def update_data_delivery(clean_company_rows: list) -> None:
"""
Updates data delivery for Zenodo
:param group_data: list of clean metadata for each row
:param clean_company_rows: list of clean metadata for each row
:return: None
"""
print("retrieving metadata")
Expand All @@ -1079,6 +1172,7 @@ def update_data_delivery(clean_company_rows: dict) -> None:
ids_file = "id.csv"
aliases_file = "alias.csv"
ticker_file = "ticker.csv"
yearly_counts_file = "yearly_publication_counts.csv"
extra_org_meta = get_extra_org_meta()
with open(os.path.join(td, core_file), mode="w") as out:
fieldnames = list(CORE_COLUMN_MAPPING.keys())+list(extra_org_meta[list(extra_org_meta.keys())[0]].keys())+["PARAT link"]
Expand All @@ -1087,9 +1181,9 @@ def update_data_delivery(clean_company_rows: dict) -> None:
for row in clean_company_rows:
reformatted_row = {new_name: get(row) for new_name, get in CORE_COLUMN_MAPPING.items()}
reformatted_row.update(extra_org_meta.get(reformatted_row["ID"], set()))
slugified_name = slugify(reformatted_row["Name"].replace("/", "").replace("'", ""))
reformatted_row["PARAT link"] = f"https://parat.eto.tech/company/{reformatted_row['ID']}-{slugified_name}"
reformatted_row["PARAT link"] = get_parat_link(reformatted_row["ID"], reformatted_row["Name"])
writer.writerow(reformatted_row)
write_yearly_counts(clean_company_rows, os.path.join(td, yearly_counts_file))
write_query_to_csv(
"""
SELECT
Expand Down Expand Up @@ -1135,7 +1229,7 @@ def update_data_delivery(clean_company_rows: dict) -> None:
)
download_name = f"parat_data_{datetime.now().strftime('%Y%m%d')}"
with zipfile.ZipFile(f"{download_name}.zip", "w") as zip:
for out_csv in [core_file, ids_file, aliases_file, ticker_file]:
for out_csv in [core_file, ids_file, aliases_file, ticker_file, yearly_counts_file]:
zip.write(os.path.join(td, out_csv), os.path.join(download_name, out_csv))


Expand Down

0 comments on commit 0b73039

Please sign in to comment.