From 9b576899d469b290c1492e839cd6bb315b5b820d Mon Sep 17 00:00:00 2001 From: f-hafner Date: Tue, 13 Dec 2022 10:16:59 +0000 Subject: [PATCH 1/2] move affiliation_outcomes.py down in pipeline - Check files lying in between; none of them uses output from affiliation_outcomes.py --- src/dataprep/pipeline.sh | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/dataprep/pipeline.sh b/src/dataprep/pipeline.sh index f8c7ac5..7b8957e 100644 --- a/src/dataprep/pipeline.sh +++ b/src/dataprep/pipeline.sh @@ -38,8 +38,6 @@ python3 -m $script_path.prep_mag.read_collab &> $logfile_path/read_collab.log python3 $script_path/prep_mag/prep_affiliations.py &> $logfile_path/prep_affiliations.log -python3 -m $script_path.prep_mag.affiliation_outcomes &> $logfile_path/affiliation_outcomes.log #note: script_path should omit the / at the end - python3 $script_path/prep_mag/prep_citations.py &> $logfile_path/prep_citations.log python3 $script_path/prep_mag/paper_outcomes.py &> $logfile_path/paper_outcomes.log @@ -50,6 +48,9 @@ python3 $script_path/prep_mag/author_info_linking.py --years_first_field 7 \ python -m $script_path.prep_mag.author_field0 \ &> $logfile_path/author_field0.log +python3 -m $script_path.prep_mag.affiliation_outcomes &> $logfile_path/affiliation_outcomes.log #note: script_path should omit the / at the end + + # ## Consolidate gender per author in author_sample python3 $script_path/prep_mag/author_gender.py &> $logfile_path/author_gender.log From 78bc95d577d619cfef0299f8191ed3171a8544a0 Mon Sep 17 00:00:00 2001 From: f-hafner Date: Tue, 13 Dec 2022 19:41:47 +0000 Subject: [PATCH 2/2] create tables for outcomes and field of affil --- .../main/prep_mag/affiliation_outcomes.py | 133 ++++++++++++++---- src/dataprep/pipeline.sh | 3 +- src/dataprep/temp/affiliation_outcomes.log | 9 +- 3 files changed, 114 insertions(+), 31 deletions(-) diff --git a/src/dataprep/main/prep_mag/affiliation_outcomes.py b/src/dataprep/main/prep_mag/affiliation_outcomes.py index d86d087..3710eb2 100644 --- a/src/dataprep/main/prep_mag/affiliation_outcomes.py +++ b/src/dataprep/main/prep_mag/affiliation_outcomes.py @@ -4,57 +4,138 @@ """ Script affiliation_outcomes.py -Generate tables: -- affiliation_outcomes: some features at the affiliation level. - - number of journal articles published per affiliation. In contrast to table Affiliations, - consider only keep_doctypes +Generate tables: at the affiliation-year-field0 level +- affiliation_outcomes: publication outcomes + - number of journal articles published and 10-year forward citations per + affiliation-year-field0. Field0 is assigned from the Field0 of the published paper +- affiliation_fields: keywords of published papers + - fields of study of the published paper NOTE: in the long run, we may consider to move this to prep_affiliations.py, or unify them in a new file. """ -# TODO -# add some stats on the "concentration" per author-year? -# most likely much more concentrated than fields, so not doing for now - +import argparse import sqlite3 as sqlite import warnings import time from helpers.functions import analyze_db -from helpers.variables import db_file, insert_questionmark_doctypes, keep_doctypes +from helpers.variables import db_file # ## Arguments +parser = argparse.ArgumentParser() +parser.add_argument("--fos_max_level", type=int, default=2, + help="Fields of study up to which level to include?") +args = parser.parse_args() # ## Variables; connect to db start_time = time.time() print(f"Start time: {start_time} \n") +interactive = False # Turn this on for only querying few records, ie for testing + con = sqlite.connect(database = db_file, isolation_level= None) -print("Making affiliation_outcomes table ...\n") +print("Creating temp table paper_affiliation_year") + +query_limit = "" +if interactive: + query_limit = "LIMIT 1000" + -con.execute("DROP TABLE IF EXISTS affiliation_outcomes") -con.execute(f"""CREATE TABLE affiliation_outcomes AS - SELECT AffiliationId, COUNT(DISTINCT PaperId) AS PublicationCount - from PaperAuthorAffiliations +with con as c: + c.execute(f""" + CREATE TEMP TABLE paper_affiliation_year AS + SELECT DISTINCT AffiliationId, Year, PaperId + FROM ( + SELECT a.AuthorId, a.AffiliationId, a.Year, b.Paperid + FROM AuthorAffiliation a -- ## if an author has 2 main affiliations in the same year, we count their papers at both institutions + INNER JOIN ( + SELECT PaperId, AuthorId, Year + FROM PaperAuthorUnique + INNER JOIN ( + SELECT PaperId, Year + FROM Papers + ) USING(PaperId) + {query_limit} + ) b + ON a.AuthorId=b.AuthorId AND a.Year=b.Year + -- reduces size of the data set INNER JOIN ( SELECT PaperId - FROM Papers - WHERE DocType IN ({insert_questionmark_doctypes}) - AND Year >= 1950 + FROM paper_outcomes ) USING(PaperId) + ) + """) + + c.execute("CREATE INDEX idx_paper_temp ON paper_affiliation_year (PaperId)") + + +print("Creating table affiliation_outcomes") + +with con as c: + c.execute("DROP TABLE IF EXISTS affiliation_outcomes") + + c.execute(""" + CREATE TABLE affiliation_outcomes AS + SELECT AffiliationId + , Year + , Field0 + , COUNT(PaperId) AS PaperCount + , SUM(CitationCount_y10) AS CitationCount_y10 + FROM paper_affiliation_year + INNER JOIN ( + SELECT PaperId, CitationCount_y10 + FROM paper_outcomes + ) USING(PaperId) + INNER JOIN ( + SELECT PaperId, Field0 + FROM PaperMainFieldsOfStudy + ) + USING(PaperId) + GROUP BY AffiliationId, Year, Field0 + """) + + c.execute("CREATE UNIQUE INDEX idx_affo_AffilYearField ON affiliation_outcomes (AffiliationId, Year, Field0)") + +print("Creating table affiliation_fields ") + +with con as c: + c.execute("DROP TABLE IF EXISTS affiliation_fields") + + c.execute(f""" + CREATE TABLE affiliation_fields AS + SELECT AffiliationId + , Field0 + , Year + , FieldOfStudyId + , SUM(Score) AS Score + FROM paper_affiliation_year + INNER JOIN ( + SELECT PaperId, FieldOfStudyId, Score + FROM PaperFieldsOfStudy INNER JOIN ( - SELECT AffiliationId - FROM Affiliations - ) USING (AffiliationId) - GROUP BY (AffiliationId) - """, - (keep_doctypes) - ) -con.execute("CREATE UNIQUE INDEX idx_affo_AffiliationId ON affiliation_outcomes (AffiliationId ASC)") + SELECT FieldOfStudyId + FROM FieldsOfStudy + WHERE level <= {args.fos_max_level} + ) USING(FieldOfStudyId) + ) USING(PaperId) + INNER JOIN ( + SELECT PaperId, Field0 + FROM PaperMainFieldsOfStudy + ) USING(PaperId) + GROUP BY AffiliationId, FieldOfStudyId, Year, Field0 + """) + + c.execute("CREATE UNIQUE INDEX idx_afff_AffilFieldYearField ON affiliation_fields (AffiliationId, FieldOfStudyId, Year, Field0)") + c.execute("CREATE INDEX idx_afff_Year ON affiliation_fields (Year)") + c.execute("CREATE INDEX idx_afff_FoS ON affiliation_fields (FieldOfStudyId)") + # ## Run ANALYZE, finish -analyze_db(con) +with con as c: + analyze_db(c) + con.close() diff --git a/src/dataprep/pipeline.sh b/src/dataprep/pipeline.sh index 7b8957e..37c2d42 100644 --- a/src/dataprep/pipeline.sh +++ b/src/dataprep/pipeline.sh @@ -48,7 +48,8 @@ python3 $script_path/prep_mag/author_info_linking.py --years_first_field 7 \ python -m $script_path.prep_mag.author_field0 \ &> $logfile_path/author_field0.log -python3 -m $script_path.prep_mag.affiliation_outcomes &> $logfile_path/affiliation_outcomes.log #note: script_path should omit the / at the end +python3 -m $script_path.prep_mag.affiliation_outcomes --fos_max_level 2 \ + &> $logfile_path/affiliation_outcomes.log #note: script_path should omit the / at the end # ## Consolidate gender per author in author_sample diff --git a/src/dataprep/temp/affiliation_outcomes.log b/src/dataprep/temp/affiliation_outcomes.log index 8e50d93..082685b 100644 --- a/src/dataprep/temp/affiliation_outcomes.log +++ b/src/dataprep/temp/affiliation_outcomes.log @@ -1,7 +1,8 @@ -Start time: 1655815997.1026988 - -Making affiliation_outcomes table ... +Start time: 1670939995.9846091 +Creating temp table paper_affiliation_year +Creating table affiliation_outcomes +Creating table affiliation_fields Running ANALYZE... -Done in 81.3799677491188 minutes. +Done in 89.05228799978892 minutes.