From 9b576899d469b290c1492e839cd6bb315b5b820d Mon Sep 17 00:00:00 2001
From: f-hafner <hafner.flavio@gmail.com>
Date: Tue, 13 Dec 2022 10:16:59 +0000
Subject: [PATCH 1/2] move affiliation_outcomes.py down in pipeline

- Check files lying in between; none of them uses output from
  affiliation_outcomes.py
---
 src/dataprep/pipeline.sh | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/dataprep/pipeline.sh b/src/dataprep/pipeline.sh
index f8c7ac5..7b8957e 100644
--- a/src/dataprep/pipeline.sh
+++ b/src/dataprep/pipeline.sh
@@ -38,8 +38,6 @@ python3 -m $script_path.prep_mag.read_collab &> $logfile_path/read_collab.log
 
 python3 $script_path/prep_mag/prep_affiliations.py &> $logfile_path/prep_affiliations.log
 
-python3 -m $script_path.prep_mag.affiliation_outcomes &> $logfile_path/affiliation_outcomes.log #note: script_path should omit the / at the end
-
 python3 $script_path/prep_mag/prep_citations.py &> $logfile_path/prep_citations.log
 
 python3 $script_path/prep_mag/paper_outcomes.py &> $logfile_path/paper_outcomes.log
@@ -50,6 +48,9 @@ python3 $script_path/prep_mag/author_info_linking.py --years_first_field 7 \
 python -m $script_path.prep_mag.author_field0 \
     &> $logfile_path/author_field0.log
 
+python3 -m $script_path.prep_mag.affiliation_outcomes &> $logfile_path/affiliation_outcomes.log #note: script_path should omit the / at the end
+
+
 # ## Consolidate gender per author in author_sample 
 python3 $script_path/prep_mag/author_gender.py &> $logfile_path/author_gender.log
 

From 78bc95d577d619cfef0299f8191ed3171a8544a0 Mon Sep 17 00:00:00 2001
From: f-hafner <hafner.flavio@gmail.com>
Date: Tue, 13 Dec 2022 19:41:47 +0000
Subject: [PATCH 2/2] create tables for outcomes and field of affil

---
 .../main/prep_mag/affiliation_outcomes.py     | 133 ++++++++++++++----
 src/dataprep/pipeline.sh                      |   3 +-
 src/dataprep/temp/affiliation_outcomes.log    |   9 +-
 3 files changed, 114 insertions(+), 31 deletions(-)

diff --git a/src/dataprep/main/prep_mag/affiliation_outcomes.py b/src/dataprep/main/prep_mag/affiliation_outcomes.py
index d86d087..3710eb2 100644
--- a/src/dataprep/main/prep_mag/affiliation_outcomes.py
+++ b/src/dataprep/main/prep_mag/affiliation_outcomes.py
@@ -4,57 +4,138 @@
 """
 Script affiliation_outcomes.py
 
-Generate tables:
-- affiliation_outcomes: some features at the affiliation level.
-    - number of journal articles published per affiliation. In contrast to table Affiliations,
-        consider only keep_doctypes
+Generate tables: at the affiliation-year-field0 level
+- affiliation_outcomes: publication outcomes
+    - number of journal articles published and 10-year forward citations per 
+        affiliation-year-field0. Field0 is assigned from the Field0 of the published paper
+- affiliation_fields: keywords of published papers 
+    - fields of study of the published paper 
 NOTE: in the long run, we may consider to move this to prep_affiliations.py, or unify them in a new file.
 """
 
-# TODO
-# add some stats on the "concentration" per author-year? 
-#   most likely much more concentrated than fields, so not doing for now
-
+import argparse 
 import sqlite3 as sqlite
 import warnings
 import time 
 from helpers.functions import analyze_db
-from helpers.variables import db_file, insert_questionmark_doctypes, keep_doctypes
+from helpers.variables import db_file
 
 
 # ## Arguments
+parser = argparse.ArgumentParser()
+parser.add_argument("--fos_max_level", type=int, default=2,
+                    help="Fields of study up to which level to include?")
+args = parser.parse_args()
 
 # ## Variables; connect to db
 start_time = time.time()
 print(f"Start time: {start_time} \n")
+interactive = False # Turn this on for only querying few records, ie for testing
+
 
 con = sqlite.connect(database = db_file, isolation_level= None)
 
-print("Making affiliation_outcomes table ...\n")
+print("Creating temp table paper_affiliation_year")
+
+query_limit = ""
+if interactive:
+    query_limit = "LIMIT 1000"
+
 
-con.execute("DROP TABLE IF EXISTS affiliation_outcomes")
-con.execute(f"""CREATE TABLE affiliation_outcomes AS 
-            SELECT AffiliationId, COUNT(DISTINCT PaperId) AS PublicationCount
-            from PaperAuthorAffiliations
+with con as c:
+    c.execute(f"""
+        CREATE TEMP TABLE paper_affiliation_year AS 
+        SELECT DISTINCT AffiliationId, Year, PaperId
+        FROM (
+            SELECT a.AuthorId, a.AffiliationId, a.Year, b.Paperid
+            FROM AuthorAffiliation a -- ## if an author has 2 main affiliations in the same year, we count their papers at both institutions
+            INNER JOIN (
+                SELECT PaperId, AuthorId, Year
+                FROM PaperAuthorUnique
+                INNER JOIN (
+                    SELECT PaperId, Year
+                    FROM Papers
+                ) USING(PaperId)
+                {query_limit}
+            ) b
+            ON a.AuthorId=b.AuthorId AND a.Year=b.Year
+            -- reduces size of the data set 
             INNER JOIN (
                 SELECT PaperId
-                FROM Papers 
-                WHERE DocType IN ({insert_questionmark_doctypes})
-                    AND Year >= 1950
+                FROM paper_outcomes
             ) USING(PaperId)
+        )
+    """)
+
+    c.execute("CREATE INDEX idx_paper_temp ON paper_affiliation_year (PaperId)")
+
+
+print("Creating table affiliation_outcomes")
+
+with con as c:
+    c.execute("DROP TABLE IF EXISTS affiliation_outcomes")
+
+    c.execute("""
+        CREATE TABLE affiliation_outcomes AS  
+        SELECT AffiliationId
+            , Year
+            , Field0
+            , COUNT(PaperId) AS PaperCount
+            , SUM(CitationCount_y10) AS CitationCount_y10
+        FROM paper_affiliation_year 
+        INNER JOIN (
+            SELECT PaperId, CitationCount_y10 
+            FROM paper_outcomes 
+        ) USING(PaperId)
+        INNER JOIN ( 
+            SELECT PaperId, Field0 
+            FROM PaperMainFieldsOfStudy
+        ) 
+        USING(PaperId)
+        GROUP BY AffiliationId, Year, Field0
+    """)
+
+    c.execute("CREATE UNIQUE INDEX idx_affo_AffilYearField ON affiliation_outcomes (AffiliationId, Year, Field0)")
+
+print("Creating table affiliation_fields ")
+
+with con as c:
+    c.execute("DROP TABLE IF EXISTS affiliation_fields")
+
+    c.execute(f"""
+        CREATE TABLE affiliation_fields AS 
+        SELECT AffiliationId
+            , Field0
+            , Year 
+            , FieldOfStudyId
+            , SUM(Score) AS Score
+        FROM paper_affiliation_year 
+        INNER JOIN (
+            SELECT PaperId, FieldOfStudyId, Score
+            FROM PaperFieldsOfStudy 
             INNER JOIN (
-                SELECT AffiliationId 
-                FROM Affiliations
-            ) USING (AffiliationId)
-            GROUP BY (AffiliationId)   
-            """,
-            (keep_doctypes)
-            )
-con.execute("CREATE UNIQUE INDEX idx_affo_AffiliationId ON affiliation_outcomes (AffiliationId ASC)")
+                SELECT FieldOfStudyId 
+                FROM FieldsOfStudy 
+                WHERE level <= {args.fos_max_level} 
+            ) USING(FieldOfStudyId)
+        ) USING(PaperId)
+        INNER JOIN ( 
+            SELECT PaperId, Field0 
+            FROM PaperMainFieldsOfStudy
+        ) USING(PaperId)
+        GROUP BY AffiliationId, FieldOfStudyId, Year, Field0
+    """)
+
+    c.execute("CREATE UNIQUE INDEX idx_afff_AffilFieldYearField ON affiliation_fields (AffiliationId, FieldOfStudyId, Year, Field0)")
+    c.execute("CREATE INDEX idx_afff_Year ON affiliation_fields (Year)")
+    c.execute("CREATE INDEX idx_afff_FoS ON affiliation_fields (FieldOfStudyId)")
+
 
 
 # ## Run ANALYZE, finish
-analyze_db(con)
+with con as c:
+    analyze_db(c)
+
 
 con.close()
 
diff --git a/src/dataprep/pipeline.sh b/src/dataprep/pipeline.sh
index 7b8957e..37c2d42 100644
--- a/src/dataprep/pipeline.sh
+++ b/src/dataprep/pipeline.sh
@@ -48,7 +48,8 @@ python3 $script_path/prep_mag/author_info_linking.py --years_first_field 7 \
 python -m $script_path.prep_mag.author_field0 \
     &> $logfile_path/author_field0.log
 
-python3 -m $script_path.prep_mag.affiliation_outcomes &> $logfile_path/affiliation_outcomes.log #note: script_path should omit the / at the end
+python3 -m $script_path.prep_mag.affiliation_outcomes --fos_max_level 2 \ 
+    &> $logfile_path/affiliation_outcomes.log #note: script_path should omit the / at the end
 
 
 # ## Consolidate gender per author in author_sample 
diff --git a/src/dataprep/temp/affiliation_outcomes.log b/src/dataprep/temp/affiliation_outcomes.log
index 8e50d93..082685b 100644
--- a/src/dataprep/temp/affiliation_outcomes.log
+++ b/src/dataprep/temp/affiliation_outcomes.log
@@ -1,7 +1,8 @@
-Start time: 1655815997.1026988 
-
-Making affiliation_outcomes table ...
+Start time: 1670939995.9846091 
 
+Creating temp table paper_affiliation_year
+Creating table affiliation_outcomes
+Creating table affiliation_fields 
 Running ANALYZE... 
 
-Done in 81.3799677491188 minutes.
+Done in 89.05228799978892 minutes.