Merge pull request #26 from f-hafner/christoph-labellink-advisors

Christoph label+link advisors. Add missing graduates fields
f-hafner · Dec 8, 2022 · b255b76 · b255b76
2 parents 141c0a4 + 5332e0f
commit b255b76
Show file tree

Hide file tree

Showing 64 changed files with 19,555 additions and 2,579 deletions.
diff --git a/output/quality_linking.pdf b/output/quality_linking.pdf
diff --git a/output/quality_linking_advisors.pdf b/output/quality_linking_advisors.pdf
diff --git a/src/dataprep/main/link/advisors.sh b/src/dataprep/main/link/advisors.sh
@@ -7,25 +7,68 @@ keywords=False
 fieldofstudy_cat=False 
 fieldofstudy_str=False 
 institution=True 
+fields=("art"
+    "biology"
+    "business"
+    "chemistry"
+    "computer science" 
+    "economics"
+    "engineering"
+    "environmental science"
+    "geography"
+    "geology" 
+    "history"
+    "materials science"
+    "mathematics"
+    "medicine"
+    "philosophy"
+    "physics"
+    "political science"
+    "psychology" 
+    "sociology")
 
-fields=("chemistry"
-        "sociology" 
-        "mathematics"
-        "biology" 
-        "computer science" 
-        "political science"
-        "engineering" 
-        "psychology" 
-        "environmental science"
-        "physics" 
-        "geology" 
-        "geography"
-        "economics")
-fields=("political science")
 
-for i in "${!fields[@]}"; do 
+for i in "${!fields[@]}"; do     
     field=${fields[$i]} 
     echo ${field}
     screen -dmS "advisors.${field}" sh main/link/link_onefield_advisors.sh $RECALL "$field" $train_name $institution $fieldofstudy_cat $fieldofstudy_str $keywords $logfile_path
     echo "Started screen ..."
 done 
+wait
+
+
+
+# start_field_i()  {
+#     fields=("art"
+#         "biology"
+#         "business"
+#         "chemistry"
+#         "computer science" 
+#         "economics"
+#         "engineering"
+#         "environmental science"
+#         "geography"
+#         "geology" 
+#         "history"
+#         "materials science"
+#         "mathematics"
+#         "medicine"
+#         "philosophy"
+#         "physics"
+#         "political science"
+#         "psychology" 
+#         "sociology")
+#     field=${fields[$1]} 
+#     echo $1
+#     echo ${field}
+#     screen -dmS "advisors.${field}" sh main/link/link_onefield_advisors.sh $RECALL "$field" $train_name $institution $fieldofstudy_cat $fieldofstudy_str $keywords $logfile_path &
+#     while screen -list | grep -q $"advisors.${field}"
+#     do
+#         sleep 1
+#     done
+#     echo "Started screen ..."
+#     wait 
+# }
+# export -f start_field_i
+# parallel -j 3 start_field_i ::: $(seq 0 18)
+
diff --git a/src/dataprep/main/link/graduates.sh b/src/dataprep/main/link/graduates.sh
@@ -21,7 +21,17 @@ fields=("chemistry"
         "geology" 
         "geography"
         "economics")
-
+
+fields=("art"
+        "business"
+        "history"
+        "materials science"
+        "medicine"
+        "philosophy"
+         )
+ fields=("philosophy"
+         )
+
 
 for i in "${!fields[@]}"; do 
     field=${fields[$i]} 

diff --git a/src/dataprep/main/link/prep_linked_data.py b/src/dataprep/main/link/prep_linked_data.py
@@ -31,7 +31,6 @@
 import pdb 
 import argparse
 
-# ## Arguments
 # ## Arguments
 parser = argparse.ArgumentParser(description = 'Inputs for author_collab')
 parser.add_argument("--filter_trainname", 
@@ -145,7 +144,7 @@
 
 # for now, do not condition on certain time distance between 
 # graduation year and whenever the supervisor has a publication. 
-# TODO: do this after gaining some insights in the analysis
+# Do this on the fly after gaining some insights in the analysis 
 
 con.execute("CREATE UNIQUE INDEX idx_cla_AuthorIdrelid ON current_links_advisors (AuthorId ASC, relationship_id ASC)")
 con.execute("CREATE UNIQUE INDEX idx_cla_relid ON current_links_advisors (relationship_id ASC)") # this is also a way to make sure there are not multiple links per goid

diff --git a/src/dataprep/main/link/setup_linking.py b/src/dataprep/main/link/setup_linking.py
@@ -34,7 +34,7 @@ def __eq__(self, other):
 
 # ## Some settings
 pd.set_option('display.max.columns', None)
-path_dedupe_files = datapath + "DedupeFiles/flavio/issue-21/" # TODO: this needs to be fixed at the end and any new files copied to DedupeFiles/advisors
+path_dedupe_files = datapath + "DedupeFiles/" 
 share_blockedpairs_training = 0.66 # fraction of similar pairs as opposed to random pairs 
 
 # register [adapter for numpy.int64](https://stackoverflow.com/questions/38753737/inserting-numpy-integer-types-into-sqlite-with-python3)

diff --git a/src/dataprep/main/reports/quality_linking.Rmd b/src/dataprep/main/reports/quality_linking.Rmd
@@ -22,10 +22,26 @@ lapply(packages, library, character.only = TRUE)
 
 datapath <- "/mnt/ssd/"
 db_file  <- paste0(datapath, "AcademicGraph/AcademicGraph.sqlite")
-select_fields <- c("physics", "biology", "chemistry", "sociology",
-                    "economics", "political science", "psychology", 
-                    "mathematics", "geography", "geology", "engineering",
-                    "computer science", "environmental science") # fields currently matched 
+select_fields <- c("art",
+                   "biology",
+                   "business",
+                   "chemistry",
+                   "computer science" ,
+                   "economics",
+                   "engineering",
+                   "environmental science",
+                   "geography",
+                   "geology" ,
+                   "history",
+                   "materials science",
+                   "mathematics",
+                   "medicine",
+                   "philosophy",
+                   "physics",
+                   "political science",
+                   "psychology" ,
+                   "sociology") # all fields are currently matched
+
 
 
 date_method_change <- ymd("2022-07-01") # after summer we extended the sampling period and added more features 

diff --git a/src/dataprep/main/reports/quality_linking_advisors.Rmd b/src/dataprep/main/reports/quality_linking_advisors.Rmd
@@ -22,10 +22,25 @@ lapply(packages, library, character.only = TRUE)
 
 datapath <- "/mnt/ssd/"
 db_file  <- paste0(datapath, "AcademicGraph/AcademicGraph.sqlite")
-select_fields <- c("physics", "biology", "chemistry", "sociology",
-                    "economics", "political science", "psychology", 
-                    "mathematics", "geography", "geology", "engineering",
-                    "computer science", "environmental science") # fields currently matched 
+select_fields <- c("art",
+                   "biology",
+                   "business",
+                   "chemistry",
+                   "computer science" ,
+                   "economics",
+                   "engineering",
+                   "environmental science",
+                   "geography",
+                   "geology" ,
+                   "history",
+                   "materials science",
+                   "mathematics",
+                   #"medicine",
+                   "philosophy",
+                   "physics",
+                   "political science",
+                   "psychology" ,
+                   "sociology") # fields currently matched 
 
 
 # ## db connection
@@ -137,10 +152,11 @@ linked_advisors %>%
 
 ```{r}
 
-keep_fields <- c("biology", "chemistry", "computer science", 
-                 "economics", "engineering", "environmental science", 
-                 "geography", "geology", "mathetmatics", "physics",
-                 "political science", "psychology", "sociology")
+keep_fields <- select_fields
+# c("biology", "chemistry", "computer science", 
+#                 "economics", "engineering", "environmental science", 
+#                 "geography", "geology", "mathetmatics", "physics",
+#                 "political science", "psychology", "sociology")
 
 score_by_year <- theses %>% 
   filter(degree_year >= 1985) %>%

diff --git a/src/dataprep/pipeline.sh b/src/dataprep/pipeline.sh
@@ -83,6 +83,10 @@ Rscript -e "rmarkdown::render('$script_path/reports/sample_size_linking.Rmd', ou
 # ## 1. Link graduates to MAG
 bash $script_path/link/graduates.sh $logfile_path
 
+# Christoph retrained with with the following options:
+# --train_name "christoph_degree0" --keepyears "19852015"
+# need to run the write_csv_links script with these options as well
+# to get all links into db
 python -m $script_path.link.write_csv_links --linking_type "graduates" --train_name "christoph_fielddegree0" \
     &> $logfile_path/write_csv_links_graduates.log
 

diff --git a/src/dataprep/temp/createlink_mag_proquest_art_1:1_christoph_degree0_graduates_8515.log b/src/dataprep/temp/createlink_mag_proquest_art_1:1_christoph_degree0_graduates_8515.log
@@ -0,0 +1,146 @@
+Namespace(testing=False, verbose=1, field=['art'], train_name='christoph_degree0', startyear=1985, endyear=2015, loadstartyear=1985, loadendyear=2015, mergemode='1:1', recall=0.9, institution='False', fieldofstudy_cat='False', fieldofstudy_str='False', keywords='True', retrain='True', linking_type='graduates', samplesize=100000, write_to='csv')
+Have max 12 cores available
+Testing is False 
+
+I set the write connection to temporary database.
+id_field is [142362112] and will be passed to sql queries.
+
+    SELECT goid
+            , year
+            , firstname 
+            , lastname
+            , CASE TRIM(SUBSTR(middle_lastname, 1, l_fullname-l_firstname-l_lastname - 1)) 
+                WHEN 
+                    "" THEN NULL 
+                    ELSE TRIM(SUBSTR(middle_lastname, 1, l_fullname-l_firstname-l_lastname - 1)) 
+                END AS middlename
+            , fieldofstudy
+            , keywords
+            , institution
+            , coauthors
+            , year_papertitle
+    FROM (
+        SELECT goid
+            , degree_year AS year 
+            , fullname 
+            , SUBSTR(TRIM(fullname),1,instr(trim(fullname)||' ',' ')-1) AS firstname
+            , REPLACE(fullname, RTRIM(fullname, REPLACE(fullname, " ", "")), "") AS lastname 
+            , TRIM(SUBSTR(fullname, length(SUBSTR(TRIM(fullname),1,instr(trim(fullname)||' ',' ')-1)) + 1)) AS middle_lastname 
+            , length(fullname) AS l_fullname 
+            , length(SUBSTR(TRIM(fullname),1,instr(trim(fullname)||' ',' ')-1) ) AS l_firstname
+            , length(REPLACE(fullname, RTRIM(fullname, REPLACE(fullname, " ", "")), "")) AS l_lastname
+            , fieldname AS fieldofstudy
+            , university_id
+            , degree_year || "//" || thesistitle as year_papertitle 
+        FROM pq_authors 
+        INNER JOIN (
+            SELECT goid, fieldname 
+            FROM pq_fields_mag
+            WHERE mag_field0 IN (?)
+        ) USING (goid)
+    )
+    -- ## NOTE: use left join here as not all graduates have advisor (particularly pre-1980) and possibly also keywords
+    LEFT JOIN (
+        SELECT goid
+            , fields as keywords
+            , advisors as coauthors
+        FROm pq_info_linking
+    ) USING(goid)
+    INNER JOIN (
+        SELECT university_id, normalizedname as institution
+        FROM pq_unis
+        WHERE location like "%United States%"
+    ) USING(university_id)
+    WHERE year >= 1985 and year <= 2015 AND length(firstname) > 1
+
+
+
+    SELECT f.AuthorId
+        , f.year
+        , f.firstname
+        , f.lastname
+        , CASE TRIM(SUBSTR(f.middle_lastname, 1, f.l_fullname - f.l_firstname - f.l_lastname - 1)) 
+            WHEN 
+                "" THEN NULL 
+                ELSE TRIM(SUBSTR(f.middle_lastname, 1, f.l_fullname - f.l_firstname - f.l_lastname - 1)) 
+            END as middlename 
+            -- ## NOTE this gives "" for middlename when it is missing 
+        , f.fieldofstudy
+        , g.keywords
+        , g.coauthors
+        , g.institution
+        , g.year_papertitle
+    FROM (
+        SELECT a.AuthorId
+            , a.YearFirstPub AS year
+            , a.FirstName AS firstname
+            , REPLACE(b.NormalizedName, RTRIM(b.NormalizedName, REPLACE(b.NormalizedName, " ", "")), "") AS lastname 
+                    -- https://stackoverflow.com/questions/21388820/how-to-get-the-last-index-of-a-substring-in-sqlite
+            , TRIM(SUBSTR(b.NormalizedName, length(a.FirstName) + 1)) AS middle_lastname 
+                    -- this gives all except the first name 
+            , length(b.NormalizedName) as l_fullname 
+            , length(a.FirstName) as l_firstname
+            , length(REPLACE(b.NormalizedName, RTRIM(b.NormalizedName, REPLACE(b.NormalizedName, " ", "")), "")) as l_lastname
+            , e.NormalizedName AS fieldofstudy
+        FROM author_sample AS a
+        INNER JOIN (
+            SELECT AuthorId, NormalizedName
+            FROM Authors
+        ) AS b USING(AuthorId)
+        INNER JOIN (
+            SELECT AuthorId
+            FROM author_field0
+            WHERE FieldOfStudyId_lvl0 IN (?)
+                AND Degree <= 0
+        ) USING(AuthorId)
+        LEFT JOIN (
+            SELECT AuthorId, NormalizedName
+            FROM author_fields c
+            INNER JOIN (
+                SELECT FieldOfStudyId, NormalizedName
+                FROM FieldsOfStudy
+            ) AS d USING(FieldOfStudyId)
+            -- ## Condition on fieldofstudy being in the level 0 id_field
+            INNER JOIN (
+                SELECT ParentFieldOfStudyId, ChildFieldOfStudyId
+                FROM crosswalk_fields
+                WHERE ParentLevel = 0
+                    AND ParentFieldOfStudyId IN (?)
+            ) AS e ON (e.ChildFieldOfStudyId = c.FieldOfStudyId)
+            WHERE FieldClass = 'first'
+        ) AS e USING(AuthorId)
+    ) f
+    LEFT JOIN (
+        SELECT AuthorId
+                , institutions as institution
+                , main_us_institutions_career
+                , coauthors
+                , keywords
+                , year_papertitle
+        FROM author_info_linking
+    ) AS g USING(AuthorId)
+    WHERE length(firstname) > 1 AND year >= 1985 - 5 AND year <= 2015 + 5 
+        -- ## use this to condition on people that have at least at some point their main affiliation in the US
+        AND g.main_us_institutions_career IS NOT NULL
+        AND g.institution != "chinese academy of sciences"
+
+
+reading from:  /mnt/ssd/DedupeFiles/graduates/settings_art_1985_2015_institutionFalse_fieldofstudy_catFalse_fieldofstudy_strFalse_keywordsTruechristoph_degree0
+Link now ... 
+made pairs
+calculated scores
+made 1:1 links
+Writing to database...
+Filling table info...
+Filled table info...
+Iteration id is 53
+Filling links into db...
+Filled links into db...
+Wrote linking info into db...
+Found 7540 links for 86635 graduates with a score of at least 0.
+Running ANALYZE... 
+
+Copying to csv...
+Done copying to csv...
+Deleted the temporary database...
+Done in 5.894030499458313 minutes.