Skip to content

Commit

Permalink
Merge pull request #26 from f-hafner/christoph-labellink-advisors
Browse files Browse the repository at this point in the history
Christoph label+link advisors. Add missing graduates fields
  • Loading branch information
f-hafner authored Dec 8, 2022
2 parents 141c0a4 + 5332e0f commit b255b76
Show file tree
Hide file tree
Showing 64 changed files with 19,555 additions and 2,579 deletions.
Binary file modified output/quality_linking.pdf
Binary file not shown.
Binary file modified output/quality_linking_advisors.pdf
Binary file not shown.
73 changes: 58 additions & 15 deletions src/dataprep/main/link/advisors.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,25 +7,68 @@ keywords=False
fieldofstudy_cat=False
fieldofstudy_str=False
institution=True
fields=("art"
"biology"
"business"
"chemistry"
"computer science"
"economics"
"engineering"
"environmental science"
"geography"
"geology"
"history"
"materials science"
"mathematics"
"medicine"
"philosophy"
"physics"
"political science"
"psychology"
"sociology")

fields=("chemistry"
"sociology"
"mathematics"
"biology"
"computer science"
"political science"
"engineering"
"psychology"
"environmental science"
"physics"
"geology"
"geography"
"economics")
fields=("political science")

for i in "${!fields[@]}"; do
for i in "${!fields[@]}"; do
field=${fields[$i]}
echo ${field}
screen -dmS "advisors.${field}" sh main/link/link_onefield_advisors.sh $RECALL "$field" $train_name $institution $fieldofstudy_cat $fieldofstudy_str $keywords $logfile_path
echo "Started screen ..."
done
wait



# start_field_i() {
# fields=("art"
# "biology"
# "business"
# "chemistry"
# "computer science"
# "economics"
# "engineering"
# "environmental science"
# "geography"
# "geology"
# "history"
# "materials science"
# "mathematics"
# "medicine"
# "philosophy"
# "physics"
# "political science"
# "psychology"
# "sociology")
# field=${fields[$1]}
# echo $1
# echo ${field}
# screen -dmS "advisors.${field}" sh main/link/link_onefield_advisors.sh $RECALL "$field" $train_name $institution $fieldofstudy_cat $fieldofstudy_str $keywords $logfile_path &
# while screen -list | grep -q $"advisors.${field}"
# do
# sleep 1
# done
# echo "Started screen ..."
# wait
# }
# export -f start_field_i
# parallel -j 3 start_field_i ::: $(seq 0 18)

12 changes: 11 additions & 1 deletion src/dataprep/main/link/graduates.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,17 @@ fields=("chemistry"
"geology"
"geography"
"economics")


fields=("art"
"business"
"history"
"materials science"
"medicine"
"philosophy"
)
fields=("philosophy"
)


for i in "${!fields[@]}"; do
field=${fields[$i]}
Expand Down
3 changes: 1 addition & 2 deletions src/dataprep/main/link/prep_linked_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
import pdb
import argparse

# ## Arguments
# ## Arguments
parser = argparse.ArgumentParser(description = 'Inputs for author_collab')
parser.add_argument("--filter_trainname",
Expand Down Expand Up @@ -145,7 +144,7 @@

# for now, do not condition on certain time distance between
# graduation year and whenever the supervisor has a publication.
# TODO: do this after gaining some insights in the analysis
# Do this on the fly after gaining some insights in the analysis

con.execute("CREATE UNIQUE INDEX idx_cla_AuthorIdrelid ON current_links_advisors (AuthorId ASC, relationship_id ASC)")
con.execute("CREATE UNIQUE INDEX idx_cla_relid ON current_links_advisors (relationship_id ASC)") # this is also a way to make sure there are not multiple links per goid
Expand Down
2 changes: 1 addition & 1 deletion src/dataprep/main/link/setup_linking.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __eq__(self, other):

# ## Some settings
pd.set_option('display.max.columns', None)
path_dedupe_files = datapath + "DedupeFiles/flavio/issue-21/" # TODO: this needs to be fixed at the end and any new files copied to DedupeFiles/advisors
path_dedupe_files = datapath + "DedupeFiles/"
share_blockedpairs_training = 0.66 # fraction of similar pairs as opposed to random pairs

# register [adapter for numpy.int64](https://stackoverflow.com/questions/38753737/inserting-numpy-integer-types-into-sqlite-with-python3)
Expand Down
24 changes: 20 additions & 4 deletions src/dataprep/main/reports/quality_linking.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,26 @@ lapply(packages, library, character.only = TRUE)
datapath <- "/mnt/ssd/"
db_file <- paste0(datapath, "AcademicGraph/AcademicGraph.sqlite")
select_fields <- c("physics", "biology", "chemistry", "sociology",
"economics", "political science", "psychology",
"mathematics", "geography", "geology", "engineering",
"computer science", "environmental science") # fields currently matched
select_fields <- c("art",
"biology",
"business",
"chemistry",
"computer science" ,
"economics",
"engineering",
"environmental science",
"geography",
"geology" ,
"history",
"materials science",
"mathematics",
"medicine",
"philosophy",
"physics",
"political science",
"psychology" ,
"sociology") # all fields are currently matched
date_method_change <- ymd("2022-07-01") # after summer we extended the sampling period and added more features
Expand Down
32 changes: 24 additions & 8 deletions src/dataprep/main/reports/quality_linking_advisors.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,25 @@ lapply(packages, library, character.only = TRUE)
datapath <- "/mnt/ssd/"
db_file <- paste0(datapath, "AcademicGraph/AcademicGraph.sqlite")
select_fields <- c("physics", "biology", "chemistry", "sociology",
"economics", "political science", "psychology",
"mathematics", "geography", "geology", "engineering",
"computer science", "environmental science") # fields currently matched
select_fields <- c("art",
"biology",
"business",
"chemistry",
"computer science" ,
"economics",
"engineering",
"environmental science",
"geography",
"geology" ,
"history",
"materials science",
"mathematics",
#"medicine",
"philosophy",
"physics",
"political science",
"psychology" ,
"sociology") # fields currently matched
# ## db connection
Expand Down Expand Up @@ -137,10 +152,11 @@ linked_advisors %>%

```{r}
keep_fields <- c("biology", "chemistry", "computer science",
"economics", "engineering", "environmental science",
"geography", "geology", "mathetmatics", "physics",
"political science", "psychology", "sociology")
keep_fields <- select_fields
# c("biology", "chemistry", "computer science",
# "economics", "engineering", "environmental science",
# "geography", "geology", "mathetmatics", "physics",
# "political science", "psychology", "sociology")
score_by_year <- theses %>%
filter(degree_year >= 1985) %>%
Expand Down
4 changes: 4 additions & 0 deletions src/dataprep/pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,10 @@ Rscript -e "rmarkdown::render('$script_path/reports/sample_size_linking.Rmd', ou
# ## 1. Link graduates to MAG
bash $script_path/link/graduates.sh $logfile_path

# Christoph retrained with with the following options:
# --train_name "christoph_degree0" --keepyears "19852015"
# need to run the write_csv_links script with these options as well
# to get all links into db
python -m $script_path.link.write_csv_links --linking_type "graduates" --train_name "christoph_fielddegree0" \
&> $logfile_path/write_csv_links_graduates.log

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
Namespace(testing=False, verbose=1, field=['art'], train_name='christoph_degree0', startyear=1985, endyear=2015, loadstartyear=1985, loadendyear=2015, mergemode='1:1', recall=0.9, institution='False', fieldofstudy_cat='False', fieldofstudy_str='False', keywords='True', retrain='True', linking_type='graduates', samplesize=100000, write_to='csv')
Have max 12 cores available
Testing is False

I set the write connection to temporary database.
id_field is [142362112] and will be passed to sql queries.

SELECT goid
, year
, firstname
, lastname
, CASE TRIM(SUBSTR(middle_lastname, 1, l_fullname-l_firstname-l_lastname - 1))
WHEN
"" THEN NULL
ELSE TRIM(SUBSTR(middle_lastname, 1, l_fullname-l_firstname-l_lastname - 1))
END AS middlename
, fieldofstudy
, keywords
, institution
, coauthors
, year_papertitle
FROM (
SELECT goid
, degree_year AS year
, fullname
, SUBSTR(TRIM(fullname),1,instr(trim(fullname)||' ',' ')-1) AS firstname
, REPLACE(fullname, RTRIM(fullname, REPLACE(fullname, " ", "")), "") AS lastname
, TRIM(SUBSTR(fullname, length(SUBSTR(TRIM(fullname),1,instr(trim(fullname)||' ',' ')-1)) + 1)) AS middle_lastname
, length(fullname) AS l_fullname
, length(SUBSTR(TRIM(fullname),1,instr(trim(fullname)||' ',' ')-1) ) AS l_firstname
, length(REPLACE(fullname, RTRIM(fullname, REPLACE(fullname, " ", "")), "")) AS l_lastname
, fieldname AS fieldofstudy
, university_id
, degree_year || "//" || thesistitle as year_papertitle
FROM pq_authors
INNER JOIN (
SELECT goid, fieldname
FROM pq_fields_mag
WHERE mag_field0 IN (?)
) USING (goid)
)
-- ## NOTE: use left join here as not all graduates have advisor (particularly pre-1980) and possibly also keywords
LEFT JOIN (
SELECT goid
, fields as keywords
, advisors as coauthors
FROm pq_info_linking
) USING(goid)
INNER JOIN (
SELECT university_id, normalizedname as institution
FROM pq_unis
WHERE location like "%United States%"
) USING(university_id)
WHERE year >= 1985 and year <= 2015 AND length(firstname) > 1



SELECT f.AuthorId
, f.year
, f.firstname
, f.lastname
, CASE TRIM(SUBSTR(f.middle_lastname, 1, f.l_fullname - f.l_firstname - f.l_lastname - 1))
WHEN
"" THEN NULL
ELSE TRIM(SUBSTR(f.middle_lastname, 1, f.l_fullname - f.l_firstname - f.l_lastname - 1))
END as middlename
-- ## NOTE this gives "" for middlename when it is missing
, f.fieldofstudy
, g.keywords
, g.coauthors
, g.institution
, g.year_papertitle
FROM (
SELECT a.AuthorId
, a.YearFirstPub AS year
, a.FirstName AS firstname
, REPLACE(b.NormalizedName, RTRIM(b.NormalizedName, REPLACE(b.NormalizedName, " ", "")), "") AS lastname
-- https://stackoverflow.com/questions/21388820/how-to-get-the-last-index-of-a-substring-in-sqlite
, TRIM(SUBSTR(b.NormalizedName, length(a.FirstName) + 1)) AS middle_lastname
-- this gives all except the first name
, length(b.NormalizedName) as l_fullname
, length(a.FirstName) as l_firstname
, length(REPLACE(b.NormalizedName, RTRIM(b.NormalizedName, REPLACE(b.NormalizedName, " ", "")), "")) as l_lastname
, e.NormalizedName AS fieldofstudy
FROM author_sample AS a
INNER JOIN (
SELECT AuthorId, NormalizedName
FROM Authors
) AS b USING(AuthorId)
INNER JOIN (
SELECT AuthorId
FROM author_field0
WHERE FieldOfStudyId_lvl0 IN (?)
AND Degree <= 0
) USING(AuthorId)
LEFT JOIN (
SELECT AuthorId, NormalizedName
FROM author_fields c
INNER JOIN (
SELECT FieldOfStudyId, NormalizedName
FROM FieldsOfStudy
) AS d USING(FieldOfStudyId)
-- ## Condition on fieldofstudy being in the level 0 id_field
INNER JOIN (
SELECT ParentFieldOfStudyId, ChildFieldOfStudyId
FROM crosswalk_fields
WHERE ParentLevel = 0
AND ParentFieldOfStudyId IN (?)
) AS e ON (e.ChildFieldOfStudyId = c.FieldOfStudyId)
WHERE FieldClass = 'first'
) AS e USING(AuthorId)
) f
LEFT JOIN (
SELECT AuthorId
, institutions as institution
, main_us_institutions_career
, coauthors
, keywords
, year_papertitle
FROM author_info_linking
) AS g USING(AuthorId)
WHERE length(firstname) > 1 AND year >= 1985 - 5 AND year <= 2015 + 5
-- ## use this to condition on people that have at least at some point their main affiliation in the US
AND g.main_us_institutions_career IS NOT NULL
AND g.institution != "chinese academy of sciences"


reading from: /mnt/ssd/DedupeFiles/graduates/settings_art_1985_2015_institutionFalse_fieldofstudy_catFalse_fieldofstudy_strFalse_keywordsTruechristoph_degree0
Link now ...
made pairs
calculated scores
made 1:1 links
Writing to database...
Filling table info...
Filled table info...
Iteration id is 53
Filling links into db...
Filled links into db...
Wrote linking info into db...
Found 7540 links for 86635 graduates with a score of at least 0.
Running ANALYZE...

Copying to csv...
Done copying to csv...
Deleted the temporary database...
Done in 5.894030499458313 minutes.
Loading

0 comments on commit b255b76

Please sign in to comment.