Skip to content

Commit

Permalink
Merge pull request #25 from f-hafner/flavio/issue-21
Browse files Browse the repository at this point in the history
Flavio/issue 21
  • Loading branch information
chrished authored Nov 24, 2022
2 parents a0d0807 + a463347 commit 141c0a4
Show file tree
Hide file tree
Showing 12 changed files with 2,534 additions and 949 deletions.
Binary file modified output/quality_linking_advisors.pdf
Binary file not shown.
2 changes: 1 addition & 1 deletion src/dataprep/main/link/advisors.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ fields=("chemistry"
"geology"
"geography"
"economics")
fields=("biology")
fields=("political science")

for i in "${!fields[@]}"; do
field=${fields[$i]}
Expand Down
8 changes: 4 additions & 4 deletions src/dataprep/main/link/link_onefield_advisors.sh
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,10 @@ echo "$field"

mergemode="m:1"

# python3 -m main.link.train_link_mag_proquest --linking_type "advisors" --no-test --mergemode $mergemode --train_name $train_name \
# --field "${field}" --recall $RECALL --start 1990 --end 2015 --institution $institution \
# --fieldofstudy_cat $fieldofstudy_cat --fieldofstudy_str $fieldofstudy_str --keywords $keywords \
# --verbose 2>&1 | tee $logfile_path/trainlink_mag_proquest_"${field}"_${train_name}_advisors_9015.log
python3 -m main.link.train_link_mag_proquest --linking_type "advisors" --no-test --mergemode $mergemode --train_name $train_name \
--field "${field}" --recall $RECALL --start 1990 --end 2015 --institution $institution \
--fieldofstudy_cat $fieldofstudy_cat --fieldofstudy_str $fieldofstudy_str --keywords $keywords \
--verbose 2>&1 | tee $logfile_path/trainlink_mag_proquest_"${field}"_${train_name}_advisors_9015.log

python3 -m main.link.create_link_mag_proquest --linking_type "advisors" --no-test --mergemode $mergemode --train_name $train_name \
--field "${field}" --recall $RECALL --start 1990 --end 2015 --institution $institution \
Expand Down
6 changes: 3 additions & 3 deletions src/dataprep/main/link/setup_linking.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ def __eq__(self, other):

# ## Some settings
pd.set_option('display.max.columns', None)
path_dedupe_files = datapath + "DedupeFiles/"
path_dedupe_files = datapath + "DedupeFiles/flavio/issue-21/" # TODO: this needs to be fixed at the end and any new files copied to DedupeFiles/advisors
share_blockedpairs_training = 0.66 # fraction of similar pairs as opposed to random pairs

# register [adapter for numpy.int64](https://stackoverflow.com/questions/38753737/inserting-numpy-integer-types-into-sqlite-with-python3)
Expand Down Expand Up @@ -356,8 +356,7 @@ def __eq__(self, other):
, f.year || ";" || f.YearLastPub AS year_range
, g.all_us_institutions_year
"""
#where_stmt_mag = f"WHERE length(firstname) > 1 AND f.YearLastPub >= {args.startyear} - 5 AND year <= {args.endyear} + 5"
where_stmt_mag = f"WHERE length(firstname) > 1 AND f.YearLastPub >= {args.loadstartyear} - 5 AND year <= {args.loadendyear} + 5" # "year" is YearFirstPub
where_stmt_mag = f"WHERE f.YearLastPub >= {args.loadstartyear} - 5 AND year <= {args.loadendyear} + 5" # "year" is YearFirstPub

# note: this still sources field of study, but it is level 0 and thus the same for everyone
query_mag = f"""
Expand Down Expand Up @@ -433,6 +432,7 @@ def __eq__(self, other):
"""

if args.linking_type == "advisors":
where_stmt_pq = f"WHERE year >= {args.loadstartyear} and year <= {args.loadendyear}" # the length(firstname)>1 here would also refer to graduate's name...
query_proquest = f"""
SELECT relationship_id
, year
Expand Down
4 changes: 0 additions & 4 deletions src/dataprep/main/link/train_link_mag_proquest.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,18 +135,14 @@
elif args.linking_type == "advisors":
fields = [
{"field": "firstname", "variable name": "firstname", "type": "String", "has missing": False},
{"field": "firstname", "variable name": "same_firstname", "type": "Exact"},
{"field": "lastname", "variable name": "lastname", "type": "String", "has missing": False},
{"field": "lastname", "variable name": "same_lastname", "type": "Exact"},
{"field": "middlename", "variable name": "middlename", "type": "String", "has missing": True},
{"field": "year_range", "variable name": "year_range", "type": "Custom", "comparator": cf.compare_range_from_tuple_tempfix, "has missing": True}
]
elif args.linking_type == "grants":
fields = [
{"field": "firstname", "variable name": "firstname", "type": "String", "has missing": False},
{"field": "firstname", "variable name": "same_firstname", "type": "Exact"},
{"field": "lastname", "variable name": "lastname", "type": "String", "has missing": False},
{"field": "lastname", "variable name": "same_lastname", "type": "Exact"},
{"field": "middlename", "variable name": "middlename", "type": "String", "has missing": True},
{"field": "year_range", "variable name": "year_range", "type": "Custom", "comparator": cf.compare_range_from_tuple_tempfix, "has missing": True}
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ id_field is [86803240] and will be passed to sql queries.
FROM pq_unis --## mark: previously we linked advisors anywhere in the world (as career outcomes). for now, focus on US
WHERE location like "%United States%"
) USING(university_id)
WHERE year >= 1990 and year <= 2015 AND length(firstname) > 1
WHERE year >= 1990 and year <= 2015



Expand Down Expand Up @@ -130,18 +130,18 @@ id_field is [86803240] and will be passed to sql queries.
FROM author_info_linking
) AS g USING(AuthorId)

WHERE length(firstname) > 1 AND f.YearLastPub >= 1990 - 5 AND year <= 2015 + 5 AND institution is not NULL
WHERE f.YearLastPub >= 1990 - 5 AND year <= 2015 + 5 AND institution is not NULL


reading from: /mnt/ssd/DedupeFiles/advisors/settings_biology_1990_2015_institutionTrue_fieldofstudy_catFalse_fieldofstudy_strFalse_keywordsFalsechristoph_degree0
reading from: /mnt/ssd/DedupeFiles/flavio/issue-21/advisors/settings_biology_1990_2015_institutionTrue_fieldofstudy_catFalse_fieldofstudy_strFalse_keywordsFalsechristoph_degree0
Link now ...
made pairs
calculated scores
made m:1 links
Writing to database...
Filling table info...
Filled table info...
Iteration id is 80
Iteration id is 81
Filling links into db...
Filled links into db...
Wrote linking info into db...
Expand All @@ -150,4 +150,4 @@ Running ANALYZE...
Copying to csv...
Done copying to csv...
Deleted the temporary database...
Done in 569.3483387827873 minutes.
Done in 727.9574162801107 minutes.
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ id_field is [127313418] and will be passed to sql queries.
FROM pq_unis --## mark: previously we linked advisors anywhere in the world (as career outcomes). for now, focus on US
WHERE location like "%United States%"
) USING(university_id)
WHERE year >= 1990 and year <= 2015 AND length(firstname) > 1
WHERE year >= 1990 and year <= 2015



Expand Down Expand Up @@ -130,10 +130,10 @@ id_field is [127313418] and will be passed to sql queries.
FROM author_info_linking
) AS g USING(AuthorId)

WHERE length(firstname) > 1 AND f.YearLastPub >= 1990 - 5 AND year <= 2015 + 5 AND institution is not NULL
WHERE f.YearLastPub >= 1990 - 5 AND year <= 2015 + 5 AND institution is not NULL


reading from: /mnt/ssd/DedupeFiles/advisors/settings_geology_1990_2015_institutionTrue_fieldofstudy_catFalse_fieldofstudy_strFalse_keywordsFalsechristoph_degree0
reading from: /mnt/ssd/DedupeFiles/flavio/issue-21/advisors/settings_geology_1990_2015_institutionTrue_fieldofstudy_catFalse_fieldofstudy_strFalse_keywordsFalsechristoph_degree0
Link now ...
made pairs
calculated scores
Expand All @@ -150,4 +150,4 @@ Running ANALYZE...
Copying to csv...
Done copying to csv...
Deleted the temporary database...
Done in 6.174721789360047 minutes.
Done in 8.049215114116668 minutes.
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
Namespace(testing=False, verbose=1, field=['political science'], train_name='christoph_degree0', startyear=1990, endyear=2015, loadstartyear=1990, loadendyear=2015, mergemode='m:1', recall=0.9, institution='True', fieldofstudy_cat='False', fieldofstudy_str='False', keywords='False', retrain='True', linking_type='advisors', samplesize=100000, write_to='csv')
Have max 12 cores available
Testing is False

I set the write connection to temporary database.
id_field is [17744445] and will be passed to sql queries.

SELECT relationship_id
, year
, year AS year_range
, firstname
, lastname
, CASE TRIM(SUBSTR(middle_lastname, 1, l_fullname-l_firstname-l_lastname - 1))
WHEN
"" THEN NULL
ELSE TRIM(SUBSTR(middle_lastname, 1, l_fullname-l_firstname-l_lastname - 1))
END AS middlename
, fieldofstudy
, keywords
, institution
, year || "//" || institution as main_us_institutions_year
, year || "//" || institution as all_us_institutions_year
FROM (
SELECT goid
, relationship_id
, degree_year AS year
, a.fullname
, SUBSTR(TRIM(a.fullname),1,instr(trim(a.fullname)||' ',' ')-1) AS firstname
, REPLACE(a.fullname, RTRIM(a.fullname, REPLACE(a.fullname, " ", "")), "") AS lastname
, TRIM(SUBSTR(a.fullname, length(SUBSTR(TRIM(a.fullname),1,instr(trim(a.fullname)||' ',' ')-1)) + 1)) AS middle_lastname
, length(a.fullname) AS l_fullname
, length(SUBSTR(TRIM(a.fullname),1,instr(trim(a.fullname)||' ',' ')-1) ) AS l_firstname
, length(REPLACE(a.fullname, RTRIM(a.fullname, REPLACE(a.fullname, " ", "")), "")) AS l_lastname
, fieldname AS fieldofstudy
, university_id
FROM pq_authors
INNER JOIN (
SELECT goid, fieldname
FROM pq_fields_mag
WHERE mag_field0 IN (?)
) USING (goid)
INNER JOIN ( --# NOTE: this only keeps the theses where at least one advisor is present
SELECT *, firstname || ' ' || lastname AS fullname
FROM pq_advisors
) AS a USING(goid)
)
-- ## NOTE: use left join here as not all graduates have advisor (particularly pre-1980) and possibly also keywords
LEFT JOIN (
SELECT goid
, fields as keywords
FROM pq_info_linking
) USING(goid)
INNER JOIN (
SELECT university_id, normalizedname as institution
FROM pq_unis --## mark: previously we linked advisors anywhere in the world (as career outcomes). for now, focus on US
WHERE location like "%United States%"
) USING(university_id)
WHERE year >= 1990 and year <= 2015



SELECT f.AuthorId
, f.year
, f.YearLastPub
, f.firstname
, f.lastname
, CASE TRIM(SUBSTR(f.middle_lastname, 1, f.l_fullname - f.l_firstname - f.l_lastname - 1))
WHEN
"" THEN NULL
ELSE TRIM(SUBSTR(f.middle_lastname, 1, f.l_fullname - f.l_firstname - f.l_lastname - 1))
END as middlename
-- ## NOTE this gives "" for middlename when it is missing
, f.fieldofstudy
, g.keywords
, g.coauthors
, g.institution
, g.main_us_institutions_year

, f.year || ";" || f.YearLastPub AS year_range
, g.all_us_institutions_year

FROM (
SELECT a.AuthorId
, a.YearFirstPub AS year
, a.YearLastPub
, a.FirstName AS firstname
, REPLACE(b.NormalizedName, RTRIM(b.NormalizedName, REPLACE(b.NormalizedName, " ", "")), "") AS lastname
-- https://stackoverflow.com/questions/21388820/how-to-get-the-last-index-of-a-substring-in-sqlite
, TRIM(SUBSTR(b.NormalizedName, length(a.FirstName) + 1)) AS middle_lastname
-- this gives all except the first name
, length(b.NormalizedName) as l_fullname
, length(a.FirstName) as l_firstname
, length(REPLACE(b.NormalizedName, RTRIM(b.NormalizedName, REPLACE(b.NormalizedName, " ", "")), "")) as l_lastname
, e.NormalizedName AS fieldofstudy
FROM author_sample AS a
INNER JOIN (
SELECT AuthorId, NormalizedName
FROM Authors
) AS b USING(AuthorId)
INNER JOIN (
SELECT AuthorId
FROM author_field0
WHERE FieldOfStudyId_lvl0 IN (?)
AND Degree <= 0
) USING(AuthorId)
LEFT JOIN (
SELECT AuthorId, NormalizedName
FROM author_fields c
INNER JOIN (
SELECT FieldOfStudyId, NormalizedName
FROM FieldsOfStudy
) AS d USING(FieldOfStudyId)
-- ## Condition on fieldofstudy being in the level 0 id_field
INNER JOIN (
SELECT ParentFieldOfStudyId, ChildFieldOfStudyId
FROM crosswalk_fields
WHERE ParentLevel = 0
AND ParentFieldOfStudyId IN (?)
) AS e ON (e.ChildFieldOfStudyId = c.FieldOfStudyId)
WHERE FieldClass = 'first'
) AS e USING(AuthorId)
) f
LEFT JOIN (
SELECT AuthorId
, main_us_institutions_career as institution
, coauthors
, keywords
, main_us_institutions_year
, all_us_institutions_year
FROM author_info_linking
) AS g USING(AuthorId)

WHERE f.YearLastPub >= 1990 - 5 AND year <= 2015 + 5 AND institution is not NULL


reading from: /mnt/ssd/DedupeFiles/flavio/issue-21/advisors/settings_political_science_1990_2015_institutionTrue_fieldofstudy_catFalse_fieldofstudy_strFalse_keywordsFalsechristoph_degree0
Link now ...
made pairs
calculated scores
made m:1 links
Writing to database...
Filling table info...
Filled table info...
Iteration id is 80
Filling links into db...
Filled links into db...
Wrote linking info into db...
Running ANALYZE...

Copying to csv...
Done copying to csv...
Deleted the temporary database...
Done in 24.474797221024833 minutes.
Loading

0 comments on commit 141c0a4

Please sign in to comment.