-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #26 from f-hafner/christoph-labellink-advisors
Christoph label+link advisors. Add missing graduates fields
- Loading branch information
Showing
64 changed files
with
19,555 additions
and
2,579 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
146 changes: 146 additions & 0 deletions
146
src/dataprep/temp/createlink_mag_proquest_art_1:1_christoph_degree0_graduates_8515.log
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,146 @@ | ||
Namespace(testing=False, verbose=1, field=['art'], train_name='christoph_degree0', startyear=1985, endyear=2015, loadstartyear=1985, loadendyear=2015, mergemode='1:1', recall=0.9, institution='False', fieldofstudy_cat='False', fieldofstudy_str='False', keywords='True', retrain='True', linking_type='graduates', samplesize=100000, write_to='csv') | ||
Have max 12 cores available | ||
Testing is False | ||
|
||
I set the write connection to temporary database. | ||
id_field is [142362112] and will be passed to sql queries. | ||
|
||
SELECT goid | ||
, year | ||
, firstname | ||
, lastname | ||
, CASE TRIM(SUBSTR(middle_lastname, 1, l_fullname-l_firstname-l_lastname - 1)) | ||
WHEN | ||
"" THEN NULL | ||
ELSE TRIM(SUBSTR(middle_lastname, 1, l_fullname-l_firstname-l_lastname - 1)) | ||
END AS middlename | ||
, fieldofstudy | ||
, keywords | ||
, institution | ||
, coauthors | ||
, year_papertitle | ||
FROM ( | ||
SELECT goid | ||
, degree_year AS year | ||
, fullname | ||
, SUBSTR(TRIM(fullname),1,instr(trim(fullname)||' ',' ')-1) AS firstname | ||
, REPLACE(fullname, RTRIM(fullname, REPLACE(fullname, " ", "")), "") AS lastname | ||
, TRIM(SUBSTR(fullname, length(SUBSTR(TRIM(fullname),1,instr(trim(fullname)||' ',' ')-1)) + 1)) AS middle_lastname | ||
, length(fullname) AS l_fullname | ||
, length(SUBSTR(TRIM(fullname),1,instr(trim(fullname)||' ',' ')-1) ) AS l_firstname | ||
, length(REPLACE(fullname, RTRIM(fullname, REPLACE(fullname, " ", "")), "")) AS l_lastname | ||
, fieldname AS fieldofstudy | ||
, university_id | ||
, degree_year || "//" || thesistitle as year_papertitle | ||
FROM pq_authors | ||
INNER JOIN ( | ||
SELECT goid, fieldname | ||
FROM pq_fields_mag | ||
WHERE mag_field0 IN (?) | ||
) USING (goid) | ||
) | ||
-- ## NOTE: use left join here as not all graduates have advisor (particularly pre-1980) and possibly also keywords | ||
LEFT JOIN ( | ||
SELECT goid | ||
, fields as keywords | ||
, advisors as coauthors | ||
FROm pq_info_linking | ||
) USING(goid) | ||
INNER JOIN ( | ||
SELECT university_id, normalizedname as institution | ||
FROM pq_unis | ||
WHERE location like "%United States%" | ||
) USING(university_id) | ||
WHERE year >= 1985 and year <= 2015 AND length(firstname) > 1 | ||
|
||
|
||
|
||
SELECT f.AuthorId | ||
, f.year | ||
, f.firstname | ||
, f.lastname | ||
, CASE TRIM(SUBSTR(f.middle_lastname, 1, f.l_fullname - f.l_firstname - f.l_lastname - 1)) | ||
WHEN | ||
"" THEN NULL | ||
ELSE TRIM(SUBSTR(f.middle_lastname, 1, f.l_fullname - f.l_firstname - f.l_lastname - 1)) | ||
END as middlename | ||
-- ## NOTE this gives "" for middlename when it is missing | ||
, f.fieldofstudy | ||
, g.keywords | ||
, g.coauthors | ||
, g.institution | ||
, g.year_papertitle | ||
FROM ( | ||
SELECT a.AuthorId | ||
, a.YearFirstPub AS year | ||
, a.FirstName AS firstname | ||
, REPLACE(b.NormalizedName, RTRIM(b.NormalizedName, REPLACE(b.NormalizedName, " ", "")), "") AS lastname | ||
-- https://stackoverflow.com/questions/21388820/how-to-get-the-last-index-of-a-substring-in-sqlite | ||
, TRIM(SUBSTR(b.NormalizedName, length(a.FirstName) + 1)) AS middle_lastname | ||
-- this gives all except the first name | ||
, length(b.NormalizedName) as l_fullname | ||
, length(a.FirstName) as l_firstname | ||
, length(REPLACE(b.NormalizedName, RTRIM(b.NormalizedName, REPLACE(b.NormalizedName, " ", "")), "")) as l_lastname | ||
, e.NormalizedName AS fieldofstudy | ||
FROM author_sample AS a | ||
INNER JOIN ( | ||
SELECT AuthorId, NormalizedName | ||
FROM Authors | ||
) AS b USING(AuthorId) | ||
INNER JOIN ( | ||
SELECT AuthorId | ||
FROM author_field0 | ||
WHERE FieldOfStudyId_lvl0 IN (?) | ||
AND Degree <= 0 | ||
) USING(AuthorId) | ||
LEFT JOIN ( | ||
SELECT AuthorId, NormalizedName | ||
FROM author_fields c | ||
INNER JOIN ( | ||
SELECT FieldOfStudyId, NormalizedName | ||
FROM FieldsOfStudy | ||
) AS d USING(FieldOfStudyId) | ||
-- ## Condition on fieldofstudy being in the level 0 id_field | ||
INNER JOIN ( | ||
SELECT ParentFieldOfStudyId, ChildFieldOfStudyId | ||
FROM crosswalk_fields | ||
WHERE ParentLevel = 0 | ||
AND ParentFieldOfStudyId IN (?) | ||
) AS e ON (e.ChildFieldOfStudyId = c.FieldOfStudyId) | ||
WHERE FieldClass = 'first' | ||
) AS e USING(AuthorId) | ||
) f | ||
LEFT JOIN ( | ||
SELECT AuthorId | ||
, institutions as institution | ||
, main_us_institutions_career | ||
, coauthors | ||
, keywords | ||
, year_papertitle | ||
FROM author_info_linking | ||
) AS g USING(AuthorId) | ||
WHERE length(firstname) > 1 AND year >= 1985 - 5 AND year <= 2015 + 5 | ||
-- ## use this to condition on people that have at least at some point their main affiliation in the US | ||
AND g.main_us_institutions_career IS NOT NULL | ||
AND g.institution != "chinese academy of sciences" | ||
|
||
|
||
reading from: /mnt/ssd/DedupeFiles/graduates/settings_art_1985_2015_institutionFalse_fieldofstudy_catFalse_fieldofstudy_strFalse_keywordsTruechristoph_degree0 | ||
Link now ... | ||
made pairs | ||
calculated scores | ||
made 1:1 links | ||
Writing to database... | ||
Filling table info... | ||
Filled table info... | ||
Iteration id is 53 | ||
Filling links into db... | ||
Filled links into db... | ||
Wrote linking info into db... | ||
Found 7540 links for 86635 graduates with a score of at least 0. | ||
Running ANALYZE... | ||
|
||
Copying to csv... | ||
Done copying to csv... | ||
Deleted the temporary database... | ||
Done in 5.894030499458313 minutes. |
Oops, something went wrong.