-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #25 from f-hafner/flavio/issue-21
Flavio/issue 21
- Loading branch information
Showing
12 changed files
with
2,534 additions
and
949 deletions.
There are no files selected for viewing
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
153 changes: 153 additions & 0 deletions
153
...ep/temp/createlink_mag_proquest_political science_m:1_christoph_degree0_advisors_9015.log
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,153 @@ | ||
Namespace(testing=False, verbose=1, field=['political science'], train_name='christoph_degree0', startyear=1990, endyear=2015, loadstartyear=1990, loadendyear=2015, mergemode='m:1', recall=0.9, institution='True', fieldofstudy_cat='False', fieldofstudy_str='False', keywords='False', retrain='True', linking_type='advisors', samplesize=100000, write_to='csv') | ||
Have max 12 cores available | ||
Testing is False | ||
|
||
I set the write connection to temporary database. | ||
id_field is [17744445] and will be passed to sql queries. | ||
|
||
SELECT relationship_id | ||
, year | ||
, year AS year_range | ||
, firstname | ||
, lastname | ||
, CASE TRIM(SUBSTR(middle_lastname, 1, l_fullname-l_firstname-l_lastname - 1)) | ||
WHEN | ||
"" THEN NULL | ||
ELSE TRIM(SUBSTR(middle_lastname, 1, l_fullname-l_firstname-l_lastname - 1)) | ||
END AS middlename | ||
, fieldofstudy | ||
, keywords | ||
, institution | ||
, year || "//" || institution as main_us_institutions_year | ||
, year || "//" || institution as all_us_institutions_year | ||
FROM ( | ||
SELECT goid | ||
, relationship_id | ||
, degree_year AS year | ||
, a.fullname | ||
, SUBSTR(TRIM(a.fullname),1,instr(trim(a.fullname)||' ',' ')-1) AS firstname | ||
, REPLACE(a.fullname, RTRIM(a.fullname, REPLACE(a.fullname, " ", "")), "") AS lastname | ||
, TRIM(SUBSTR(a.fullname, length(SUBSTR(TRIM(a.fullname),1,instr(trim(a.fullname)||' ',' ')-1)) + 1)) AS middle_lastname | ||
, length(a.fullname) AS l_fullname | ||
, length(SUBSTR(TRIM(a.fullname),1,instr(trim(a.fullname)||' ',' ')-1) ) AS l_firstname | ||
, length(REPLACE(a.fullname, RTRIM(a.fullname, REPLACE(a.fullname, " ", "")), "")) AS l_lastname | ||
, fieldname AS fieldofstudy | ||
, university_id | ||
FROM pq_authors | ||
INNER JOIN ( | ||
SELECT goid, fieldname | ||
FROM pq_fields_mag | ||
WHERE mag_field0 IN (?) | ||
) USING (goid) | ||
INNER JOIN ( --# NOTE: this only keeps the theses where at least one advisor is present | ||
SELECT *, firstname || ' ' || lastname AS fullname | ||
FROM pq_advisors | ||
) AS a USING(goid) | ||
) | ||
-- ## NOTE: use left join here as not all graduates have advisor (particularly pre-1980) and possibly also keywords | ||
LEFT JOIN ( | ||
SELECT goid | ||
, fields as keywords | ||
FROM pq_info_linking | ||
) USING(goid) | ||
INNER JOIN ( | ||
SELECT university_id, normalizedname as institution | ||
FROM pq_unis --## mark: previously we linked advisors anywhere in the world (as career outcomes). for now, focus on US | ||
WHERE location like "%United States%" | ||
) USING(university_id) | ||
WHERE year >= 1990 and year <= 2015 | ||
|
||
|
||
|
||
SELECT f.AuthorId | ||
, f.year | ||
, f.YearLastPub | ||
, f.firstname | ||
, f.lastname | ||
, CASE TRIM(SUBSTR(f.middle_lastname, 1, f.l_fullname - f.l_firstname - f.l_lastname - 1)) | ||
WHEN | ||
"" THEN NULL | ||
ELSE TRIM(SUBSTR(f.middle_lastname, 1, f.l_fullname - f.l_firstname - f.l_lastname - 1)) | ||
END as middlename | ||
-- ## NOTE this gives "" for middlename when it is missing | ||
, f.fieldofstudy | ||
, g.keywords | ||
, g.coauthors | ||
, g.institution | ||
, g.main_us_institutions_year | ||
|
||
, f.year || ";" || f.YearLastPub AS year_range | ||
, g.all_us_institutions_year | ||
|
||
FROM ( | ||
SELECT a.AuthorId | ||
, a.YearFirstPub AS year | ||
, a.YearLastPub | ||
, a.FirstName AS firstname | ||
, REPLACE(b.NormalizedName, RTRIM(b.NormalizedName, REPLACE(b.NormalizedName, " ", "")), "") AS lastname | ||
-- https://stackoverflow.com/questions/21388820/how-to-get-the-last-index-of-a-substring-in-sqlite | ||
, TRIM(SUBSTR(b.NormalizedName, length(a.FirstName) + 1)) AS middle_lastname | ||
-- this gives all except the first name | ||
, length(b.NormalizedName) as l_fullname | ||
, length(a.FirstName) as l_firstname | ||
, length(REPLACE(b.NormalizedName, RTRIM(b.NormalizedName, REPLACE(b.NormalizedName, " ", "")), "")) as l_lastname | ||
, e.NormalizedName AS fieldofstudy | ||
FROM author_sample AS a | ||
INNER JOIN ( | ||
SELECT AuthorId, NormalizedName | ||
FROM Authors | ||
) AS b USING(AuthorId) | ||
INNER JOIN ( | ||
SELECT AuthorId | ||
FROM author_field0 | ||
WHERE FieldOfStudyId_lvl0 IN (?) | ||
AND Degree <= 0 | ||
) USING(AuthorId) | ||
LEFT JOIN ( | ||
SELECT AuthorId, NormalizedName | ||
FROM author_fields c | ||
INNER JOIN ( | ||
SELECT FieldOfStudyId, NormalizedName | ||
FROM FieldsOfStudy | ||
) AS d USING(FieldOfStudyId) | ||
-- ## Condition on fieldofstudy being in the level 0 id_field | ||
INNER JOIN ( | ||
SELECT ParentFieldOfStudyId, ChildFieldOfStudyId | ||
FROM crosswalk_fields | ||
WHERE ParentLevel = 0 | ||
AND ParentFieldOfStudyId IN (?) | ||
) AS e ON (e.ChildFieldOfStudyId = c.FieldOfStudyId) | ||
WHERE FieldClass = 'first' | ||
) AS e USING(AuthorId) | ||
) f | ||
LEFT JOIN ( | ||
SELECT AuthorId | ||
, main_us_institutions_career as institution | ||
, coauthors | ||
, keywords | ||
, main_us_institutions_year | ||
, all_us_institutions_year | ||
FROM author_info_linking | ||
) AS g USING(AuthorId) | ||
|
||
WHERE f.YearLastPub >= 1990 - 5 AND year <= 2015 + 5 AND institution is not NULL | ||
|
||
|
||
reading from: /mnt/ssd/DedupeFiles/flavio/issue-21/advisors/settings_political_science_1990_2015_institutionTrue_fieldofstudy_catFalse_fieldofstudy_strFalse_keywordsFalsechristoph_degree0 | ||
Link now ... | ||
made pairs | ||
calculated scores | ||
made m:1 links | ||
Writing to database... | ||
Filling table info... | ||
Filled table info... | ||
Iteration id is 80 | ||
Filling links into db... | ||
Filled links into db... | ||
Wrote linking info into db... | ||
Running ANALYZE... | ||
|
||
Copying to csv... | ||
Done copying to csv... | ||
Deleted the temporary database... | ||
Done in 24.474797221024833 minutes. |
Oops, something went wrong.