diff --git a/output/quality_linking.pdf b/output/quality_linking.pdf index 5c0105e..9401ad1 100644 Binary files a/output/quality_linking.pdf and b/output/quality_linking.pdf differ diff --git a/src/dataprep/main/link/prep_linked_data.py b/src/dataprep/main/link/prep_linked_data.py index 2e58946..97c5229 100644 --- a/src/dataprep/main/link/prep_linked_data.py +++ b/src/dataprep/main/link/prep_linked_data.py @@ -31,7 +31,6 @@ import pdb import argparse -# ## Arguments # ## Arguments parser = argparse.ArgumentParser(description = 'Inputs for author_collab') parser.add_argument("--filter_trainname", @@ -145,7 +144,7 @@ # for now, do not condition on certain time distance between # graduation year and whenever the supervisor has a publication. -# TODO: do this after gaining some insights in the analysis +# Do this on the fly after gaining some insights in the analysis con.execute("CREATE UNIQUE INDEX idx_cla_AuthorIdrelid ON current_links_advisors (AuthorId ASC, relationship_id ASC)") con.execute("CREATE UNIQUE INDEX idx_cla_relid ON current_links_advisors (relationship_id ASC)") # this is also a way to make sure there are not multiple links per goid diff --git a/src/dataprep/main/reports/quality_linking.Rmd b/src/dataprep/main/reports/quality_linking.Rmd index ac93083..17fc99e 100644 --- a/src/dataprep/main/reports/quality_linking.Rmd +++ b/src/dataprep/main/reports/quality_linking.Rmd @@ -22,10 +22,26 @@ lapply(packages, library, character.only = TRUE) datapath <- "/mnt/ssd/" db_file <- paste0(datapath, "AcademicGraph/AcademicGraph.sqlite") -select_fields <- c("physics", "biology", "chemistry", "sociology", - "economics", "political science", "psychology", - "mathematics", "geography", "geology", "engineering", - "computer science", "environmental science") # fields currently matched +select_fields <- c("art", + "biology", + "business", + "chemistry", + "computer science" , + "economics", + "engineering", + "environmental science", + "geography", + "geology" , + "history", + "materials science", + "mathematics", + "medicine", + "philosophy", + "physics", + "political science", + "psychology" , + "sociology") # all fields are currently matched + date_method_change <- ymd("2022-07-01") # after summer we extended the sampling period and added more features diff --git a/src/dataprep/pipeline.sh b/src/dataprep/pipeline.sh index 7a19d5e..f8c7ac5 100644 --- a/src/dataprep/pipeline.sh +++ b/src/dataprep/pipeline.sh @@ -83,6 +83,10 @@ Rscript -e "rmarkdown::render('$script_path/reports/sample_size_linking.Rmd', ou # ## 1. Link graduates to MAG bash $script_path/link/graduates.sh $logfile_path +# Christoph retrained with with the following options: +# --train_name "christoph_degree0" --keepyears "19852015" +# need to run the write_csv_links script with these options as well +# to get all links into db python -m $script_path.link.write_csv_links --linking_type "graduates" --train_name "christoph_fielddegree0" \ &> $logfile_path/write_csv_links_graduates.log diff --git a/src/dataprep/temp/prep_linked_data.log b/src/dataprep/temp/prep_linked_data.log index 92744be..7c15b15 100644 --- a/src/dataprep/temp/prep_linked_data.log +++ b/src/dataprep/temp/prep_linked_data.log @@ -1,4 +1,4 @@ -Start time: 1664470032.5248213 +Start time: 1670232570.3368495 Using the following DocTypes for citations: ('Journal', 'Book', 'BookChapter', 'Conference')... @@ -8,16 +8,16 @@ where_stmt_iterations is current_links for graduates current_links for advisors -Time elapsed: 0.33170623779296876 minutes +Time elapsed: 0.2818102161089579 minutes Making author_citations... -Time elapsed: 7.353962099552154 minutes +Time elapsed: 11.818827704588573 minutes Making author_output... -Time elapsed: 22.382010038693746 minutes +Time elapsed: 26.798788146177927 minutes Running ANALYZE... -Done in 22.38204313913981 minutes. +Done in 26.79902730782827 minutes. diff --git a/src/dataprep/temp/write_csv_links_graduates.log b/src/dataprep/temp/write_csv_links_graduates.log index e1f6f43..5f417a9 100644 --- a/src/dataprep/temp/write_csv_links_graduates.log +++ b/src/dataprep/temp/write_csv_links_graduates.log @@ -1,3 +1,5 @@ +Fields where files have the years in the name: dict_keys([]) +Fields where files do not have the years in the name: dict_keys(['political science', 'biology', 'geology', 'psychology', 'physics', 'engineering', 'economics', 'environmental science', 'computer science']) The links of the iteration of field political science are already in the database. The links of the iteration of field biology are already in the database. The links of the iteration of field geology are already in the database. @@ -9,4 +11,4 @@ The links of the iteration of field environmental science are already in the dat The links of the iteration of field computer science are already in the database. Running ANALYZE... -Done in 0.004560442765553793 minutes. +Done in 0.0038335204124450685 minutes. diff --git a/src/dataprep/temp/write_csv_links_graduates_addedfields.log b/src/dataprep/temp/write_csv_links_graduates_addedfields.log new file mode 100644 index 0000000..c0b4b32 --- /dev/null +++ b/src/dataprep/temp/write_csv_links_graduates_addedfields.log @@ -0,0 +1,11 @@ +Fields where files have the years in the name: dict_keys(['business', 'philosophy', 'history', 'medicine', 'art', 'materials science']) +Fields where files do not have the years in the name: dict_keys([]) +Writing field business +Writing field philosophy +Writing field history +Writing field medicine +Writing field art +Writing field materials science +Running ANALYZE... + +Done in 0.003022166093190511 minutes.