diff --git a/core/src/main/resources/scrape/journals.csv b/core/src/main/resources/scrape/journals.csv index 4179235..9ad6623 100644 --- a/core/src/main/resources/scrape/journals.csv +++ b/core/src/main/resources/scrape/journals.csv @@ -5,20 +5,20 @@ pmid,pmcid,doi,title,keywords,abstract,fulltext,links,corresp,site 0,0,31,71,0,0,0,2,0,https://doi.org/10.1093/BIOINFORMATICS/13.5.555 0,0,29,40,0,696,0,2,30,https://doi.org/10.1093/BIOINFORMATICS/BTG334 0,0,29,45,0,1079,9274,2,26,https://doi.org/10.1093/BIOINFORMATICS/BTX116 -0,0,28,95,0,1104,33116,2,35,https://doi.org/10.1093/BIOSTATISTICS/KXJ032 -0,0,23,68,5,1069,0,2,29,https://doi.org/10.1093/CZOOLO/61.5.854 -0,0,23,35,0,1240,29531,2,50,https://doi.org/10.1093/DATABASE/BAP020 -0,0,18,82,0,1265,26496,2,43,https://doi.org/10.1093/HMG/DDT384 -0,0,21,66,0,1876,21830,2,37,https://doi.org/10.1093/JHERED/ESI094 -0,0,21,113,5,1497,43204,2,27,https://doi.org/10.1093/MOLBEV/MSH194 +0,0,28,95,0,1104,33109,2,37,https://doi.org/10.1093/BIOSTATISTICS/KXJ032 +0,0,23,68,5,1069,0,2,31,https://doi.org/10.1093/CZOOLO/61.5.854 +0,0,23,35,0,1240,29531,2,52,https://doi.org/10.1093/DATABASE/BAP020 +0,0,18,82,0,1265,26532,2,45,https://doi.org/10.1093/HMG/DDT384 +0,0,21,66,0,1876,21882,2,37,https://doi.org/10.1093/JHERED/ESI094 +0,0,21,113,5,1497,43220,2,27,https://doi.org/10.1093/MOLBEV/MSH194 0,0,18,98,0,1442,17079,2,53,https://doi.org/10.1093/NAR/GKG014 -0,0,18,68,0,2330,29041,2,38,https://doi.org/10.1093/NAR/GKH408 -0,0,18,153,0,1328,47191,2,25,https://doi.org/10.1093/NAR/GNJ005 -0,0,18,80,0,1088,17343,2,37,https://doi.org/10.1093/PCP/PCR141 -0,0,23,64,0,767,32329,2,0,https://doi.org/10.1093/PROTEIN/12.1.15 -0,0,25,102,5,2036,38521,2,35,https://doi.org/10.1080/10635150500541599 +0,0,18,68,0,2330,29049,2,38,https://doi.org/10.1093/NAR/GKH408 +0,0,18,153,0,1328,47299,2,25,https://doi.org/10.1093/NAR/GNJ005 +0,0,18,80,0,1088,17363,2,39,https://doi.org/10.1093/PCP/PCR141 +0,0,23,64,0,767,32545,2,0,https://doi.org/10.1093/PROTEIN/12.1.15 +0,0,25,102,5,2036,38569,2,35,https://doi.org/10.1080/10635150500541599 # keywords question -0,0,21,83,9,1493,39412,2,37,https://doi.org/10.1093/GLYCOB/CWJ049 +0,0,21,83,9,1493,39611,2,37,https://doi.org/10.1093/GLYCOB/CWJ049 # citeseerx # registry from oaDOI @@ -61,20 +61,20 @@ pmid,pmcid,doi,title,keywords,abstract,fulltext,links,corresp,site # wiley_full 0,0,28,83,0,511,0,1,0,https://currentprotocols.onlinelibrary.wiley.com/doi/full/10.1002/0471140864.ps1605s14 0,0,29,41,0,0,0,1,0,https://onlinelibrary.wiley.com/doi/full/10.1002/9780471650126.dob0949 -0,0,59,75,0,2214,48472,2,39,https://onlinelibrary.wiley.com/doi/full/10.1002/1097-0134(20001001)41:1<108::AID-PROT130>3.0.CO;2-S +0,0,59,75,0,2214,49801,2,39,https://onlinelibrary.wiley.com/doi/full/10.1002/1097-0134(20001001)41:1<108::AID-PROT130>3.0.CO;2-S 0,0,52,129,0,1123,0,2,33,https://onlinelibrary.wiley.com/doi/full/10.1002/1098-2272(2000)19:1%2B<::AID-GEPI15>3.0.CO;2-1 0,0,22,84,0,809,18751,2,44,https://onlinelibrary.wiley.com/doi/full/10.1002/ange.201507047 -0,0,20,73,0,1579,43185,2,28,https://onlinelibrary.wiley.com/doi/full/10.1002/cyto.a.20531 -0,0,18,58,0,1484,29518,2,40,https://onlinelibrary.wiley.com/doi/full/10.1002/humu.21438 -0,0,17,93,0,1088,24465,2,54,https://onlinelibrary.wiley.com/doi/full/10.1002/jcc.10386 +0,0,20,73,0,1579,45757,2,28,https://onlinelibrary.wiley.com/doi/full/10.1002/cyto.a.20531 +0,0,18,58,0,1484,29517,2,40,https://onlinelibrary.wiley.com/doi/full/10.1002/humu.21438 +0,0,17,93,0,1088,25018,2,54,https://onlinelibrary.wiley.com/doi/full/10.1002/jcc.10386 0,0,22,110,0,1452,0,2,57,https://onlinelibrary.wiley.com/doi/full/10.1002/pmic.200300402 0,0,22,166,0,459,0,2,53,https://onlinelibrary.wiley.com/doi/full/10.1002/pmic.200300483 -0,0,18,28,0,828,49480,2,49,https://onlinelibrary.wiley.com/doi/full/10.1002/prot.10146 +0,0,18,28,0,828,60976,2,49,https://onlinelibrary.wiley.com/doi/full/10.1002/prot.10146 0,0,64,71,0,1332,30808,2,60,https://onlinelibrary.wiley.com/doi/full/10.1002/(SICI)1097-0061(20000130)16:2<177::AID-YEA516>3.0.CO;2-9 0,0,20,52,0,694,10006,2,34,https://onlinelibrary.wiley.com/doi/full/10.1038/clpt.2012.96 0,0,32,198,0,2188,33157,2,33,https://onlinelibrary.wiley.com/doi/full/10.1046/j.1469-1809.2003.00030.x 0,0,21,59,0,1779,0,2,41,https://onlinelibrary.wiley.com/doi/full/10.1055/s-2004-817909 -0,0,25,55,0,1052,33768,2,29,https://onlinelibrary.wiley.com/doi/full/10.1111/1755-0998.12009.x +0,0,25,55,0,1052,34407,2,29,https://onlinelibrary.wiley.com/doi/full/10.1111/1755-0998.12009.x 0,0,23,123,0,1294,29792,2,33,https://onlinelibrary.wiley.com/doi/full/10.1111/2041-210X.12628 # sciencedirect @@ -96,30 +96,28 @@ pmid,pmcid,doi,title,keywords,abstract,fulltext,links,corresp,site # springer 0,0,18,83,4,665,0,1,0,https://doi.org/10.1007/BF00182187 -0,0,25,97,6,1028,23593,2,0,https://doi.org/10.1007/s00216-009-3166-1 +0,0,25,97,6,1028,23557,2,22,https://doi.org/10.1007/s00216-009-3166-1 0,0,27,112,5,1053,0,2,0,https://doi.org/10.1007/978-0-387-49317-6_9 0,0,27,89,5,1108,0,1,0,https://doi.org/10.1007/978-1-4939-0366-5_8 0,0,21,54,3,759,0,2,0,https://doi.org/10.1007/s002510050595 -0,0,27,166,5,1409,0,2,34,https://doi.org/10.1016/j.jasms.2003.12.011 -0,0,29,132,5,1136,0,2,0,https://doi.org/10.1016/S1044-0305(01)00301-4 -0,0,23,108,8,1646,0,2,0,https://doi.org/10.1023/A:1006960004440 -0,0,25,86,5,1708,0,2,0,https://doi.org/10.1134/S1021443716020175 -0,0,26,41,3,936,0,2,0,https://doi.org/10.1140/epje/i2007-10314-1 -0,0,19,87,4,657,0,2,0,https://doi.org/10.1385/MB:22:3:301 -0,0,32,89,5,1191,0,1,35,https://doi.org/10.2165/00822942-200594030-00002 +0,0,23,108,8,1646,0,2,12,https://doi.org/10.1023/A:1006960004440 +0,0,25,86,5,1708,2206,2,11,https://doi.org/10.1134/S1021443716020175 +0,0,26,41,3,936,0,2,9,https://doi.org/10.1140/epje/i2007-10314-1 +0,0,19,87,4,657,0,2,18,https://doi.org/10.1385/MB:22:3:301 +0,0,32,89,5,1191,0,1,12,https://doi.org/10.2165/00822942-200594030-00002 0,0,25,44,10,1837,0,1,0,https://doi.org/10.1007/978-3-319-24277-4 0,0,23,135,5,0,0,1,0,https://doi.org/10.1007/0-306-47084-5_3 0,0,19,66,5,557,0,2,0,https://doi.org/10.1007/11564096_50 0,0,27,112,5,1053,0,2,0,https://doi.org/10.1007/978-0-387-49317-6_9 0,0,25,60,5,311,0,1,0,https://doi.org/10.1385/0-89603-276-0:267 -0,0,0,31,0,1825,0,1,0,https://doi.org/10.1007/978-1-4419-9863-7 +0,0,25,31,0,1825,0,1,0,https://doi.org/10.1007/978-1-4419-9863-7 # springer_ref 0,0,30,7,0,519,978,1,47,https://doi.org/10.1007/978-1-4419-9863-7_1039 0,0,30,8,0,797,973,1,33,https://doi.org/10.1007/978-1-4419-9863-7_1352 # biomedcentral -0,0,24,76,4,1279,31742,2,16,https://doi.org/10.1186/1471-2105-10-110 +0,0,24,76,4,1279,31745,2,16,https://doi.org/10.1186/1471-2105-10-110 0,0,25,69,5,1695,52426,2,11,https://doi.org/10.1186/s12859-015-0686-x 0,0,21,76,5,1640,20524,2,12,https://doi.org/10.1186/1471-2105-4-1 0,0,27,61,5,2497,55107,2,37,https://doi.org/10.1186/1471-2105-10-S10-S8 @@ -128,13 +126,13 @@ pmid,pmcid,doi,title,keywords,abstract,fulltext,links,corresp,site 0,0,33,69,5,1591,40232,2,12,https://doi.org/10.1186/gb-2002-3-12-research0077 0,0,24,120,5,681,57152,2,31,https://doi.org/10.1186/gb-2014-15-2-r35 0,0,14,98,5,0,7805,2,22,https://doi.org/10.1186/gb4173 -0,0,25,77,5,671,79355,2,23,https://doi.org/10.1186/s13059-014-0405-3 +0,0,25,77,5,671,79385,2,23,https://doi.org/10.1186/s13059-014-0405-3 0,0,24,56,5,1707,16658,2,27,https://doi.org/10.1186/1471-2164-10-375 0,0,25,109,5,1458,49717,2,13,https://doi.org/10.1186/s12864-015-1704-0 0,0,22,96,5,1359,22201,2,18,https://doi.org/10.1186/1756-0500-1-30 0,0,21,76,5,1761,36966,2,13,https://doi.org/10.1186/1752-0509-1-2 0,0,25,137,5,1157,109882,2,20,https://doi.org/10.1186/s12918-015-0211-x -0,0,21,92,5,1982,54025,2,13,https://doi.org/10.1186/1748-7188-2-1 +0,0,21,92,5,1982,54027,2,13,https://doi.org/10.1186/1748-7188-2-1 0,0,21,62,5,2184,17764,2,12,https://doi.org/10.1186/1751-0473-2-1 0,0,24,147,5,1889,58187,2,16,https://doi.org/10.1186/1471-2148-10-210 0,0,22,97,5,2106,51167,2,21,https://doi.org/10.1186/1756-8935-3-20 @@ -142,7 +140,7 @@ pmid,pmcid,doi,title,keywords,abstract,fulltext,links,corresp,site 0,0,21,53,5,1625,58798,2,18,https://doi.org/10.1186/1756-0381-3-1 0,0,26,109,5,0,157,2,0,https://doi.org/10.1186/1297-9686-26-6-537 0,0,22,107,5,1389,41423,2,13,https://doi.org/10.1186/1297-9686-44-9 -0,0,22,49,5,2043,33047,2,20,https://doi.org/10.1186/1472-6807-9-44 +0,0,22,49,5,2043,33052,2,20,https://doi.org/10.1186/1472-6807-9-44 # cshlp 0,0,19,20,0,1121,0,3,0,https://doi.org/10.1101/gr.10.4.511 @@ -339,7 +337,7 @@ pmid,pmcid,doi,title,keywords,abstract,fulltext,links,corresp,site # f1000research_posters # pdf_a not working (because href="#") -0,0,0,59,5,3934,0,1,41,https://doi.org/10.7490/f1000research.1110127.1 +0,0,0,59,5,3933,0,1,41,https://doi.org/10.7490/f1000research.1110127.1 0,0,0,86,8,768,0,1,89,https://doi.org/10.7490/f1000research.1112656.1 0,0,0,97,6,0,0,1,33,https://doi.org/10.7490/f1000research.1113436.1 @@ -358,7 +356,7 @@ pmid,pmcid,doi,title,keywords,abstract,fulltext,links,corresp,site 0,0,22,80,5,1218,0,1,0,http://orbit.dtu.dk/en/publications/netphosbac--a-predictor-for-serthr-phosphorylation-sites-in-bacterial-proteins(9faf0130-30a2-4a89-80ec-875b20c82e67).html # tandfonline -0,0,30,62,0,1621,0,2,0,http://www.tandfonline.com/doi/abs/10.1080/07391102.2005.10507020 +0,0,30,62,0,1631,0,2,0,http://www.tandfonline.com/doi/abs/10.1080/07391102.2005.10507020 0,0,21,68,0,0,0,2,0,http://www.tandfonline.com/doi/abs/10.1081/CNV-120016428 0,0,25,61,5,1657,0,2,0,http://www.tandfonline.com/doi/abs/10.1198/jasa.2009.ap07611 0,0,28,133,6,1515,0,3,0,http://www.tandfonline.com/doi/abs/10.1080/07391102.2014.968875 @@ -367,9 +365,9 @@ pmid,pmcid,doi,title,keywords,abstract,fulltext,links,corresp,site # tandfonline_full 0,0,21,68,0,0,0,1,0,http://www.tandfonline.com/doi/full/10.1081/CNV-120016428 -0,0,28,133,6,1515,33334,2,0,http://www.tandfonline.com/doi/full/10.1080/07391102.2014.968875 -0,0,29,164,6,1817,41664,2,0,http://www.tandfonline.com/doi/full/10.1080/07391102.2015.1095116 -0,0,22,117,0,0,7094,2,0,http://www.tandfonline.com/doi/full/10.1586/14789450.3.1.1 +0,0,28,133,6,1515,32940,2,0,http://www.tandfonline.com/doi/full/10.1080/07391102.2014.968875 +0,0,29,164,6,1817,40515,2,0,http://www.tandfonline.com/doi/full/10.1080/07391102.2015.1095116 +0,0,22,117,0,0,7091,2,0,http://www.tandfonline.com/doi/full/10.1586/14789450.3.1.1 # asm 0,0,20,196,0,2009,36823,2,45,https://doi.org/10.1128/JCM.00540-08 @@ -384,7 +382,7 @@ pmid,pmcid,doi,title,keywords,abstract,fulltext,links,corresp,site 0,0,23,62,0,627,0,2,0,https://doi.org/10.1145/2618243.2618289 # degruyter -0,0,22,81,4,973,0,1,0,https://doi.org/10.2202/1544-6115.1046 +0,0,22,81,4,975,0,1,0,https://doi.org/10.2202/1544-6115.1046 0,0,22,61,6,1545,0,1,0,https://doi.org/10.1515/1544-6115.1753 0,0,23,76,4,1459,0,1,0,https://doi.org/10.1515/sagmb-2012-0046 @@ -470,8 +468,8 @@ pmid,pmcid,doi,title,keywords,abstract,fulltext,links,corresp,site 0,0,29,86,0,496,0,1,0,https://doi.org/10.3233/978-1-61499-769-6-182 # researchgate -0,0,0,89,0,1608,0,2,0,https://doi.org/10.13140/RG.2.1.2763.4807 -0,0,0,73,0,1435,0,2,0,https://doi.org/10.13140/RG.2.1.3547.6561 +0,0,25,89,0,1608,0,2,0,https://doi.org/10.13140/RG.2.1.2763.4807 +0,0,25,73,0,1435,0,2,0,https://doi.org/10.13140/RG.2.1.3547.6561 # frontiersin 0,0,24,215,0,0,7809,2,0,https://doi.org/10.3389/FGENE.2014.00130 @@ -497,7 +495,7 @@ pmid,pmcid,doi,title,keywords,abstract,fulltext,links,corresp,site 0,0,27,95,0,579,30064,2,0,https://doi.org/10.1534/GENETICS.107.085332 # plantphysiol -0,0,17,70,0,1268,30332,2,31,https://doi.org/10.1104/PP.011577 +0,0,17,70,0,1268,30332,2,0,https://doi.org/10.1104/PP.011577 0,0,21,118,0,2160,80460,2,32,https://doi.org/10.1104/PP.110.156851 0,0,19,90,0,1123,26486,2,58,https://doi.org/10.1104/PP.15.01327 @@ -506,10 +504,10 @@ pmid,pmcid,doi,title,keywords,abstract,fulltext,links,corresp,site 0,0,22,112,0,1600,76501,2,39,https://doi.org/10.1105/TPC.113.121913 # bloodjournal, now ashpublications.org -0,0,28,67,2,0,5034,2,0,https://doi.org/10.1182/BLOOD-2010-04-282616 +0,0,28,67,2,0,5050,2,0,https://doi.org/10.1182/BLOOD-2010-04-282616 # bloodadvances, now ashpublications.org -0,0,32,107,12,1950,34205,2,0,https://doi.org/10.1182/BLOODADVANCES.2016000794 +0,0,32,107,12,1950,34217,2,0,https://doi.org/10.1182/BLOODADVANCES.2016000794 # biochemj, portlandpress.com 0,0,17,133,0,1700,0,1,0,https://doi.org/10.1042/BJ3080801 @@ -523,14 +521,14 @@ pmid,pmcid,doi,title,keywords,abstract,fulltext,links,corresp,site # zenodo 0,0,22,46,0,59,0,1,0,https://doi.org/10.5281/ZENODO.1251638 0,0,22,19,11,546,0,1,0,https://doi.org/10.5281/ZENODO.1217112 -0,0,20,82,7,835,0,1,0,https://doi.org/10.5281/ZENODO.34090 +0,0,20,6,7,835,0,1,0,https://doi.org/10.5281/ZENODO.34090 0,0,21,53,0,83,0,1,0,https://doi.org/10.5281/ZENODO.573771 0,0,0,58,0,2627,0,2,0,https://zenodo.org/record/1259625 0,0,0,46,0,46,0,2,0,https://zenodo.org/record/1233395 # future-science -0,0,0,101,4,1334,37006,2,36,https://doi.org/10.2144/000113999 -0,0,0,126,5,724,10940,2,40,https://doi.org/10.2144/000113978 +0,0,0,101,4,1334,37014,2,36,https://doi.org/10.2144/000113999 +0,0,0,126,5,724,10943,2,40,https://doi.org/10.2144/000113978 # jstatsoft 0,0,21,68,0,1302,0,2,0,https://doi.org/10.18637/JSS.V046.I11 @@ -559,13 +557,13 @@ pmid,pmcid,doi,title,keywords,abstract,fulltext,links,corresp,site 0,0,20,159,5,1579,0,3,6,https://doi.org/10.3390/ijms15057594 # mdpi_full -0,0,16,154,8,1489,40737,2,22,https://www.mdpi.com/1999-4915/4/11/3209/htm -0,0,20,80,5,1168,22350,2,16,https://www.mdpi.com/1422-0067/17/8/1215/htm -0,0,20,77,4,1175,27055,2,34,https://www.mdpi.com/1422-0067/18/2/274/htm -0,0,16,75,3,1287,50026,2,12,https://www.mdpi.com/1999-4893/6/2/352/htm -0,0,20,87,4,2209,33410,2,7,https://www.mdpi.com/1422-0067/20/5/1070/htm -0,0,21,129,3,1648,52141,2,18,https://www.mdpi.com/2218-1989/6/4/39/htm -0,0,20,159,5,1564,23971,2,6,https://www.mdpi.com/1422-0067/15/5/7594/htm +0,0,16,154,8,1489,40855,2,22,https://www.mdpi.com/1999-4915/4/11/3209/htm +0,0,20,80,5,1168,22359,2,16,https://www.mdpi.com/1422-0067/17/8/1215/htm +0,0,20,77,4,1175,27089,2,34,https://www.mdpi.com/1422-0067/18/2/274/htm +0,0,16,75,3,1265,50015,2,12,https://www.mdpi.com/1999-4893/6/2/352/htm +0,0,20,87,4,2209,33431,2,7,https://www.mdpi.com/1422-0067/20/5/1070/htm +0,0,21,129,3,1648,52188,2,18,https://www.mdpi.com/2218-1989/6/4/39/htm +0,0,20,159,5,1564,24019,2,6,https://www.mdpi.com/1422-0067/15/5/7594/htm # preprints 0,0,0,26,4,976,0,2,0,https://doi.org/10.20944/preprints201905.0056.v1 @@ -585,7 +583,7 @@ pmid,pmcid,doi,title,keywords,abstract,fulltext,links,corresp,site 0,0,0,56,2,1770,0,2,0,https://doi.org/10.26434/chemrxiv.8178722.v1 # iop -0,0,24,120,0,2178,55355,2,0,https://doi.org/10.1088/1741-2552/ab208d +0,0,24,120,0,2178,55211,2,0,https://doi.org/10.1088/1741-2552/ab208d 0,0,24,100,0,0,6170,2,0,https://doi.org/10.1088/1752-7163/ab2fa2 0,0,24,135,0,2014,31338,2,0,https://doi.org/10.1088/1361-6560/ab2f47 diff --git a/core/src/main/resources/scrape/journals.yaml b/core/src/main/resources/scrape/journals.yaml index 7b321ae..58ebaa5 100644 --- a/core/src/main/resources/scrape/journals.yaml +++ b/core/src/main/resources/scrape/journals.yaml @@ -1,5 +1,5 @@ # -# Copyright © 2016, 2018 Erik Jaaniso +# Copyright © 2016, 2018, 2020 Erik Jaaniso # # This file is part of PubFetcher. # @@ -156,8 +156,8 @@ oup: doi: .ww-citation-primary > a title: h1 keywords: .kwd-group > .kwd-part - abstract: section.abstract > p, section.abstract > .sec > * - fulltext: .widget-ArticleFulltext > div > div > :not(.abstract-title):not(section.abstract):not(.article-metadata-panel):not(.article-metadata-standalone-panel):not(.kwd-group):not(.backreferences-title):not(.ref-list):not(.copyright):not(.reveal-modal):not(.license):not(.authorNotes-section-title):not(.authorNotes-section-title + p):not(.backacknowledgements-title):not(.backacknowledgements-title + p):not(.pdf-notice):not(:has(.footnote-content)):not(.table-modal) + abstract: section.abstract > p, section.abstract > .sec > *, section.abstract > div > .title, section.abstract > div > p + fulltext: .widget-ArticleFulltext > div > div > :not(.abstract-title):not(section.abstract):not(.article-metadata-panel):not(.article-metadata-standalone-panel):not(.kwd-group):not(.backreferences-title):not(.ref-list):not(.copyright):not(.reveal-modal):not(.license):not(.authorNotes-section-title):not(.authorNotes-section-title + p):not(.backacknowledgements-title):not(.backacknowledgements-title + p):not(.pdf-notice):not(:has(.footnote-content)):not(.table-modal):not(.widget-FirstPagePreview) pdf_a: .article-pdfLink corresp_author_names: .info-card-author:has(.info-author-correspondence) .info-card-name corresp_author_emails: .info-card-author .info-author-correspondence a, .info-card-author:has(.info-author-correspondence) .info-card-affilitation a[href^=mailto] @@ -212,9 +212,8 @@ springer: keywords: .Keyword, .c-article-subject-list__subject abstract: 'section.Abstract > p, #book-description > *, #Abs1-content > p' fulltext: '#body > section > *, article div[data-article-body=true] > section > :not(#abstract-section):not(#Abs1-section):not(#references-section):not(#Bib1-section):not(#acknowledgements-section):not(#Ack1-section):not(#author-information-section):not(#author-contribution-section):not(#article-comments-section):not(#rightslink-section):not(#article-info-section):not(#further-reading-section):not(#ethics-section) > *' - pdf_a: '.test-pdf-link > :not([id$=no_access_banner]) > a' - corresp_author_names: .authors__list li:has(.authors__contact) .authors__name - corresp_author_emails: .authors__list li .authors__contact a + pdf_a: '.test-pdf-link > :not([id$=no_access_banner]) > a, .c-pdf-download__link' + corresp_author_names: .c-author-list a[data-test=author-name]:has(.u-icon) springer_ref: doi: .FulltextWrapper .ChapterDOI @@ -250,8 +249,8 @@ nature: doi: 'h3:has(abbr[title=Digital Object Identifier]) + p > a[data-track-action=view doi], #article-info-section a[data-track-action=view doi]' title: 'article header h1, #article #content > .article > h2.article-title' keywords: div[data-component=article-subject-links] a[data-track-action=view subject], .c-article-subject-list__subject - abstract: 'article div[data-article-body=true] > section > #abstract-section > div > *, article div[data-article-body=true] > section > #Abs1-section > div > *, article div[data-article-body=true] > section > #Abs2-section > div > *, #article #content > .article > #Summary + p' - fulltext: 'article div[data-article-body=true] > section > :not(#abstract-section):not(#Abs1-section):not(#Abs2-section):not(#references-section):not(#Bib1-section):not(#acknowledgements-section):not(#Ack1-section):not(#author-information-section):not(#author-contribution-section):not(#article-comments-section):not(#rightslink-section):not(#article-info-section):not(#further-reading-section):not(#ethics-section) > *, article div[data-article-body=true] > div:not(:has(#access-options)):not([aria-hidden=true]):not(.c-pdf-button__container)' + abstract: 'article .c-article-body > section > #abstract-section > div > *, article .c-article-body > section > #Abs1-section > div > *, article .c-article-body > section > #Abs2-section > div > *, #article #content > .article > #Summary + p' + fulltext: 'article .c-article-body > section > :not(#abstract-section):not(#Abs1-section):not(#Abs2-section):not(#references-section):not(#Bib1-section):not(#acknowledgements-section):not(#Ack1-section):not(#author-information-section):not(#author-contribution-section):not(#article-comments-section):not(#rightslink-section):not(#article-info-section):not(#further-reading-section):not(#ethics-section) > *, article .c-article-body > div:not(:has(#access-options)):not([aria-hidden=true]):not(.c-pdf-button__container)' pdf_a: a[data-article-pdf] corresp_author_names: .c-author-list a[data-test=author-name]:has(.u-icon) @@ -412,7 +411,7 @@ f1000research_articles: f1000research_posters: title: .asset-title > h1 keywords_split: .asset-subcontainer__title:containsOwn(Keywords) + * - abstract: .abstract__content + abstract: .abstract__content:not(:has(.abstract__content)) # pdf_a not working (because href="#") corresp_author_names: .asset-details-container-large .asset-authors > a corresp_author_emails: .asset-details-container-large .asset-authors > a @@ -468,11 +467,11 @@ acm: pdf_a: article .citation a[title=PDF] degruyter: - doi: .article-doi - title: h1.entryTitle - keywords: .articleBody_keywords > a - abstract: .articleBody_abstract > p - pdf_a: .gs-access-fullcontentlink .pdf-link + doi: dt.c-List__item:matchesOwn(^DOI:$) + dd.c-List__item > a + title: '#pageBody h1' + keywords: dt.c-List__item:matchesOwn(^Keywords:$) + dd.c-List__item > a + abstract: '#pageBody section.abstract > p' + pdf_a: '#pdf-download' hindawi: doi: .middle_content pre a[href^=http://dx.doi.org], .article_citation a[aria-label=Doi-link] @@ -573,9 +572,9 @@ iospress: abstract: '#contentcolumn .metadata > .abstract > p' researchgate: - #doi: .publication-details__section > .publication-meta > div:first-child + div - title: .publication-header > h1, .publication-details__section > h1 - abstract: .publication-abstract > :not(:first-child), .publication-details__section > div > div:matches(^Abstract$) + div + doi: .research-detail-header-section__metadata > div:matchesOwn(^DOI:) > a + title: main h1 + abstract: main .research-detail-middle-section__abstract pdf_a: a:matches(^Download full-text PDF$) frontiersin: @@ -649,13 +648,13 @@ mdpi: corresp_author_names: .art-authors > span:has(.emailCaptcha) > span > a mdpi_full: - doi: article .html-art-header a - title: '#html-article-title' - keywords_split: '#html-keywords > div' - abstract: '#html-abstract > div' - fulltext: 'article > .html-body > section > *, article > .html-back > #html-glossary' - pdf_a: article .download > a:matchesOwn(^Download PDF$) - corresp_author_names: .html-author-group > span:has(.emailCaptcha) > span > a + doi: '#abstract .bib-identity > a' + title: '#abstract h1' + keywords: '#abstract .art-keywords a' + abstract: '#abstract .art-abstract' + fulltext: 'article .html-body section > :not(section), article #SupplementaryFiles, article #html-glossary' + pdf_a: '#abstract .download > a:matchesOwn(^Download PDF$)' + corresp_author_names: .art-authors > span:has(.emailCaptcha) > span > a preprints: title: '#submission-content > h1' @@ -669,16 +668,16 @@ aip: title: .publicationContentTitle > h1 keywords: .topicList a:matches(^Topics$) + ul > .topicTags abstract: article .abstractSection - fulltext: .hlFld-Fulltext .NLM_sec > :not(.NLM_sec) - pdf_a: .article-menu > .download-pdf a.pdf + fulltext: .hlFld-Fulltext .NLM_sec > :not(.NLM_sec):not(.sectionInfo), .hlFld-Fulltext .NLM_sec > .sectionInfo > .sectionHeading + pdf_a: .article-menu > .download-pdf a:not(.notVisible) corresp_author_names: .entryAuthor > .contrib-author:has(.email) > a[href^=/author/] # corresp_author_emails: .entryAuthor > .contrib-author > .email # email-protection chemrxiv: - title: .item-left > h2.title - keywords: .item-right > .tags a - abstract: .item-left > .description - pdf_a: .actions-bar .download-button + title: main h2 + keywords: main h3:matchesOwn(^Keyword\(s\)$) + div > a + abstract: main ._1bqUT + pdf_a: '#a11y-1-tab-tab-download' iop: doi: '#doi' @@ -699,7 +698,6 @@ iop: - 'ieeexplore\.ieee\.org' - 'xlink\.rsc\.org' # for fulltext_a, pdf_a - 'pubs\.rsc\.org' # for fulltext_a, pdf_a -- 'f1000research\.com/+posters' # highwire2 - 'mcponline\.org' diff --git a/core/src/main/resources/scrape/webpages.csv b/core/src/main/resources/scrape/webpages.csv index ea79349..19abe3f 100644 --- a/core/src/main/resources/scrape/webpages.csv +++ b/core/src/main/resources/scrape/webpages.csv @@ -1,25 +1,25 @@ title,content,license,language,webpage # bioconductor.org -9,642,5,8,http://bioconductor.org/packages/release/bioc/html/RIPSeeker.html -7,835,12,8,https://bioconductor.org/packages/release/bioc/html/biomaRt.html +9,536,5,0,http://bioconductor.org/packages/release/bioc/html/RIPSeeker.html +7,878,12,18,https://bioconductor.org/packages/release/bioc/html/biomaRt.html 9,264,12,18,http://bioconductor.org/packages/release/bioc/html/BiocStyle.html 8,640,5,28,http://bioconductor.org/packages/release/bioc/html/synapter.html 10,247,5,8,http://www.bioconductor.org/packages//2.13/bioc/html/DAVIDQuery.html # bioconductor.org vignettes -23,54890,0,0,http://bioconductor.org/packages/release/bioc/vignettes/EBImage/inst/doc/EBImage-introduction.html -69,30057,0,0,http://bioconductor.org/packages/release/bioc/vignettes/IHW/inst/doc/introduction_to_ihw.html +23,54306,0,0,http://bioconductor.org/packages/release/bioc/vignettes/EBImage/inst/doc/EBImage-introduction.html +69,30055,0,0,http://bioconductor.org/packages/release/bioc/vignettes/IHW/inst/doc/introduction_to_ihw.html # git.bioconductor.org 0,0,0,0,https://git.bioconductor.org/packages/RIVER # github.com -7,942,0,11,https://github.com/sgoodswe/vacceed +7,175,0,11,https://github.com/sgoodswe/vacceed 4,5887,0,27,https://github.com/SciLifeLab/facs/ -10,10673,7,6,https://github.com/petmri/ROCKETSHIP -4,1863,3,7,https://github.com/pschorderet/NEAT -9,422,3,24,https://github.com/arq5x/poretools +10,10673,15,6,https://github.com/petmri/ROCKETSHIP +4,1863,11,7,https://github.com/pschorderet/NEAT +9,422,11,24,https://github.com/arq5x/poretools # github.com user 7,1281,0,0,https://github.com/SeqWare @@ -49,16 +49,17 @@ title,content,license,language,webpage 4,493,0,0,https://github.com/chapmanb/bcbb/blob/master/nextgen/README.md # galaxy.pasteur.fr +# switched off, as requires JavaScript that for this case is too resource intensive for some reason # java.lang.NullPointerException: JavascriptThread has not created a Document! -0,0,0,0,https://galaxy.pasteur.fr/tool_runner?tool_id=toolshed.pasteur.fr/repos/odoppelt/taxonomy_analysis/taxoptimizer/0.0.2 -100,1845,0,0,https://galaxy.pasteur.fr/root?tool_id=aggregate_scores_in_intervals2 -85,1425,0,0,https://galaxy.pasteur.fr/root?tool_id=clustalw -50,187,0,0,https://galaxy.pasteur.fr/root?tool_id=toolshed.g2.bx.psu.edu%2Frepos%2Fdevteam%2Fbam_to_sam%2Fbam_to_sam%2F1.0.3 -88,2368,0,0,https://galaxy.pasteur.fr/root?tool_id=toolshed.pasteur.fr%2Frepos%2Fafelten%2Fmicrobiome_analyses%2FCD-HIT%2F4.6.1 +#0,0,0,0,https://galaxy.pasteur.fr/tool_runner?tool_id=toolshed.pasteur.fr/repos/odoppelt/taxonomy_analysis/taxoptimizer/0.0.2 +#100,1845,0,0,https://galaxy.pasteur.fr/root?tool_id=aggregate_scores_in_intervals2 +#85,1425,0,0,https://galaxy.pasteur.fr/root?tool_id=clustalw +#50,187,0,0,https://galaxy.pasteur.fr/root?tool_id=toolshed.g2.bx.psu.edu%2Frepos%2Fdevteam%2Fbam_to_sam%2Fbam_to_sam%2F1.0.3 +#88,2368,0,0,https://galaxy.pasteur.fr/root?tool_id=toolshed.pasteur.fr%2Frepos%2Fafelten%2Fmicrobiome_analyses%2FCD-HIT%2F4.6.1 # java.lang.NullPointerException: JavascriptThread has not created a Document! -0,0,0,0,https://galaxy.pasteur.fr/root?tool_id=toolshed.pasteur.fr/repos/khillion/salmonella_crispr_typing/salmonella_crispr_typing/1.0.0 -75,774,0,0,https://galaxy.pasteur.fr/root?tool_id=vcf_intersect -101,73,0,0,https://galaxy.pasteur.fr/tool_runner?tool_id=CONVERTER_genbank_to_fasta +#0,0,0,0,https://galaxy.pasteur.fr/root?tool_id=toolshed.pasteur.fr/repos/khillion/salmonella_crispr_typing/salmonella_crispr_typing/1.0.0 +#75,774,0,0,https://galaxy.pasteur.fr/root?tool_id=vcf_intersect +#101,73,0,0,https://galaxy.pasteur.fr/tool_runner?tool_id=CONVERTER_genbank_to_fasta # emboss.open-bio.org # begins wrongly with "The master copies of EMBOSS documentation are available at on the EMBOSS Wiki." @@ -98,7 +99,7 @@ title,content,license,language,webpage 87,412,5,0,https://cran.r-project.org/web/packages/linkcomm/index.html 54,373,5,0,https://cran.r-project.org/web/packages/lme4/index.html 39,163,5,0,https://cran.r-project.org/web/packages/GUniFrac/index.html -0,4421,0,0,https://cran.r-project.org/web/packages/linkcomm/NEWS +0,4484,0,0,https://cran.r-project.org/web/packages/linkcomm/NEWS # cran.r-project.org vignettes 25,29820,0,0,https://cran.r-project.org/web/packages/pergola/vignettes/my-vignette.html @@ -122,13 +123,8 @@ title,content,license,language,webpage 0,0,0,0,https://bitbucket.org/paiyetan/xglycscan/src # sanger.ac.uk -7,233,0,0,http://www.sanger.ac.uk/science/tools/artemis -22,888,0,0,http://www.sanger.ac.uk/science/tools/ssahasnp - -# dna.leeds.ac.uk -21,1465,0,0,http://dna.leeds.ac.uk/AffyErrorRate/ -14,1519,0,0,http://dna.leeds.ac.uk/agile/AgileGenotyper/ -12,1762,0,0,http://dna.leeds.ac.uk/methylviewer/ +7,468,392,0,http://www.sanger.ac.uk/science/tools/artemis +8,1772,699,0,http://www.sanger.ac.uk/science/tools/ssahasnp # tools.proteomecenter.org 14,1004,0,0,http://tools.proteomecenter.org/Libra.php @@ -143,7 +139,7 @@ title,content,license,language,webpage # zhanglab.ccmb.med.umich.edu 47,776,0,0,https://zhanglab.ccmb.med.umich.edu/BSpred/ -79,3568,0,0,https://zhanglab.ccmb.med.umich.edu/EvoDesign/ +79,3747,0,0,https://zhanglab.ccmb.med.umich.edu/EvoDesign/ # bioinformatics.mdanderson.org 10,1261,6,4,http://bioinformatics.mdanderson.org/main/BreakTrans @@ -152,17 +148,13 @@ title,content,license,language,webpage 6,302,19,0,https://www.bcgsc.ca/resources/software/anchor # compomics.github.io -13,12795,0,0,https://compomics.github.io/projects/peptide-shaker.html +13,13547,0,0,https://compomics.github.io/projects/peptide-shaker.html 8,397,0,0,http://compomics.github.io/projects/reporter.html # rostlab.org wiki -7,8025,0,0,https://rostlab.org/owiki/index.php/CHOPPER +7,8021,0,0,https://rostlab.org/owiki/index.php/CHOPPER 10,1450,0,0,https://rostlab.org/owiki/index.php/Uniqueprot -# uea.ac.uk -5,3042,0,0,https://www.uea.ac.uk/computing/psiko -10,2306,0,0,http://www.uea.ac.uk/computing/my-closure - # rna.informatik.uni-freiburg.de 25,3437,0,0,http://rna.informatik.uni-freiburg.de/CopraRNA/Input.jsp 27,2231,0,0,http://rna.informatik.uni-freiburg.de/MARNA/Input.jsp @@ -174,7 +166,7 @@ title,content,license,language,webpage # pypi.org 16,779,26,6,https://pypi.python.org/pypi/omics_pipe -13,1963,35,69,http://pypi.python.org/pypi/plastid +13,2311,35,97,http://pypi.python.org/pypi/plastid 17,69,48,11,https://pypi.org/project/bio-apricot/ # bioinformatics.org wiki diff --git a/core/src/main/resources/scrape/webpages.yaml b/core/src/main/resources/scrape/webpages.yaml index f947d79..7bdd9b5 100644 --- a/core/src/main/resources/scrape/webpages.yaml +++ b/core/src/main/resources/scrape/webpages.yaml @@ -50,9 +50,9 @@ 'github\.com/+[^/]+/+[^/]+/*$': title: h1 > [itemprop=name] - content: '.repository-content [itemprop=about], #readme > .Box-body > * > *' - license: .numbers-summary a:has(.octicon-law):not(:matches(^View license$)) - language: .repository-lang-stats-numbers a:has(.percent:matchesOwn(^(100|[1-9][0-9]|[5-9])\.[0-9]%$)) > .lang # only take languages with percentage >= 5% + content: '.BorderGrid-cell > h2:matchesOwn(^About$) + p, #readme > .Box-body > * > *' + license: a:has(.octicon-law):not(:matches(^View license$)) + language: a[href~=/search\?l=]:has(span:matchesOwn(^(100|[1-9][0-9]|[5-9])\.[0-9]%$)) > span.text-bold 'github\.com/+[^/]+/*$': title: h1 @@ -60,7 +60,7 @@ 'github\.com/+[^/]+/+[^/]+/+wiki(/|$)': title: h1 > [itemprop=name] - content: 'h1.gh-header-title, #wiki-body > :not(#wiki-footer), #wiki-rightbar #wiki-pages-box a, #wiki-rightbar .wiki-custom-sidebar' + content: 'h1.gh-header-title, #wiki-body > :not(#wiki-footer), .wiki-rightbar #wiki-pages-box a, .wiki-rightbar .wiki-custom-sidebar' 'github\.com/+[^/]+/+[^/]+/+tree/': title: h1 > [itemprop=name] @@ -77,7 +77,7 @@ 'galaxy\.pasteur\.fr/+(root|tool_runner)': title: '#center .portlet-title-text' content: '#center .portlet-body .ui-form-title-text, #center .ui-form-help' - javascript: 'true' + javascript: 'false' # too resource intensive 'emboss\.open-bio\.org/+rel/': title: body > :first-child @@ -130,12 +130,9 @@ # javascript: 'true' 'sanger\.ac\.uk': - title: '#main-content > .pagetitle' - content: '#overview > :not(h4), .view-tool-type a' - -'dna\.leeds\.ac\.uk': - title: '#content > h1' - content: '#content > :not(h1)' + title: '#main .panel-outer h2' + content: '#main .panel-outer .button-container a, #main .panel-outer .intro' + license: '#main .panel-outer label:matchesOwn(^License and Citation$) + div' 'tools\.proteomecenter\.org': title: h1.firstHeading @@ -169,10 +166,6 @@ title: h1.firstHeading content: '#mw-content-text > :not(#toc)' -'uea\.ac\.uk': - title: '#breadcrumbs li.last > a' - content: .journal-content-article - 'rna\.informatik\.uni-freiburg\.de': title: .content > h1 content: .content > :not(h1):not(.footer) @@ -250,3 +243,9 @@ # Results in java.lang.StackOverflowError after ~1.5 minutes 'seltarbase\.org': javascript: 'false' + +'who\.int': + javascript: 'false' + +'ncbi\.nlm\.nih\.gov': + javascript: 'false' diff --git a/core/src/main/resources/test/europepmc-xml.csv b/core/src/main/resources/test/europepmc-xml.csv index a653c5e..648a5fc 100644 --- a/core/src/main/resources/test/europepmc-xml.csv +++ b/core/src/main/resources/test/europepmc-xml.csv @@ -3,7 +3,7 @@ id,pmid,pmcid,doi,title,keywords,abstract,fulltext,corresp,journal # keywords PMC3131510,0,10,25,133,6,1474,74547,70,42 PMC5123367,0,10,25,110,5,952,34856,40,26 -PMC3608327,8,10,22,57,4,1539,54438,42,38 +PMC3608327,8,10,22,57,4,1539,54439,42,38 PMC5210299,0,10,25,61,3,1153,31801,86,18 # abstract @@ -39,7 +39,7 @@ PMC1142402,8,10,18,100,0,1369,48472,54,22 PMC2730554,0,10,32,105,5,905,50952,45,16 PMC2784320,8,10,26,89,0,696,34874,36,14 PMC3775632,0,10,17,85,7,1169,32969,66,5 -PMC4300491,8,10,21,117,4,1595,62121,113,25 +PMC4300491,8,10,21,117,4,1595,62122,113,25 PMC4829337,8,10,29,109,10,1539,13365,34,11 PMC5180080,8,10,21,74,0,0,10424,36,30 diff --git a/core/src/main/resources/test/oadoi.csv b/core/src/main/resources/test/oadoi.csv index 7aaf8a2..cf0fc25 100644 --- a/core/src/main/resources/test/oadoi.csv +++ b/core/src/main/resources/test/oadoi.csv @@ -13,22 +13,20 @@ doi,oa,links,title,journal # 1 link 10.1101/gr.6861907,1,1,139,15 -10.1186/1471-2105-15-S11-S10,1,1,86,18 10.1107/S0907444910045749,1,1,50,59 -10.3389/fgene.2012.00035,1,1,110,21 10.6019/tol.eva-w.2016.00001.1,1,1,47,0 +10.1073/pnas.1523899113,1,1,97,47 + +# 2 links +10.1186/1471-2105-15-S11-S10,1,2,86,18 +10.1371/journal.pone.0167047,1,2,92,8 +10.1214/10-AOAS338,1,2,89,32 # 3 links +10.3389/fgene.2012.00035,1,3,110,21 10.1093/nar/gks1246,1,3,109,22 -10.1073/pnas.1523899113,1,3,97,47 -10.1371/journal.pone.0167047,1,3,92,8 -# 4 links +# 4 or more links 10.1038/nature12373,1,4,44,6 -10.1214/10-AOAS338,1,4,89,32 - -# 5 links 10.3897/BDJ.4.e8740,1,5,104,25 - -# 8 links 10.1371/journal.pgen.1002453,1,8,60,13 diff --git a/core/src/main/resources/test/pubmed-html.csv b/core/src/main/resources/test/pubmed-html.csv index dba5f6f..b330fd8 100644 --- a/core/src/main/resources/test/pubmed-html.csv +++ b/core/src/main/resources/test/pubmed-html.csv @@ -1,18 +1,18 @@ id,pmid,pmcid,doi,title,keywords,mesh,abstract # subtitle -11598180,8,8,22,28,0,21,1303 +11598180,8,8,22,27,0,21,1303 # keywords -27942268,8,10,25,111,5,0,951 -28049428,8,10,25,62,3,6,943 -27140611,8,10,23,100,5,13,930 +27942268,8,10,25,110,5,0,951 +28049428,8,10,25,61,3,6,977 +27140611,8,10,23,99,5,13,930 # abstract #27942268 #28049428 -20161787,8,10,28,78,0,15,1584 -17973982,8,10,23,80,0,7,2305 +20161787,8,10,28,77,0,15,1654 +17973982,8,10,23,79,0,7,2338 # no abstract -24727771,8,10,16,91,0,10,0 +24727771,8,10,16,90,0,10,0