From d03a43c7e826e1c1a7675dfe4520e895bffaebc8 Mon Sep 17 00:00:00 2001 From: xnn727 <125815443+xuanan20020@users.noreply.github.com> Date: Tue, 5 Dec 2023 19:27:33 +1100 Subject: [PATCH] Predatory journal checker (#10592) * add PredatoryJournalRepository class * Refactor PredatoryJournalRepository to match JournalAbbreviationRepository design * Add PredatoryJournalChecker and PredatoryJournalLoader classes * Add Integrity Message for en Resources * Integrate PredatoryJournalChecker into IntegrityCheck * Initialize PredatoryJournalRepository on Launch * Add PredatoryJournalCheckerTest and more logging * Refactor PredatoryJournalLoader to switch from temp dir to user's app data * Add MV file generation for predatory journal lists * update CHANGELOG.md * Refactor and create own record class for journal information move to own gradle task and file * checkstyle, rename methods in gradle * run rewrite * fix test * fix duplicate handling * fix gradle task fix zwsp in name * more exception handling * Make serializable to fix mvstore Just use simpler levenstein distance should be enough add javadoc * checkstyle * fuck you checkstyle * checkstyle * refactor test * use same copy behavior as for journal abbrevs * make link elemens non static fix checkstyle * fix static vars * Split loader into crawler and "real" loader * Checkstyle * Fix variable type --------- Co-authored-by: Siedlerchr Co-authored-by: Oliver Kopp --- CHANGELOG.md | 1 + build.gradle | 15 +- .../intellij-89-run-with-intellij.md | 6 +- .../jabref/cli/JournalListMvGenerator.java | 8 +- src/main/java/org/jabref/cli/Launcher.java | 3 + .../cli/PredatoryJournalsMvGenerator.java | 47 +++++ src/main/java/org/jabref/gui/Globals.java | 2 + src/main/java/org/jabref/gui/JabRefFrame.java | 1 + src/main/java/org/jabref/gui/MainMenu.java | 6 +- .../gui/integrity/IntegrityCheckAction.java | 7 +- .../logic/integrity/IntegrityCheck.java | 6 +- .../integrity/PredatoryJournalChecker.java | 29 ++++ .../PredatoryJournalInformation.java | 16 ++ .../PredatoryJournalListCrawler.java | 160 ++++++++++++++++++ .../predatory/PredatoryJournalListLoader.java | 32 ++++ .../predatory/PredatoryJournalRepository.java | 61 +++++++ src/main/resources/l10n/JabRef_en.properties | 1 + .../logic/integrity/IntegrityCheckTest.java | 12 +- .../PredatoryJournalCheckerTest.java | 70 ++++++++ 19 files changed, 467 insertions(+), 16 deletions(-) create mode 100644 src/main/java/org/jabref/cli/PredatoryJournalsMvGenerator.java create mode 100644 src/main/java/org/jabref/logic/integrity/PredatoryJournalChecker.java create mode 100644 src/main/java/org/jabref/logic/journals/predatory/PredatoryJournalInformation.java create mode 100644 src/main/java/org/jabref/logic/journals/predatory/PredatoryJournalListCrawler.java create mode 100644 src/main/java/org/jabref/logic/journals/predatory/PredatoryJournalListLoader.java create mode 100644 src/main/java/org/jabref/logic/journals/predatory/PredatoryJournalRepository.java create mode 100644 src/test/java/org/jabref/logic/integrity/PredatoryJournalCheckerTest.java diff --git a/CHANGELOG.md b/CHANGELOG.md index 6d4d298bfb7..c8b3ec8c5b3 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -16,6 +16,7 @@ Note that this project **does not** adhere to [Semantic Versioning](https://semv - We added a button to let users reset the cite command to the default value. [#10569](https://github.com/JabRef/jabref/issues/10569) - We added the option to use System Preference for Light/Dark Theme [#8729](https://github.com/JabRef/jabref/issues/8729). - We added [scholar.archive.org](https://scholar.archive.org/) as a new fetcher. [#10498](https://github.com/JabRef/jabref/issues/10498) +- We integrated predatory journal checking as part of the Integrity Checker based on the [check-bib-for-predatory](https://github.com/CfKu/check-bib-for-predatory). [koppor#348](https://github.com/koppor/jabref/issues/348) - We added a 'More options' section in the main table right click menu opening the preferences dialog. [#9432](https://github.com/JabRef/jabref/issues/9432) ### Changed diff --git a/build.gradle b/build.gradle index 36c746ae76b..e34673740da 100644 --- a/build.gradle +++ b/build.gradle @@ -327,8 +327,19 @@ tasks.register("generateJournalListMV", JavaExec) { !file("build/resources/main/journals/journal-list.mv").exists() } } -jar.dependsOn "generateJournalListMV" -compileTestJava.dependsOn "generateJournalListMV" + +tasks.register("generatePredatoryJournalListMV", JavaExec) { + group = "JabRef" + description = "Load predatory journal information from online sources to a H2 MVStore" + classpath = sourceSets.main.runtimeClasspath + mainClass = "org.jabref.cli.PredatoryJournalsMvGenerator" + onlyIf { + !file("build/resources/main/journals/predatory-journals.mv").exists() + } +} + +jar.dependsOn("generateJournalListMV", "generatePredatoryJournalListMV") +compileTestJava.dependsOn("generateJournalListMV","generatePredatoryJournalListMV") tasks.register('generateCitaviSource', XjcTask) { group = 'JabRef' diff --git a/docs/getting-into-the-code/guidelines-for-setting-up-a-local-workspace/intellij-89-run-with-intellij.md b/docs/getting-into-the-code/guidelines-for-setting-up-a-local-workspace/intellij-89-run-with-intellij.md index b0897e95a65..17ec4d6ea85 100644 --- a/docs/getting-into-the-code/guidelines-for-setting-up-a-local-workspace/intellij-89-run-with-intellij.md +++ b/docs/getting-into-the-code/guidelines-for-setting-up-a-local-workspace/intellij-89-run-with-intellij.md @@ -11,16 +11,16 @@ It is also possible to use IntelliJ's internal build and run system to launch Ja Due to [IDEA-119280](https://youtrack.jetbrains.com/issue/IDEA-119280), it is a bit more work. 1. Navigate to **File > Settings... > Build, Execution, Deployment > Build Tools > Gradle**. -2. Change the setting "Build an run using:" to "IntelliJ IDEA". +2. Change the setting "Build and run using:" to "IntelliJ IDEA". 3. Navigate to **File > Settings... > Build, Execution, Deployment > Compiler > Java Compiler**. 4. Uncheck `--Use 'release' option for cross-compilation`. 5. **Build > Build Project** 6. Open the project view (Alt+1 , on mac cmd>+1) 7. Copy all build resources to the folder of the build classes - 1. Navigate to the folder `out/production/resources` + 1. Navigate to the folder `build/resoruces/main` 2. Select all folders below (`bst`, `csl-locales`, ...) 3. Press Ctrl+C to mark them for copying - 4. Select the folder `classes` + 4. Select the folder `out/production/classes` 5. Press Ctrl+V to start the copy process 8. Locate the class `Launcher` (e.g., by ctrl+N and then typing `Launcher`). Press Enter to jump to that class.
diff --git a/src/main/java/org/jabref/cli/JournalListMvGenerator.java b/src/main/java/org/jabref/cli/JournalListMvGenerator.java index 9a4fd3d67c6..dad187dc81a 100644 --- a/src/main/java/org/jabref/cli/JournalListMvGenerator.java +++ b/src/main/java/org/jabref/cli/JournalListMvGenerator.java @@ -37,15 +37,15 @@ public static void main(String[] args) throws IOException { // we currently do not have good support for BibTeX strings "journal_abbreviations_ieee_strings.csv" - ); + ); Files.createDirectories(journalListMvFile.getParent()); try (DirectoryStream stream = Files.newDirectoryStream(abbreviationsDirectory, "*.csv"); MVStore store = new MVStore.Builder(). - fileName(journalListMvFile.toString()). - compressHigh(). - open()) { + fileName(journalListMvFile.toString()). + compressHigh(). + open()) { MVMap fullToAbbreviation = store.openMap("FullToAbbreviation"); stream.forEach(Unchecked.consumer(path -> { String fileName = path.getFileName().toString(); diff --git a/src/main/java/org/jabref/cli/Launcher.java b/src/main/java/org/jabref/cli/Launcher.java index a49ea92e865..2e8033267d6 100644 --- a/src/main/java/org/jabref/cli/Launcher.java +++ b/src/main/java/org/jabref/cli/Launcher.java @@ -12,6 +12,7 @@ import org.jabref.gui.Globals; import org.jabref.gui.MainApplication; import org.jabref.logic.journals.JournalAbbreviationLoader; +import org.jabref.logic.journals.predatory.PredatoryJournalListLoader; import org.jabref.logic.l10n.Localization; import org.jabref.logic.net.ProxyAuthenticator; import org.jabref.logic.net.ProxyPreferences; @@ -169,6 +170,8 @@ private static void initGlobals(PreferencesService preferences) { // Read list(s) of journal names and abbreviations Globals.journalAbbreviationRepository = JournalAbbreviationLoader .loadRepository(preferences.getJournalAbbreviationPreferences()); + Globals.predatoryJournalRepository = PredatoryJournalListLoader + .loadRepository(); Globals.entryTypesManager = preferences.getCustomEntryTypesRepository(); Globals.protectedTermsLoader = new ProtectedTermsLoader(preferences.getProtectedTermsPreferences()); diff --git a/src/main/java/org/jabref/cli/PredatoryJournalsMvGenerator.java b/src/main/java/org/jabref/cli/PredatoryJournalsMvGenerator.java new file mode 100644 index 00000000000..574e287abf8 --- /dev/null +++ b/src/main/java/org/jabref/cli/PredatoryJournalsMvGenerator.java @@ -0,0 +1,47 @@ +package org.jabref.cli; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; + +import org.jabref.logic.journals.predatory.PredatoryJournalInformation; +import org.jabref.logic.journals.predatory.PredatoryJournalListCrawler; + +import org.h2.mvstore.MVMap; +import org.h2.mvstore.MVStore; + +public class PredatoryJournalsMvGenerator { + public static void main(String[] args) throws IOException { + boolean verbose = (args.length == 1) && ("--verbose".equals(args[0])); + + Path predatoryJournalsMvFile = Path.of("build", "resources", "main", "journals", "predatory-journals.mv"); + Files.createDirectories(predatoryJournalsMvFile.getParent()); + + try (MVStore store = new MVStore.Builder() + .fileName(predatoryJournalsMvFile.toString()) + .compressHigh() + .backgroundExceptionHandler((t, e) -> { + System.err.println("Exception occurred in Thread " + t + "with exception " + e); + e.printStackTrace(); + }) + .open()) { + MVMap predatoryJournalsMap = store.openMap("PredatoryJournals"); + + PredatoryJournalListCrawler loader = new PredatoryJournalListCrawler(); + Set predatoryJournals = loader.loadFromOnlineSources(); + + var resultMap = predatoryJournals.stream().collect(Collectors.toMap(PredatoryJournalInformation::name, Function.identity(), + (predatoryJournalInformation, predatoryJournalInformation2) -> { + if (verbose) { + System.out.println("Double entry " + predatoryJournalInformation.name()); + } + return predatoryJournalInformation2; + })); + + predatoryJournalsMap.putAll(resultMap); + } + } +} diff --git a/src/main/java/org/jabref/gui/Globals.java b/src/main/java/org/jabref/gui/Globals.java index 04ead50bf63..06437d7f6c9 100644 --- a/src/main/java/org/jabref/gui/Globals.java +++ b/src/main/java/org/jabref/gui/Globals.java @@ -9,6 +9,7 @@ import org.jabref.gui.util.DefaultTaskExecutor; import org.jabref.gui.util.TaskExecutor; import org.jabref.logic.journals.JournalAbbreviationRepository; +import org.jabref.logic.journals.predatory.PredatoryJournalRepository; import org.jabref.logic.protectedterms.ProtectedTermsLoader; import org.jabref.logic.remote.RemotePreferences; import org.jabref.logic.remote.server.RemoteListenerServerManager; @@ -51,6 +52,7 @@ public class Globals { * Only GUI code is allowed to access it, logic code should use dependency injection. */ public static JournalAbbreviationRepository journalAbbreviationRepository; + public static PredatoryJournalRepository predatoryJournalRepository; /** * This field is initialized upon startup. diff --git a/src/main/java/org/jabref/gui/JabRefFrame.java b/src/main/java/org/jabref/gui/JabRefFrame.java index 8a829134b96..3f0f95a6955 100644 --- a/src/main/java/org/jabref/gui/JabRefFrame.java +++ b/src/main/java/org/jabref/gui/JabRefFrame.java @@ -488,6 +488,7 @@ private void initLayout() { taskExecutor, dialogService, Globals.journalAbbreviationRepository, + Globals.predatoryJournalRepository, entryTypesManager, undoManager, Globals.getClipboardManager()); diff --git a/src/main/java/org/jabref/gui/MainMenu.java b/src/main/java/org/jabref/gui/MainMenu.java index 8ac907cf6ed..589c740f448 100644 --- a/src/main/java/org/jabref/gui/MainMenu.java +++ b/src/main/java/org/jabref/gui/MainMenu.java @@ -65,6 +65,7 @@ import org.jabref.logic.importer.IdFetcher; import org.jabref.logic.importer.WebFetchers; import org.jabref.logic.journals.JournalAbbreviationRepository; +import org.jabref.logic.journals.predatory.PredatoryJournalRepository; import org.jabref.logic.l10n.Localization; import org.jabref.logic.util.OS; import org.jabref.model.entry.BibEntryTypesManager; @@ -82,6 +83,7 @@ public class MainMenu extends MenuBar { private final TaskExecutor taskExecutor; private final DialogService dialogService; private final JournalAbbreviationRepository abbreviationRepository; + private final PredatoryJournalRepository predatoryJournalRepository; private final BibEntryTypesManager entryTypesManager; private final UndoManager undoManager; private final ClipBoardManager clipBoardManager; @@ -95,6 +97,7 @@ public MainMenu(JabRefFrame frame, TaskExecutor taskExecutor, DialogService dialogService, JournalAbbreviationRepository abbreviationRepository, + PredatoryJournalRepository predatoryJournalRepository, BibEntryTypesManager entryTypesManager, UndoManager undoManager, ClipBoardManager clipBoardManager) { @@ -107,6 +110,7 @@ public MainMenu(JabRefFrame frame, this.taskExecutor = taskExecutor; this.dialogService = dialogService; this.abbreviationRepository = abbreviationRepository; + this.predatoryJournalRepository = predatoryJournalRepository; this.entryTypesManager = entryTypesManager; this.undoManager = undoManager; this.clipBoardManager = clipBoardManager; @@ -224,7 +228,7 @@ private void createMenu() { quality.getItems().addAll( factory.createMenuItem(StandardActions.FIND_DUPLICATES, new DuplicateSearch(frame, dialogService, stateManager, preferencesService, entryTypesManager, taskExecutor)), factory.createMenuItem(StandardActions.MERGE_ENTRIES, new MergeEntriesAction(dialogService, stateManager, preferencesService)), - factory.createMenuItem(StandardActions.CHECK_INTEGRITY, new IntegrityCheckAction(frame, preferencesService, dialogService, stateManager, taskExecutor, abbreviationRepository)), + factory.createMenuItem(StandardActions.CHECK_INTEGRITY, new IntegrityCheckAction(frame, preferencesService, dialogService, stateManager, taskExecutor, abbreviationRepository, predatoryJournalRepository)), factory.createMenuItem(StandardActions.CLEANUP_ENTRIES, new CleanupAction(frame, preferencesService, dialogService, stateManager, taskExecutor)), new SeparatorMenuItem(), diff --git a/src/main/java/org/jabref/gui/integrity/IntegrityCheckAction.java b/src/main/java/org/jabref/gui/integrity/IntegrityCheckAction.java index 5e6e2df5a37..0acf35de17a 100644 --- a/src/main/java/org/jabref/gui/integrity/IntegrityCheckAction.java +++ b/src/main/java/org/jabref/gui/integrity/IntegrityCheckAction.java @@ -14,6 +14,7 @@ import org.jabref.logic.integrity.IntegrityCheck; import org.jabref.logic.integrity.IntegrityMessage; import org.jabref.logic.journals.JournalAbbreviationRepository; +import org.jabref.logic.journals.predatory.PredatoryJournalRepository; import org.jabref.logic.l10n.Localization; import org.jabref.model.database.BibDatabaseContext; import org.jabref.model.entry.BibEntry; @@ -29,19 +30,22 @@ public class IntegrityCheckAction extends SimpleCommand { private final PreferencesService preferencesService; private final StateManager stateManager; private final JournalAbbreviationRepository abbreviationRepository; + private final PredatoryJournalRepository predatoryJournalRepository; public IntegrityCheckAction(JabRefFrame frame, PreferencesService preferencesService, DialogService dialogService, StateManager stateManager, TaskExecutor taskExecutor, - JournalAbbreviationRepository abbreviationRepository) { + JournalAbbreviationRepository abbreviationRepository, + PredatoryJournalRepository predatoryJournalRepository) { this.frame = frame; this.stateManager = stateManager; this.taskExecutor = taskExecutor; this.preferencesService = preferencesService; this.dialogService = dialogService; this.abbreviationRepository = abbreviationRepository; + this.predatoryJournalRepository = predatoryJournalRepository; this.executable.bind(needsDatabase(this.stateManager)); } @@ -53,6 +57,7 @@ public void execute() { preferencesService.getFilePreferences(), preferencesService.getCitationKeyPatternPreferences(), abbreviationRepository, + predatoryJournalRepository, preferencesService.getEntryEditorPreferences().shouldAllowIntegerEditionBibtex()); Task> task = new Task<>() { diff --git a/src/main/java/org/jabref/logic/integrity/IntegrityCheck.java b/src/main/java/org/jabref/logic/integrity/IntegrityCheck.java index 1a4207c98c1..bcf87382a55 100644 --- a/src/main/java/org/jabref/logic/integrity/IntegrityCheck.java +++ b/src/main/java/org/jabref/logic/integrity/IntegrityCheck.java @@ -6,6 +6,7 @@ import org.jabref.logic.citationkeypattern.CitationKeyPatternPreferences; import org.jabref.logic.journals.JournalAbbreviationRepository; +import org.jabref.logic.journals.predatory.PredatoryJournalRepository; import org.jabref.model.database.BibDatabase; import org.jabref.model.database.BibDatabaseContext; import org.jabref.model.entry.BibEntry; @@ -22,6 +23,7 @@ public IntegrityCheck(BibDatabaseContext bibDatabaseContext, FilePreferences filePreferences, CitationKeyPatternPreferences citationKeyPatternPreferences, JournalAbbreviationRepository journalAbbreviationRepository, + PredatoryJournalRepository predatoryJournalRepository, boolean allowIntegerEdition) { this.bibDatabaseContext = bibDatabaseContext; @@ -40,7 +42,9 @@ public IntegrityCheck(BibDatabaseContext bibDatabaseContext, new CitationKeyDuplicationChecker(bibDatabaseContext.getDatabase()), new AmpersandChecker(), new LatexIntegrityChecker(), - new JournalInAbbreviationListChecker(StandardField.JOURNAL, journalAbbreviationRepository) + new JournalInAbbreviationListChecker(StandardField.JOURNAL, journalAbbreviationRepository), + new PredatoryJournalChecker(predatoryJournalRepository, + List.of(StandardField.JOURNAL, StandardField.PUBLISHER, StandardField.BOOKTITLE)) )); if (bibDatabaseContext.isBiblatexMode()) { entryCheckers.add(new UTF8Checker(bibDatabaseContext.getMetaData().getEncoding().orElse(StandardCharsets.UTF_8))); diff --git a/src/main/java/org/jabref/logic/integrity/PredatoryJournalChecker.java b/src/main/java/org/jabref/logic/integrity/PredatoryJournalChecker.java new file mode 100644 index 00000000000..eb4f7030ced --- /dev/null +++ b/src/main/java/org/jabref/logic/integrity/PredatoryJournalChecker.java @@ -0,0 +1,29 @@ +package org.jabref.logic.integrity; + +import java.util.List; +import java.util.Objects; + +import org.jabref.logic.journals.predatory.PredatoryJournalRepository; +import org.jabref.logic.l10n.Localization; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.Field; + +public class PredatoryJournalChecker implements EntryChecker { + + private final PredatoryJournalRepository predatoryJournalRepository; + private final List fieldNames; + + public PredatoryJournalChecker(PredatoryJournalRepository predatoryJournalRepository, List fieldsToCheck) { + this.predatoryJournalRepository = Objects.requireNonNull(predatoryJournalRepository); + this.fieldNames = fieldsToCheck; + } + + @Override + public List check(BibEntry entry) { + return entry.getFieldMap().entrySet().stream() + .filter(field -> fieldNames.contains(field.getKey())) + .filter(field -> predatoryJournalRepository.isKnownName(field.getValue())) + .map(field -> new IntegrityMessage(Localization.lang("Predatory journal %0 found", field.getValue()), entry, field.getKey())) + .toList(); + } +} diff --git a/src/main/java/org/jabref/logic/journals/predatory/PredatoryJournalInformation.java b/src/main/java/org/jabref/logic/journals/predatory/PredatoryJournalInformation.java new file mode 100644 index 00000000000..83c936f8748 --- /dev/null +++ b/src/main/java/org/jabref/logic/journals/predatory/PredatoryJournalInformation.java @@ -0,0 +1,16 @@ +package org.jabref.logic.journals.predatory; + +import java.io.Serializable; + +/** + * Represents predatory journal information + * + * @param name The full journal name + * @param abbr Abbreviation, if any + * @param url Url of the journal + */ +public record PredatoryJournalInformation( + String name, + String abbr, + String url) implements Serializable { // must implement @Serializable otherwise MVStore fails +} diff --git a/src/main/java/org/jabref/logic/journals/predatory/PredatoryJournalListCrawler.java b/src/main/java/org/jabref/logic/journals/predatory/PredatoryJournalListCrawler.java new file mode 100644 index 00000000000..dfedda4eec5 --- /dev/null +++ b/src/main/java/org/jabref/logic/journals/predatory/PredatoryJournalListCrawler.java @@ -0,0 +1,160 @@ +package org.jabref.logic.journals.predatory; + +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.Reader; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URISyntaxException; +import java.net.URL; +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +import org.jabref.logic.net.URLDownload; +import org.jabref.model.strings.StringUtil; + +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * Converts (hard-coded) online resources to a set. {@link #loadFromOnlineSources} is the method containing the result. + */ +public class PredatoryJournalListCrawler { + + private record PJSource(URL url, Optional elementPattern) { + PJSource(String url, String regex) { + this(createURL(url), Optional.of(Pattern.compile(regex))); + } + + PJSource(String url) { + this(createURL(url), Optional.empty()); + } + + private static URL createURL(String urlString) { + try { + return new URI(urlString).toURL(); + } catch (MalformedURLException | URISyntaxException ex) { + throw new IllegalArgumentException("Malformed URL has occurred in PJSource", ex); + } + } + } + + private static final Logger LOGGER = LoggerFactory.getLogger(PredatoryJournalListCrawler.class); + private static final Pattern PATTERN_NAME = Pattern.compile("(?<=\">).*?(?=<)"); + private static final Pattern PATTERN_URL = Pattern.compile("http.*?(?=\")"); + private static final Pattern PATTERN_ABBR = Pattern.compile("(?<=\\()[^ ]*(?=\\))"); + private final List predatorySources = List.of( + new PJSource("https://raw.githubusercontent.com/stop-predatory-journals/stop-predatory-journals.github.io/master/_data/journals.csv"), + new PJSource("https://raw.githubusercontent.com/stop-predatory-journals/stop-predatory-journals.github.io/master/_data/publishers.csv"), + new PJSource("https://beallslist.net/", + "
  • .*?
  • "), + new PJSource("https://beallslist.net/standalone-journals/", + "
  • .*?
  • "), + new PJSource("https://beallslist.net/hijacked-journals/", + ".*?") + ); + + private final List linkElements = new ArrayList<>(); + + private final List predatoryJournalInformation = new ArrayList<>(); + + /** + * Loads predatory journal information from online resources + * This method should be only called once when building JabRef + * + * @return the set of journal information + */ + public HashSet loadFromOnlineSources() { + predatorySources.forEach(this::crawl); + linkElements.forEach(this::clean); + return new HashSet<>(predatoryJournalInformation); + } + + private void crawl(PJSource source) { + try { + URLDownload download = new URLDownload(source.url); + + if (!download.canBeReached()) { + LOGGER.warn("Url {} is unreachable", source.url); + } else if (source.url.getPath().contains(".csv")) { + handleCSV(new InputStreamReader(download.asInputStream())); + } else { + if (source.elementPattern.isPresent()) { + handleHTML(source.elementPattern.get(), download.asString()); + } + } + } catch (IOException ex) { + LOGGER.error("Could not crawl source for predatory journals {}", source.url, ex); + } + } + + private void handleCSV(Reader reader) throws IOException { + CSVFormat format = CSVFormat.EXCEL.builder().setSkipHeaderRecord(true).build(); + CSVParser csvParser = new CSVParser(reader, format); + + for (CSVRecord csvRecord : csvParser) { + String name = csvRecord.get(1); + String abbr = csvRecord.get(2); + String url = csvRecord.get(0); + + if (StringUtil.isNullOrEmpty(name)) { + if (!abbr.isEmpty()) { + name = abbr; + } else { + continue; + } + } + // changes column order from CSV (source: url, name, abbr) + predatoryJournalInformation.add(new PredatoryJournalInformation(decode(name), decode(abbr), url)); + } + } + + private void handleHTML(Pattern pattern, String body) { + Matcher matcher = pattern.matcher(body); + while (matcher.find()) { + linkElements.add(matcher.group()); + } + } + + private void clean(String item) { + Matcher m_name = PATTERN_NAME.matcher(item); + Matcher m_url = PATTERN_URL.matcher(item); + Matcher m_abbr = PATTERN_ABBR.matcher(item); + + // using `if` gets only first link in element, `while` gets all, but this may not be desirable + // e.g. this way only the hijacked journals are recorded and not the authentic originals + if (m_name.find() && m_url.find()) { + String name = m_name.group(); + if (name != null) { + name = name.replace("\u200B", ""); // zero width space + } + String abbr = m_abbr.find() ? m_abbr.group() : ""; + String url = m_url.group(); + + if (StringUtil.isNullOrEmpty(name)) { + if (!abbr.isEmpty()) { + name = abbr; + } else { + return; + } + } + predatoryJournalInformation.add(new PredatoryJournalInformation(decode(name), decode(abbr), url)); + } + } + + private String decode(String s) { + return Optional.ofNullable(s) + .orElse("") + .replace(",", "") + .replace("&", "&") + .replace("’", "'") + .replace("–", "-"); + } +} diff --git a/src/main/java/org/jabref/logic/journals/predatory/PredatoryJournalListLoader.java b/src/main/java/org/jabref/logic/journals/predatory/PredatoryJournalListLoader.java new file mode 100644 index 00000000000..2b03f9dcb36 --- /dev/null +++ b/src/main/java/org/jabref/logic/journals/predatory/PredatoryJournalListLoader.java @@ -0,0 +1,32 @@ +package org.jabref.logic.journals.predatory; + +import java.net.URISyntaxException; +import java.net.URL; +import java.nio.file.Path; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +public class PredatoryJournalListLoader { + + private static final Logger LOGGER = LoggerFactory.getLogger(PredatoryJournalListLoader.class); + + public static PredatoryJournalRepository loadRepository() { + PredatoryJournalRepository repository = new PredatoryJournalRepository(); + + Path path; + try { + URL resource = PredatoryJournalRepository.class.getResource("/journals/predatory-journals.mv"); + if (resource == null) { + LOGGER.error("predatoryJournal-list.mv not found. Using demo list."); + return new PredatoryJournalRepository(); + } + path = Path.of(resource.toURI()); + } catch (URISyntaxException e) { + LOGGER.error("Could not determine path to predatoryJournal-list.mv. Using demo list."); + return new PredatoryJournalRepository(); + } + + return new PredatoryJournalRepository(path); + } +} diff --git a/src/main/java/org/jabref/logic/journals/predatory/PredatoryJournalRepository.java b/src/main/java/org/jabref/logic/journals/predatory/PredatoryJournalRepository.java new file mode 100644 index 00000000000..64b930026d2 --- /dev/null +++ b/src/main/java/org/jabref/logic/journals/predatory/PredatoryJournalRepository.java @@ -0,0 +1,61 @@ +package org.jabref.logic.journals.predatory; + +import java.nio.file.Path; +import java.util.HashMap; +import java.util.Locale; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.stream.Collectors; + +import org.jabref.logic.util.strings.StringSimilarity; + +import org.h2.mvstore.MVMap; +import org.h2.mvstore.MVStore; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +/** + * A repository for all predatory journals and publishers, including add and find methods. + */ +public class PredatoryJournalRepository { + private final Logger LOGGER = LoggerFactory.getLogger(PredatoryJournalRepository.class); + private final Map predatoryJournals = new HashMap<>(); + private final StringSimilarity match = new StringSimilarity(); + + /** + * Initializes the internal data based on the predatory journals found in the given MV file + */ + public PredatoryJournalRepository(Path mvStore) { + MVMap predatoryJournalsMap; + try (MVStore store = new MVStore.Builder().readOnly().fileName(mvStore.toAbsolutePath().toString()).open()) { + predatoryJournalsMap = store.openMap("PredatoryJournals"); + predatoryJournals.putAll(predatoryJournalsMap); + } + } + + /** + * Initializes the repository with demonstration data. Used if no abbreviation file is found. + */ + public PredatoryJournalRepository() { + predatoryJournals.put("Demo", new PredatoryJournalInformation("Demo", "Demo", "")); + } + + /** + * Returns true if the given journal name is contained in the list in its full form + */ + public boolean isKnownName(String journalName) { + String journal = journalName.trim().replaceAll(Matcher.quoteReplacement("\\&"), "&"); + + if (predatoryJournals.containsKey(journal)) { + LOGGER.debug("Found predatory journal {}", journal); + return true; + } + + var matches = predatoryJournals.keySet().stream() + .filter(key -> match.isSimilar(journal.toLowerCase(Locale.ROOT), key.toLowerCase(Locale.ROOT))) + .collect(Collectors.toList()); + + LOGGER.info("Found multiple possible predatory journals {}", String.join(", ", matches)); + return !matches.isEmpty(); + } +} diff --git a/src/main/resources/l10n/JabRef_en.properties b/src/main/resources/l10n/JabRef_en.properties index 7ea2dceb5e7..ea1e11f80ae 100644 --- a/src/main/resources/l10n/JabRef_en.properties +++ b/src/main/resources/l10n/JabRef_en.properties @@ -2621,6 +2621,7 @@ Would\ you\ like\ to\ enable\ fetching\ of\ journal\ information?\ This\ can\ be Enable=Enable Keep\ disabled=Keep disabled +Predatory\ journal\ %0\ found=Predatory journal %0 found Hide\ user\ comments=Hide user comments Show\ user\ comments\ field=Show user comments field diff --git a/src/test/java/org/jabref/logic/integrity/IntegrityCheckTest.java b/src/test/java/org/jabref/logic/integrity/IntegrityCheckTest.java index 5a589970aba..21b2d0b2274 100644 --- a/src/test/java/org/jabref/logic/integrity/IntegrityCheckTest.java +++ b/src/test/java/org/jabref/logic/integrity/IntegrityCheckTest.java @@ -13,6 +13,7 @@ import org.jabref.logic.citationkeypattern.CitationKeyPatternPreferences; import org.jabref.logic.citationkeypattern.GlobalCitationKeyPattern; import org.jabref.logic.journals.JournalAbbreviationLoader; +import org.jabref.logic.journals.predatory.PredatoryJournalListLoader; import org.jabref.model.database.BibDatabase; import org.jabref.model.database.BibDatabaseContext; import org.jabref.model.database.BibDatabaseMode; @@ -139,7 +140,8 @@ void testEntryIsUnchangedAfterChecks() { new IntegrityCheck(context, mock(FilePreferences.class), createCitationKeyPatternPreferences(), - JournalAbbreviationLoader.loadBuiltInRepository(), false) + JournalAbbreviationLoader.loadBuiltInRepository(), + PredatoryJournalListLoader.loadRepository(), false) .check(); assertEquals(clonedEntry, entry); @@ -171,7 +173,8 @@ private void assertWrong(BibDatabaseContext context) { List messages = new IntegrityCheck(context, mock(FilePreferences.class), createCitationKeyPatternPreferences(), - JournalAbbreviationLoader.loadBuiltInRepository(), false) + JournalAbbreviationLoader.loadBuiltInRepository(), + PredatoryJournalListLoader.loadRepository(), false) .check(); assertNotEquals(Collections.emptyList(), messages); } @@ -182,8 +185,9 @@ private void assertCorrect(BibDatabaseContext context) { List messages = new IntegrityCheck(context, filePreferencesMock, createCitationKeyPatternPreferences(), - JournalAbbreviationLoader.loadBuiltInRepository(), false - ).check(); + JournalAbbreviationLoader.loadBuiltInRepository(), + PredatoryJournalListLoader.loadRepository(), false) + .check(); assertEquals(Collections.emptyList(), messages); } diff --git a/src/test/java/org/jabref/logic/integrity/PredatoryJournalCheckerTest.java b/src/test/java/org/jabref/logic/integrity/PredatoryJournalCheckerTest.java new file mode 100644 index 00000000000..b56d997c252 --- /dev/null +++ b/src/test/java/org/jabref/logic/integrity/PredatoryJournalCheckerTest.java @@ -0,0 +1,70 @@ +package org.jabref.logic.integrity; + +import java.util.Collections; +import java.util.List; + +import org.jabref.logic.journals.predatory.PredatoryJournalListLoader; +import org.jabref.model.entry.BibEntry; +import org.jabref.model.entry.field.StandardField; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertEquals; + +class PredatoryJournalCheckerTest { + + static PredatoryJournalChecker checker; + + @BeforeAll + static void initChecker() { + checker = new PredatoryJournalChecker(PredatoryJournalListLoader.loadRepository(), + List.of(StandardField.JOURNAL, StandardField.PUBLISHER, StandardField.BOOKTITLE)); + } + + @Test + void journalIsNotPredatory() { + BibEntry entry = new BibEntry().withField(StandardField.JOURNAL, "IEEE Software"); + assertEquals(Collections.emptyList(), checker.check(entry)); + } + + @Test + void journalIsPredatory() { + String journalName = "European International Journal of Science and Technology"; + BibEntry entry = new BibEntry().withField(StandardField.JOURNAL, journalName); + assertEquals(List.of(new IntegrityMessage("Predatory journal %s found".formatted(journalName), + entry, StandardField.JOURNAL)), checker.check(entry)); + } + + @Test + void journalIsPredatoryCaseInsensitive() { + String journalName = "european international journal of science and technology"; + BibEntry entry = new BibEntry().withField(StandardField.JOURNAL, journalName); + assertEquals(List.of(new IntegrityMessage("Predatory journal %s found".formatted(journalName), + entry, StandardField.JOURNAL)), checker.check(entry)); + } + + @Test + void journalIsPredatoryExtraCharacters() { + String journalName = "European International Journal, of Science and Technology"; + BibEntry entry = new BibEntry().withField(StandardField.JOURNAL, journalName); + assertEquals(List.of(new IntegrityMessage("Predatory journal %s found".formatted(journalName), + entry, StandardField.JOURNAL)), checker.check(entry)); + } + + @Test + void publisherIsPredatory() { + String publisherName = "Academia Scholarly Journals"; + BibEntry entry = new BibEntry().withField(StandardField.PUBLISHER, publisherName); + assertEquals(List.of(new IntegrityMessage("Predatory journal %s found".formatted(publisherName), + entry, StandardField.PUBLISHER)), checker.check(entry)); + } + + @Test + void bookTitleIsPredatory() { + String bookTitle = "Biosciences International"; + BibEntry entry = new BibEntry().withField(StandardField.BOOKTITLE, bookTitle); + assertEquals(List.of(new IntegrityMessage("Predatory journal %s found".formatted(bookTitle), + entry, StandardField.BOOKTITLE)), checker.check(entry)); + } +}