Make PdfBibExtractor be Importer

JabRef · Dec 19, 2024 · 783b2a5 · 783b2a5
1 parent 7c9bb38
commit 783b2a5
Show file tree

Hide file tree

Showing 11 changed files with 160 additions and 30 deletions.
diff --git a/src/main/java/module-info.java b/src/main/java/module-info.java
@@ -193,6 +193,5 @@
     requires mslinks;
     requires org.antlr.antlr4.runtime;
     requires org.libreoffice.uno;
-    requires org.jetbrains.annotations;
     // endregion
 }
diff --git a/src/main/java/org/jabref/gui/fieldeditors/LinkedFileViewModel.java b/src/main/java/org/jabref/gui/fieldeditors/LinkedFileViewModel.java
@@ -30,13 +30,19 @@
 import org.jabref.gui.linkedfile.DeleteFileAction;
 import org.jabref.gui.linkedfile.DownloadLinkedFileAction;
 import org.jabref.gui.linkedfile.LinkedFileEditDialog;
+import org.jabref.gui.mergeentries.MultiMergeEntriesView;
 import org.jabref.gui.preferences.GuiPreferences;
 import org.jabref.gui.util.ControlHelper;
 import org.jabref.logic.FilePreferences;
 import org.jabref.logic.externalfiles.LinkedFileHandler;
 import org.jabref.logic.importer.Importer;
 import org.jabref.logic.importer.ParserResult;
 import org.jabref.logic.importer.fileformat.PdfImporter;
+import org.jabref.logic.importer.fileformat.pdf.PdfEmbeddedBibExtractor;
+import org.jabref.logic.importer.fileformat.pdf.PdfFirstPageBibExtractor;
+import org.jabref.logic.importer.fileformat.pdf.PdfGrobidBibExtractor;
+import org.jabref.logic.importer.fileformat.pdf.PdfVerbatimBibExtractor;
+import org.jabref.logic.importer.fileformat.pdf.PdfXmpBibExtractor;
 import org.jabref.logic.l10n.Localization;
 import org.jabref.logic.util.TaskExecutor;
 import org.jabref.logic.util.io.FileUtil;
@@ -450,25 +456,22 @@ public ValidationStatus fileExistsValidationStatus() {
         return fileExistsValidator.getValidationStatus();
     }
 
-    public void parsePdfMetadata() {
+    public void parsePdfMetadataAndShowMergeDialog() {
         linkedFile.findIn(databaseContext, preferences.getFilePreferences()).ifPresent(filePath -> {
-            try {
-                PdfImporter importer = new PdfImporter(preferences.getImportFormatPreferences());
-                ParserResult result = importer.importDatabase(filePath);
-
-                // This code duplication is the result of mixing 2 idioms: exceptions in method specification
-                // vs exceptions in return type.
-                if (result.isInvalid()) {
-                    LOGGER.error("Unable to extract PDF metadata: {}", result.getErrorMessage());
-                    dialogService.notify(Localization.lang("Unable to extract PDF metadata: %0", result.getErrorMessage()));
-                }
-
-                databaseContext.getDatabase().removeEntry(entry);
-                databaseContext.getDatabase().insertEntries(result.getDatabase().getEntries());
-            } catch (Exception e) {
-                LOGGER.error("Unable to extract PDF metadata", e);
-                dialogService.notify(Localization.lang("Unable to extract PDF metadata: %0", e.getMessage()));
+            MultiMergeEntriesView dialog = new MultiMergeEntriesView(preferences, taskExecutor);
+            dialog.setTitle(Localization.lang("Merge PDF metadata"));
+            dialog.addSource(Localization.lang("Entry"), entry);
+            dialog.addSource(Localization.lang("Verbatim"), wrapImporterToSupplier(new PdfVerbatimBibExtractor(preferences.getImportFormatPreferences()), filePath));
+            dialog.addSource(Localization.lang("Embedded"), wrapImporterToSupplier(new PdfEmbeddedBibExtractor(preferences.getImportFormatPreferences()), filePath));
+            if (preferences.getGrobidPreferences().isGrobidEnabled()) {
+                dialog.addSource("Grobid", wrapImporterToSupplier(new PdfGrobidBibExtractor(preferences.getImportFormatPreferences()), filePath));
             }
+            dialog.addSource(Localization.lang("XMP metadata"), wrapImporterToSupplier(new PdfXmpBibExtractor(preferences.getXmpPreferences()), filePath));
+            dialog.addSource(Localization.lang("Content"), wrapImporterToSupplier(new PdfFirstPageBibExtractor(), filePath));
+            dialogService.showCustomDialogAndWait(dialog).ifPresent(newEntry -> {
+                databaseContext.getDatabase().removeEntry(entry);
+                databaseContext.getDatabase().insertEntry(newEntry);
+            });
         });
     }
 

diff --git a/src/main/java/org/jabref/gui/fieldeditors/LinkedFilesEditor.java b/src/main/java/org/jabref/gui/fieldeditors/LinkedFilesEditor.java
@@ -237,7 +237,7 @@ private Node createFileDisplay(LinkedFileViewModel linkedFile) {
         parsePdfMetadata.visibleProperty().bind(linkedFile.isOfflinePdfProperty());
         parsePdfMetadata.setOnAction(event -> {
             GrobidUseDialogHelper.showAndWaitIfUserIsUndecided(dialogService, preferences.getGrobidPreferences());
-            linkedFile.parsePdfMetadata();
+            linkedFile.parsePdfMetadataAndShowMergeDialog();
         });
         parsePdfMetadata.getStyleClass().setAll("icon-button");
 

diff --git a/src/main/java/org/jabref/gui/importer/ImportCommand.java b/src/main/java/org/jabref/gui/importer/ImportCommand.java
@@ -24,6 +24,7 @@
 import org.jabref.logic.importer.Importer;
 import org.jabref.logic.importer.ParserResult;
 import org.jabref.logic.importer.fileformat.PdfImporter;
+import org.jabref.logic.importer.fileformat.pdf.PdfGrobidBibExtractor;
 import org.jabref.logic.l10n.Localization;
 import org.jabref.logic.preferences.CliPreferences;
 import org.jabref.logic.util.BackgroundTask;
@@ -152,7 +153,7 @@ private ParserResult doImport(List<Path> files, Importer importFormat) throws IO
                     imports.add(importFormatReader.importUnknownFormat(filename, fileUpdateMonitor));
                 } else {
                     UiTaskExecutor.runAndWaitInJavaFXThread(() -> {
-                        if (importer.get() instanceof PdfImporter
+                        if (((importer.get() instanceof PdfGrobidBibExtractor) || (importer.get() instanceof PdfImporter))
                                 && GrobidUseDialogHelper.showAndWaitIfUserIsUndecided(dialogService, preferences.getGrobidPreferences())) {
                             importFormatReader.reset();
                         }

diff --git a/src/main/java/org/jabref/logic/importer/ImportFormatReader.java b/src/main/java/org/jabref/logic/importer/ImportFormatReader.java
@@ -27,11 +27,18 @@
 import org.jabref.logic.importer.fileformat.PdfImporter;
 import org.jabref.logic.importer.fileformat.RepecNepImporter;
 import org.jabref.logic.importer.fileformat.RisImporter;
+import org.jabref.logic.importer.fileformat.pdf.PdfEmbeddedBibExtractor;
+import org.jabref.logic.importer.fileformat.pdf.PdfFirstPageBibExtractor;
+import org.jabref.logic.importer.fileformat.pdf.PdfGrobidBibExtractor;
+import org.jabref.logic.importer.fileformat.pdf.PdfVerbatimBibExtractor;
+import org.jabref.logic.importer.fileformat.pdf.PdfXmpBibExtractor;
 import org.jabref.logic.l10n.Localization;
 import org.jabref.model.database.BibDatabases;
 import org.jabref.model.entry.BibEntry;
 import org.jabref.model.util.FileUpdateMonitor;
 
+import org.apache.pdfbox.pdmodel.PDFormContentStream;
+
 public class ImportFormatReader {
 
     public static final String BIBTEX_FORMAT = "BibTeX";
@@ -70,6 +77,13 @@ public void reset() {
         formats.add(new MsBibImporter());
         formats.add(new OvidImporter());
         formats.add(new PdfImporter(importFormatPreferences));
+        formats.add(new PdfVerbatimBibExtractor(importFormatPreferences));
+        formats.add(new PdfFirstPageBibExtractor());
+        formats.add(new PdfEmbeddedBibExtractor(importFormatPreferences));
+        if (importFormatPreferences.grobidPreferences().isGrobidEnabled()) {
+            formats.add(new PdfGrobidBibExtractor(importFormatPreferences));
+        }
+        formats.add(new PdfXmpBibExtractor(importFormatPreferences.xmpPreferences()));
         formats.add(new RepecNepImporter(importFormatPreferences));
         formats.add(new RisImporter());
         formats.add(new CffImporter(citationKeyPatternPreferences));

diff --git a/src/main/java/org/jabref/logic/importer/fileformat/pdf/PdfBibExtractor.java b/src/main/java/org/jabref/logic/importer/fileformat/pdf/PdfBibExtractor.java
@@ -1,20 +1,68 @@
 package org.jabref.logic.importer.fileformat.pdf;
 
+import java.io.BufferedReader;
 import java.io.IOException;
 import java.nio.file.Path;
 import java.util.List;
 
+import org.jabref.gui.fieldeditors.LinkedFileViewModel;
+import org.jabref.logic.importer.Importer;
 import org.jabref.logic.importer.ParseException;
+import org.jabref.logic.importer.ParserResult;
+import org.jabref.logic.l10n.Localization;
+import org.jabref.logic.util.StandardFileType;
+import org.jabref.logic.xmp.EncryptedPdfsNotSupportedException;
+import org.jabref.logic.xmp.XmpUtilReader;
 import org.jabref.model.entry.BibEntry;
 
 import org.apache.pdfbox.pdmodel.PDDocument;
 
 /**
- * Intermediate class to bundle all PDF analysis steps.
+ * Intermediate class to bundle all PDF analysis steps. {@link PdfBibExtractor} are also {@link org.jabref.logic.importer.Importer}s,
+ * which allows user for more fine-grained control of how {@link BibEntry} is extracted from a PDF file.
  * <p>
- * Note, that this step should not add PDF file to {@link BibEntry}, it will be finally added
- * in {@link org.jabref.logic.importer.fileformat.PdfImporter}.
+ * {@link PdfBibExtractor}s are used in two places in JabRef:
+ * 1. {@link org.jabref.logic.importer.fileformat.PdfImporter}: uses several {@link PdfBibExtractor} and automatically
+ *    merges them into 1 {@link BibEntry}.
+ * 2. {@link LinkedFileViewModel#parsePdfMetadataAndShowMergeDialog()}: also uses several {@link PdfBibExtractor}, but
+ *    it shows a merge dialog (instead of automatic merge).
+ * <p>
+ * Note, that this step should not add PDF file to {@link BibEntry}, it will be finally added either in
+ * {@link PdfBibExtractor#importDatabase(Path)} or {@link org.jabref.logic.importer.fileformat.PdfImporter}.
  */
-public interface PdfBibExtractor {
-    List<BibEntry> importDatabase(Path filePath, PDDocument document) throws IOException, ParseException;
+public abstract class PdfBibExtractor extends Importer {
+    public abstract List<BibEntry> importDatabase(Path filePath, PDDocument document) throws IOException, ParseException;
+
+    @Override
+    public boolean isRecognizedFormat(BufferedReader input) throws IOException {
+        return input.readLine().startsWith("%PDF");
+    }
+
+    @Override
+    public ParserResult importDatabase(BufferedReader reader) throws IOException {
+        throw new UnsupportedOperationException("PdfBibExtractor does not support importDatabase(BufferedReader reader). "
+                + "Instead use importDatabase(Path filePath, Charset defaultEncoding).");
+    }
+
+    @Override
+    public ParserResult importDatabase(String data) throws IOException {
+        throw new UnsupportedOperationException("PdfBibExtractor  does not support importDatabase(String data). "
+                + "Instead use importDatabase(Path filePath, Charset defaultEncoding).");
+    }
+
+    @Override
+    public ParserResult importDatabase(Path filePath) {
+        try (PDDocument document = new XmpUtilReader().loadWithAutomaticDecryption(filePath)) {
+            return new ParserResult(importDatabase(filePath, document));
+        } catch (EncryptedPdfsNotSupportedException e) {
+            return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported."));
+        } catch (IOException | ParseException exception) {
+            return ParserResult.fromError(exception);
+        }
+    }
+
+    @Override
+    public StandardFileType getFileType() {
+        return StandardFileType.PDF;
+    }
 }
diff --git a/src/main/java/org/jabref/logic/importer/fileformat/pdf/PdfEmbeddedBibExtractor.java b/src/main/java/org/jabref/logic/importer/fileformat/pdf/PdfEmbeddedBibExtractor.java
@@ -9,6 +9,7 @@
 import org.jabref.logic.importer.ImportFormatPreferences;
 import org.jabref.logic.importer.ParseException;
 import org.jabref.logic.importer.fileformat.BibtexParser;
+import org.jabref.logic.l10n.Localization;
 import org.jabref.logic.util.io.FileUtil;
 import org.jabref.model.entry.BibEntry;
 
@@ -25,7 +26,7 @@
 /**
  * Imports an embedded Bib-File from the PDF.
  */
-public class PdfEmbeddedBibExtractor implements PdfBibExtractor {
+public class PdfEmbeddedBibExtractor extends PdfBibExtractor {
 
     private final BibtexParser bibtexParser;
 
@@ -106,4 +107,14 @@ private static PDEmbeddedFile getEmbeddedFile(PDComplexFileSpecification fileSpe
         }
         return embeddedFile;
     }
+
+    @Override
+    public String getName() {
+        return "PDFembeddedbibfile";
+    }
+
+    @Override
+    public String getDescription() {
+        return Localization.lang("Imports a BibTeX file found inside a PDF.");
+    }
 }
diff --git a/src/main/java/org/jabref/logic/importer/fileformat/pdf/PdfFirstPageBibExtractor.java b/src/main/java/org/jabref/logic/importer/fileformat/pdf/PdfFirstPageBibExtractor.java
@@ -15,6 +15,7 @@
 
 import org.jabref.logic.importer.fileformat.BibliographyFromPdfImporter;
 import org.jabref.logic.importer.fileformat.PdfImporter;
+import org.jabref.logic.l10n.Localization;
 import org.jabref.logic.os.OS;
 import org.jabref.logic.util.PdfUtils;
 import org.jabref.model.entry.BibEntry;
@@ -42,7 +43,7 @@
  * <p>
  * If several PDF importers should be tried, use {@link PdfImporter}.
  */
-public class PdfFirstPageBibExtractor implements PdfBibExtractor {
+public class PdfFirstPageBibExtractor extends PdfBibExtractor {
 
     private static final Pattern YEAR_EXTRACT_PATTERN = Pattern.compile("\\d{4}");
 
@@ -649,4 +650,14 @@ private void readLastBlock() {
             }
         }
     }
+
+    @Override
+    public String getName() {
+        return "PDFcontent";
+    }
+
+    @Override
+    public String getDescription() {
+        return Localization.lang("This importer parses data of the first page of the PDF and creates a BibTeX entry. Currently, Springer and IEEE formats are supported.");
+    }
 }
diff --git a/src/main/java/org/jabref/logic/importer/fileformat/pdf/PdfGrobidBibExtractor.java b/src/main/java/org/jabref/logic/importer/fileformat/pdf/PdfGrobidBibExtractor.java
@@ -7,14 +7,15 @@
 import org.jabref.logic.importer.ImportFormatPreferences;
 import org.jabref.logic.importer.ParseException;
 import org.jabref.logic.importer.util.GrobidService;
+import org.jabref.logic.l10n.Localization;
 import org.jabref.model.entry.BibEntry;
 
 import org.apache.pdfbox.pdmodel.PDDocument;
 
 /**
  * Wraps the GrobidService function to be used as a {@link PdfBibExtractor}.
  */
-public class PdfGrobidBibExtractor implements PdfBibExtractor {
+public class PdfGrobidBibExtractor extends PdfBibExtractor {
 
     private final GrobidService grobidService;
     private final ImportFormatPreferences importFormatPreferences;
@@ -27,4 +28,19 @@ public PdfGrobidBibExtractor(ImportFormatPreferences importFormatPreferences) {
     public List<BibEntry> importDatabase(Path filePath, PDDocument document) throws IOException, ParseException {
         return grobidService.processPDF(filePath, importFormatPreferences);
     }
+
+    @Override
+    public String getId() {
+        return "grobidPdf";
+    }
+
+    @Override
+    public String getName() {
+        return "Grobid";
+    }
+
+    @Override
+    public String getDescription() {
+        return Localization.lang("Imports BibTeX data of a PDF using Grobid.");
+    }
 }
diff --git a/src/main/java/org/jabref/logic/importer/fileformat/pdf/PdfVerbatimBibExtractor.java b/src/main/java/org/jabref/logic/importer/fileformat/pdf/PdfVerbatimBibExtractor.java
@@ -7,6 +7,7 @@
 import org.jabref.logic.importer.ImportFormatPreferences;
 import org.jabref.logic.importer.ParseException;
 import org.jabref.logic.importer.fileformat.BibtexParser;
+import org.jabref.logic.l10n.Localization;
 import org.jabref.logic.util.PdfUtils;
 import org.jabref.model.entry.BibEntry;
 
@@ -15,7 +16,7 @@
 /**
  * This importer imports a verbatim BibTeX entry from the first page of the PDF.
  */
-public class PdfVerbatimBibExtractor implements PdfBibExtractor {
+public class PdfVerbatimBibExtractor extends PdfBibExtractor {
 
     private final ImportFormatPreferences importFormatPreferences;
 
@@ -35,4 +36,14 @@ public List<BibEntry> importDatabase(Path filePath, PDDocument document) throws
 
         return result;
     }
+
+    @Override
+    public String getName() {
+        return "PdfVerbatimBibText";
+    }
+
+    @Override
+    public String getDescription() {
+        return Localization.lang("Scrapes the first page of a PDF for BibTeX information.");
+    }
 }
diff --git a/src/main/java/org/jabref/logic/importer/fileformat/pdf/PdfXmpBibExtractor.java b/src/main/java/org/jabref/logic/importer/fileformat/pdf/PdfXmpBibExtractor.java
@@ -4,6 +4,7 @@
 import java.nio.file.Path;
 import java.util.List;
 
+import org.jabref.logic.l10n.Localization;
 import org.jabref.logic.xmp.XmpPreferences;
 import org.jabref.logic.xmp.XmpUtilReader;
 import org.jabref.model.entry.BibEntry;
@@ -13,7 +14,7 @@
 /**
  * Wraps the XMPUtility function to be used as an Importer.
  */
-public class PdfXmpBibExtractor implements PdfBibExtractor {
+public class PdfXmpBibExtractor extends PdfBibExtractor {
 
     private final XmpPreferences xmpPreferences;
 
@@ -24,4 +25,19 @@ public PdfXmpBibExtractor(XmpPreferences xmpPreferences) {
     public List<BibEntry> importDatabase(Path filePath, PDDocument document) throws IOException {
         return new XmpUtilReader().readXmp(filePath, xmpPreferences);
     }
+
+    @Override
+    public String getName() {
+        return Localization.lang("XMP-annotated PDF");
+    }
+
+    @Override
+    public String getId() {
+        return "xmp";
+    }
+
+    @Override
+    public String getDescription() {
+        return Localization.lang("Imports BibTeX data using XMP data of a PDF.");
+    }
 }