Skip to content

Commit

Permalink
Make PdfBibExtractor be Importer
Browse files Browse the repository at this point in the history
  • Loading branch information
InAnYan committed Dec 19, 2024
1 parent 7c9bb38 commit 783b2a5
Show file tree
Hide file tree
Showing 11 changed files with 160 additions and 30 deletions.
1 change: 0 additions & 1 deletion src/main/java/module-info.java
Original file line number Diff line number Diff line change
Expand Up @@ -193,6 +193,5 @@
requires mslinks;
requires org.antlr.antlr4.runtime;
requires org.libreoffice.uno;
requires org.jetbrains.annotations;
// endregion
}
37 changes: 20 additions & 17 deletions src/main/java/org/jabref/gui/fieldeditors/LinkedFileViewModel.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,19 @@
import org.jabref.gui.linkedfile.DeleteFileAction;
import org.jabref.gui.linkedfile.DownloadLinkedFileAction;
import org.jabref.gui.linkedfile.LinkedFileEditDialog;
import org.jabref.gui.mergeentries.MultiMergeEntriesView;
import org.jabref.gui.preferences.GuiPreferences;
import org.jabref.gui.util.ControlHelper;
import org.jabref.logic.FilePreferences;
import org.jabref.logic.externalfiles.LinkedFileHandler;
import org.jabref.logic.importer.Importer;
import org.jabref.logic.importer.ParserResult;
import org.jabref.logic.importer.fileformat.PdfImporter;
import org.jabref.logic.importer.fileformat.pdf.PdfEmbeddedBibExtractor;
import org.jabref.logic.importer.fileformat.pdf.PdfFirstPageBibExtractor;
import org.jabref.logic.importer.fileformat.pdf.PdfGrobidBibExtractor;
import org.jabref.logic.importer.fileformat.pdf.PdfVerbatimBibExtractor;
import org.jabref.logic.importer.fileformat.pdf.PdfXmpBibExtractor;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.util.TaskExecutor;
import org.jabref.logic.util.io.FileUtil;
Expand Down Expand Up @@ -450,25 +456,22 @@ public ValidationStatus fileExistsValidationStatus() {
return fileExistsValidator.getValidationStatus();
}

public void parsePdfMetadata() {
public void parsePdfMetadataAndShowMergeDialog() {
linkedFile.findIn(databaseContext, preferences.getFilePreferences()).ifPresent(filePath -> {
try {
PdfImporter importer = new PdfImporter(preferences.getImportFormatPreferences());
ParserResult result = importer.importDatabase(filePath);

// This code duplication is the result of mixing 2 idioms: exceptions in method specification
// vs exceptions in return type.
if (result.isInvalid()) {
LOGGER.error("Unable to extract PDF metadata: {}", result.getErrorMessage());
dialogService.notify(Localization.lang("Unable to extract PDF metadata: %0", result.getErrorMessage()));
}

databaseContext.getDatabase().removeEntry(entry);
databaseContext.getDatabase().insertEntries(result.getDatabase().getEntries());
} catch (Exception e) {
LOGGER.error("Unable to extract PDF metadata", e);
dialogService.notify(Localization.lang("Unable to extract PDF metadata: %0", e.getMessage()));
MultiMergeEntriesView dialog = new MultiMergeEntriesView(preferences, taskExecutor);
dialog.setTitle(Localization.lang("Merge PDF metadata"));
dialog.addSource(Localization.lang("Entry"), entry);
dialog.addSource(Localization.lang("Verbatim"), wrapImporterToSupplier(new PdfVerbatimBibExtractor(preferences.getImportFormatPreferences()), filePath));
dialog.addSource(Localization.lang("Embedded"), wrapImporterToSupplier(new PdfEmbeddedBibExtractor(preferences.getImportFormatPreferences()), filePath));
if (preferences.getGrobidPreferences().isGrobidEnabled()) {
dialog.addSource("Grobid", wrapImporterToSupplier(new PdfGrobidBibExtractor(preferences.getImportFormatPreferences()), filePath));
}
dialog.addSource(Localization.lang("XMP metadata"), wrapImporterToSupplier(new PdfXmpBibExtractor(preferences.getXmpPreferences()), filePath));
dialog.addSource(Localization.lang("Content"), wrapImporterToSupplier(new PdfFirstPageBibExtractor(), filePath));
dialogService.showCustomDialogAndWait(dialog).ifPresent(newEntry -> {
databaseContext.getDatabase().removeEntry(entry);
databaseContext.getDatabase().insertEntry(newEntry);
});
});
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ private Node createFileDisplay(LinkedFileViewModel linkedFile) {
parsePdfMetadata.visibleProperty().bind(linkedFile.isOfflinePdfProperty());
parsePdfMetadata.setOnAction(event -> {
GrobidUseDialogHelper.showAndWaitIfUserIsUndecided(dialogService, preferences.getGrobidPreferences());
linkedFile.parsePdfMetadata();
linkedFile.parsePdfMetadataAndShowMergeDialog();
});
parsePdfMetadata.getStyleClass().setAll("icon-button");

Expand Down
3 changes: 2 additions & 1 deletion src/main/java/org/jabref/gui/importer/ImportCommand.java
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import org.jabref.logic.importer.Importer;
import org.jabref.logic.importer.ParserResult;
import org.jabref.logic.importer.fileformat.PdfImporter;
import org.jabref.logic.importer.fileformat.pdf.PdfGrobidBibExtractor;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.preferences.CliPreferences;
import org.jabref.logic.util.BackgroundTask;
Expand Down Expand Up @@ -152,7 +153,7 @@ private ParserResult doImport(List<Path> files, Importer importFormat) throws IO
imports.add(importFormatReader.importUnknownFormat(filename, fileUpdateMonitor));
} else {
UiTaskExecutor.runAndWaitInJavaFXThread(() -> {
if (importer.get() instanceof PdfImporter
if (((importer.get() instanceof PdfGrobidBibExtractor) || (importer.get() instanceof PdfImporter))
&& GrobidUseDialogHelper.showAndWaitIfUserIsUndecided(dialogService, preferences.getGrobidPreferences())) {
importFormatReader.reset();
}
Expand Down
14 changes: 14 additions & 0 deletions src/main/java/org/jabref/logic/importer/ImportFormatReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,11 +27,18 @@
import org.jabref.logic.importer.fileformat.PdfImporter;
import org.jabref.logic.importer.fileformat.RepecNepImporter;
import org.jabref.logic.importer.fileformat.RisImporter;
import org.jabref.logic.importer.fileformat.pdf.PdfEmbeddedBibExtractor;
import org.jabref.logic.importer.fileformat.pdf.PdfFirstPageBibExtractor;
import org.jabref.logic.importer.fileformat.pdf.PdfGrobidBibExtractor;
import org.jabref.logic.importer.fileformat.pdf.PdfVerbatimBibExtractor;
import org.jabref.logic.importer.fileformat.pdf.PdfXmpBibExtractor;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.database.BibDatabases;
import org.jabref.model.entry.BibEntry;
import org.jabref.model.util.FileUpdateMonitor;

import org.apache.pdfbox.pdmodel.PDFormContentStream;

public class ImportFormatReader {

public static final String BIBTEX_FORMAT = "BibTeX";
Expand Down Expand Up @@ -70,6 +77,13 @@ public void reset() {
formats.add(new MsBibImporter());
formats.add(new OvidImporter());
formats.add(new PdfImporter(importFormatPreferences));
formats.add(new PdfVerbatimBibExtractor(importFormatPreferences));
formats.add(new PdfFirstPageBibExtractor());
formats.add(new PdfEmbeddedBibExtractor(importFormatPreferences));
if (importFormatPreferences.grobidPreferences().isGrobidEnabled()) {
formats.add(new PdfGrobidBibExtractor(importFormatPreferences));
}
formats.add(new PdfXmpBibExtractor(importFormatPreferences.xmpPreferences()));
formats.add(new RepecNepImporter(importFormatPreferences));
formats.add(new RisImporter());
formats.add(new CffImporter(citationKeyPatternPreferences));
Expand Down
Original file line number Diff line number Diff line change
@@ -1,20 +1,68 @@
package org.jabref.logic.importer.fileformat.pdf;

import java.io.BufferedReader;
import java.io.IOException;
import java.nio.file.Path;
import java.util.List;

import org.jabref.gui.fieldeditors.LinkedFileViewModel;
import org.jabref.logic.importer.Importer;
import org.jabref.logic.importer.ParseException;
import org.jabref.logic.importer.ParserResult;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.util.StandardFileType;
import org.jabref.logic.xmp.EncryptedPdfsNotSupportedException;
import org.jabref.logic.xmp.XmpUtilReader;
import org.jabref.model.entry.BibEntry;

import org.apache.pdfbox.pdmodel.PDDocument;

/**
* Intermediate class to bundle all PDF analysis steps.
* Intermediate class to bundle all PDF analysis steps. {@link PdfBibExtractor} are also {@link org.jabref.logic.importer.Importer}s,
* which allows user for more fine-grained control of how {@link BibEntry} is extracted from a PDF file.
* <p>
* Note, that this step should not add PDF file to {@link BibEntry}, it will be finally added
* in {@link org.jabref.logic.importer.fileformat.PdfImporter}.
* {@link PdfBibExtractor}s are used in two places in JabRef:
* 1. {@link org.jabref.logic.importer.fileformat.PdfImporter}: uses several {@link PdfBibExtractor} and automatically
* merges them into 1 {@link BibEntry}.
* 2. {@link LinkedFileViewModel#parsePdfMetadataAndShowMergeDialog()}: also uses several {@link PdfBibExtractor}, but
* it shows a merge dialog (instead of automatic merge).
* <p>
* Note, that this step should not add PDF file to {@link BibEntry}, it will be finally added either in
* {@link PdfBibExtractor#importDatabase(Path)} or {@link org.jabref.logic.importer.fileformat.PdfImporter}.
*/
public interface PdfBibExtractor {
List<BibEntry> importDatabase(Path filePath, PDDocument document) throws IOException, ParseException;
public abstract class PdfBibExtractor extends Importer {
public abstract List<BibEntry> importDatabase(Path filePath, PDDocument document) throws IOException, ParseException;

@Override
public boolean isRecognizedFormat(BufferedReader input) throws IOException {
return input.readLine().startsWith("%PDF");
}

@Override
public ParserResult importDatabase(BufferedReader reader) throws IOException {
throw new UnsupportedOperationException("PdfBibExtractor does not support importDatabase(BufferedReader reader). "
+ "Instead use importDatabase(Path filePath, Charset defaultEncoding).");
}

@Override
public ParserResult importDatabase(String data) throws IOException {
throw new UnsupportedOperationException("PdfBibExtractor does not support importDatabase(String data). "
+ "Instead use importDatabase(Path filePath, Charset defaultEncoding).");
}

@Override
public ParserResult importDatabase(Path filePath) {
try (PDDocument document = new XmpUtilReader().loadWithAutomaticDecryption(filePath)) {
return new ParserResult(importDatabase(filePath, document));
} catch (EncryptedPdfsNotSupportedException e) {
return ParserResult.fromErrorMessage(Localization.lang("Decryption not supported."));
} catch (IOException | ParseException exception) {
return ParserResult.fromError(exception);
}
}

@Override
public StandardFileType getFileType() {
return StandardFileType.PDF;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.ParseException;
import org.jabref.logic.importer.fileformat.BibtexParser;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.util.io.FileUtil;
import org.jabref.model.entry.BibEntry;

Expand All @@ -25,7 +26,7 @@
/**
* Imports an embedded Bib-File from the PDF.
*/
public class PdfEmbeddedBibExtractor implements PdfBibExtractor {
public class PdfEmbeddedBibExtractor extends PdfBibExtractor {

private final BibtexParser bibtexParser;

Expand Down Expand Up @@ -106,4 +107,14 @@ private static PDEmbeddedFile getEmbeddedFile(PDComplexFileSpecification fileSpe
}
return embeddedFile;
}

@Override
public String getName() {
return "PDFembeddedbibfile";
}

@Override
public String getDescription() {
return Localization.lang("Imports a BibTeX file found inside a PDF.");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import org.jabref.logic.importer.fileformat.BibliographyFromPdfImporter;
import org.jabref.logic.importer.fileformat.PdfImporter;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.os.OS;
import org.jabref.logic.util.PdfUtils;
import org.jabref.model.entry.BibEntry;
Expand Down Expand Up @@ -42,7 +43,7 @@
* <p>
* If several PDF importers should be tried, use {@link PdfImporter}.
*/
public class PdfFirstPageBibExtractor implements PdfBibExtractor {
public class PdfFirstPageBibExtractor extends PdfBibExtractor {

private static final Pattern YEAR_EXTRACT_PATTERN = Pattern.compile("\\d{4}");

Expand Down Expand Up @@ -649,4 +650,14 @@ private void readLastBlock() {
}
}
}

@Override
public String getName() {
return "PDFcontent";
}

@Override
public String getDescription() {
return Localization.lang("This importer parses data of the first page of the PDF and creates a BibTeX entry. Currently, Springer and IEEE formats are supported.");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,14 +7,15 @@
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.ParseException;
import org.jabref.logic.importer.util.GrobidService;
import org.jabref.logic.l10n.Localization;
import org.jabref.model.entry.BibEntry;

import org.apache.pdfbox.pdmodel.PDDocument;

/**
* Wraps the GrobidService function to be used as a {@link PdfBibExtractor}.
*/
public class PdfGrobidBibExtractor implements PdfBibExtractor {
public class PdfGrobidBibExtractor extends PdfBibExtractor {

private final GrobidService grobidService;
private final ImportFormatPreferences importFormatPreferences;
Expand All @@ -27,4 +28,19 @@ public PdfGrobidBibExtractor(ImportFormatPreferences importFormatPreferences) {
public List<BibEntry> importDatabase(Path filePath, PDDocument document) throws IOException, ParseException {
return grobidService.processPDF(filePath, importFormatPreferences);
}

@Override
public String getId() {
return "grobidPdf";
}

@Override
public String getName() {
return "Grobid";
}

@Override
public String getDescription() {
return Localization.lang("Imports BibTeX data of a PDF using Grobid.");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import org.jabref.logic.importer.ImportFormatPreferences;
import org.jabref.logic.importer.ParseException;
import org.jabref.logic.importer.fileformat.BibtexParser;
import org.jabref.logic.l10n.Localization;
import org.jabref.logic.util.PdfUtils;
import org.jabref.model.entry.BibEntry;

Expand All @@ -15,7 +16,7 @@
/**
* This importer imports a verbatim BibTeX entry from the first page of the PDF.
*/
public class PdfVerbatimBibExtractor implements PdfBibExtractor {
public class PdfVerbatimBibExtractor extends PdfBibExtractor {

private final ImportFormatPreferences importFormatPreferences;

Expand All @@ -35,4 +36,14 @@ public List<BibEntry> importDatabase(Path filePath, PDDocument document) throws

return result;
}

@Override
public String getName() {
return "PdfVerbatimBibText";
}

@Override
public String getDescription() {
return Localization.lang("Scrapes the first page of a PDF for BibTeX information.");
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import java.nio.file.Path;
import java.util.List;

import org.jabref.logic.l10n.Localization;
import org.jabref.logic.xmp.XmpPreferences;
import org.jabref.logic.xmp.XmpUtilReader;
import org.jabref.model.entry.BibEntry;
Expand All @@ -13,7 +14,7 @@
/**
* Wraps the XMPUtility function to be used as an Importer.
*/
public class PdfXmpBibExtractor implements PdfBibExtractor {
public class PdfXmpBibExtractor extends PdfBibExtractor {

private final XmpPreferences xmpPreferences;

Expand All @@ -24,4 +25,19 @@ public PdfXmpBibExtractor(XmpPreferences xmpPreferences) {
public List<BibEntry> importDatabase(Path filePath, PDDocument document) throws IOException {
return new XmpUtilReader().readXmp(filePath, xmpPreferences);
}

@Override
public String getName() {
return Localization.lang("XMP-annotated PDF");
}

@Override
public String getId() {
return "xmp";
}

@Override
public String getDescription() {
return Localization.lang("Imports BibTeX data using XMP data of a PDF.");
}
}

0 comments on commit 783b2a5

Please sign in to comment.