JabRef · ar-rana · Dec 24, 2024 · Dec 25, 2024 · Dec 25, 2024 · Dec 25, 2024
diff --git a/buildres/abbrv.jabref.org b/buildres/abbrv.jabref.org
diff --git a/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java b/src/main/java/org/jabref/logic/importer/fileformat/PdfContentImporter.java
@@ -24,6 +24,7 @@
 import org.jabref.model.entry.BibEntry;
 import org.jabref.model.entry.LinkedFile;
 import org.jabref.model.entry.field.StandardField;
+import org.jabref.model.entry.identifier.ArXivIdentifier;
 import org.jabref.model.entry.identifier.DOI;
 import org.jabref.model.entry.types.EntryType;
 import org.jabref.model.entry.types.StandardEntryType;
@@ -51,6 +52,8 @@ public class PdfContentImporter extends PdfImporter {
 
     private static final Pattern YEAR_EXTRACT_PATTERN = Pattern.compile("\\d{4}");
 
+    private static final int ARXIV_PREFIX_LENGTH = "arxiv:".length();
+
     // input lines into several lines
     private String[] lines;
 
@@ -364,12 +367,14 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
         String volume = null;
         String number = null;
         String pages = null;
+        String arXivId = null;
         // year is a class variable as the method extractYear() uses it;
         String publisher = null;
 
         EntryType type = StandardEntryType.InProceedings;
         if (curString.length() > 4) {
             // special case: possibly conference as first line on the page
+            arXivId = getArXivId(null);
             extractYear();
             doi = getDoi(null);
             if (curString.contains("Conference")) {
@@ -387,7 +392,8 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
                 }
             }
         }
-
+        // sometimes ArXiv ID is read before title
+        arXivId = getArXivId(arXivId);
         // start: title
         fillCurStringWithNonEmptyLines();
         title = streamlineTitle(curString);
@@ -507,6 +513,7 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
                 }
             } else {
                 doi = getDoi(doi);
+                arXivId = getArXivId(arXivId);
 
                 if ((publisher == null) && curString.contains("IEEE")) {
                     // IEEE has the conference things at the end
@@ -531,6 +538,10 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
             }
         }
 
+        if (arXivId != null && arXivId.contains(year)) {
+            year = null;
+        }
+
         BibEntry entry = new BibEntry();
         entry.setType(type);
 
@@ -557,20 +568,26 @@ Optional<BibEntry> getEntryFromPDFContent(String firstpageContents, String lineS
         if (doi != null) {
             entry.setField(StandardField.DOI, doi);
         }
+        if (arXivId != null) {
+            entry.setField(StandardField.EPRINT, arXivId);
+        }
         if (series != null) {
             entry.setField(StandardField.SERIES, series);
         }
         if (volume != null) {
             entry.setField(StandardField.VOLUME, volume);
         }
-        if (number != null) {
+        if (number != null && number.chars().allMatch(Character::isDigit)) {
             entry.setField(StandardField.NUMBER, number);
         }
         if (pages != null) {
             entry.setField(StandardField.PAGES, pages);
         }
         if (year != null) {
             entry.setField(StandardField.YEAR, year);
+        } else if (arXivId != null) {
+            year = "20" + arXivId.substring(0, 2);
+            entry.setField(StandardField.YEAR, year);
         }
         if (publisher != null) {
             entry.setField(StandardField.PUBLISHER, publisher);
@@ -592,6 +609,26 @@ private String getDoi(String doi) {
         return doi;
     }
 
+    private String getArXivId(String arXivId) {
+        if (arXivId != null) {
+            return arXivId;
+        }
+
+        String arXiv = curString.split(" ")[0];
+        arXivId = ArXivIdentifier.parse(arXiv).map(ArXivIdentifier::asString).orElse(null);
+
+        if (arXivId == null || curString.length() < arXivId.length() + ARXIV_PREFIX_LENGTH) {
+            return arXivId;
+        }
+        // The arxiv string also contains the year
+        curString = curString.substring(arXivId.length() + ARXIV_PREFIX_LENGTH);
+        extractYear();
+        curString = "";
+        proceedToNextNonEmptyLine();
+
+        return arXivId;
+    }
+
     private String getFirstPageContents(PDDocument document) throws IOException {
         PDFTextStripper stripper = new PDFTextStripper();
 

diff --git a/src/main/resources/csl-styles b/src/main/resources/csl-styles
diff --git a/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java b/src/test/java/org/jabref/logic/importer/fileformat/PdfContentImporterTest.java
@@ -129,6 +129,42 @@ British Journal of Nutrition (2008), 99, 1–11 doi: 10.1017/S0007114507795296
         assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n", Optional.empty()));
     }
 
+    @Test
+    void extractArXivFromPage1() {
+        BibEntry entry = new BibEntry(StandardEntryType.TechReport)
+                .withField(StandardField.AUTHOR, "Filippo Riccaa and Alessandro Marchettob and Andrea Stoccoc")
+                .withField(StandardField.TITLE, "A Multi-Year Grey Literature Review on AI-assisted Test Automation")
+                .withField(StandardField.YEAR, "2024")
+                .withField(StandardField.EPRINT, "2408.06224v1")
+                .withField((StandardField.KEYWORDS), "Test Automation Artificial Intelligence AI-assisted Test Automation Grey Literature Automated Test Generation Self-Healing Test Scripts");
+
+        String firstPageContent = """
+                arXiv:2408.06224v1 [cs.SE] 12 Aug 2024
+                A Multi-Year Grey Literature Review on AI-assisted Test Automation
+
+                Filippo Riccaa, Alessandro Marchettob and Andrea Stoccoc
+
+                aUniversity of Genoa, Via Balbi 5, Genova, 16126, Italy
+                bUniversity of Trento, Via Sommarive 9, Trento, 38123, Italy
+                cTechnical University of Munich, Boltzmannstraße 3, Munich, 85748, Germany
+                dfortiss GmbH, Guerickestraße 25, Munich, 80805, Germany
+
+                Keywords:
+                Test Automation
+                Artificial Intelligence
+                AI-assisted Test Automation
+                Grey Literature
+                Automated Test Generation
+                Self-Healing Test Scripts
+
+                *Corresponding author
+                [email protected] (F. Ricca)
+                https://person.dibris.unige.it/ricca-filippo/ (F. Ricca)
+                ORCID(s): 0000-0002-3928-5408 (F. Ricca); 0000-0002-6833-896X (A. Marchetto); 0000-0001-8956-3894 (A. Stocco)""";
+
+        assertEquals(Optional.of(entry), importer.getEntryFromPDFContent(firstPageContent, "\n", Optional.empty()));
+    }
+
     @ParameterizedTest
     @MethodSource("providePdfData")
     void pdfTitleExtraction(String expectedTitle, String filePath) throws Exception {
+0 −9		journals/journal_abbreviations_mathematics.csv
+1 −20		journals/journal_abbreviations_ubc.csv
+0 −279		acta-medica-portuguesa.csl
+1 −1		anesthesiology.csl
+0 −5		apa-5th-edition.csl
+0 −5		apa-6th-edition-no-ampersand.csl
+0 −5		apa-6th-edition.csl
+0 −5		apa-annotated-bibliography.csl
+0 −5		apa-cv.csl
+0 −5		apa-no-ampersand.csl
+0 −5		apa-no-doi-no-issue.csl
+0 −5		apa-no-initials.csl
+0 −5		apa-numeric-superscript-brackets.csl
+0 −5		apa-numeric-superscript.csl
+0 −5		apa-old-doi-prefix.csl
+0 −5		apa-single-spaced.csl
+0 −5		apa-with-abstract.csl
+0 −5		apa.csl
+1 −11		bern-university-of-applied-sciences-school-of-agricultural-forest-and-food-sciences-hafl.csl
+0 −416		gayana.csl
+0 −24		royal-society-of-chemistry-with-titles.csl
+1 −1		spec/spec_helper.rb