forked from JabRef/jabref
-
Notifications
You must be signed in to change notification settings - Fork 15
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
…abRef#10549) * Create Fetcher and Transformer for ScholarArchive * Finish change requirement including code style, testing, some error , and comment. * Finish and fix archive scholar fetcher * add url * fix arch * fix test * fix var name * remove comments * add changelog * fuck this changelog --------- Co-authored-by: youliyou <[email protected]> Co-authored-by: Siedlerchr <[email protected]>
- Loading branch information
1 parent
94e2285
commit a2aa2c6
Showing
6 changed files
with
304 additions
and
15 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
158 changes: 158 additions & 0 deletions
158
src/main/java/org/jabref/logic/importer/fetcher/ScholarArchiveFetcher.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,158 @@ | ||
package org.jabref.logic.importer.fetcher; | ||
|
||
import java.net.MalformedURLException; | ||
import java.net.URISyntaxException; | ||
import java.net.URL; | ||
import java.util.ArrayList; | ||
import java.util.List; | ||
import java.util.Optional; | ||
import java.util.stream.IntStream; | ||
|
||
import org.jabref.logic.importer.FetcherException; | ||
import org.jabref.logic.importer.PagedSearchBasedParserFetcher; | ||
import org.jabref.logic.importer.ParseException; | ||
import org.jabref.logic.importer.Parser; | ||
import org.jabref.logic.importer.fetcher.transformers.ScholarArchiveQueryTransformer; | ||
import org.jabref.logic.importer.util.JsonReader; | ||
import org.jabref.logic.net.URLDownload; | ||
import org.jabref.model.entry.AuthorList; | ||
import org.jabref.model.entry.BibEntry; | ||
import org.jabref.model.entry.field.StandardField; | ||
import org.jabref.model.entry.types.EntryType; | ||
import org.jabref.model.entry.types.StandardEntryType; | ||
|
||
import jakarta.ws.rs.core.MediaType; | ||
import kong.unirest.json.JSONArray; | ||
import kong.unirest.json.JSONException; | ||
import kong.unirest.json.JSONObject; | ||
import org.apache.http.client.utils.URIBuilder; | ||
import org.apache.lucene.queryparser.flexible.core.nodes.QueryNode; | ||
import org.slf4j.Logger; | ||
import org.slf4j.LoggerFactory; | ||
|
||
public class ScholarArchiveFetcher implements PagedSearchBasedParserFetcher { | ||
|
||
public static final String FETCHER_NAME = "ScholarArchive"; | ||
|
||
private static final Logger LOGGER = LoggerFactory.getLogger(ScholarArchiveFetcher.class); | ||
|
||
private static final String API_URL = "https://scholar.archive.org/search"; | ||
|
||
/** | ||
* Gets the query URL by luceneQuery and pageNumber. | ||
* | ||
* @param luceneQuery the search query | ||
* @param pageNumber the number of the page indexed from 0 | ||
* @return URL | ||
*/ | ||
@Override | ||
public URL getURLForQuery(QueryNode luceneQuery, int pageNumber) throws URISyntaxException, MalformedURLException, FetcherException { | ||
URIBuilder uriBuilder = new URIBuilder(API_URL); | ||
uriBuilder.addParameter("q", new ScholarArchiveQueryTransformer().transformLuceneQuery(luceneQuery).orElse("")); | ||
uriBuilder.addParameter("from", String.valueOf(getPageSize() * pageNumber)); | ||
uriBuilder.addParameter("size", String.valueOf(getPageSize())); | ||
uriBuilder.addParameter("format", "json"); | ||
|
||
LOGGER.debug("using URL for search {}", uriBuilder.build()); | ||
return uriBuilder.build().toURL(); | ||
} | ||
|
||
@Override | ||
public URLDownload getUrlDownload(URL url) { | ||
URLDownload download = new URLDownload(url); | ||
download.addHeader("Accept", MediaType.APPLICATION_JSON); | ||
return download; | ||
} | ||
|
||
/** | ||
* Gets the list of BibEntry by given Json response from scholar archive fetcher API | ||
* | ||
* @return Parser, list of BibEntry | ||
*/ | ||
@Override | ||
public Parser getParser() { | ||
return inputStream -> { | ||
JSONObject response = JsonReader.toJsonObject(inputStream); | ||
List<BibEntry> entries = new ArrayList<>(); | ||
if (response.has("results")) { | ||
JSONArray results = response.getJSONArray("results"); | ||
for (int i = 0; i < results.length(); i++) { | ||
JSONObject jsonEntry = results.getJSONObject(i); | ||
BibEntry entry = parseJSONtoBibtex(jsonEntry); | ||
entries.add(entry); | ||
} | ||
} | ||
|
||
return entries; | ||
}; | ||
} | ||
|
||
@Override | ||
public String getName() { | ||
return FETCHER_NAME; | ||
} | ||
|
||
private BibEntry parseJSONtoBibtex(JSONObject jsonEntry) throws ParseException { | ||
try { | ||
BibEntry entry = new BibEntry(); | ||
EntryType entryType = StandardEntryType.InCollection; | ||
JSONObject biblio = jsonEntry.optJSONObject("biblio"); | ||
|
||
JSONArray abstracts = jsonEntry.getJSONArray("abstracts"); | ||
String foundAbstract = IntStream.range(0, abstracts.length()) | ||
.mapToObj(abstracts::getJSONObject) | ||
.map(object -> object.optString("body")) | ||
.findFirst().orElse(""); | ||
|
||
String url = Optional.ofNullable(jsonEntry.optJSONObject("fulltext")).map(fullText -> fullText.optString("access_url")).orElse(""); | ||
|
||
// publication type | ||
String type = biblio.optString("release_type"); | ||
entry.setField(StandardField.TYPE, type); | ||
if (type.toLowerCase().contains("book")) { | ||
entryType = StandardEntryType.Book; | ||
} else if (type.toLowerCase().contains("article")) { | ||
entryType = StandardEntryType.Article; | ||
} | ||
entry.setType(entryType); | ||
|
||
entry.setField(StandardField.TITLE, biblio.optString("title")); | ||
entry.setField(StandardField.JOURNAL, biblio.optString("container_name")); | ||
entry.setField(StandardField.DOI, biblio.optString("doi")); | ||
entry.setField(StandardField.ISSUE, biblio.optString("issue")); | ||
entry.setField(StandardField.LANGUAGE, biblio.optString("lang_code")); | ||
entry.setField(StandardField.PUBLISHER, biblio.optString("publisher")); | ||
|
||
entry.setField(StandardField.YEAR, String.valueOf(biblio.optInt("release_year"))); | ||
entry.setField(StandardField.VOLUME, String.valueOf(biblio.optInt("volume_int"))); | ||
entry.setField(StandardField.ABSTRACT, foundAbstract); | ||
entry.setField(StandardField.URL, url); | ||
|
||
String dateString = biblio.optString("date"); | ||
entry.setField(StandardField.DATE, dateString); | ||
|
||
// Authors are in contrib_names | ||
if (biblio.has("contrib_names")) { | ||
JSONArray authors = biblio.getJSONArray("contrib_names"); | ||
List<String> authorList = new ArrayList<>(); | ||
for (int i = 0; i < authors.length(); i++) { | ||
authorList.add(authors.getString(i)); | ||
} | ||
AuthorList parsedAuthors = AuthorList.parse(String.join(" and ", authorList)); | ||
entry.setField(StandardField.AUTHOR, parsedAuthors.getAsLastFirstNamesWithAnd(false)); | ||
} | ||
|
||
if (biblio.has("issns")) { | ||
JSONArray issn = biblio.getJSONArray("issns"); | ||
List<String> issnList = new ArrayList<>(); | ||
for (int i = 0; i < issn.length(); i++) { | ||
issnList.add(issn.getString(i)); | ||
} | ||
entry.setField(StandardField.ISSN, String.join(" ", issnList)); | ||
} | ||
return entry; | ||
} catch (JSONException exception) { | ||
throw new ParseException("ScholarArchive API JSON format has changed", exception); | ||
} | ||
} | ||
} |
71 changes: 71 additions & 0 deletions
71
...n/java/org/jabref/logic/importer/fetcher/transformers/ScholarArchiveQueryTransformer.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
package org.jabref.logic.importer.fetcher.transformers; | ||
|
||
/** | ||
* This class extends the AbstractQueryTransformer to provide specific implementations | ||
* for transforming standard queries into ones suitable for the Scholar Archive's unique format. | ||
*/ | ||
public class ScholarArchiveQueryTransformer extends AbstractQueryTransformer { | ||
|
||
@Override | ||
protected String getLogicalAndOperator() { | ||
return " AND "; | ||
} | ||
|
||
@Override | ||
protected String getLogicalOrOperator() { | ||
return " OR "; | ||
} | ||
|
||
@Override | ||
protected String getLogicalNotOperator() { | ||
return "NOT "; | ||
} | ||
|
||
@Override | ||
protected String handleAuthor(String author) { | ||
return createKeyValuePair("contrib_names", author); | ||
} | ||
|
||
@Override | ||
protected String handleTitle(String title) { | ||
return createKeyValuePair("title", title); | ||
} | ||
|
||
@Override | ||
protected String handleJournal(String journalTitle) { | ||
return createKeyValuePair("container_name", journalTitle); | ||
} | ||
|
||
/** | ||
* Handles the year query by formatting it specifically for a range search in the Scholar Archive. | ||
* This method is for an exact year match. | ||
* | ||
* @param year the publication year to be searched in the Scholar Archive. | ||
* @return A string query segment formatted for the year search. | ||
*/ | ||
@Override | ||
protected String handleYear(String year) { | ||
return "publication.startDate:[" + year + " TO " + year + "]"; | ||
} | ||
|
||
/** | ||
* Handles a year range query, transforming it for the Scholar Archive's query format. | ||
* If only a start year is provided, the range will extend to the current year. | ||
* | ||
* @param yearRange the range of years to be searched in the Scholar Archive, usually in the format "startYear-endYear". | ||
* @return A string query segment formatted for the year range search. | ||
*/ | ||
@Override | ||
protected String handleYearRange(String yearRange) { | ||
parseYearRange(yearRange); | ||
if (endYear == Integer.MAX_VALUE) { | ||
// If no specific end year is set, it assumes the range extends to the current year. | ||
return yearRange; | ||
} | ||
return "publication.startDate:[" + startYear + " TO " + endYear + "]"; | ||
} | ||
} | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
51 changes: 51 additions & 0 deletions
51
src/test/java/org/jabref/logic/importer/fetcher/ScholarArchiveFetcherTest.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
package org.jabref.logic.importer.fetcher; | ||
|
||
import java.util.List; | ||
|
||
import org.jabref.logic.importer.FetcherException; | ||
import org.jabref.model.entry.BibEntry; | ||
import org.jabref.model.entry.field.StandardField; | ||
import org.jabref.model.entry.types.StandardEntryType; | ||
import org.jabref.testutils.category.FetcherTest; | ||
|
||
import org.junit.jupiter.api.BeforeEach; | ||
import org.junit.jupiter.api.Test; | ||
|
||
import static org.junit.jupiter.api.Assertions.assertEquals; | ||
import static org.junit.jupiter.api.Assertions.assertTrue; | ||
|
||
@FetcherTest | ||
public class ScholarArchiveFetcherTest { | ||
private ScholarArchiveFetcher fetcher; | ||
private BibEntry bibEntry; | ||
|
||
@BeforeEach | ||
public void setUp() { | ||
fetcher = new ScholarArchiveFetcher(); | ||
bibEntry = new BibEntry(StandardEntryType.InCollection) | ||
.withField(StandardField.TITLE, "Query expansion using associated queries") | ||
.withField(StandardField.AUTHOR, "Billerbeck, Bodo and Scholer, Falk and Williams, Hugh E. and Zobel, Justin") | ||
.withField(StandardField.VOLUME, "0") | ||
.withField(StandardField.DOI, "10.1145/956863.956866") | ||
.withField(StandardField.JOURNAL, "Proceedings of the twelfth international conference on Information and knowledge management - CIKM '03") | ||
.withField(StandardField.PUBLISHER, "ACM Press") | ||
.withField(StandardField.TYPE, "paper-conference") | ||
.withField(StandardField.YEAR, "2003") | ||
.withField(StandardField.URL, "https://web.archive.org/web/20170810164449/http://goanna.cs.rmit.edu.au/~jz/fulltext/cikm03.pdf"); | ||
} | ||
|
||
@Test | ||
public void getNameReturnsCorrectName() { | ||
assertEquals("ScholarArchive", fetcher.getName()); | ||
} | ||
|
||
@Test | ||
public void performSearchReturnsExpectedResults() throws FetcherException { | ||
List<BibEntry> fetchedEntries = fetcher.performSearch("query"); | ||
fetchedEntries.forEach(entry -> entry.clearField(StandardField.ABSTRACT)); | ||
assertTrue(fetchedEntries.contains(bibEntry), "Found the following entries " + fetchedEntries); | ||
} | ||
} | ||
|
||
|
||
|