-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #1719 from CentreForDigitalHumanities/feature/gall…
…ica-fix Add Gallica/Figaro corpus definition
- Loading branch information
Showing
12 changed files
with
476 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import os | ||
|
||
import pytest | ||
|
||
here = os.path.abspath(os.path.dirname(__file__)) | ||
|
||
|
||
@pytest.fixture() | ||
def gallica_corpus_settings(settings): | ||
settings.CORPORA = { | ||
"figaro": os.path.join(here, "figaro.py"), | ||
} | ||
|
||
|
||
class MockResponse(object): | ||
def __init__(self, filepath): | ||
self.mock_content_file = filepath | ||
|
||
@property | ||
def content(self): | ||
with open(self.mock_content_file, "r") as f: | ||
return f.read() | ||
|
||
|
||
def mock_response(url: str) -> MockResponse: | ||
if url.endswith("date"): | ||
filename = os.path.join(here, "tests", "data", "figaro", "Years.xml") | ||
elif "&" in url: | ||
filename = os.path.join(here, "tests", "data", "figaro", "Issues.xml") | ||
elif "?" in url: | ||
filename = os.path.join(here, "tests", "data", "figaro", "OAIRecord.xml") | ||
elif url.endswith("texteBrut"): | ||
filename = os.path.join(here, "tests", "data", "figaro", "RoughText.html") | ||
return MockResponse(filename) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
from datetime import datetime | ||
from typing import Union | ||
|
||
from django.conf import settings | ||
from ianalyzer_readers.xml_tag import Tag | ||
from ianalyzer_readers.extract import XML | ||
|
||
from addcorpus.python_corpora.corpus import FieldDefinition | ||
from addcorpus.es_mappings import ( | ||
keyword_mapping, | ||
) | ||
|
||
from corpora.gallica.gallica import Gallica | ||
|
||
|
||
def join_issue_strings(issue_description: Union[list[str], None]) -> Union[str, None]: | ||
if issue_description: | ||
return "".join(issue_description[:2]) | ||
|
||
|
||
class Figaro(Gallica): | ||
title = "Le Figaro" | ||
description = "Newspaper archive, 1854-1953" | ||
min_date = datetime(year=1854, month=1, day=1) | ||
max_date = datetime(year=1953, month=12, day=31) | ||
corpus_id = "cb34355551z" | ||
category = "periodical" | ||
es_index = getattr(settings, 'FIGARO_INDEX', 'figaro') | ||
image = "figaro.jpg" | ||
|
||
contributor = FieldDefinition( | ||
name="contributor", | ||
description="Persons who contributed to this issue", | ||
es_mapping=keyword_mapping(enable_full_text_search=True), | ||
extractor=XML(Tag("dc:contributor"), multiple=True), | ||
) | ||
|
||
issue = FieldDefinition( | ||
name="issue", | ||
description="Issue description", | ||
es_mapping=keyword_mapping(), | ||
extractor=XML( | ||
Tag("dc:description"), multiple=True, transform=join_issue_strings | ||
), | ||
) | ||
|
||
def __init__(self): | ||
self.fields = [ | ||
self.content(), | ||
self.contributor, | ||
self.date(self.min_date, self.max_date), | ||
self.identifier(), | ||
self.issue, | ||
self.url(), | ||
] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,162 @@ | ||
from datetime import datetime | ||
import logging | ||
from time import sleep | ||
|
||
from bs4 import BeautifulSoup | ||
from ianalyzer_readers.xml_tag import Tag | ||
from ianalyzer_readers.extract import Metadata, XML | ||
import requests | ||
|
||
from addcorpus.python_corpora.corpus import XMLCorpusDefinition | ||
from addcorpus.python_corpora.corpus import FieldDefinition | ||
from addcorpus.python_corpora.filters import DateFilter | ||
from addcorpus.es_mappings import ( | ||
keyword_mapping, | ||
date_mapping, | ||
main_content_mapping, | ||
) | ||
from addcorpus.es_settings import es_settings | ||
|
||
logger = logging.getLogger('indexing') | ||
|
||
def get_content(content: BeautifulSoup) -> str: | ||
"""Return text content in the parsed HTML file from the `texteBrut` request | ||
This is contained in the first <p> element after the first <hr> element. | ||
""" | ||
text_nodes = content.find("hr").find_next_siblings("p") | ||
return "".join([node.get_text() for node in text_nodes]) | ||
|
||
|
||
def get_publication_id(identifier: str) -> str: | ||
try: | ||
return identifier.split("/")[-1] | ||
except: | ||
return None | ||
|
||
|
||
class Gallica(XMLCorpusDefinition): | ||
|
||
languages = ["fr"] | ||
data_url = "https://gallica.bnf.fr" | ||
corpus_id = "" # each corpus on Gallica has an "ark" id | ||
|
||
@property | ||
def es_settings(self): | ||
return es_settings( | ||
self.languages[:1], stopword_analysis=True, stemming_analysis=True | ||
) | ||
|
||
def sources(self, start: datetime, end: datetime): | ||
# obtain list of ark numbers | ||
response = requests.get( | ||
f"{self.data_url}/services/Issues?ark=ark:/12148/{self.corpus_id}/date" | ||
) | ||
year_soup = BeautifulSoup(response.content, "xml") | ||
years = [ | ||
year.string | ||
for year in year_soup.find_all("year") | ||
if int(year.string) >= start.year and int(year.string) <= end.year | ||
] | ||
for year in years: | ||
try: | ||
response = requests.get( | ||
f"{self.data_url}/services/Issues?ark=ark:/12148/{self.corpus_id}/date&date={year}" | ||
) | ||
ark_soup = BeautifulSoup(response.content, "xml") | ||
ark_numbers = [ | ||
issue_tag["ark"] for issue_tag in ark_soup.find_all("issue") | ||
] | ||
sleep(2) | ||
except ConnectionError: | ||
logger.warning(f"Connection error when processing year {year}") | ||
break | ||
|
||
for ark in ark_numbers: | ||
try: | ||
source_response = requests.get( | ||
f"{self.data_url}/services/OAIRecord?ark={ark}" | ||
) | ||
sleep(2) | ||
except ConnectionError: | ||
logger.warning(f"Connection error encountered in issue {ark}") | ||
break | ||
|
||
if source_response: | ||
try: | ||
content_response = requests.get( | ||
f"{self.data_url}/ark:/12148/{ark}.texteBrut" | ||
) | ||
sleep(10) | ||
except ConnectionError: | ||
logger.warning( | ||
f"Connection error when fetching full text of issue {ark}" | ||
) | ||
parsed_content = BeautifulSoup( | ||
content_response.content, "lxml-html" | ||
) | ||
yield ( | ||
source_response.content, | ||
{"content": parsed_content}, | ||
) | ||
|
||
def content(self): | ||
return FieldDefinition( | ||
name="content", | ||
description="Content of publication", | ||
display_name="Content", | ||
display_type="text_content", | ||
es_mapping=main_content_mapping( | ||
token_counts=True, | ||
stopword_analysis=True, | ||
stemming_analysis=True, | ||
language=self.languages[0], | ||
), | ||
extractor=Metadata("content", transform=get_content), | ||
) | ||
|
||
def date(self, min_date: datetime, max_date: datetime): | ||
return FieldDefinition( | ||
name="date", | ||
display_name="Date", | ||
description="The date of the publication.", | ||
es_mapping=date_mapping(), | ||
extractor=XML( | ||
Tag("dc:date"), | ||
), | ||
results_overview=True, | ||
search_filter=DateFilter( | ||
min_date, max_date, description="Search only within this time range." | ||
), | ||
visualizations=["resultscount", "termfrequency"], | ||
csv_core=True, | ||
) | ||
|
||
def identifier(self): | ||
return FieldDefinition( | ||
name="id", | ||
display_name="Publication ID", | ||
description="Identifier of the publication on Gallica", | ||
es_mapping=keyword_mapping(), | ||
extractor=XML(Tag("dc:identifier"), transform=get_publication_id), | ||
csv_core=True, | ||
) | ||
|
||
def url(self): | ||
return FieldDefinition( | ||
name="url", | ||
display_name="Source URL", | ||
display_type="url", | ||
description="URL to scan on Gallica", | ||
es_mapping=keyword_mapping(), | ||
extractor=XML(Tag("dc:identifier")), | ||
searchable=False, | ||
) | ||
|
||
# define fields property so it can be set in __init__ | ||
@property | ||
def fields(self): | ||
return self._fields | ||
|
||
@fields.setter | ||
def fields(self, value): | ||
self._fields = value |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
<?xml version="1.0" encoding="UTF-8" standalone="no"?> | ||
<issues compile_time="0:00:06.013" date="1930" listType="issue" parentArk="cb34355551z/date"> | ||
<issue ark="bpt6k296099q" dayOfYear="1">01 janvier 1930</issue> | ||
</issues> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,62 @@ | ||
<?xml version="1.0" encoding="UTF-8" standalone="no"?> | ||
<results ResultsGenerationSearchTime="0:00:00.011" countResults="1" resultType="CVOAIRecordSearchService" searchTime=""> | ||
<visibility_rights>all</visibility_rights> | ||
<notice> | ||
<record> | ||
<header> | ||
<identifier>oai:bnf.fr:gallica/ark:/12148/bpt6k296099q</identifier> | ||
<datestamp>2024-06-21</datestamp> | ||
<setSpec>gallica:corpus:BnPlCo00</setSpec> | ||
<setSpec>gallica:corpus:Pam1</setSpec> | ||
<setSpec>gallica:corpus:bresil</setSpec> | ||
<setSpec>gallica:theme:0:07</setSpec> | ||
<setSpec>gallica:typedoc:periodiques:fascicules</setSpec> | ||
</header> | ||
<metadata> | ||
<oai_dc:dc xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"> | ||
<dc:identifier>https://gallica.bnf.fr/ark:/12148/bpt6k296099q</dc:identifier> | ||
<dc:date>1930-01-01</dc:date> | ||
<dc:description>01 janvier 1930</dc:description> | ||
<dc:description>1930/01/01 (Numéro 1).</dc:description> | ||
<dc:description/> | ||
<dc:title>Figaro : journal non politique</dc:title> | ||
<dc:contributor>Villemessant, Hippolyte de (1810-1879). Directeur de publication</dc:contributor> | ||
<dc:contributor>Jouvin, Benoît (1810-1886). Directeur de publication</dc:contributor> | ||
<dc:publisher>Figaro (Paris)</dc:publisher> | ||
<dc:type xml:lang="fre">texte</dc:type> | ||
<dc:type xml:lang="eng">text</dc:type> | ||
<dc:type xml:lang="fre">publication en série imprimée</dc:type> | ||
<dc:type xml:lang="eng">printed serial</dc:type> | ||
<dc:language>fre</dc:language> | ||
<dc:relation>Notice du catalogue : http://catalogue.bnf.fr/ark:/12148/cb34355551z</dc:relation> | ||
<dc:source>Bibliothèque nationale de France</dc:source> | ||
<dc:rights xml:lang="fre">domaine public</dc:rights> | ||
<dc:rights xml:lang="eng">public domain</dc:rights> | ||
<dc:relation>http://gallica.bnf.fr/ark:/12148/cb34355551z/date</dc:relation> | ||
<dc:description>Appartient à l’ensemble documentaire : BIPFPIG00</dc:description> | ||
<dc:description>Appartient à l’ensemble documentaire : BIPFPIG63</dc:description> | ||
<dc:description>Appartient à l’ensemble documentaire : BIPFPIG69</dc:description> | ||
<dc:description>Appartient à l’ensemble documentaire : Pam1</dc:description> | ||
<dc:description>Appartient à l’ensemble documentaire : BnPlCo00</dc:description> | ||
<dc:description>Appartient à l’ensemble documentaire : BnPlCo01</dc:description> | ||
<dc:description>Appartient à l’ensemble documentaire : FranceBr</dc:description> | ||
<dc:format>Nombre total de vues : 164718</dc:format> | ||
</oai_dc:dc> | ||
</metadata> | ||
</record> | ||
</notice> | ||
<provenance>bnf.fr</provenance> | ||
<sdewey>07</sdewey> | ||
<dewey>0</dewey> | ||
<source>Bibliothèque nationale de France</source> | ||
<typedoc>fascicule</typedoc> | ||
<nqamoyen>0.0</nqamoyen> | ||
<title>Figaro : journal non politique</title> | ||
<date nbIssue="1">1930-01-01</date> | ||
<first_indexation_date>15/10/2007</first_indexation_date> | ||
<streamable>false</streamable> | ||
<listBibVirt> | ||
<label>gallica</label> | ||
<label>communpatrimoine</label> | ||
</listBibVirt> | ||
</results> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
<!DOCTYPE html><html><head><title>Figaro : journal non politique | Gallica</title><meta name="title" content="Figaro : journal non politique | Gallica"><meta name="description" content="Figaro : journal non politique -- 1930-01-01 -- fascicules"><meta name="DC.type" content=""><meta name="DC.description" content="01 janvier 1930"><meta name="DC.description" content="1930/01/01 (Numéro 1)."><meta name="DC.description" content="Appartient à l’ensemble documentaire : BIPFPIG00"><meta name="DC.description" content="Appartient à l’ensemble documentaire : BIPFPIG63"><meta name="DC.description" content="Appartient à l’ensemble documentaire : BIPFPIG69"><meta name="DC.description" content="Appartient à l’ensemble documentaire : Pam1"><meta name="DC.description" content="Appartient à l’ensemble documentaire : BnPlCo00"><meta name="DC.description" content="Appartient à l’ensemble documentaire : BnPlCo01"><meta name="DC.description" content="Appartient à l’ensemble documentaire : FranceBr"><meta name="DC.title" content="Figaro : journal non politique"><meta name="DC.date" content="1930-01-01"><meta name="DC.rights" content="domaine public"><meta name="DC.rights" content="public domain"><meta name="DC.identifier" content="oai:bnf.fr:gallica/ark:/12148/bpt6k296099q"><meta name="p:domain_verify" content="12c0fefa160572d58c3754d6158b2d99"></meta><script type="text/javascript" src="/ruxitagentjs_ICA2NVfjqru_10257230921194352.js" data-dtconfig="app=3c476fda10179998|featureHash=ICA2NVfjqru|vcv=2|rdnt=1|uxrgce=1|bp=3|cuc=7drwxnvu|mel=100000|ssv=4|lastModification=1728477932099|dtVersion=10257230921194352|tp=500,50,0,1|uxdcw=1500|agentUri=/ruxitagentjs_ICA2NVfjqru_10257230921194352.js|reportUrl=/rb_67d25af6-aa00-4621-84c7-37086540adb6|rid=RID_-34823944|rpid=-672779903|domain=bnf.fr"></script></head><body><p>Reminder of your request:</p><br><p>Downloading format: : <strong>Text</strong></p><p>View <strong>1</strong> to <strong>8</strong> on <strong>8</strong></p><p>Number of pages: <strong>8</strong></p><p>Full notice</p><p><strong>Title : </strong>Figaro : journal non politique</p><p><strong>Publisher : </strong>Figaro (Paris)</p><p><strong>Publication date : </strong>1930-01-01</p><p><strong>Contributor : </strong>Villemessant, Hippolyte de (1810-1879). Directeur de publication</p><p><strong>Contributor : </strong>Jouvin, Benoît (1810-1886). Directeur de publication</p><p><strong>Relationship : </strong><a target="_blank" href="https://gallica.bnf.fr/http://catalogue.bnf.fr/ark:/12148/cb34355551z">http://catalogue.bnf.fr/ark:/12148/cb34355551z</a></p><p><strong>Relationship : </strong><a target="_blank" href="https://gallica.bnf.fr/https://gallica.bnf.fr/ark:/12148/cb34355551z/date">https://gallica.bnf.fr/ark:/12148/cb34355551z/date</a></p><p><strong>Type : </strong>text</p><p><strong>Type : </strong>printed serial</p><p><strong>Language : </strong>french</p><p><strong>Format : </strong>Nombre total de vues : 164718</p><p><strong>Description : </strong>01 janvier 1930</p><p><strong>Description : </strong>1930/01/01 (Numéro 1).</p><p><strong>Description : </strong>Collection numérique : Bibliographie de la presse française politique et d'information générale</p><p><strong>Description : </strong>Collection numérique : BIPFPIG63</p><p><strong>Description : </strong>Collection numérique : BIPFPIG69</p><p><strong>Description : </strong>Collection numérique : Arts de la marionnette</p><p><strong>Description : </strong>Collection numérique : Commun Patrimoine: bibliothèque numérique du réseau des médiathèques de Plaine commune</p><p><strong>Description : </strong>Collection numérique : La Commune de Paris</p><p><strong>Description : </strong>Collection numérique : France-Brésil</p><p><strong>Rights : </strong>Consultable en ligne</p><p><strong>Rights : </strong>Public domain</p><p><strong>Identifier : </strong><a target="_blank" href="https://gallica.bnf.fr/ark:/12148/bpt6k296099q">ark:/12148/bpt6k296099q</a></p><p><strong>Source : </strong>Bibliothèque nationale de France</p><p><strong>Provenance : </strong>Bibliothèque nationale de France</p><p><strong>Online date : </strong>15/10/2007</p><p>The text displayed may contain some errors. The text of this document has been generated automatically by an optical character recognition (OCR) program. The | ||
estimated recognition rate for this document is 0%.<br/></p><hr><p>SOMMAIRE DE FIGARO PAGE 2. <span style="color:Slategray;">Les</span> Cours, les Ambassades, le Monde et la Ville. <span style="color:Slategray;">Les</span> <span style="color:Slategray;">Echos.</span> La fin du Bulletin vert. <span style="color:Slategray;">1929-1930.</span> </p><p>PAGE 3. La Dernière Heure. Avant la Conférence de La Haye. Les méfaits de la tempête. </p><p>PAGE 4. <span style="color:Slategray;">La</span> Vie sportive. Revue de la Presse. Anne Douglas Sedgwick Marthe Ludérac. </p><p>PAGE 5. Henri Rebois L'Art espagnol à <span style="color:Slategray;">l'Exposition</span> de Barcelone. Robert Brussel Le Mouvement musical. <span style="color:Slategray;">Guy</span> de Passillé Les Etrennes. Jacques Patin Les Premières. Les Alguazils Courrier des Lettres. Marc Hélys Revues étrangères. PAGE 6. La Bourse La Cote des Valeurs. Le Programme des spectacles. </p><p>PAGE 7. Courrier des théâtres. Les Courses LA POLITIQUE </p><p><span style="color:Slategray;">La</span> <span style="color:Slategray;">diplomatie</span> </p><hr></body></html> |
Oops, something went wrong.