Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Feat] AC spider #223

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 13 additions & 21 deletions covid19br/common/models/full_report.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,26 +127,20 @@ def add_new_bulletin(self, bulletin: BulletinModel):
self._auto_calculated_total.increase_deaths(bulletin.deaths)

def check_total_death_cases(self) -> bool:
if not self._official_total_bulletins:
return False
auto_calculated_deaths = self._auto_calculated_total.deaths
return all(
[
auto_calculated_deaths == official_bulletin.deaths
for official_bulletin in self._official_total_bulletins
]
)
death_cases_values = set()
if self._auto_calculated_total.has_deaths:
death_cases_values.add(self._auto_calculated_total.deaths)
for official_bulletin in self._official_total_bulletins:
death_cases_values.add(official_bulletin.deaths)
return len(death_cases_values) == 1

def check_total_confirmed_cases(self) -> bool:
if not self._official_total_bulletins:
return False
auto_calculated_cases = self._auto_calculated_total.confirmed_cases
return all(
[
auto_calculated_cases == official_bulletin.confirmed_cases
for official_bulletin in self._official_total_bulletins
]
)
confirmed_cases_values = set()
if self._auto_calculated_total.has_confirmed_cases:
confirmed_cases_values.add(self._auto_calculated_total.confirmed_cases)
for official_bulletin in self._official_total_bulletins:
confirmed_cases_values.add(official_bulletin.confirmed_cases)
return len(confirmed_cases_values) == 1

def to_csv_rows(self):
rows = []
Expand Down Expand Up @@ -357,9 +351,7 @@ def _auto_detect_warnings(self):
"são apenas a soma automática dos dados dos municípios."
),
)
elif not self._auto_calculated_total.is_empty and (
not self.check_total_confirmed_cases() or not self.check_total_death_cases()
):
if not self.check_total_confirmed_cases() or not self.check_total_death_cases():
sources_data = "\n".join(
[
f"Fonte {{ {' | '.join(bulletin.sources)} }}: "
Expand Down
98 changes: 98 additions & 0 deletions covid19br/parsers/acre.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import re
from rows.plugins import pdf

from covid19br.common.constants import State
from covid19br.common.data_normalization_utils import NormalizationUtils
from covid19br.parsers.extractor_utils import match_object_from_regexp, is_only_number

REGEXP_DATE = re.compile("([0-9]+) de (.+) de ([0-9]{4})$")
CITY_NAME_TABLE_COLUMN = 0
CONFIRMED_CASES_TABLE_COLUMN = 2
DEATH_CASES_TABLE_COLUMN = 4


def parse_int(value):
return int(value.replace(".", ""))


class AcreBulletinExtractor:
state = State.AC

def __init__(self, filename):
self.doc = pdf.PyMuPDFBackend(filename)

@property
def date(self):
first_page_objects = next(
self.doc.text_objects(
starts_after=re.compile("BOLETIM(.+)"),
ends_before=re.compile("SITUAÇÃO ATUAL(.+)"),
)
)
date_obj, *_ = match_object_from_regexp(REGEXP_DATE, first_page_objects) or [
None
]
if date_obj:
return NormalizationUtils.extract_in_full_date(" ".join(date_obj))

@property
def official_total(self):
first_page_objects = next(
self.doc.text_objects(
starts_after=re.compile("SITUAÇÃO ATUAL(.+)"),
ends_before=re.compile("DISTRIBUIÇÃO DOS CASOS(.+)"),
)
)

# Unfortunately the text labels are images which makes difficult for us to get the numbers based on them
# So we are going to infer which values we need based on it's position (sometimes there are "ghost objects"
# in the page, but they are on the far left and won't interfere in this logic.
remaining_number_objs = [
obj for obj in first_page_objects if is_only_number(obj.text)
]
# we will start ordering the objects and drop the 2 last of the right (the little numbers on the bulletin)
ordered_by_x_axis = sorted(remaining_number_objs, key=lambda obj: obj.x0)
remaining_number_objs = ordered_by_x_axis[:-2]
# From the 3 numbers on the far right, the death cases is the one most at the bottom
*_, death_cases_obj = sorted(
remaining_number_objs[-3:], key=lambda obj: (obj.y0, obj.x0)
)
remaining_number_objs = remaining_number_objs[:-3]
# From the 3 numbers on the right remaining (the middle column), the confirmed cases is the one in the middle
_, confirmed_cases_obj, _ = sorted(
remaining_number_objs[-3:], key=lambda obj: (obj.y0, obj.x0)
)

return {"confirmados": confirmed_cases_obj.text, "mortes": death_cases_obj.text}

@property
def data(self):
table_page_number = self._get_table_page_number()
if not table_page_number:
return None
page_objs = next(self.doc.text_objects(
starts_after=re.compile(".+DISTRIBUIÇÃO DOS CASOS CONFIRMADOS.+"),
ends_before=re.compile("Fonte:.+"),
page_numbers=(table_page_number,),
))

# remove headers
city_column_header = next(obj for obj in page_objs if "munic" in obj.text.lower())
table_objs = [obj for obj in page_objs if obj.y0 > city_column_header.y1]

lines = pdf.group_objects("y", table_objs, check_group=pdf.object_contains_center)
for line in lines:
city = line[CITY_NAME_TABLE_COLUMN].text.strip()
deaths = line[DEATH_CASES_TABLE_COLUMN].text.strip()
confirmed = line[CONFIRMED_CASES_TABLE_COLUMN].text.strip()
yield {
"municipio": city,
"confirmados": confirmed,
"mortes": deaths,
}

def _get_table_page_number(self):
for page_number, page_objs in enumerate(self.doc.text_objects(), start=1):
for obj in page_objs:
if "TABELA" in obj.text and "DISTRIBUIÇÃO DOS CASOS CONFIRMADOS" in obj.text:
return page_number
13 changes: 13 additions & 0 deletions covid19br/parsers/extractor_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
import re


def match_object_from_regexp(regexp, objects):
"""Return the matching result for"""
for obj in objects:
result = regexp.findall(obj.text)
if result:
return result


def is_only_number(value):
return re.compile("^([0-9.,]+)$").findall(value.strip())
9 changes: 1 addition & 8 deletions covid19br/parsers/tocantins.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,19 +8,12 @@
from covid19br.common.constants import State
from covid19br.common.data_normalization_utils import NormalizationUtils
from covid19br.common.demographic_utils import DemographicUtils
from covid19br.parsers.extractor_utils import match_object_from_regexp

REGEXP_DAY_MONTH = re.compile("([0-9]+) de (.+)$")
REGEXP_YEAR = re.compile("^de ([0-9]{4})$")


def match_object_from_regexp(regexp, objects):
"""Return the matching result for"""
for obj in objects:
result = regexp.findall(obj.text)
if result:
return result


def parse_int(value):
return int(value.replace(".", ""))

Expand Down
2 changes: 2 additions & 0 deletions covid19br/run_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
sys.path[0] = "/".join(sys.path[0].split("/")[:-1])

from covid19br.common.data_normalization_utils import NormalizationUtils
from covid19br.spiders.spider_ac import SpiderAC
from covid19br.spiders.spider_ba import SpiderBA
from covid19br.spiders.spider_ce import SpiderCE
from covid19br.spiders.spider_pr import SpiderPR
Expand All @@ -17,6 +18,7 @@

# Todo -> Automatically retrieve spiders that extend the Base Class
AVAILABLE_SPIDERS = [
SpiderAC,
SpiderBA,
SpiderCE,
SpiderPR,
Expand Down
150 changes: 150 additions & 0 deletions covid19br/spiders/spider_ac.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,150 @@
import re
import scrapy
import tempfile

from covid19br.common.base_spider import BaseCovid19Spider
from covid19br.common.constants import State, ReportQuality
from covid19br.common.models.bulletin_models import (
StateTotalBulletinModel,
CountyBulletinModel,
)
from covid19br.parsers.acre import AcreBulletinExtractor

REGEXP_CASES = re.compile(
"O número de infectados (?:passou )?(?:subiu )?(?:para )?(?:permanece.? )?(?:em )?([0-9.]+) em todo o estado"
)
REGEXP_DEATHS = re.compile(
"o número oficial de mortes por covid-19 (?:.+) ([0-9.]+)[,]? em todo o estado"
)


class SpiderAC(BaseCovid19Spider):
state = State.AC
name = State.AC.value
information_delay_in_days = 0
report_qualities = [ReportQuality.COUNTY_BULLETINS]

news_base_url = "https://agencia.ac.gov.br/"
news_query_params = "?s=covid19"

def pre_init(self):
self.requested_dates = list(self.requested_dates)

def start_requests(self):
yield scrapy.Request(self.news_base_url + self.news_query_params)

def parse(self, response, **kwargs):
news_per_date = {}
news_divs = response.xpath("//header[@class='entry-header']")
for div in news_divs:
url = div.xpath(".//h2[@class='entry-title']//a/@href").get()
if "boletim-sesacre" in url:
raw_date = (
div.xpath(".//span[@class='date-post']//text()[2]").get().strip()
)
date = self.normalizer.extract_numeric_date(raw_date)
news_per_date[date] = url

for date in news_per_date:
if date in self.requested_dates:
yield scrapy.Request(
news_per_date[date],
callback=self.parse_news_bulletin,
cb_kwargs={"date": date},
)

# handle pagination
if news_per_date and self.start_date < min(news_per_date):
last_page_number = 1
last_page_url = response.request.url
if "page" in last_page_url:
url, *_query_params = last_page_url.split("?")
if url[-1] == "/":
url = url[:-1]
*_url_path, last_page_number = url.split("/")
last_page_number = self.normalizer.ensure_integer(last_page_number)
next_page_number = last_page_number + 1
next_page_url = (
f"{self.news_base_url}page/{next_page_number}/{self.news_query_params}"
)
yield scrapy.Request(next_page_url, callback=self.parse)

def parse_news_bulletin(self, response, date):
self._extract_cases_and_deaths_from_news(response, date)

pdf_url = response.xpath(
"//div[@class='entry-content']//a[contains(@href, '.pdf') and contains(@href, 'BOLETIM')]/@href"
).get()
yield scrapy.Request(
pdf_url, callback=self.parse_pdf_bulletin, cb_kwargs={"date": date}
)

def parse_pdf_bulletin(self, response, date):
source = response.request.url
with tempfile.NamedTemporaryFile(mode="wb", suffix=".pdf") as tmp:
tmp.write(response.body)

extractor = AcreBulletinExtractor(tmp.name)

pdf_date = extractor.date
if pdf_date and pdf_date != date:
self.logger.warning(
f"PDF date does not match for pdf {source}. Aborting extraction."
)
return

pdf_official_total = extractor.official_total
if pdf_official_total:
bulletin = StateTotalBulletinModel(
date=date,
state=self.state,
confirmed_cases=pdf_official_total["confirmados"],
deaths=pdf_official_total["mortes"],
source=response.request.url + " | Painel na primeira pag. do pdf.",
)
self.add_new_bulletin_to_report(bulletin, date)

pdf_data = list(extractor.data)
if not pdf_data:
if "parcial" not in source.lower():
self.logger.error(
f"Couldn't extract data from pdf that is not parcial. Pdf source: {source}."
)
return

for row in pdf_data:
if row["municipio"].lower() == "total":
bulletin = StateTotalBulletinModel(
date=date,
state=self.state,
confirmed_cases=row["confirmados"],
deaths=row["mortes"],
source=response.request.url
+ " | Tabela com dados dos municípios do pdf.",
)
else:
bulletin = CountyBulletinModel(
date=date,
state=self.state,
city=row["municipio"],
confirmed_cases=row["confirmados"],
deaths=row["mortes"],
source=response.request.url,
)
self.add_new_bulletin_to_report(bulletin, date)

def _extract_cases_and_deaths_from_news(self, response, date):
body_text = " ".join(
response.xpath("//div[@class='entry-content']//p//text()").extract()
)
cases, *_other_matches = REGEXP_CASES.findall(body_text) or [None]
deaths, *_other_matches = REGEXP_DEATHS.findall(body_text) or [None]
if cases or deaths:
bulletin = StateTotalBulletinModel(
date=date,
state=self.state,
deaths=deaths,
confirmed_cases=cases,
source=response.request.url + " | Corpo da notícia",
)
self.add_new_bulletin_to_report(bulletin, date)
Binary file added tests/data/AC/TO-2022-01-30.pdf
Binary file not shown.
Binary file added tests/data/AC/TO-2022-02-12.pdf
Binary file not shown.
24 changes: 24 additions & 0 deletions tests/data/AC/TO-2022-02-25.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
municipio,confirmados,mortes
Acrelândia,2.700,39
Assis Brasil,1.995,25
Brasileia,3.574,46
Bujari,1.588,17
Capixaba,848,17
Cruzeiro do Sul,10.724,184
Epitaciolândia,1.850,38
Feijó,3.747,67
Jordão,722,2
Mâncio Lima,3.070,36
Manoel Urbano,995,17
M. Thaumaturgo,1.613,13
Plácido de Castro,2.624,25
Porto Acre,2.044,42
Porto Walter,558,7
Rio Branco,60.653,1.161
Rodrigues Alves,1.040,14
Santa Rosa do Purus,1.164,7
Sena Madureira,6.995,86
Senador Guiomard,1.703,43
Tarauacá,7.001,50
Xapuri,3.361,33
TOTAL,120.569,1.969
Binary file added tests/data/AC/TO-2022-02-25.pdf
Binary file not shown.
Loading