turicas · dehatanes · Mar 13, 2022 · Mar 13, 2022 · Mar 13, 2022 · Mar 13, 2022
diff --git a/covid19br/common/models/full_report.py b/covid19br/common/models/full_report.py
@@ -127,26 +127,20 @@ def add_new_bulletin(self, bulletin: BulletinModel):
             self._auto_calculated_total.increase_deaths(bulletin.deaths)
 
     def check_total_death_cases(self) -> bool:
-        if not self._official_total_bulletins:
-            return False
-        auto_calculated_deaths = self._auto_calculated_total.deaths
-        return all(
-            [
-                auto_calculated_deaths == official_bulletin.deaths
-                for official_bulletin in self._official_total_bulletins
-            ]
-        )
+        death_cases_values = set()
+        if self._auto_calculated_total.has_deaths:
+            death_cases_values.add(self._auto_calculated_total.deaths)
+        for official_bulletin in self._official_total_bulletins:
+            death_cases_values.add(official_bulletin.deaths)
+        return len(death_cases_values) == 1
 
     def check_total_confirmed_cases(self) -> bool:
-        if not self._official_total_bulletins:
-            return False
-        auto_calculated_cases = self._auto_calculated_total.confirmed_cases
-        return all(
-            [
-                auto_calculated_cases == official_bulletin.confirmed_cases
-                for official_bulletin in self._official_total_bulletins
-            ]
-        )
+        confirmed_cases_values = set()
+        if self._auto_calculated_total.has_confirmed_cases:
+            confirmed_cases_values.add(self._auto_calculated_total.confirmed_cases)
+        for official_bulletin in self._official_total_bulletins:
+            confirmed_cases_values.add(official_bulletin.confirmed_cases)
+        return len(confirmed_cases_values) == 1
 
     def to_csv_rows(self):
         rows = []
@@ -357,9 +351,7 @@ def _auto_detect_warnings(self):
                     "são apenas a soma automática dos dados dos municípios."
                 ),
             )
-        elif not self._auto_calculated_total.is_empty and (
-            not self.check_total_confirmed_cases() or not self.check_total_death_cases()
-        ):
+        if not self.check_total_confirmed_cases() or not self.check_total_death_cases():
             sources_data = "\n".join(
                 [
                     f"Fonte {{ {' | '.join(bulletin.sources)} }}: "

diff --git a/covid19br/parsers/acre.py b/covid19br/parsers/acre.py
@@ -0,0 +1,98 @@
+import re
+from rows.plugins import pdf
+
+from covid19br.common.constants import State
+from covid19br.common.data_normalization_utils import NormalizationUtils
+from covid19br.parsers.extractor_utils import match_object_from_regexp, is_only_number
+
+REGEXP_DATE = re.compile("([0-9]+) de (.+) de ([0-9]{4})$")
+CITY_NAME_TABLE_COLUMN = 0
+CONFIRMED_CASES_TABLE_COLUMN = 2
+DEATH_CASES_TABLE_COLUMN = 4
+
+
+def parse_int(value):
+    return int(value.replace(".", ""))
+
+
+class AcreBulletinExtractor:
+    state = State.AC
+
+    def __init__(self, filename):
+        self.doc = pdf.PyMuPDFBackend(filename)
+
+    @property
+    def date(self):
+        first_page_objects = next(
+            self.doc.text_objects(
+                starts_after=re.compile("BOLETIM(.+)"),
+                ends_before=re.compile("SITUAÇÃO ATUAL(.+)"),
+            )
+        )
+        date_obj, *_ = match_object_from_regexp(REGEXP_DATE, first_page_objects) or [
+            None
+        ]
+        if date_obj:
+            return NormalizationUtils.extract_in_full_date(" ".join(date_obj))
+
+    @property
+    def official_total(self):
+        first_page_objects = next(
+            self.doc.text_objects(
+                starts_after=re.compile("SITUAÇÃO ATUAL(.+)"),
+                ends_before=re.compile("DISTRIBUIÇÃO DOS CASOS(.+)"),
+            )
+        )
+
+        # Unfortunately the text labels are images which makes difficult for us to get the numbers based on them
+        # So we are going to infer which values we need based on it's position (sometimes there are "ghost objects"
+        # in the page, but they are on the far left and won't interfere in this logic.
+        remaining_number_objs = [
+            obj for obj in first_page_objects if is_only_number(obj.text)
+        ]
+        # we will start ordering the objects and drop the 2 last of the right (the little numbers on the bulletin)
+        ordered_by_x_axis = sorted(remaining_number_objs, key=lambda obj: obj.x0)
+        remaining_number_objs = ordered_by_x_axis[:-2]
+        # From the 3 numbers on the far right, the death cases is the one most at the bottom
+        *_, death_cases_obj = sorted(
+            remaining_number_objs[-3:], key=lambda obj: (obj.y0, obj.x0)
+        )
+        remaining_number_objs = remaining_number_objs[:-3]
+        # From the 3 numbers on the right remaining (the middle column), the confirmed cases is the one in the middle
+        _, confirmed_cases_obj, _ = sorted(
+            remaining_number_objs[-3:], key=lambda obj: (obj.y0, obj.x0)
+        )
+
+        return {"confirmados": confirmed_cases_obj.text, "mortes": death_cases_obj.text}
+
+    @property
+    def data(self):
+        table_page_number = self._get_table_page_number()
+        if not table_page_number:
+            return None
+        page_objs = next(self.doc.text_objects(
+            starts_after=re.compile(".+DISTRIBUIÇÃO DOS CASOS CONFIRMADOS.+"),
+            ends_before=re.compile("Fonte:.+"),
+            page_numbers=(table_page_number,),
+        ))
+
+        # remove headers
+        city_column_header = next(obj for obj in page_objs if "munic" in obj.text.lower())
+        table_objs = [obj for obj in page_objs if obj.y0 > city_column_header.y1]
+
+        lines = pdf.group_objects("y", table_objs, check_group=pdf.object_contains_center)
+        for line in lines:
+            city = line[CITY_NAME_TABLE_COLUMN].text.strip()
+            deaths = line[DEATH_CASES_TABLE_COLUMN].text.strip()
+            confirmed = line[CONFIRMED_CASES_TABLE_COLUMN].text.strip()
+            yield {
+                "municipio": city,
+                "confirmados": confirmed,
+                "mortes": deaths,
+            }
+
+    def _get_table_page_number(self):
+        for page_number, page_objs in enumerate(self.doc.text_objects(), start=1):
+            for obj in page_objs:
+                if "TABELA" in obj.text and "DISTRIBUIÇÃO DOS CASOS CONFIRMADOS" in obj.text:
+                    return page_number
diff --git a/covid19br/parsers/extractor_utils.py b/covid19br/parsers/extractor_utils.py
@@ -0,0 +1,13 @@
+import re
+
+
+def match_object_from_regexp(regexp, objects):
+    """Return the matching result for"""
+    for obj in objects:
+        result = regexp.findall(obj.text)
+        if result:
+            return result
+
+
+def is_only_number(value):
+    return re.compile("^([0-9.,]+)$").findall(value.strip())
diff --git a/covid19br/parsers/tocantins.py b/covid19br/parsers/tocantins.py
@@ -8,19 +8,12 @@
 from covid19br.common.constants import State
 from covid19br.common.data_normalization_utils import NormalizationUtils
 from covid19br.common.demographic_utils import DemographicUtils
+from covid19br.parsers.extractor_utils import match_object_from_regexp
 
 REGEXP_DAY_MONTH = re.compile("([0-9]+) de (.+)$")
 REGEXP_YEAR = re.compile("^de ([0-9]{4})$")
 
 
-def match_object_from_regexp(regexp, objects):
-    """Return the matching result for"""
-    for obj in objects:
-        result = regexp.findall(obj.text)
-        if result:
-            return result
-
-
 def parse_int(value):
     return int(value.replace(".", ""))
 

diff --git a/covid19br/run_spider.py b/covid19br/run_spider.py
@@ -9,6 +9,7 @@
 sys.path[0] = "/".join(sys.path[0].split("/")[:-1])
 
 from covid19br.common.data_normalization_utils import NormalizationUtils
+from covid19br.spiders.spider_ac import SpiderAC
 from covid19br.spiders.spider_ba import SpiderBA
 from covid19br.spiders.spider_ce import SpiderCE
 from covid19br.spiders.spider_pr import SpiderPR
@@ -17,6 +18,7 @@
 
 # Todo -> Automatically retrieve spiders that extend the Base Class
 AVAILABLE_SPIDERS = [
+    SpiderAC,
     SpiderBA,
     SpiderCE,
     SpiderPR,

diff --git a/covid19br/spiders/spider_ac.py b/covid19br/spiders/spider_ac.py
@@ -0,0 +1,150 @@
+import re
+import scrapy
+import tempfile
+
+from covid19br.common.base_spider import BaseCovid19Spider
+from covid19br.common.constants import State, ReportQuality
+from covid19br.common.models.bulletin_models import (
+    StateTotalBulletinModel,
+    CountyBulletinModel,
+)
+from covid19br.parsers.acre import AcreBulletinExtractor
+
+REGEXP_CASES = re.compile(
+    "O número de infectados (?:passou )?(?:subiu )?(?:para )?(?:permanece.? )?(?:em )?([0-9.]+) em todo o estado"
+)
+REGEXP_DEATHS = re.compile(
+    "o número oficial de mortes por covid-19 (?:.+) ([0-9.]+)[,]? em todo o estado"
+)
+
+
+class SpiderAC(BaseCovid19Spider):
+    state = State.AC
+    name = State.AC.value
+    information_delay_in_days = 0
+    report_qualities = [ReportQuality.COUNTY_BULLETINS]
+
+    news_base_url = "https://agencia.ac.gov.br/"
+    news_query_params = "?s=covid19"
+
+    def pre_init(self):
+        self.requested_dates = list(self.requested_dates)
+
+    def start_requests(self):
+        yield scrapy.Request(self.news_base_url + self.news_query_params)
+
+    def parse(self, response, **kwargs):
+        news_per_date = {}
+        news_divs = response.xpath("//header[@class='entry-header']")
+        for div in news_divs:
+            url = div.xpath(".//h2[@class='entry-title']//a/@href").get()
+            if "boletim-sesacre" in url:
+                raw_date = (
+                    div.xpath(".//span[@class='date-post']//text()[2]").get().strip()
+                )
+                date = self.normalizer.extract_numeric_date(raw_date)
+                news_per_date[date] = url
+
+        for date in news_per_date:
+            if date in self.requested_dates:
+                yield scrapy.Request(
+                    news_per_date[date],
+                    callback=self.parse_news_bulletin,
+                    cb_kwargs={"date": date},
+                )
+
+        # handle pagination
+        if news_per_date and self.start_date < min(news_per_date):
+            last_page_number = 1
+            last_page_url = response.request.url
+            if "page" in last_page_url:
+                url, *_query_params = last_page_url.split("?")
+                if url[-1] == "/":
+                    url = url[:-1]
+                *_url_path, last_page_number = url.split("/")
+                last_page_number = self.normalizer.ensure_integer(last_page_number)
+            next_page_number = last_page_number + 1
+            next_page_url = (
+                f"{self.news_base_url}page/{next_page_number}/{self.news_query_params}"
+            )
+            yield scrapy.Request(next_page_url, callback=self.parse)
+
+    def parse_news_bulletin(self, response, date):
+        self._extract_cases_and_deaths_from_news(response, date)
+
+        pdf_url = response.xpath(
+            "//div[@class='entry-content']//a[contains(@href, '.pdf') and contains(@href, 'BOLETIM')]/@href"
+        ).get()
+        yield scrapy.Request(
+            pdf_url, callback=self.parse_pdf_bulletin, cb_kwargs={"date": date}
+        )
+
+    def parse_pdf_bulletin(self, response, date):
+        source = response.request.url
+        with tempfile.NamedTemporaryFile(mode="wb", suffix=".pdf") as tmp:
+            tmp.write(response.body)
+
+            extractor = AcreBulletinExtractor(tmp.name)
+
+            pdf_date = extractor.date
+            if pdf_date and pdf_date != date:
+                self.logger.warning(
+                    f"PDF date does not match for pdf {source}. Aborting extraction."
+                )
+                return
+
+            pdf_official_total = extractor.official_total
+            if pdf_official_total:
+                bulletin = StateTotalBulletinModel(
+                    date=date,
+                    state=self.state,
+                    confirmed_cases=pdf_official_total["confirmados"],
+                    deaths=pdf_official_total["mortes"],
+                    source=response.request.url + " | Painel na primeira pag. do pdf.",
+                )
+                self.add_new_bulletin_to_report(bulletin, date)
+
+            pdf_data = list(extractor.data)
+            if not pdf_data:
+                if "parcial" not in source.lower():
+                    self.logger.error(
+                        f"Couldn't extract data from pdf that is not parcial. Pdf source: {source}."
+                    )
+                return
+
+            for row in pdf_data:
+                if row["municipio"].lower() == "total":
+                    bulletin = StateTotalBulletinModel(
+                        date=date,
+                        state=self.state,
+                        confirmed_cases=row["confirmados"],
+                        deaths=row["mortes"],
+                        source=response.request.url
+                        + " | Tabela com dados dos municípios do pdf.",
+                    )
+                else:
+                    bulletin = CountyBulletinModel(
+                        date=date,
+                        state=self.state,
+                        city=row["municipio"],
+                        confirmed_cases=row["confirmados"],
+                        deaths=row["mortes"],
+                        source=response.request.url,
+                    )
+                self.add_new_bulletin_to_report(bulletin, date)
+
+    def _extract_cases_and_deaths_from_news(self, response, date):
+        body_text = " ".join(
+            response.xpath("//div[@class='entry-content']//p//text()").extract()
+        )
+        cases, *_other_matches = REGEXP_CASES.findall(body_text) or [None]
+        deaths, *_other_matches = REGEXP_DEATHS.findall(body_text) or [None]
+        if cases or deaths:
+            bulletin = StateTotalBulletinModel(
+                date=date,
+                state=self.state,
+                deaths=deaths,
+                confirmed_cases=cases,
+                source=response.request.url + " | Corpo da notícia",
+            )
+            self.add_new_bulletin_to_report(bulletin, date)
diff --git a/tests/data/AC/TO-2022-01-30.pdf b/tests/data/AC/TO-2022-01-30.pdf
diff --git a/tests/data/AC/TO-2022-02-12.pdf b/tests/data/AC/TO-2022-02-12.pdf
diff --git a/tests/data/AC/TO-2022-02-25.csv b/tests/data/AC/TO-2022-02-25.csv
@@ -0,0 +1,24 @@
+municipio,confirmados,mortes
+Acrelândia,2.700,39
+Assis Brasil,1.995,25
+Brasileia,3.574,46
+Bujari,1.588,17
+Capixaba,848,17
+Cruzeiro do Sul,10.724,184
+Epitaciolândia,1.850,38
+Feijó,3.747,67
+Jordão,722,2
+Mâncio Lima,3.070,36
+Manoel Urbano,995,17
+M. Thaumaturgo,1.613,13
+Plácido de Castro,2.624,25
+Porto Acre,2.044,42
+Porto Walter,558,7
+Rio Branco,60.653,1.161
+Rodrigues Alves,1.040,14
+Santa Rosa do Purus,1.164,7
+Sena Madureira,6.995,86
+Senador Guiomard,1.703,43
+Tarauacá,7.001,50
+Xapuri,3.361,33
+TOTAL,120.569,1.969
diff --git a/tests/data/AC/TO-2022-02-25.pdf b/tests/data/AC/TO-2022-02-25.pdf