Skip to content
This repository has been archived by the owner on Oct 2, 2023. It is now read-only.

Mantém arquivos para a extração local #412

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/workflows/cicd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ jobs:
DJANGO_SETTINGS_MODULE: "web.settings"
DJANGO_CONFIGURATION: "Test"
DATABASE_URL: "postgres://postgres:postgres@localhost:5432/mariaquiteria"
DATA_DIR: ${{ github.workspace }}
run: |
python manage.py collectstatic
pytest
Expand Down
33 changes: 19 additions & 14 deletions web/datasets/services.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import boto3
import requests
from django.conf import settings


class S3Client:
Expand All @@ -23,17 +24,18 @@ def _upload_to_s3(self, temp_file_path, bucket_file_path):
)

def upload_file(self, location_or_url, relative_file_path, prefix=""):
bucket_file_path = f"{self.bucket_folder}/files/{relative_file_path}"

location = Path(location_or_url)
if not location.exists():
# se não é um arquivo local, assumimos que é uma url
file_name, temp_file_path = self.create_temp_file(
location_or_url, relative_file_path, prefix
location_or_url, bucket_file_path, prefix
)
else:
file_name, temp_file_path = location.name, str(location)
file_name, temp_file_path = location.name, str(location.absolute())

bucket_file_path = f"{self.bucket_folder}/files/{relative_file_path}"
bucket_file_path = f"{bucket_file_path}{file_name}"
bucket_file_path = f"{bucket_file_path}/{file_name}"
url = (
f"https://{self.bucket}.s3.{self.bucket_region}.amazonaws.com/"
f"{bucket_file_path}"
Expand All @@ -45,27 +47,26 @@ def upload_file(self, location_or_url, relative_file_path, prefix=""):

@staticmethod
def create_temp_file(url, relative_file_path="", prefix=""):
temporary_directory = f"{Path.cwd()}/data/tmp/{relative_file_path}"
Path(temporary_directory).mkdir(parents=True, exist_ok=True)
temporary_directory = Path(settings.DATA_DIR) / relative_file_path
temporary_directory.mkdir(parents=True, exist_ok=True)

response = requests.get(url)
start_index = url.rfind("/") + 1
temp_file_name = f"{url[start_index:]}"
if prefix:
temp_file_name = f"{prefix}-{temp_file_name}"
temp_file_path = f"{temporary_directory}{temp_file_name}"
with open(temp_file_path, "wb") as tmp_file:
tmp_file.write(response.content)
return temp_file_name, temp_file_path
temp_file_path = temporary_directory / temp_file_name

temp_file_path.write_bytes(response.content)
return temp_file_name, str(temp_file_path.absolute())

def download_file(self, s3_file_path):
temporary_directory = f"{Path.cwd()}/data/tmp/"
Path(temporary_directory).mkdir(parents=True, exist_ok=True)
Path(settings.DATA_DIR).mkdir(parents=True, exist_ok=True)

start_index = s3_file_path.rfind("/") + 1
file_name = s3_file_path[start_index:]

local_path = f"{temporary_directory}{file_name}"
local_path = Path(settings.DATA_DIR) / file_name
with open(local_path, "wb") as file_:
self.client.download_fileobj(self.bucket, s3_file_path, file_)

Expand All @@ -81,7 +82,11 @@ def _upload_to_s3(self, temp_file_path, bucket_file_path):
pass

def download_file(self, s3_file_path):
return f"{Path.cwd()}/data/tmp/{s3_file_path}"
return f"{settings.DATA_DIR}/{s3_file_path}"

@staticmethod
def delete_temp_file(temp_file_path):
pass


def get_s3_client(settings):
Expand Down
9 changes: 4 additions & 5 deletions web/datasets/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class WebserviceException(Exception):


@shared_task
def content_from_file(file_pk=None, path=None, keep_file=True):
def content_from_file(file_pk=None, path=None):
if not any([file_pk, path]):
raise Exception("Ou `file_pk` ou `path` devem ser informados.")

Expand All @@ -45,16 +45,15 @@ def content_from_file(file_pk=None, path=None, keep_file=True):
return a_file.content

path = client.download_file(a_file.s3_file_path)
keep_file = False

if not Path(path).exists():
info(f"Arquivo {path} não encontrado.")
return

raw = parser.from_file(path)

if not keep_file:
Path(path).unlink()
if raw is not None:
Path(path).unlink(missing_ok=True)

if a_file:
a_file.content = raw["content"] or ""
Expand All @@ -79,7 +78,7 @@ def backup_file(file_id):
model_name = file_obj.content_object._meta.model_name
relative_file_path = (
f"{model_name}/{file_obj.created_at.year}/"
f"{file_obj.created_at.month}/{file_obj.created_at.day}/"
f"{file_obj.created_at.month}/{file_obj.created_at.day}"
)

location = file_obj.local_path or file_obj.url
Expand Down
42 changes: 18 additions & 24 deletions web/datasets/tests/test_services.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import os
from pathlib import Path

from django.conf import settings
Expand All @@ -9,18 +8,19 @@

class TestS3Client:
def test_upload_file(self):
relative_path = "TestModel/2020/10/23/"
relative_path = "TestModel/2020/10/23"
s3_url, bucket_file_path = client.upload_file(
"https://www.google.com/robots.txt", relative_path
)

expected_file_path = f"maria-quiteria-local/files/{relative_path}robots.txt"
expected_file_path = f"maria-quiteria-local/files/{relative_path}/robots.txt"
expected_s3_url = f"https://teste.s3.brasil.amazonaws.com/{bucket_file_path}"
real_path = f"{os.getcwd()}/data/tmp/{expected_file_path}"
real_path = Path(settings.DATA_DIR) / expected_file_path

assert s3_url == expected_s3_url
assert bucket_file_path == expected_file_path
assert Path(real_path).exists() is False
assert real_path.exists()
real_path.unlink()

def test_create_temp_file(self):
url = (
Expand All @@ -31,9 +31,7 @@ def test_create_temp_file(self):

assert temp_file_name == "4924SUSPENS%C3%83O.pdf"
assert Path(temp_file_path).is_file() is True

client.delete_temp_file(temp_file_path)
assert Path(temp_file_path).is_file() is False
Path(temp_file_path).unlink()

def test_create_temp_file_with_prefix(self):
url = (
Expand All @@ -46,54 +44,50 @@ def test_create_temp_file_with_prefix(self):

assert temp_file_name == expected_file_name
assert Path(temp_file_path).is_file() is True

client.delete_temp_file(temp_file_path)
assert Path(temp_file_path).is_file() is False
Path(temp_file_path).unlink()

def test_create_temp_file_with_relative_file_path(self):
url = (
"http://www.feiradesantana.ba.gov.br/licitacoes/"
"respostas/4924SUSPENS%C3%83O.pdf"
)
relative_file_path = "extra/"
relative_file_path = "extra"
temp_file_name, temp_file_path = client.create_temp_file(
url, relative_file_path=relative_file_path
)

assert temp_file_name == "4924SUSPENS%C3%83O.pdf"
assert Path(temp_file_path).is_file() is True

client.delete_temp_file(temp_file_path)

assert Path(temp_file_path).is_file() is False
Path(temp_file_path).unlink()

def test_download_file(self):
relative_path = "TestModel/2020/10/23/"
relative_path = "TestModel/2020/10/23"
s3_url, relative_file_path = client.upload_file(
"https://www.google.com/robots.txt", relative_path
)

expected_file_path = f"maria-quiteria-local/files/{relative_path}robots.txt"
expected_file_path = f"maria-quiteria-local/files/{relative_path}/robots.txt"
expected_s3_url = f"https://teste.s3.brasil.amazonaws.com/{expected_file_path}"
real_path = f"{os.getcwd()}/data/tmp/{expected_file_path}"
real_path = Path(settings.DATA_DIR) / expected_file_path

assert s3_url == expected_s3_url
assert relative_file_path == expected_file_path
assert Path(real_path).exists() is False
assert real_path.exists()

absolute_file_path = client.download_file(relative_file_path)

assert absolute_file_path == real_path
assert absolute_file_path == str(real_path.absolute())
real_path.unlink()

def test_upload_file_from_local_path(self):
local_path = Path("conteudo.txt")
local_path.write_text("Testando")
relative_path = "TestModel/2021/06/23/"
relative_path = "TestModel/2021/06/23"
s3_url, bucket_file_path = client.upload_file(str(local_path), relative_path)

expected_file_path = f"maria-quiteria-local/files/{relative_path}conteudo.txt"
expected_file_path = f"maria-quiteria-local/files/{relative_path}/conteudo.txt"
expected_s3_url = f"https://teste.s3.brasil.amazonaws.com/{bucket_file_path}"

assert s3_url == expected_s3_url
assert bucket_file_path == expected_file_path
assert Path(local_path).exists() is False
local_path.unlink()
2 changes: 2 additions & 0 deletions web/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,8 @@ class Common(Configuration):

ENABLE_NEW_RELIC = False

DATA_DIR = values.Value(default="/data", environ_prefix=None)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
DATA_DIR = values.Value(default="/data", environ_prefix=None)
DATA_DIR = values.PathValue(default="/data", environ_prefix=None)

Isso já verifica se o valor é um caminho de arquivo ou diretório válido, e se ele existe. Se quiser que ele não verifique se existe, dá para usar check_exists=False — e, no futuro, talvez esse PathValue vá retornar um pathlib.Path, o que deixaria o nosso código mais simples ainda : )

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nesse caso é melhor que não verifique porque o volume é criado depois que a imagem é buildada. :)



class Dev(Common):
DEBUG = True
Expand Down