From c1951251c54927ae130c9c790340a377306ffc8a Mon Sep 17 00:00:00 2001 From: KonstantAnxiety Date: Fri, 20 Dec 2024 16:25:34 +0300 Subject: [PATCH] feat: BI-5952 WIP file-uploader-api handler for processing presigned file from bucket --- .../dl_file_uploader_api_lib/schemas/files.py | 5 ++ .../dl_file_uploader_api_lib/views/files.py | 51 ++++++++++++++++++- .../db/test_files_api.py | 12 ++++- .../ext/test_update_data_gsheets.py | 2 +- .../ext/test_update_data_yadocs.py | 2 +- .../ext/test_yadocs.py | 2 +- lib/dl_s3/dl_s3/utils.py | 21 ++++++++ lib/dl_testing/dl_testing/s3_utils.py | 13 ----- 8 files changed, 90 insertions(+), 18 deletions(-) create mode 100644 lib/dl_s3/dl_s3/utils.py diff --git a/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib/schemas/files.py b/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib/schemas/files.py index a60f731f6..179ae198e 100644 --- a/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib/schemas/files.py +++ b/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib/schemas/files.py @@ -76,6 +76,11 @@ class MakePresignedUrlRequestSchema(ma.Schema): content_md5 = ma.fields.String(required=True) +class DownloadPresignedUrlRequestSchema(ma.Schema): + filename = ma.fields.String(required=True) + key = ma.fields.String(required=True) + + class PresignedUrlSchema(ma.Schema): class PresignedUrlFields(ma.Schema): class Meta: diff --git a/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib/views/files.py b/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib/views/files.py index eeea0c7c0..8932e50d4 100644 --- a/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib/views/files.py +++ b/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib/views/files.py @@ -13,6 +13,7 @@ from aiohttp import web from aiohttp.multipart import BodyPartReader +from dl_s3.utils import s3_file_exists from redis.asyncio.lock import Lock as RedisLock from dl_api_commons.aiohttp.aiohttp_wrappers import ( @@ -53,6 +54,8 @@ LOGGER = logging.getLogger(__name__) +S3_KEY_PARTS_SEPARATOR = "--" # used to separate author user_id from the rest of the s3 object key to sign it + def get_file_type_from_name( filename: Optional[str], @@ -136,7 +139,10 @@ async def post(self) -> web.StreamResponse: content_md5: str = req_data["content_md5"] s3 = self.dl_request.get_s3_service() - s3_key = "{}_{}".format(self.dl_request.rci.user_id or "unknown", str(uuid.uuid4())) + s3_key = S3_KEY_PARTS_SEPARATOR.join(( + self.dl_request.rci.user_id or "unknown", + str(uuid.uuid4()), + )) url = await s3.client.generate_presigned_post( Bucket=s3.tmp_bucket_name, @@ -154,6 +160,49 @@ async def post(self) -> web.StreamResponse: ) +class DownloadPresignedUrlView(FileUploaderBaseView): + async def post(self) -> web.StreamResponse: + req_data = await self._load_post_request_schema_data(files_schemas.DownloadPresignedUrlRequestSchema) + filename: str = req_data["filename"] + key: str = req_data["key"] + + file_type = get_file_type_from_name(filename=filename, allow_xlsx=self.request.app["ALLOW_XLSX"]) + + s3 = self.dl_request.get_s3_service() + + file_exists = await s3_file_exists(s3.client, s3.tmp_bucket_name, key) + if not file_exists: + raise exc.DocumentNotFound() + + user_id_from_key = key.split(S3_KEY_PARTS_SEPARATOR)[0] + if user_id_from_key != self.dl_request.rci.user_id: + exc.PermissionDenied() + + rmm = self.dl_request.get_redis_model_manager() + dfile = DataFile( + manager=rmm, + filename=filename, + file_type=file_type, + status=FileProcessingStatus.in_progress, + ) + LOGGER.info(f"Data file id: {dfile.id}") + + await dfile.save() + + task_processor = self.dl_request.get_task_processor() + if file_type == FileType.xlsx: + await task_processor.schedule(ProcessExcelTask(file_id=dfile.id)) + LOGGER.info(f"Scheduled ProcessExcelTask for file_id {dfile.id}") + else: + await task_processor.schedule(ParseFileTask(file_id=dfile.id)) + LOGGER.info(f"Scheduled ParseFileTask for file_id {dfile.id}") + + return web.json_response( + files_schemas.FileUploadResponseSchema().dump({"file_id": dfile.id, "title": dfile.filename}), + status=HTTPStatus.CREATED, + ) + + class LinksView(FileUploaderBaseView): REQUIRED_RESOURCES: ClassVar[frozenset[RequiredResource]] = frozenset() # Don't skip CSRF check diff --git a/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib_tests/db/test_files_api.py b/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib_tests/db/test_files_api.py index 81b2882a6..0ee5740eb 100644 --- a/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib_tests/db/test_files_api.py +++ b/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib_tests/db/test_files_api.py @@ -4,6 +4,7 @@ import uuid import attr +from dl_file_uploader_api_lib.views.files import S3_KEY_PARTS_SEPARATOR import pytest from dl_api_commons.base_models import RequestContextInfo @@ -28,7 +29,7 @@ async def test_file_upload_cors(fu_client, s3_tmp_bucket, upload_file_req): @pytest.mark.asyncio -async def test_make_presigned_url(fu_client, s3_tmp_bucket): +async def test_make_presigned_url(fu_client, s3_tmp_bucket, rci): expected_url_fields = ("key", "x-amz-algorithm", "x-amz-credential", "x-amz-date", "policy", "x-amz-signature") resp = await fu_client.make_request(ReqBuilder.presigned_url("mymd5")) @@ -36,6 +37,15 @@ async def test_make_presigned_url(fu_client, s3_tmp_bucket): assert "url" in resp.json, resp.json assert "fields" in resp.json, resp.json assert all(field in resp.json["fields"] for field in expected_url_fields), resp.json + key = resp.json["fields"]["key"] + key_parts = key.split(S3_KEY_PARTS_SEPARATOR) + assert len(key_parts) == 2, key_parts + assert key_parts[0] == rci.user_id + + +@pytest.mark.asyncio +async def test_download_presigned_url(fu_client, s3_tmp_bucket, rci): + # TODO test @pytest.mark.asyncio diff --git a/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib_tests/ext/test_update_data_gsheets.py b/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib_tests/ext/test_update_data_gsheets.py index 7b1d799b9..e1d59880e 100644 --- a/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib_tests/ext/test_update_data_gsheets.py +++ b/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib_tests/ext/test_update_data_gsheets.py @@ -15,7 +15,7 @@ from dl_core_testing.connection import make_conn_key from dl_file_uploader_api_lib_tests.req_builder import ReqBuilder from dl_file_uploader_lib import exc -from dl_testing.s3_utils import s3_file_exists +from dl_s3.utils import s3_file_exists from dl_connector_bundle_chs3.chs3_gsheets.core.constants import CONNECTION_TYPE_GSHEETS_V2 from dl_connector_bundle_chs3.chs3_gsheets.core.lifecycle import GSheetsFileS3ConnectionLifecycleManager diff --git a/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib_tests/ext/test_update_data_yadocs.py b/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib_tests/ext/test_update_data_yadocs.py index 98ea488f5..cce64cfd2 100644 --- a/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib_tests/ext/test_update_data_yadocs.py +++ b/lib/dl_file_uploader_api_lib/dl_file_uploader_api_lib_tests/ext/test_update_data_yadocs.py @@ -15,7 +15,7 @@ from dl_core_testing.connection import make_conn_key from dl_file_uploader_api_lib_tests.req_builder import ReqBuilder from dl_file_uploader_lib import exc -from dl_testing.s3_utils import s3_file_exists +from dl_s3.utils import s3_file_exists from dl_connector_bundle_chs3.chs3_yadocs.core.constants import CONNECTION_TYPE_YADOCS from dl_connector_bundle_chs3.chs3_yadocs.core.lifecycle import YaDocsFileS3ConnectionLifecycleManager diff --git a/lib/dl_file_uploader_worker_lib/dl_file_uploader_worker_lib_tests/ext/test_yadocs.py b/lib/dl_file_uploader_worker_lib/dl_file_uploader_worker_lib_tests/ext/test_yadocs.py index 934e784a6..6f6ad5fcc 100644 --- a/lib/dl_file_uploader_worker_lib/dl_file_uploader_worker_lib_tests/ext/test_yadocs.py +++ b/lib/dl_file_uploader_worker_lib/dl_file_uploader_worker_lib_tests/ext/test_yadocs.py @@ -8,7 +8,7 @@ ) from dl_file_uploader_task_interface.tasks import DownloadYaDocsTask from dl_task_processor.state import wait_task -from dl_testing.s3_utils import s3_file_exists +from dl_s3.utils import s3_file_exists @pytest.mark.asyncio diff --git a/lib/dl_s3/dl_s3/utils.py b/lib/dl_s3/dl_s3/utils.py new file mode 100644 index 000000000..9f06913eb --- /dev/null +++ b/lib/dl_s3/dl_s3/utils.py @@ -0,0 +1,21 @@ +from __future__ import annotations + +from typing import TYPE_CHECKING + +import botocore.exceptions + +if TYPE_CHECKING: + from types_aiobotocore_s3 import S3Client as AsyncS3Client + + +async def s3_file_exists(s3_client: AsyncS3Client, bucket: str, key: str) -> bool: + try: + s3_resp = await s3_client.head_object( + Bucket=bucket, + Key=key, + ) + except botocore.exceptions.ClientError as ex: + if ex.response["ResponseMetadata"]["HTTPStatusCode"] == 404: + return False + raise + return s3_resp["ResponseMetadata"]["HTTPStatusCode"] == 200 diff --git a/lib/dl_testing/dl_testing/s3_utils.py b/lib/dl_testing/dl_testing/s3_utils.py index 6cac0791f..f3327ae04 100644 --- a/lib/dl_testing/dl_testing/s3_utils.py +++ b/lib/dl_testing/dl_testing/s3_utils.py @@ -83,19 +83,6 @@ async def get_lc_rules_number(s3_client: AsyncS3Client, bucket: str) -> int: return len(lc_config["Rules"]) -async def s3_file_exists(s3_client: AsyncS3Client, bucket: str, key: str) -> bool: - try: - s3_resp = await s3_client.head_object( - Bucket=bucket, - Key=key, - ) - except botocore.exceptions.ClientError as ex: - if ex.response["ResponseMetadata"]["HTTPStatusCode"] == 404: - return False - raise - return s3_resp["ResponseMetadata"]["HTTPStatusCode"] == 200 - - S3_TBL_FUNC_TEMPLATE = """s3( '{s3_endpoint}/{bucket}/{filename}', '{key_id}',