Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor and Standardize S3 Functionality #2008

Open
pt2302 opened this issue Oct 23, 2023 · 0 comments
Open

Refactor and Standardize S3 Functionality #2008

pt2302 opened this issue Oct 23, 2023 · 0 comments

Comments

@pt2302
Copy link
Contributor

pt2302 commented Oct 23, 2023

Description/Context

There are some instances in the OCW Studio codebase where different functions that communicate with S3 are doing closely-related tasks. These end up duplicating some functionality, and also doing things slightly differently. Some examples are listed below:

def move_s3_object(from_path, to_path):
"""Move an S3 object from one path to another"""
s3 = get_boto3_resource("s3")
bucket = settings.AWS_STORAGE_BUCKET_NAME
extra_args = {"ACL": "public-read"}
s3.meta.client.copy(
{"Bucket": bucket, "Key": from_path}, bucket, to_path, extra_args
)
s3.Object(bucket, from_path).delete()

@transaction.atomic
def rename_file(obj_text_id, obj_new_filename):
"""Rename the file on S3 associated with the WebsiteContent object to a new filename.""" # noqa: E501
obj = WebsiteContent.objects.get(text_id=obj_text_id)
site = obj.website
df = DriveFile.objects.get(resource=obj)
s3 = get_boto3_resource("s3")
# slugify just the provided name and then make the extensions lowercase
filepath = Path(obj_new_filename)
new_filename = slugify(obj_new_filename.rstrip("".join(filepath.suffixes)))
if filepath.suffixes:
new_filename += "".join(filepath.suffixes).lower()
df_path = df.s3_key.split("/")
df_path[-1] = new_filename
new_key = "/".join(df_path)
# check if an object with the new filename already exists in this course
existing_obj = WebsiteContent.objects.filter(Q(website=site) & Q(file=new_key))
if existing_obj:
old_obj = existing_obj.first()
if old_obj == obj:
msg = "New filename is the same as the existing filename."
raise ValueError(msg)
dependencies = old_obj.get_content_dependencies()
if dependencies:
raise ValueError(
"Not renaming file due to dependencies in existing content: "
+ str(dependencies)
)
log.info("Found existing file with same name. Overwriting it.")
old_obj.delete()
backend = get_sync_backend(site)
backend.sync_all_content_to_backend()
old_key = df.s3_key
df.s3_key = new_key
obj.file = new_key
obj.filename = get_dirpath_and_filename(new_filename)[1]
df.save()
obj.save()
s3.Object(settings.AWS_STORAGE_BUCKET_NAME, new_key).copy_from(
CopySource=settings.AWS_STORAGE_BUCKET_NAME + "/" + old_key
)
s3.Object(settings.AWS_STORAGE_BUCKET_NAME, old_key).delete()

"""S3 utility functions"""
import boto3
from django.conf import settings
def get_boto3_options(extra_options=None):
"""
Provides default boto3 options, connecting to Minio if the environment is dev
Args:
extra_options (dict): (Optional) Extra options to append
Returns:
dict: A dictionary of options to initialize an s3 resource or client with
""" # noqa: D401
options = {
"aws_access_key_id": settings.AWS_ACCESS_KEY_ID,
"aws_secret_access_key": settings.AWS_SECRET_ACCESS_KEY,
}
if settings.ENVIRONMENT == "dev":
options.update({"endpoint_url": "http://10.1.0.100:9000"})
if extra_options:
options.update(extra_options)
return options
def get_boto3_resource(service_type, extra_options=None):
"""
Provides an S3 resource
Args:
service_type (string): The AWS service_type to initialize the resource with
extra_options (dict): (Optional) Extra options to initialize the resource with
Returns:
s3.ServiceResource: An S3 resource
""" # noqa: D401
return boto3.resource(service_type, **get_boto3_options(extra_options))
def get_boto3_client(service_type, extra_options=None):
"""
Provides an S3 client
Args:
service_type (string): The AWS service_type to initialize the resource with
extra_options (dict): (Optional) Extra options to initialize the resource with
Returns:
s3.Client: An S3 client
""" # noqa: D401
return boto3.client(service_type, **get_boto3_options(extra_options))
def get_s3_object_and_read(obj, iteration=0):
"""
Attempts to read S3 data, and tries again up to MAX_S3_GET_ITERATIONS if it encounters an error.
This helps to prevent read timeout errors from stopping sync.
Args:
obj (s3.ObjectSummary): The S3 ObjectSummary we are trying to read
iteration (int): A number tracking how many times this function has been run
Returns:
bytes: The contents of a json file read from S3
""" # noqa: E501, D401
try:
return obj.get()["Body"].read()
except Exception: # pylint: disable=broad-except # noqa: BLE001
if iteration < settings.MAX_S3_GET_ITERATIONS:
return get_s3_object_and_read(obj, iteration + 1)
else:
raise

from django.conf import settings # noqa: INP001
from django.core.management import BaseCommand
from django.db.models import Q
from content_sync.tasks import sync_website_content
from gdrive_sync.models import DriveFile
from main.s3_utils import get_boto3_client
from websites.constants import WEBSITE_SOURCE_STUDIO
from websites.models import Website, WebsiteContent
class Command(BaseCommand):
"""Moves nonvideo files mistakenly placed in a `Website.short_id` path to a `Website.name` path""" # noqa: E501
help = __doc__ # noqa: A003
def handle(self, *args, **options): # noqa: ARG002
s3 = get_boto3_client("s3")
for site in Website.objects.filter(source=WEBSITE_SOURCE_STUDIO).values(
"uuid", "name", "short_id"
):
if site["name"] != site["short_id"]:
for drive_file in (
DriveFile.objects.exclude(video__isnull=False)
.filter(
Q(website__uuid=site["uuid"])
& Q(s3_key__contains=site["short_id"])
)
.iterator()
):
old_s3_key = drive_file.s3_key
new_s3_key = drive_file.s3_key.replace(
f'{drive_file.s3_prefix}/{site["short_id"]}',
f'{drive_file.s3_prefix}/{site["name"]}',
1,
)
if old_s3_key == new_s3_key:
continue
try:
self.stdout.write(f"Moving {old_s3_key} to {new_s3_key}")
s3.copy_object(
Bucket=settings.AWS_STORAGE_BUCKET_NAME,
CopySource=f"{settings.AWS_STORAGE_BUCKET_NAME}/{old_s3_key}",
Key=new_s3_key,
ACL="public-read",
)
s3.delete_object(
Bucket=settings.AWS_STORAGE_BUCKET_NAME,
Key=drive_file.s3_key,
)
drive_file.s3_key = new_s3_key
drive_file.save()
content = WebsiteContent.objects.filter(file=old_s3_key).first()
if content:
content.file = new_s3_key
content.save()
except Exception as exc: # noqa: BLE001
self.stderr.write(
f"Error copying {old_s3_key} to {new_s3_key}: {exc!s}"
)
sync_website_content.delay(site["name"])
self.stdout.write("Finished moving s3 objects")

Plan/Design

  1. Review and verify all instances of S3-related functions in the OCW Studio codebase.
  2. Have a single, clean place to communicate with S3. A reasonable option may be main/s3_utils.py.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

1 participant