Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mntsup 171 #5

Merged
merged 3 commits into from
Nov 15, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@ plugins {

ext {
base_img = 'open-source.docker.xenit.eu/oracle-python'
base_img_version = 'main-2.1.1'
img_version = '2.1.0'
base_img_version = 'main-2.1.2'
img_version = '2.2.0'
}

createDockerFile {
Expand Down
5 changes: 5 additions & 0 deletions src/main/docker/docker_root/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,8 @@ configargparse
requests[socks]
records
humanfriendly
python-dateutil
pytimeparse2
types-python-dateutil
types-requests
types-humanfriendly
65 changes: 43 additions & 22 deletions src/main/docker/docker_root/swarmclean.py
Original file line number Diff line number Diff line change
@@ -1,22 +1,23 @@
#!/usr/bin/python3
import configargparse
import configargparse # type: ignore
import requests
import re
import logging
import time
import sys
import socket
from datetime import datetime
from datetime import datetime, timezone
import os
import random
import urllib.parse
import records
from dataclasses import dataclass, asdict
import records # type: ignore
import humanfriendly
import binascii
from dateutil.parser import parse as parsedate
import pytimeparse2 # type: ignore
from attr import define, field

sys.path.insert(0,sys.path[0]+'/castorsdk')
import scspHeaders
import scspHeaders # type: ignore

def yes_or_no(question):
reply = str(input(question+' (y/n): ')).lower().strip()
Expand Down Expand Up @@ -52,18 +53,15 @@ def yes_or_no(question):
r'^X-Castor-Meta-Error-Message'
]

headersAllow = "(" + ")|(".join(headersToCopy) + ")"
#print('allow='+headersAllow)
headersAllow = re.compile(headersAllow, re.IGNORECASE)
headersAllow_string = "(" + ")|(".join(headersToCopy) + ")"
headersAllow = re.compile(headersAllow_string, re.IGNORECASE)

headersSkip = "(" + ")|(".join(headersToSkip) + ")"
#print('skip='+headersSkip)
headersSkip = re.compile(headersSkip, re.IGNORECASE)
headersSkip_string = "(" + ")|(".join(headersToSkip) + ")"
headersSkip = re.compile(headersSkip_string, re.IGNORECASE)



def script_init():
hostname = socket.gethostname()
parser = configargparse.ArgumentParser(
default_config_files = ['swarmclean.conf'],
description = """
Expand Down Expand Up @@ -122,7 +120,7 @@ def script_init():
'-R',
'--report_folder',
env_var = 'SCL_REPORT_FOLDER',
default = f"/tmp/swarmclean",
default = "/tmp/swarmclean",
help = 'folder where report files will be written'
)

Expand Down Expand Up @@ -186,6 +184,16 @@ def script_init():
required = True,
help = 'alfresco_db | regex'
)

parser.add_argument(
'-a',
'--min_age',
env_var = 'SCL_MIN_AGE',
default = '1 week',
help = 'minimum age for objects, objects that are older will be deleted (string duration parsed with pytimeparse2)'
)

# regex
parser.add_argument(
'-f',
'--filter_regex',
Expand Down Expand Up @@ -228,7 +236,7 @@ def script_init():
addLoggingLevel('TRACE', logging.DEBUG + 5) # level between info and debug
numeric_level = getattr(logging, args.loglevel.upper(), None)
if not isinstance(numeric_level, int):
raise ValueError( f"Invalid log level: { loglevel }")
raise ValueError( f"Invalid log level: { args.loglevel }")
logging.basicConfig(level=numeric_level,format='%(asctime)s %(name)-5s %(levelname)-8s - %(message)s',datefmt='%Y-%m-%d %H:%M:%S')

return args
Expand Down Expand Up @@ -308,10 +316,15 @@ def query_single_value(self, query: str, arg_values={}):
return self.do_query(query, arg_values)[0][0]
#end class AlfrescoDB

@dataclass

def parse_http_timestamp(timestamp: str) -> datetime:
return parsedate(timestamp).astimezone()

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it correct for me to assume this method takes in the http-header string for the timestamp as: "Wed, 21 Oct 2015 07:28:00 GMT" and converts it into a datetime Object?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

indeed

@define
class SwarmObject:
name: str
bytes: int
last_modified: datetime = field(converter=parse_http_timestamp)


class Swarm:
Expand All @@ -327,7 +340,7 @@ def __init__(self, args):

# if using swarm gateway, set up basic AUTH
if args['swarm_use_contentgateway']:
logging.debug(f"Using Swarm gateway, setting up basic auth.")
logging.debug("Using Swarm gateway, setting up basic auth.")
self.swarm_session.auth = (args['swarm_user'], args['swarm_password'])

if args['swarm_proxy']:
Expand All @@ -346,7 +359,8 @@ def list_bucket_contents_filtered(self, filter_function, max_batch_size):
batch_size = 0
paging_marker = ''
while True:
response = self.swarm_session.get(self.make_swarm_url(self.args['swarm_bucket'], f"fields=name,content-length&format=json&size={ self.paging_size }&marker={ paging_marker }"))
# field tmBorn is named last_modified in the json result
response = self.swarm_session.get(self.make_swarm_url(self.args['swarm_bucket'], f"fields=name,content-length,tmBorn&format=json&size={ self.paging_size }&marker={ paging_marker }"))
response.raise_for_status()
logging.debug(response.content)
objects = response.json()
Expand All @@ -356,6 +370,7 @@ def list_bucket_contents_filtered(self, filter_function, max_batch_size):

for object in objects:
swarm_object = SwarmObject(**object)

if filter_function(swarm_object):
if batch_size + swarm_object.bytes > max_batch_size:
if batch_size == 0:
Expand All @@ -381,7 +396,7 @@ def is_object_deletable(self, object_info):
if 'Lifepoint' in object_info:
lifepoints=scspHeaders.lifepointsFromString(object_info['Lifepoint'])
for lp in lifepoints:
if lp.end == None or time.time() <= lp.end.sinceEpoch():
if lp.end is None or time.time() <= lp.end.sinceEpoch():
if lp.constraint == 'deletable=no':
logging.debug(f"{ object_info['Castor-System-Name'] } has 'deletable=no' lifepoint")
return False
Expand Down Expand Up @@ -424,8 +439,11 @@ def delete_object(self, object_name):

class SwarmClean:
def __init__(self, args):
self.args = args
logging.debug(f"args={ args }")
self.args = args

self.max_creation_date = datetime.now(timezone.utc) - pytimeparse2.parse(args.min_age, as_timedelta=True)
logging.info(f"max_creation_date={self.max_creation_date}")

if self.args.execute:
self.args.dryrun = False
Expand Down Expand Up @@ -456,7 +474,10 @@ def __init__(self, args):
#end def __init__

def isDeletionCandidate(self, swarm_object):
if args.filter_method == 'alfresco_db':
logging.debug(f"modified={swarm_object.last_modified}")
if swarm_object.last_modified > self.max_creation_date:
result=False
elif args.filter_method == 'alfresco_db':
content_url_short = swarm_object.name[-12:]
content_url_crc = binascii.crc32(bytes(f"swarm://{self.args.swarm_domain}/{swarm_object.name}", 'ascii'))
# table has an index on content_url_short + content_url_crc
Expand All @@ -469,7 +490,7 @@ def isDeletionCandidate(self, swarm_object):
) == 0
elif args.filter_method == 'regex':
result = self.filterRegex.match(swarm_object.name)
logging.trace(f"filter { swarm_object.name }: { bool(result) } - size { humanfriendly.format_size(swarm_object.bytes, binary=True) }")
logging.trace(f"filter { swarm_object.name }: { bool(result) } - last_modified={swarm_object.last_modified} - size={ humanfriendly.format_size(swarm_object.bytes, binary=True) }")
return result
#end def filter

Expand Down