Skip to content

Commit

Permalink
MNTSUP-165 - fix review findings
Browse files Browse the repository at this point in the history
  • Loading branch information
wimfabri committed Oct 9, 2023
1 parent 52aa06b commit 28bacb6
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 12 deletions.
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,4 @@ build
.settings/
bin/
out/
*.iml
**/__pycache__
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ Content selection:
The script will delete a single batch of objects with the total size of the objects in the batch < --batch_size.
When the objects have a 'deletable=no' lifepoint this will be replaced with 'deletable=yes'. This is a COPY operation and will cause the storage used for the object to be temporarily doubled.
When the objects have a 'deletable=no' lifepoint this will be replaced with 'deletable=yes'. This is a COPY operation and will cause the storage used for the object to be temporarily doubled. The object is deleted immediately after the copy.
---
Expand Down
25 changes: 15 additions & 10 deletions src/main/docker/docker_root/swarmclean.py
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,8 @@ def script_init():
)
parser.add_argument(
'-f',
'--filter',
env_var = 'SCL_FILTER',
'--filter_regex',
env_var = 'SCL_FILTER_REGEX',
help = 'filter regex, objects that match will be deleted'
)

Expand Down Expand Up @@ -304,7 +304,7 @@ def do_query(self, query: str, arg_values={}):
#end def do_query

def query_single_value(self, query: str, arg_values={}):
return self.do_query(query, **arg_values)[0][0]
return self.do_query(query, arg_values)[0][0]
#end class AlfrescoDB

@dataclass
Expand All @@ -319,6 +319,8 @@ def __init__(self, args):
self.args = args
self.swarm_servers = args['swarm_servers'].split(',')

self.paging_size = 1000 # swarm default = 1000

# setup Swarm session
self.swarm_session = requests.Session()

Expand All @@ -338,12 +340,12 @@ def make_swarm_url(self, sub_path, args=''):
return url
#end def make_swarm_url

def list_bucket_contents(self, filter_function, max_batch_size):
def list_bucket_contents_filtered(self, filter_function, max_batch_size):
object_list = []
batch_size = 0
paging_marker = ''
while True:
response = self.swarm_session.get(self.make_swarm_url(self.args['swarm_bucket'], f"fields=name,content-length&format=json&paging_marker={ paging_marker }"))
response = self.swarm_session.get(self.make_swarm_url(self.args['swarm_bucket'], f"fields=name,content-length&format=json&size={ self.paging_size }&marker={ paging_marker }"))
response.raise_for_status()
logging.debug(response.content)
objects = response.json()
Expand All @@ -362,6 +364,9 @@ def list_bucket_contents(self, filter_function, max_batch_size):
batch_size += swarm_object.bytes
logging.trace(f"batch size { batch_size }")
object_list.append(swarm_object)
paging_marker = objects[-1]['name']
logging.debug(f"next page - marker={paging_marker}")

#end def list_bucket_contents

def get_info(self, object_name):
Expand Down Expand Up @@ -443,23 +448,23 @@ def __init__(self, args):
db_args = { key: value for key, value in vars(args).items() if key[0:2] == 'db' }
self.alfresco_db = AlfrescoDB(db_args)
elif args.filter_method == 'regex':
logging.info(f"using filter type 'regex' with regex { args.filter }")
self.filterRegex=re.compile(args.filter)
logging.info(f"using filter type 'regex' with regex { args.filter_regex }")
self.filterRegex=re.compile(args.filter_regex)
else:
raise ValueError( f"Invalid filter_method: { args.filter_method }")
#end def __init__

def filter(self, swarm_object):
def isDeletionCandidate(self, swarm_object):
if args.filter_method == 'alfresco_db':
result = len(self.alfresco_db.do_query("select id from alf_content_url where content_url like :object_name", {'object_name': f"%/{swarm_object.name}"}).all()) == 0
result = self.alfresco_db.query_single_value("select count(*) from alf_content_url where content_url like :object_name", {'object_name': f"%/{swarm_object.name}"}) == 0
elif args.filter_method == 'regex':
result = self.filterRegex.match(swarm_object.name)
logging.trace(f"filter { swarm_object.name }: { bool(result) } - size { humanfriendly.format_size(swarm_object.bytes, binary=True) }")
return result
#end def filter

def main(self):
objects_to_delete = self.swarm.list_bucket_contents(self.filter, self.batch_size)
objects_to_delete = self.swarm.list_bucket_contents_filtered(self.isDeletionCandidate, self.batch_size)
for swarm_object in objects_to_delete['list']:
logging.info(f"to delete: { swarm_object.name }")
logging.info(f"total size: { humanfriendly.format_size(objects_to_delete['size'], binary=True) }")
Expand Down

0 comments on commit 28bacb6

Please sign in to comment.