|
| 1 | +""" |
| 2 | +This migration checks all the files stored in local storage (=GridFS) and compares them to the list |
| 3 | +of messages already on the node. The files that are not linked to any message are scheduled for |
| 4 | +deletion. |
| 5 | +""" |
| 6 | +import asyncio |
| 7 | +import datetime as dt |
| 8 | +from typing import Any, Dict, FrozenSet, List |
| 9 | +from typing import cast |
| 10 | + |
| 11 | +import pytz |
| 12 | +import typer |
| 13 | +from aleph_message.models import MessageType |
| 14 | +from configmanager import Config |
| 15 | + |
| 16 | +import aleph.model |
| 17 | +from aleph.ccn_cli.cli_config import CliConfig |
| 18 | +from aleph.config import get_defaults |
| 19 | +from aleph.model import init_db_globals |
| 20 | +from aleph.model.filepin import PermanentPin |
| 21 | +from aleph.model.hashes import delete_value as delete_gridfs_file |
| 22 | +from aleph.model.messages import Message |
| 23 | + |
| 24 | +gc_ns = typer.Typer() |
| 25 | + |
| 26 | + |
| 27 | +# Get all the messages that potentially store data in local storage: |
| 28 | +# * AGGREGATEs with item_type=="storage" |
| 29 | +# * POSTs with item_type=="storage" |
| 30 | +# * STOREs with content.item_type=="storage" |
| 31 | +async def get_hashes( |
| 32 | + msg_type: MessageType, item_type_field: str, item_hash_field: str |
| 33 | +) -> FrozenSet[str]: |
| 34 | + def rgetitem(dictionary: Any, fields: List[str]) -> Any: |
| 35 | + value = dictionary[fields[0]] |
| 36 | + if len(fields) > 1: |
| 37 | + return rgetitem(value, fields[1:]) |
| 38 | + return value |
| 39 | + |
| 40 | + return frozenset( |
| 41 | + [ |
| 42 | + rgetitem(msg, item_hash_field.split(".")) |
| 43 | + async for msg in Message.collection.find( |
| 44 | + {"type": msg_type, item_type_field: "storage"}, |
| 45 | + {item_hash_field: 1}, |
| 46 | + batch_size=1000, |
| 47 | + ) |
| 48 | + ] |
| 49 | + ) |
| 50 | + |
| 51 | + |
| 52 | +def print_files_to_preserve(files_to_preserve: Dict[str, FrozenSet[str]]) -> None: |
| 53 | + typer.echo("The following files will be preserved:") |
| 54 | + for file_type, files in files_to_preserve.items(): |
| 55 | + typer.echo(f"* {len(files)} {file_type}") |
| 56 | + |
| 57 | + |
| 58 | +async def list_files_to_preserve( |
| 59 | + gridfs_files_dict: Dict[str, Dict], |
| 60 | + temporary_files_ttl: int, |
| 61 | +) -> Dict[str, FrozenSet[str]]: |
| 62 | + files_to_preserve_dict = {} |
| 63 | + |
| 64 | + # Preserve any file that was uploaded less than an hour ago |
| 65 | + current_datetime = pytz.utc.localize(dt.datetime.utcnow()) |
| 66 | + files_to_preserve_dict["temporary files"] = frozenset( |
| 67 | + [ |
| 68 | + file["filename"] |
| 69 | + for file in gridfs_files_dict.values() |
| 70 | + if file["uploadDate"] |
| 71 | + > current_datetime - dt.timedelta(seconds=temporary_files_ttl) |
| 72 | + ] |
| 73 | + ) |
| 74 | + |
| 75 | + # Get all the messages that potentially store data in local storage: |
| 76 | + # * AGGREGATEs with item_type=="storage" |
| 77 | + # * POSTs with item_type=="storage" |
| 78 | + # * STOREs with content.item_type=="storage" |
| 79 | + files_to_preserve_dict["aggregates"] = await get_hashes( |
| 80 | + MessageType.aggregate, "item_type", "item_hash" |
| 81 | + ) |
| 82 | + files_to_preserve_dict["posts"] = await get_hashes( |
| 83 | + MessageType.post, "item_type", "item_hash" |
| 84 | + ) |
| 85 | + files_to_preserve_dict["stores"] = await get_hashes( |
| 86 | + MessageType.store, "content.item_type", "content.item_hash" |
| 87 | + ) |
| 88 | + |
| 89 | + # We also keep permanent pins, even if they are also stored on IPFS |
| 90 | + files_to_preserve_dict["file pins"] = frozenset( |
| 91 | + [ |
| 92 | + pin["multihash"] |
| 93 | + async for pin in PermanentPin.collection.find({}, {"multihash": 1}) |
| 94 | + ] |
| 95 | + ) |
| 96 | + |
| 97 | + return files_to_preserve_dict |
| 98 | + |
| 99 | + |
| 100 | +async def run(ctx: typer.Context, dry_run: bool): |
| 101 | + config = Config(schema=get_defaults()) |
| 102 | + cli_config = cast(CliConfig, ctx.obj) |
| 103 | + config.yaml.load(str(cli_config.config_file_path)) |
| 104 | + |
| 105 | + init_db_globals(config=config) |
| 106 | + if aleph.model.db is None: # for mypy |
| 107 | + raise ValueError("DB not initialized as expected.") |
| 108 | + |
| 109 | + # Get a set of all the files currently in GridFS |
| 110 | + gridfs_files_dict = { |
| 111 | + file["filename"]: file |
| 112 | + async for file in aleph.model.db["fs.files"].find( |
| 113 | + projection={"_id": 0, "filename": 1, "length": 1, "uploadDate": 1}, |
| 114 | + batch_size=1000, |
| 115 | + ) |
| 116 | + } |
| 117 | + gridfs_files = frozenset(gridfs_files_dict.keys()) |
| 118 | + |
| 119 | + typer.echo(f"Found {len(gridfs_files_dict)} files in local storage.") |
| 120 | + |
| 121 | + files_to_preserve_dict = await list_files_to_preserve( |
| 122 | + gridfs_files_dict=gridfs_files_dict, |
| 123 | + temporary_files_ttl=config.storage.temporary_files_ttl.value, |
| 124 | + ) |
| 125 | + files_to_preserve = frozenset().union(*files_to_preserve_dict.values()) |
| 126 | + files_to_delete = gridfs_files - files_to_preserve |
| 127 | + |
| 128 | + if cli_config.verbose: |
| 129 | + print_files_to_preserve(files_to_preserve_dict) |
| 130 | + |
| 131 | + restored_memory = sum( |
| 132 | + gridfs_files_dict[filename]["length"] for filename in files_to_delete |
| 133 | + ) |
| 134 | + typer.echo( |
| 135 | + f"{len(files_to_delete)} will be deleted, totaling {restored_memory} bytes." |
| 136 | + ) |
| 137 | + |
| 138 | + if dry_run: |
| 139 | + if cli_config.verbose: |
| 140 | + typer.echo("The following files will be deleted:") |
| 141 | + for file_to_delete in files_to_delete: |
| 142 | + typer.echo(f"* {file_to_delete}") |
| 143 | + |
| 144 | + else: |
| 145 | + for file_to_delete in files_to_delete: |
| 146 | + typer.echo(f"Deleting {file_to_delete}...") |
| 147 | + await delete_gridfs_file(file_to_delete) |
| 148 | + |
| 149 | + typer.echo("Done.") |
| 150 | + |
| 151 | + |
| 152 | +@gc_ns.command(name="run") |
| 153 | +def run_gc( |
| 154 | + ctx: typer.Context, |
| 155 | + dry_run: bool = typer.Option( |
| 156 | + False, help="If set, display files to delete without deleting them." |
| 157 | + ), |
| 158 | +): |
| 159 | + asyncio.run(run(ctx, dry_run)) |
0 commit comments