From 1802eeffe3c479f003b9ad6856f1307a67856009 Mon Sep 17 00:00:00 2001 From: MMaker Date: Wed, 26 Feb 2025 13:36:36 -0500 Subject: [PATCH] Add cleanup script --- .gitignore | 3 +- clean.py | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 89 insertions(+), 1 deletion(-) create mode 100644 clean.py diff --git a/.gitignore b/.gitignore index 8f5e87a..6e36383 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ venv .venv __pycache__ -cookies.txt \ No newline at end of file +cookies.txt +.env \ No newline at end of file diff --git a/clean.py b/clean.py new file mode 100644 index 0000000..e20faa0 --- /dev/null +++ b/clean.py @@ -0,0 +1,87 @@ +import os +import sys +import datetime +import argparse +import logging +import boto3 +from botocore.client import Config as BotoConfig +from dotenv import load_dotenv + +logging.basicConfig( + level=logging.INFO, + format='%(asctime)s - %(levelname)s - %(message)s', + datefmt='%Y-%m-%d %H:%M:%S' +) +logger = logging.getLogger(__name__) + +def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"): + required_env_vars = [ + 'NICONICOGAY_S3_ACCESS_KEY', + 'NICONICOGAY_S3_SECRET_KEY', + 'NICONICOGAY_S3_BUCKET_NAME', + 'NICONICOGAY_S3_REGION' + ] + missing_vars = [var for var in required_env_vars if not os.environ.get(var)] + if missing_vars: + logger.error(f"Missing required environment variables: {', '.join(missing_vars)}") + sys.exit(1) + + try: + s3_session = boto3.Session() + s3_client = s3_session.client( + 's3', + aws_access_key_id=os.environ['NICONICOGAY_S3_ACCESS_KEY'], + aws_secret_access_key=os.environ['NICONICOGAY_S3_SECRET_KEY'], + region_name=os.environ['NICONICOGAY_S3_REGION'], + endpoint_url=f"https://{os.environ['NICONICOGAY_S3_REGION']}.digitaloceanspaces.com", + config=BotoConfig(s3={'addressing_style': 'virtual'}), + ) + + bucket_name = os.environ['NICONICOGAY_S3_BUCKET_NAME'] + cutoff_date = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=days) + paginator = s3_client.get_paginator('list_objects_v2') + page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=f"{directory_prefix}/") + + total_files = 0 + objects_to_delete = [] + + for page in page_iterator: + if 'Contents' not in page: + continue + + for obj in page['Contents']: + total_files += 1 + if obj['LastModified'] < cutoff_date: # type: ignore + objects_to_delete.append({'Key': obj['Key']}) # type: ignore + + if len(objects_to_delete) == 0: + logger.info("No files to delete") + return + + if dry_run: + logger.info(f"DRY RUN: Would delete {len(objects_to_delete)} out of {total_files} files") + else: + # Delete files in batches of 1000 (S3 limit?) + for i in range(0, len(objects_to_delete), 1000): + batch = objects_to_delete[i:i+1000] + s3_client.delete_objects( + Bucket=bucket_name, + Delete={'Objects': batch} + ) + logger.info(f"Successfully deleted {len(objects_to_delete)} out of {total_files} files") + + except Exception as e: + logger.error(f"Error: {e}") + sys.exit(1) + +if __name__ == "__main__": + load_dotenv() + parser = argparse.ArgumentParser() + parser.add_argument( + "--dry-run", + action="store_true", + help="Show what would be deleted without actually deleting anything" + ) + args = parser.parse_args() + + cleanup_old_files(dry_run=args.dry_run)