Add access tracker for cleaning CDN

This commit is contained in:
MMaker 2025-08-18 21:28:42 -04:00
commit c18260ebcb
Signed by: mmaker
GPG key ID: CCE79B8FEDA40FB2
4 changed files with 104 additions and 4 deletions

View file

@ -6,6 +6,7 @@ import logging
import boto3
from botocore.client import Config as BotoConfig
from dotenv import load_dotenv
from access_tracker import AccessTracker
logging.basicConfig(
level=logging.INFO,
@ -15,6 +16,7 @@ logging.basicConfig(
logger = logging.getLogger(__name__)
def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"):
access_tracker = AccessTracker()
required_env_vars = [
'NICONICOGAY_S3_ACCESS_KEY',
'NICONICOGAY_S3_SECRET_KEY',
@ -38,12 +40,13 @@ def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"):
)
bucket_name = os.environ['NICONICOGAY_S3_BUCKET_NAME']
cutoff_date = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=days)
cutoff_timestamp = datetime.datetime.now(datetime.timezone.utc).timestamp() - (days * 24 * 60 * 60)
paginator = s3_client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=f"{directory_prefix}/")
total_files = 0
objects_to_delete = []
access_times = access_tracker.get_all_access_times()
for page in page_iterator:
if 'Contents' not in page:
@ -51,8 +54,31 @@ def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"):
for obj in page['Contents']:
total_files += 1
if obj['LastModified'] < cutoff_date: # type: ignore
objects_to_delete.append({'Key': obj['Key']}) # type: ignore
key = obj['Key'] # type: ignore
# Extract video_id from S3 key (e.g., "niconico/sm12345.mp4" -> "sm12345")
if key.startswith(f"{directory_prefix}/") and key.endswith('.mp4'):
video_id = key[len(f"{directory_prefix}/"):-4] # Remove prefix and .mp4 extension
last_access = access_times.get(video_id)
should_delete = False
if last_access is None:
# No access record - delete files that haven't been accessed since tracking started
# For safety, only delete files older than the cutoff date
if obj['LastModified'].timestamp() < cutoff_timestamp: # type: ignore
should_delete = True
logger.debug(f"Will delete {video_id}: no access record and file is old")
elif last_access < cutoff_timestamp:
# Has access record but last access was too long ago
should_delete = True
logger.debug(f"Will delete {video_id}: last accessed {(datetime.datetime.now().timestamp() - last_access) / (24*60*60):.1f} days ago")
if should_delete:
objects_to_delete.append({'Key': key})
# Remove the access record since we're deleting the file
if not dry_run:
access_tracker.remove_access_record(video_id)
if len(objects_to_delete) == 0:
logger.info("No files to delete")