Add access tracker for cleaning CDN

2025-08-18 21:28:42 -04:00 · 2025-08-18 21:28:42 -04:00 · c18260ebcb
commit c18260ebcb
parent e43a67b0d5
4 changed files with 104 additions and 4 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,4 +2,5 @@ venv
 .venv
 __pycache__
 cookies.txt
-.env
+.env
+access_times.json
--- a/access_tracker.py
+++ b/access_tracker.py
@ -0,0 +1,65 @@
+import json
+import os
+import threading
+import time
+from typing import Dict, Optional
+import logging
+
+logger = logging.getLogger(__name__)
+
+class AccessTracker:
+    """Tracks when video URLs are accessed, storing data in JSON file and keeping it in memory"""
+    
+    def __init__(self, json_file_path: str = "access_times.json"):
+        self.json_file_path = json_file_path
+        self.access_times: Dict[str, float] = {}
+        self.lock = threading.Lock()
+        self._load_from_file()
+    
+    def _load_from_file(self) -> None:
+        """Load access times from JSON file into memory"""
+        try:
+            if os.path.exists(self.json_file_path):
+                with open(self.json_file_path, 'r') as f:
+                    self.access_times = json.load(f)
+                logger.info(f"Loaded {len(self.access_times)} access times from {self.json_file_path}")
+            else:
+                logger.info(f"Access times file {self.json_file_path} does not exist, starting fresh")
+        except Exception as e:
+            logger.error(f"Error loading access times from {self.json_file_path}: {e}")
+            self.access_times = {}
+    
+    def _save_to_file(self) -> None:
+        """Save current access times from memory to JSON file"""
+        try:
+            with open(self.json_file_path, 'w') as f:
+                json.dump(self.access_times, f, indent=2)
+            logger.debug(f"Saved {len(self.access_times)} access times to {self.json_file_path}")
+        except Exception as e:
+            logger.error(f"Error saving access times to {self.json_file_path}: {e}")
+    
+    def record_access(self, video_id: str) -> None:
+        """Record that a video was accessed at the current time"""
+        current_time = time.time()
+        with self.lock:
+            self.access_times[video_id] = current_time
+            self._save_to_file()
+        logger.debug(f"Recorded access for {video_id} at {current_time}")
+    
+    def get_last_access(self, video_id: str) -> Optional[float]:
+        """Get the last access time for a video (returns None if never accessed)"""
+        with self.lock:
+            return self.access_times.get(video_id)
+    
+    def get_all_access_times(self) -> Dict[str, float]:
+        """Get a copy of all access times"""
+        with self.lock:
+            return self.access_times.copy()
+    
+    def remove_access_record(self, video_id: str) -> None:
+        """Remove access record for a video (e.g., when video is deleted)"""
+        with self.lock:
+            if video_id in self.access_times:
+                del self.access_times[video_id]
+                self._save_to_file()
+                logger.debug(f"Removed access record for {video_id}")
--- a/app.py
+++ b/app.py
@ -17,6 +17,7 @@ from botocore.client import Config as BotoConfig
 import urllib.parse

 from dotenv import load_dotenv
+from access_tracker import AccessTracker
 load_dotenv()

 logging.basicConfig(
@ -83,6 +84,8 @@ download_tracker = {
 download_lock = threading.Lock()
 download_queue = []

+access_tracker = AccessTracker()
+
 def download_and_upload_video(video_id, url, video_quality):
    try:
        with download_lock:
@ -411,6 +414,11 @@ if you want to download videos, please consider using a tool like nndownload: ht
        logger.info(f"{video_id}: Caching HTML response")
        cache.set(f"{video_id}{cache_html_suffix}", html_response, expire=CACHE_EXPIRATION_HTML)

+    # Record access time for CDN cleanup purposes
+    if is_video_in_cdn(video_id):
+        access_tracker.record_access(video_id)
+        logger.debug(f"{video_id}: Recorded access time for CDN tracking")
+
    logger.info(f"{video_id}: Returning response")
    logger.debug(f"{video_id}: HTML response:\n----------\n{html_response}\n----------")
    return Response(html_response, mimetype="text/html")
--- a/clean.py
+++ b/clean.py
@ -6,6 +6,7 @@ import logging
 import boto3
 from botocore.client import Config as BotoConfig
 from dotenv import load_dotenv
+from access_tracker import AccessTracker

 logging.basicConfig(
    level=logging.INFO,
@ -15,6 +16,7 @@ logging.basicConfig(
 logger = logging.getLogger(__name__)

 def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"):
+    access_tracker = AccessTracker()
    required_env_vars = [
        'NICONICOGAY_S3_ACCESS_KEY',
        'NICONICOGAY_S3_SECRET_KEY',
@ -38,12 +40,13 @@ def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"):
        )
        
        bucket_name = os.environ['NICONICOGAY_S3_BUCKET_NAME']
-        cutoff_date = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=days)
+        cutoff_timestamp = datetime.datetime.now(datetime.timezone.utc).timestamp() - (days * 24 * 60 * 60)
        paginator = s3_client.get_paginator('list_objects_v2')
        page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=f"{directory_prefix}/")
        
        total_files = 0
        objects_to_delete = []
+        access_times = access_tracker.get_all_access_times()
        
        for page in page_iterator:
            if 'Contents' not in page:
@ -51,8 +54,31 @@ def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"):
            
            for obj in page['Contents']:
                total_files += 1
-                if obj['LastModified'] < cutoff_date:  # type: ignore
-                    objects_to_delete.append({'Key': obj['Key']})  # type: ignore
+                key = obj['Key']  # type: ignore
+                
+                # Extract video_id from S3 key (e.g., "niconico/sm12345.mp4" -> "sm12345")
+                if key.startswith(f"{directory_prefix}/") and key.endswith('.mp4'):
+                    video_id = key[len(f"{directory_prefix}/"):-4]  # Remove prefix and .mp4 extension
+                    
+                    last_access = access_times.get(video_id)
+                    should_delete = False
+                    
+                    if last_access is None:
+                        # No access record - delete files that haven't been accessed since tracking started
+                        # For safety, only delete files older than the cutoff date
+                        if obj['LastModified'].timestamp() < cutoff_timestamp:  # type: ignore
+                            should_delete = True
+                            logger.debug(f"Will delete {video_id}: no access record and file is old")
+                    elif last_access < cutoff_timestamp:
+                        # Has access record but last access was too long ago
+                        should_delete = True
+                        logger.debug(f"Will delete {video_id}: last accessed {(datetime.datetime.now().timestamp() - last_access) / (24*60*60):.1f} days ago")
+                    
+                    if should_delete:
+                        objects_to_delete.append({'Key': key})
+                        # Remove the access record since we're deleting the file
+                        if not dry_run:
+                            access_tracker.remove_access_record(video_id)

        if len(objects_to_delete) == 0:
            logger.info("No files to delete")