Add access tracker for cleaning CDN
This commit is contained in:
		
					parent
					
						
							
								e43a67b0d5
							
						
					
				
			
			
				commit
				
					
						c18260ebcb
					
				
			
		
					 4 changed files with 104 additions and 4 deletions
				
			
		
							
								
								
									
										3
									
								
								.gitignore
									
										
									
									
										vendored
									
									
								
							
							
						
						
									
										3
									
								
								.gitignore
									
										
									
									
										vendored
									
									
								
							|  | @ -2,4 +2,5 @@ venv | ||||||
| .venv | .venv | ||||||
| __pycache__ | __pycache__ | ||||||
| cookies.txt | cookies.txt | ||||||
| .env | .env | ||||||
|  | access_times.json | ||||||
							
								
								
									
										65
									
								
								access_tracker.py
									
										
									
									
									
										Normal file
									
								
							
							
						
						
									
										65
									
								
								access_tracker.py
									
										
									
									
									
										Normal file
									
								
							|  | @ -0,0 +1,65 @@ | ||||||
|  | import json | ||||||
|  | import os | ||||||
|  | import threading | ||||||
|  | import time | ||||||
|  | from typing import Dict, Optional | ||||||
|  | import logging | ||||||
|  | 
 | ||||||
|  | logger = logging.getLogger(__name__) | ||||||
|  | 
 | ||||||
|  | class AccessTracker: | ||||||
|  |     """Tracks when video URLs are accessed, storing data in JSON file and keeping it in memory""" | ||||||
|  |      | ||||||
|  |     def __init__(self, json_file_path: str = "access_times.json"): | ||||||
|  |         self.json_file_path = json_file_path | ||||||
|  |         self.access_times: Dict[str, float] = {} | ||||||
|  |         self.lock = threading.Lock() | ||||||
|  |         self._load_from_file() | ||||||
|  |      | ||||||
|  |     def _load_from_file(self) -> None: | ||||||
|  |         """Load access times from JSON file into memory""" | ||||||
|  |         try: | ||||||
|  |             if os.path.exists(self.json_file_path): | ||||||
|  |                 with open(self.json_file_path, 'r') as f: | ||||||
|  |                     self.access_times = json.load(f) | ||||||
|  |                 logger.info(f"Loaded {len(self.access_times)} access times from {self.json_file_path}") | ||||||
|  |             else: | ||||||
|  |                 logger.info(f"Access times file {self.json_file_path} does not exist, starting fresh") | ||||||
|  |         except Exception as e: | ||||||
|  |             logger.error(f"Error loading access times from {self.json_file_path}: {e}") | ||||||
|  |             self.access_times = {} | ||||||
|  |      | ||||||
|  |     def _save_to_file(self) -> None: | ||||||
|  |         """Save current access times from memory to JSON file""" | ||||||
|  |         try: | ||||||
|  |             with open(self.json_file_path, 'w') as f: | ||||||
|  |                 json.dump(self.access_times, f, indent=2) | ||||||
|  |             logger.debug(f"Saved {len(self.access_times)} access times to {self.json_file_path}") | ||||||
|  |         except Exception as e: | ||||||
|  |             logger.error(f"Error saving access times to {self.json_file_path}: {e}") | ||||||
|  |      | ||||||
|  |     def record_access(self, video_id: str) -> None: | ||||||
|  |         """Record that a video was accessed at the current time""" | ||||||
|  |         current_time = time.time() | ||||||
|  |         with self.lock: | ||||||
|  |             self.access_times[video_id] = current_time | ||||||
|  |             self._save_to_file() | ||||||
|  |         logger.debug(f"Recorded access for {video_id} at {current_time}") | ||||||
|  |      | ||||||
|  |     def get_last_access(self, video_id: str) -> Optional[float]: | ||||||
|  |         """Get the last access time for a video (returns None if never accessed)""" | ||||||
|  |         with self.lock: | ||||||
|  |             return self.access_times.get(video_id) | ||||||
|  |      | ||||||
|  |     def get_all_access_times(self) -> Dict[str, float]: | ||||||
|  |         """Get a copy of all access times""" | ||||||
|  |         with self.lock: | ||||||
|  |             return self.access_times.copy() | ||||||
|  |      | ||||||
|  |     def remove_access_record(self, video_id: str) -> None: | ||||||
|  |         """Remove access record for a video (e.g., when video is deleted)""" | ||||||
|  |         with self.lock: | ||||||
|  |             if video_id in self.access_times: | ||||||
|  |                 del self.access_times[video_id] | ||||||
|  |                 self._save_to_file() | ||||||
|  |                 logger.debug(f"Removed access record for {video_id}") | ||||||
							
								
								
									
										8
									
								
								app.py
									
										
									
									
									
								
							
							
						
						
									
										8
									
								
								app.py
									
										
									
									
									
								
							|  | @ -17,6 +17,7 @@ from botocore.client import Config as BotoConfig | ||||||
| import urllib.parse | import urllib.parse | ||||||
| 
 | 
 | ||||||
| from dotenv import load_dotenv | from dotenv import load_dotenv | ||||||
|  | from access_tracker import AccessTracker | ||||||
| load_dotenv() | load_dotenv() | ||||||
| 
 | 
 | ||||||
| logging.basicConfig( | logging.basicConfig( | ||||||
|  | @ -83,6 +84,8 @@ download_tracker = { | ||||||
| download_lock = threading.Lock() | download_lock = threading.Lock() | ||||||
| download_queue = [] | download_queue = [] | ||||||
| 
 | 
 | ||||||
|  | access_tracker = AccessTracker() | ||||||
|  | 
 | ||||||
| def download_and_upload_video(video_id, url, video_quality): | def download_and_upload_video(video_id, url, video_quality): | ||||||
|     try: |     try: | ||||||
|         with download_lock: |         with download_lock: | ||||||
|  | @ -411,6 +414,11 @@ if you want to download videos, please consider using a tool like nndownload: ht | ||||||
|         logger.info(f"{video_id}: Caching HTML response") |         logger.info(f"{video_id}: Caching HTML response") | ||||||
|         cache.set(f"{video_id}{cache_html_suffix}", html_response, expire=CACHE_EXPIRATION_HTML) |         cache.set(f"{video_id}{cache_html_suffix}", html_response, expire=CACHE_EXPIRATION_HTML) | ||||||
| 
 | 
 | ||||||
|  |     # Record access time for CDN cleanup purposes | ||||||
|  |     if is_video_in_cdn(video_id): | ||||||
|  |         access_tracker.record_access(video_id) | ||||||
|  |         logger.debug(f"{video_id}: Recorded access time for CDN tracking") | ||||||
|  | 
 | ||||||
|     logger.info(f"{video_id}: Returning response") |     logger.info(f"{video_id}: Returning response") | ||||||
|     logger.debug(f"{video_id}: HTML response:\n----------\n{html_response}\n----------") |     logger.debug(f"{video_id}: HTML response:\n----------\n{html_response}\n----------") | ||||||
|     return Response(html_response, mimetype="text/html") |     return Response(html_response, mimetype="text/html") | ||||||
|  |  | ||||||
							
								
								
									
										32
									
								
								clean.py
									
										
									
									
									
								
							
							
						
						
									
										32
									
								
								clean.py
									
										
									
									
									
								
							|  | @ -6,6 +6,7 @@ import logging | ||||||
| import boto3 | import boto3 | ||||||
| from botocore.client import Config as BotoConfig | from botocore.client import Config as BotoConfig | ||||||
| from dotenv import load_dotenv | from dotenv import load_dotenv | ||||||
|  | from access_tracker import AccessTracker | ||||||
| 
 | 
 | ||||||
| logging.basicConfig( | logging.basicConfig( | ||||||
|     level=logging.INFO, |     level=logging.INFO, | ||||||
|  | @ -15,6 +16,7 @@ logging.basicConfig( | ||||||
| logger = logging.getLogger(__name__) | logger = logging.getLogger(__name__) | ||||||
| 
 | 
 | ||||||
| def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"): | def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"): | ||||||
|  |     access_tracker = AccessTracker() | ||||||
|     required_env_vars = [ |     required_env_vars = [ | ||||||
|         'NICONICOGAY_S3_ACCESS_KEY', |         'NICONICOGAY_S3_ACCESS_KEY', | ||||||
|         'NICONICOGAY_S3_SECRET_KEY', |         'NICONICOGAY_S3_SECRET_KEY', | ||||||
|  | @ -38,12 +40,13 @@ def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"): | ||||||
|         ) |         ) | ||||||
|          |          | ||||||
|         bucket_name = os.environ['NICONICOGAY_S3_BUCKET_NAME'] |         bucket_name = os.environ['NICONICOGAY_S3_BUCKET_NAME'] | ||||||
|         cutoff_date = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=days) |         cutoff_timestamp = datetime.datetime.now(datetime.timezone.utc).timestamp() - (days * 24 * 60 * 60) | ||||||
|         paginator = s3_client.get_paginator('list_objects_v2') |         paginator = s3_client.get_paginator('list_objects_v2') | ||||||
|         page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=f"{directory_prefix}/") |         page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=f"{directory_prefix}/") | ||||||
|          |          | ||||||
|         total_files = 0 |         total_files = 0 | ||||||
|         objects_to_delete = [] |         objects_to_delete = [] | ||||||
|  |         access_times = access_tracker.get_all_access_times() | ||||||
|          |          | ||||||
|         for page in page_iterator: |         for page in page_iterator: | ||||||
|             if 'Contents' not in page: |             if 'Contents' not in page: | ||||||
|  | @ -51,8 +54,31 @@ def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"): | ||||||
|              |              | ||||||
|             for obj in page['Contents']: |             for obj in page['Contents']: | ||||||
|                 total_files += 1 |                 total_files += 1 | ||||||
|                 if obj['LastModified'] < cutoff_date:  # type: ignore |                 key = obj['Key']  # type: ignore | ||||||
|                     objects_to_delete.append({'Key': obj['Key']})  # type: ignore |                  | ||||||
|  |                 # Extract video_id from S3 key (e.g., "niconico/sm12345.mp4" -> "sm12345") | ||||||
|  |                 if key.startswith(f"{directory_prefix}/") and key.endswith('.mp4'): | ||||||
|  |                     video_id = key[len(f"{directory_prefix}/"):-4]  # Remove prefix and .mp4 extension | ||||||
|  |                      | ||||||
|  |                     last_access = access_times.get(video_id) | ||||||
|  |                     should_delete = False | ||||||
|  |                      | ||||||
|  |                     if last_access is None: | ||||||
|  |                         # No access record - delete files that haven't been accessed since tracking started | ||||||
|  |                         # For safety, only delete files older than the cutoff date | ||||||
|  |                         if obj['LastModified'].timestamp() < cutoff_timestamp:  # type: ignore | ||||||
|  |                             should_delete = True | ||||||
|  |                             logger.debug(f"Will delete {video_id}: no access record and file is old") | ||||||
|  |                     elif last_access < cutoff_timestamp: | ||||||
|  |                         # Has access record but last access was too long ago | ||||||
|  |                         should_delete = True | ||||||
|  |                         logger.debug(f"Will delete {video_id}: last accessed {(datetime.datetime.now().timestamp() - last_access) / (24*60*60):.1f} days ago") | ||||||
|  |                      | ||||||
|  |                     if should_delete: | ||||||
|  |                         objects_to_delete.append({'Key': key}) | ||||||
|  |                         # Remove the access record since we're deleting the file | ||||||
|  |                         if not dry_run: | ||||||
|  |                             access_tracker.remove_access_record(video_id) | ||||||
| 
 | 
 | ||||||
|         if len(objects_to_delete) == 0: |         if len(objects_to_delete) == 0: | ||||||
|             logger.info("No files to delete") |             logger.info("No files to delete") | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue