Add access tracker for cleaning CDN
This commit is contained in:
parent
e43a67b0d5
commit
c18260ebcb
4 changed files with 104 additions and 4 deletions
3
.gitignore
vendored
3
.gitignore
vendored
|
@ -2,4 +2,5 @@ venv
|
|||
.venv
|
||||
__pycache__
|
||||
cookies.txt
|
||||
.env
|
||||
.env
|
||||
access_times.json
|
65
access_tracker.py
Normal file
65
access_tracker.py
Normal file
|
@ -0,0 +1,65 @@
|
|||
import json
|
||||
import os
|
||||
import threading
|
||||
import time
|
||||
from typing import Dict, Optional
|
||||
import logging
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class AccessTracker:
|
||||
"""Tracks when video URLs are accessed, storing data in JSON file and keeping it in memory"""
|
||||
|
||||
def __init__(self, json_file_path: str = "access_times.json"):
|
||||
self.json_file_path = json_file_path
|
||||
self.access_times: Dict[str, float] = {}
|
||||
self.lock = threading.Lock()
|
||||
self._load_from_file()
|
||||
|
||||
def _load_from_file(self) -> None:
|
||||
"""Load access times from JSON file into memory"""
|
||||
try:
|
||||
if os.path.exists(self.json_file_path):
|
||||
with open(self.json_file_path, 'r') as f:
|
||||
self.access_times = json.load(f)
|
||||
logger.info(f"Loaded {len(self.access_times)} access times from {self.json_file_path}")
|
||||
else:
|
||||
logger.info(f"Access times file {self.json_file_path} does not exist, starting fresh")
|
||||
except Exception as e:
|
||||
logger.error(f"Error loading access times from {self.json_file_path}: {e}")
|
||||
self.access_times = {}
|
||||
|
||||
def _save_to_file(self) -> None:
|
||||
"""Save current access times from memory to JSON file"""
|
||||
try:
|
||||
with open(self.json_file_path, 'w') as f:
|
||||
json.dump(self.access_times, f, indent=2)
|
||||
logger.debug(f"Saved {len(self.access_times)} access times to {self.json_file_path}")
|
||||
except Exception as e:
|
||||
logger.error(f"Error saving access times to {self.json_file_path}: {e}")
|
||||
|
||||
def record_access(self, video_id: str) -> None:
|
||||
"""Record that a video was accessed at the current time"""
|
||||
current_time = time.time()
|
||||
with self.lock:
|
||||
self.access_times[video_id] = current_time
|
||||
self._save_to_file()
|
||||
logger.debug(f"Recorded access for {video_id} at {current_time}")
|
||||
|
||||
def get_last_access(self, video_id: str) -> Optional[float]:
|
||||
"""Get the last access time for a video (returns None if never accessed)"""
|
||||
with self.lock:
|
||||
return self.access_times.get(video_id)
|
||||
|
||||
def get_all_access_times(self) -> Dict[str, float]:
|
||||
"""Get a copy of all access times"""
|
||||
with self.lock:
|
||||
return self.access_times.copy()
|
||||
|
||||
def remove_access_record(self, video_id: str) -> None:
|
||||
"""Remove access record for a video (e.g., when video is deleted)"""
|
||||
with self.lock:
|
||||
if video_id in self.access_times:
|
||||
del self.access_times[video_id]
|
||||
self._save_to_file()
|
||||
logger.debug(f"Removed access record for {video_id}")
|
8
app.py
8
app.py
|
@ -17,6 +17,7 @@ from botocore.client import Config as BotoConfig
|
|||
import urllib.parse
|
||||
|
||||
from dotenv import load_dotenv
|
||||
from access_tracker import AccessTracker
|
||||
load_dotenv()
|
||||
|
||||
logging.basicConfig(
|
||||
|
@ -83,6 +84,8 @@ download_tracker = {
|
|||
download_lock = threading.Lock()
|
||||
download_queue = []
|
||||
|
||||
access_tracker = AccessTracker()
|
||||
|
||||
def download_and_upload_video(video_id, url, video_quality):
|
||||
try:
|
||||
with download_lock:
|
||||
|
@ -411,6 +414,11 @@ if you want to download videos, please consider using a tool like nndownload: ht
|
|||
logger.info(f"{video_id}: Caching HTML response")
|
||||
cache.set(f"{video_id}{cache_html_suffix}", html_response, expire=CACHE_EXPIRATION_HTML)
|
||||
|
||||
# Record access time for CDN cleanup purposes
|
||||
if is_video_in_cdn(video_id):
|
||||
access_tracker.record_access(video_id)
|
||||
logger.debug(f"{video_id}: Recorded access time for CDN tracking")
|
||||
|
||||
logger.info(f"{video_id}: Returning response")
|
||||
logger.debug(f"{video_id}: HTML response:\n----------\n{html_response}\n----------")
|
||||
return Response(html_response, mimetype="text/html")
|
||||
|
|
32
clean.py
32
clean.py
|
@ -6,6 +6,7 @@ import logging
|
|||
import boto3
|
||||
from botocore.client import Config as BotoConfig
|
||||
from dotenv import load_dotenv
|
||||
from access_tracker import AccessTracker
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
|
@ -15,6 +16,7 @@ logging.basicConfig(
|
|||
logger = logging.getLogger(__name__)
|
||||
|
||||
def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"):
|
||||
access_tracker = AccessTracker()
|
||||
required_env_vars = [
|
||||
'NICONICOGAY_S3_ACCESS_KEY',
|
||||
'NICONICOGAY_S3_SECRET_KEY',
|
||||
|
@ -38,12 +40,13 @@ def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"):
|
|||
)
|
||||
|
||||
bucket_name = os.environ['NICONICOGAY_S3_BUCKET_NAME']
|
||||
cutoff_date = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=days)
|
||||
cutoff_timestamp = datetime.datetime.now(datetime.timezone.utc).timestamp() - (days * 24 * 60 * 60)
|
||||
paginator = s3_client.get_paginator('list_objects_v2')
|
||||
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=f"{directory_prefix}/")
|
||||
|
||||
total_files = 0
|
||||
objects_to_delete = []
|
||||
access_times = access_tracker.get_all_access_times()
|
||||
|
||||
for page in page_iterator:
|
||||
if 'Contents' not in page:
|
||||
|
@ -51,8 +54,31 @@ def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"):
|
|||
|
||||
for obj in page['Contents']:
|
||||
total_files += 1
|
||||
if obj['LastModified'] < cutoff_date: # type: ignore
|
||||
objects_to_delete.append({'Key': obj['Key']}) # type: ignore
|
||||
key = obj['Key'] # type: ignore
|
||||
|
||||
# Extract video_id from S3 key (e.g., "niconico/sm12345.mp4" -> "sm12345")
|
||||
if key.startswith(f"{directory_prefix}/") and key.endswith('.mp4'):
|
||||
video_id = key[len(f"{directory_prefix}/"):-4] # Remove prefix and .mp4 extension
|
||||
|
||||
last_access = access_times.get(video_id)
|
||||
should_delete = False
|
||||
|
||||
if last_access is None:
|
||||
# No access record - delete files that haven't been accessed since tracking started
|
||||
# For safety, only delete files older than the cutoff date
|
||||
if obj['LastModified'].timestamp() < cutoff_timestamp: # type: ignore
|
||||
should_delete = True
|
||||
logger.debug(f"Will delete {video_id}: no access record and file is old")
|
||||
elif last_access < cutoff_timestamp:
|
||||
# Has access record but last access was too long ago
|
||||
should_delete = True
|
||||
logger.debug(f"Will delete {video_id}: last accessed {(datetime.datetime.now().timestamp() - last_access) / (24*60*60):.1f} days ago")
|
||||
|
||||
if should_delete:
|
||||
objects_to_delete.append({'Key': key})
|
||||
# Remove the access record since we're deleting the file
|
||||
if not dry_run:
|
||||
access_tracker.remove_access_record(video_id)
|
||||
|
||||
if len(objects_to_delete) == 0:
|
||||
logger.info("No files to delete")
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue