Compare commits

...

63 Commits
master ... s3

Author SHA1 Message Date
e43a67b0d5
Add another reason code 2025-03-30 00:46:24 -04:00
71fa5ad6b6
Separate HTML caches 2025-03-04 10:34:30 -05:00
abbe5c3bd9
Re-order download allowed logic 2025-03-04 10:24:26 -05:00
8089130b7e
Only log "too long" if not a Twitterbot 2025-03-04 10:23:38 -05:00
d8da55520f
Handle another reason code 2025-03-04 10:22:54 -05:00
a31d7d5a90
Add extra logging 2025-03-03 22:06:15 -05:00
df8537e811
Remove play button for videos not downloaded 2025-03-03 22:03:32 -05:00
070eed8f41
Print oEmbed response in debug 2025-03-03 13:38:29 -05:00
36c34bd4f7
Better description modification 2025-03-03 13:28:44 -05:00
3456e74afb
Extra newline in log 2025-03-03 13:17:32 -05:00
d770c7df41
Set log level via env var 2025-03-03 13:16:34 -05:00
b0b552ee82
Log HTML response 2025-03-03 13:15:19 -05:00
6a78b81084
Remove title and category from OG description 2025-03-03 13:14:12 -05:00
aa755dc186
Try to fix oEmbed 2025-03-03 13:06:22 -05:00
a075a5a7a2
Warning if no OG tags found 2025-03-03 13:01:26 -05:00
255a12fcc6
Attempt to fix Twitter card display v2 2025-03-03 12:49:17 -05:00
a07da68e4f
Revert "Attempt to fix Twitter card display"
This reverts commit 6e41e842fd0f1ebacf6001ef8125465708f080a2.
2025-03-03 12:46:25 -05:00
6e41e842fd
Attempt to fix Twitter card display 2025-03-03 12:44:40 -05:00
eccea59070
Revert experimental placeholder video
Discord seems to cache the video regardless of
what it's told.
2025-02-27 16:07:14 -05:00
753d4c691a
Try to prevent caching of placeholder 2025-02-27 16:05:39 -05:00
bca73594f5
Fix again 2025-02-27 15:56:20 -05:00
7cbc5f84c9
Fix placeholder video return 2025-02-27 15:53:05 -05:00
9905d91479
Fix for Discord user agent for files 2025-02-27 15:40:34 -05:00
2ade81b3be
oops 2025-02-27 15:35:38 -05:00
86b490bab1
fix 2025-02-27 15:33:07 -05:00
5d5588f4f5
Experimental placeholder video functionality 2025-02-27 15:15:11 -05:00
d8ffe43857
Clearer log 2025-02-27 13:18:30 -05:00
dd95661352
Handle deleted videos 2025-02-27 13:14:09 -05:00
19befc9eb5
Refactor out allow check 2025-02-27 12:52:37 -05:00
96326f543f
Better log phrasing 2025-02-27 12:39:38 -05:00
c6d53e0c1c
Opposite 2025-02-27 12:08:35 -05:00
0b8f0dc1b9
Try to ignore connection pool warnings 2025-02-27 12:03:17 -05:00
e6d7278624
Ignore hidden videos 2025-02-27 11:53:51 -05:00
2ca6d6aa73
Better cache control, linter cleanup 2025-02-27 11:38:48 -05:00
cc21a2322e
Granular time logging 2025-02-27 11:32:18 -05:00
c7a2ae2b6e
nit 2025-02-27 11:31:33 -05:00
c120d9ba92
Tweak response logging level 2025-02-27 11:30:58 -05:00
419dd19faa
Clean up logging levels 2025-02-27 11:30:12 -05:00
4ac1fba240
Found the real cache issue :^) 2025-02-27 11:24:39 -05:00
c3ceb007f3
Found the issue :^) 2025-02-27 11:19:38 -05:00
e2d6cabed5
More logging 2025-02-27 11:18:24 -05:00
246de3e29d
Add diskcache logging 2025-02-27 11:16:14 -05:00
6e95c1dd52
Add some logs 2025-02-27 11:13:50 -05:00
1963ba53d9
Cache video CDN status 2025-02-27 11:12:42 -05:00
8f222ff957
Cleaner log format 2025-02-27 11:01:48 -05:00
ae803c0fe0
Make S3 optional 2025-02-27 10:54:51 -05:00
aa836a4f55
oops 2025-02-27 10:48:46 -05:00
ac86c5f5ee
Only download video if request from Discord 2025-02-27 10:29:36 -05:00
fe5c547055
Pass in user session for nndownload 2025-02-27 07:43:10 -05:00
1ce10dfae4
Remove raise for status check temporarily 2025-02-27 06:58:15 -05:00
1802eeffe3
Add cleanup script 2025-02-26 13:36:36 -05:00
71b7dac492
Remove tags that might be breaking Twitter 2025-02-26 08:45:20 -05:00
5a194507a0
Get correct video dimensions 2025-02-26 00:31:51 -05:00
77f9545db6
Fixes 2025-02-25 18:21:29 -05:00
c456200ae0
Handle unknown videos 2025-02-25 18:12:13 -05:00
8ebacc84b0
Try to fix site name again 2025-02-25 18:10:12 -05:00
b3539d7a47
Format numbers 2025-02-25 18:08:15 -05:00
3201aea856
Change mylist emoji 2025-02-25 18:06:46 -05:00
be23df3591
Meta tag cleanup
Maybe fixes site title not displaying?
2025-02-25 18:04:59 -05:00
e48159ce14
owoembed
(oEmbed)
2025-02-25 17:56:06 -05:00
e9eb88c13f
Add more meta tags 2025-02-25 17:23:50 -05:00
add8f1bbde
Load .env 2025-02-25 16:28:12 -05:00
e532c45a92
Add video proxying support
Requires S3 setup
2025-02-25 16:23:31 -05:00
4 changed files with 500 additions and 46 deletions

1
.gitignore vendored
View File

@ -2,3 +2,4 @@ venv
.venv
__pycache__
cookies.txt
.env

452
app.py
View File

@ -1,90 +1,452 @@
import os
import http.cookiejar
import json
import re
import requests
from bs4 import BeautifulSoup
from flask import Flask, Response
from flask import Flask, Response, request, jsonify
from diskcache import Cache
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
import threading
import time
import tempfile
import nndownload
import boto3
from botocore.client import Config as BotoConfig
import urllib.parse
from dotenv import load_dotenv
load_dotenv()
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s.%(msecs)03d - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
logger = logging.getLogger(__name__)
logger.setLevel(os.environ.get('NICONICOGAY_LOG', 'INFO').upper())
app = Flask(__name__)
CACHE_EXPIRATION_SECONDS = 3600 # 1 hour
HOST = os.environ.get('NICONICOGAY_HOST', 'https://nicovideo.gay')
S3_BUCKET_NAME = os.environ.get('NICONICOGAY_S3_BUCKET_NAME')
S3_REGION = os.environ.get('NICONICOGAY_S3_REGION')
CDN_BASE_URL = os.environ.get('NICONICOGAY_CDN_BASE_URL')
MAX_CONCURRENT_DOWNLOADS = 3
CACHE_EXPIRATION_HTML = 60 * 60 # 1 hour
CACHE_EXPIRATION_CDN = 60 * 60 * 24 * 7 # 1 week
CACHE_SIZE_LIMIT = 100 * 1024 * 1024 # 100 MB
cache = None if os.environ.get('NICONICOGAY_DISABLE_CACHE', '') != '' else Cache("disk_cache", size_limit=CACHE_SIZE_LIMIT)
cache = None
if os.environ.get('NICONICOGAY_DISABLE_CACHE', '') != '1':
cache = Cache("disk_cache", size_limit=CACHE_SIZE_LIMIT)
logger.debug("Using disk cache")
else:
logger.info("Disk cache disabled")
user_session = None
cookie_jar = http.cookiejar.MozillaCookieJar('cookies.txt')
try:
cookie_jar.load(ignore_discard=True, ignore_expires=True)
user_session = next((cookie.value for cookie in cookie_jar if cookie.name == 'user_session'), None)
except FileNotFoundError:
logger.warning("cookies.txt not found, starting with empty cookie jar")
logger.info("cookies.txt not found, starting with empty cookie jar")
s = requests.Session()
s.headers.update({
"User-Agent": "Twitterbot/1.0"
"User-Agent": os.environ.get('NICONICOGAY_USER_AGENT', 'Twitterbot/1.0')
})
s.cookies = cookie_jar # type: ignore
@app.route("/watch/<video_id>")
def proxy(video_id):
logger.info(f"Received request for video ID: {video_id}")
s3_client = None
if all(key in os.environ for key in [
'NICONICOGAY_S3_ACCESS_KEY',
'NICONICOGAY_S3_SECRET_KEY',
]):
s3_session = boto3.Session()
s3_client = s3_session.client(
's3',
aws_access_key_id=os.environ['NICONICOGAY_S3_ACCESS_KEY'],
aws_secret_access_key=os.environ['NICONICOGAY_S3_SECRET_KEY'],
region_name=S3_REGION,
endpoint_url=f"https://{S3_REGION}.digitaloceanspaces.com",
config=BotoConfig(s3={'addressing_style': 'virtual'}),
)
else:
logger.info("S3 credentials not provided. Videos will not be downloaded.")
if cache:
cached_html = cache.get(video_id)
if cached_html is not None:
logger.info(f"Using cached response for video ID: {video_id}")
return Response(cached_html, mimetype="text/html") # type: ignore
download_tracker = {
'active_downloads': 0,
'in_progress': set(),
}
download_lock = threading.Lock()
download_queue = []
# Not in cache or cache expired; fetch from nicovideo.jp
real_url = f"https://www.nicovideo.jp/watch/{video_id}"
def download_and_upload_video(video_id, url, video_quality):
try:
logger.info(f"Fetching content from URL: {real_url}")
with download_lock:
download_tracker['active_downloads'] += 1
download_tracker['in_progress'].add(video_id)
with tempfile.NamedTemporaryFile(suffix='.mp4', delete=True) as temp_file:
temp_path = temp_file.name
try:
logger.info(f"{video_id}: Starting download")
nndownload_args = [
"--no-login",
"--user-agent", "Googlebot/2.1",
"--video-quality", video_quality,
"--output-path", temp_path,
url
]
if user_session:
nndownload_args += ["--session-cookie", user_session]
nndownload_args = nndownload_args[1:]
nndownload.execute(*nndownload_args)
if os.path.exists(temp_path) and s3_client and S3_BUCKET_NAME:
logger.info(f"{video_id}: Downloaded, uploading to CDN")
try:
s3_key = f"niconico/{video_id}.mp4"
s3_client.upload_file(
temp_path,
S3_BUCKET_NAME,
s3_key,
ExtraArgs={'ContentType': 'video/mp4', 'ACL': 'public-read'}
)
logger.info(f"{video_id}: Upload successful to CDN")
if cache is not None:
cache.set(f"{video_id}_cdn", True, expire=CACHE_EXPIRATION_CDN)
# Clear HTML cache for this video to ensure next view gets updated HTML
cache.delete(f"{video_id}_html")
logger.debug(f"{video_id}: Cleared HTML cache")
return True
except Exception as e:
logger.error(f"{video_id}: Error uploading to CDN: {e}")
return False
else:
logger.error(f"{video_id}: Failed to download or S3 client not configured")
return False
finally:
if os.path.exists(temp_path):
os.unlink(temp_path)
logger.debug(f"Removed temporary file: {temp_path}")
except Exception as e:
logger.error(f"{video_id}: Error in download process: {e}")
return False
finally:
with download_lock:
download_tracker['active_downloads'] -= 1
download_tracker['in_progress'].discard(video_id)
def download_worker():
while True:
try:
with download_lock:
can_download = download_tracker['active_downloads'] < MAX_CONCURRENT_DOWNLOADS
queue_has_items = len(download_queue) > 0
if queue_has_items and can_download:
with download_lock:
# Get next video that is not already being downloaded
for i, (video_id, _, _) in enumerate(download_queue):
if video_id not in download_tracker['in_progress']:
video_info = download_queue.pop(i)
threading.Thread(target=download_and_upload_video,
args=(video_info[0], video_info[1], video_info[2])).start()
break
time.sleep(1)
except Exception as e:
logger.error(f"Error in download worker: {e}")
time.sleep(5) # Back off in case of error
worker_thread = threading.Thread(target=download_worker, daemon=True)
worker_thread.start()
def is_video_in_cdn(video_id):
"""Check if video exists in CDN"""
if cache is not None and cache.get(f"{video_id}_cdn"):
logger.debug(f"{video_id}: Already uploaded to CDN (cached)")
return True
if not s3_client or not S3_BUCKET_NAME:
logger.warning("S3 client not configured. Cannot check if video exists in CDN.")
return False
try:
s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=f"niconico/{video_id}.mp4")
return True
except Exception:
return False
def is_video_being_downloaded(video_id):
"""Check if video is currently being downloaded"""
with download_lock:
return video_id in download_tracker['in_progress']
def get_cdn_url(video_id):
"""Get the CDN URL for a video"""
return f"{CDN_BASE_URL}/niconico/{video_id}.mp4"
def get_video_resolution(params):
if not params:
return None, None
video = params['media']['domand']['videos'][0]
return video['width'], video['height']
def get_video_quality(params, quality_level_threshold=3):
"""Get the code of the best video quality available (optionally below a certain threshold)"""
videos = params['media']['domand']['videos']
eligible_videos = [v for v in videos if v['qualityLevel'] < quality_level_threshold]
if not eligible_videos:
return None
return str(max(eligible_videos, key=lambda x: int(x['qualityLevel']))['id'])
def get_data(video_id, real_url):
"""Get the server response for a given video ID"""
try:
logger.debug(f"{video_id}: Fetching content from URL: {real_url}")
r = s.get(real_url, timeout=10)
# r.raise_for_status()
except requests.RequestException as e:
logger.error(f"Error fetching the page for video ID '{video_id}': {e}")
return Response(status=500)
logger.error(f"{video_id}: Error fetching the page ('{real_url}'): {e}")
return None, None
soup = BeautifulSoup(r.text, "html.parser")
thumbnail_url = None
try:
server_response = soup.find("meta", {"name": "server-response"})
if server_response:
params = json.loads(server_response["content"])["data"]["response"] # type: ignore
thumbnail_url = (
params["video"]["thumbnail"].get("ogp") or
params["video"]["thumbnail"].get("player") or
params["video"]["thumbnail"].get("largeUrl") or
params["video"]["thumbnail"].get("middleUrl") or
params["video"]["thumbnail"].get("url")
)
return params, soup
except (KeyError, json.JSONDecodeError) as e:
logger.warning(f"Failed to extract thumbnail info for video ID '{video_id}': {e}")
logger.warning(f"{video_id}: Failed to extract thumbnail info: {e}")
pass
og_tags = soup.find_all("meta", property=lambda x: x) # type: ignore
return None, soup
def human_format(num):
"""Format a number in a human-readable way (e.g., 1K, 2M, etc.)"""
if num is None:
return None
num = float('{:.3g}'.format(num))
magnitude = 0
while abs(num) >= 1000:
magnitude += 1
num /= 1000.0
return '{}{}'.format('{:f}'.format(num).rstrip('0').rstrip('.'), ['', 'K', 'M', 'B', 'T'][magnitude])
def get_oembed_url(params):
"""Get the oEmbed (/owoembed) URL based on the given params (server response)"""
if not params:
return None
author_id = None
author_name = None
if params.get('owner'):
author_id = params['owner'].get('id')
author_name = params['owner'].get('nickname')
video_id = params.get('video', {}).get('id')
if not video_id:
return None
view_count = human_format(params.get('video', {}).get('count', {}).get('view')) or "n/a"
comment_count = human_format(params.get('video', {}).get('count', {}).get('comment')) or "n/a"
like_count = human_format(params.get('video', {}).get('count', {}).get('like')) or "n/a"
mylist_count = human_format(params.get('video', {}).get('count', {}).get('mylist')) or "n/a"
provder_stats = f"👁️ {view_count} 💬 {comment_count} ❤️ {like_count} 📝 {mylist_count}"
author_name_encoded = urllib.parse.quote(author_name) if author_name else ""
provider_stats_encoded = urllib.parse.quote(provder_stats)
oembed_url = (
f"{HOST}/owoembed?"
f"author_id={author_id if author_id else ''}&"
f"author_name={author_name_encoded}&"
f"video_id={video_id}&"
f"provider={provider_stats_encoded}"
)
return oembed_url
@app.route("/watch/<video_id>")
def proxy(video_id):
logger.info(f"{video_id}: Received request")
cache_html_suffix = "_html"
request_user_agent = request.headers.get('User-Agent', '').lower()
if 'twitterbot' in request_user_agent:
cache_html_suffix = "_html_twitterbot"
elif 'discordbot' in request_user_agent:
cache_html_suffix = "_html_discordbot"
if cache is not None:
logger.debug(f"{video_id}: Checking cache")
cached_html = cache.get(f"{video_id}{cache_html_suffix}")
if cached_html is not None:
logger.info(f"{video_id}: Returning cached response")
return Response(cached_html, mimetype="text/html") # type: ignore
logger.debug(f"{video_id}: Cache miss - fetching")
# Not in cache or cache expired; fetch from nicovideo.jp
real_url = f"https://www.nicovideo.jp/watch/{video_id}"
params, soup = get_data(video_id, real_url)
if not params or not soup:
logger.error(f"{video_id}: Failed to fetch data")
return Response("Video not found", status=404)
reason_code = params.get('reasonCode', '').upper()
if reason_code in ['HIDDEN_VIDEO', 'ADMINISTRATOR_DELETE_VIDEO', 'RIGHT_HOLDER_DELETE_VIDEO', 'DELETED_VIDEO']:
logger.warning(f"{video_id}: Video is hidden or deleted ({reason_code}) - returning 404")
return Response("Video not found", status=404)
thumbnail_url = (
params["video"]["thumbnail"].get("ogp") or
params["video"]["thumbnail"].get("player") or
params["video"]["thumbnail"].get("largeUrl") or
params["video"]["thumbnail"].get("middleUrl") or
params["video"]["thumbnail"].get("url")
) if params else None
video_width, video_height = get_video_resolution(params) if params else (None, None)
download_allowed = True
if download_allowed and 'discordbot' not in request_user_agent:
logger.info(f"{video_id}: Video download ignored due to user agent ({request_user_agent})")
download_allowed = False
if params['video']['duration'] > 60 * 20: # 20 minutes
logger.info(f"{video_id}: Video download ignored due to duration ({params['video']['duration']} seconds)")
download_allowed = False
video_quality = get_video_quality(params) if params else None
if download_allowed and video_quality is not None:
video_in_cdn = is_video_in_cdn(video_id)
video_in_progress = is_video_being_downloaded(video_id)
if not video_in_cdn and not video_in_progress and s3_client:
with download_lock:
# Add to queue if not already in it
queue_video_ids = [item[0] for item in download_queue]
if video_id not in queue_video_ids:
download_queue.append((video_id, real_url, video_quality))
logger.info(f"{video_id}: Queued for download")
cdn_video_url = get_cdn_url(video_id)
og_tags = soup.find_all("meta", attrs={"property": True})
if len(og_tags) == 0:
logger.warning(f"{video_id}: No Open Graph tags found")
og_title = None
og_description = None
og_category = None
for tag in og_tags:
# Remove attribute(s) added by niconico
if 'data-server' in tag.attrs:
del tag.attrs['data-server']
# Set title
if tag.get("property") == "og:title":
og_title = tag["content"]
# Set description
if tag.get("property") == "og:description":
og_description = tag["content"]
if og_description and og_title:
# The description is formatted like "Title [Category] Description"
# Extract category (just incase this is useful later), and keep only the description part.
match = re.search(rf"^{re.escape(og_title)}(\s+\[(.*?)\])?\s+(.*)", og_description)
if match:
og_category = match.group(2) if match.group(2) else None
og_description = match.group(3)
tag["content"] = og_description
# Fix thumbnail
if tag.get("property") == "og:image" and thumbnail_url:
tag["content"] = thumbnail_url
# Fix video URL
if tag.get("property") == "og:video:url" or tag.get("property") == "og:video:secure_url":
tag["content"] = cdn_video_url
og_tags_str = "\n".join(str(tag) for tag in og_tags)
html_response = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
{og_tags_str}
</head>
<body>
</body>
</html>
"""
og_tags_str = "\n".join(str(tag) for tag in og_tags if tag.get("property") not in ["og:site_name"])
og_tags_str += '\n<meta content="ニコニコ動画" property="og:site_name"/>'
og_tags_str += f'\n<link rel="alternate" href="{get_oembed_url(params)}" type="application/json+oembed" title="{video_id}"/>'
if cache:
logging.info(f"Caching response for video ID: {video_id}")
cache.set(video_id, html_response, expire=CACHE_EXPIRATION_SECONDS)
# Discord seems to ignore video URLs when Twitter meta tags are present,
# so in addition to including these when the User Agent is a Twitterbot,
# we also include them when the video is too long to download in order to remove the play button.
if 'twitterbot' in request_user_agent or not download_allowed:
if 'twitterbot' in request_user_agent:
logger.info(f"{video_id}: Twitterbot detected - adding Twitter tags")
elif not download_allowed:
logger.info(f"{video_id}: Video too long to download - will not show play button")
og_tags_str += f'\n<meta content="{thumbnail_url}" property="twitter:image"/>'
og_tags_str += '\n<meta content="summary_large_image" property="twitter:card"/>'
og_tags_str += '\n<meta content="www.nicovideo.gay" name="twitter:domain"/>'
og_tags_str += f'\n<meta content="{request.url}" name="twitter:url"/>'
if og_title:
og_tags_str += f'\n<meta content="{og_title}" name="twitter:title"/>'
if og_description:
og_tags_str += f'\n<meta content="{og_description}" name="twitter:description"/>'
# og_tags_str += '\n<meta content="video/mp4" property="twitter:player:stream:content_type"/>'
# og_tags_str += f'\n<meta content="{cdn_video_url}" property="twitter:player:stream"/>'
# if video_width:
# og_tags_str += f'\n<meta content="{video_width}" property="twitter:player:width"/>'
# if video_height:
# og_tags_str += f'\n<meta content="{video_height}" property="twitter:player:height"/>'
html_response = f"""<!DOCTYPE html>
<!--
niconico proxy - brought to you by https://mmaker.moe
this service is intended to be used by social media open graph embed generators and discordbot.
please do not abuse! the videos returned by the CDN are lower quality and intended to only be proxied by discord, not hotlinked.
if you want to download videos, please consider using a tool like nndownload: https://github.com/AlexAplin/nndownload
-->
<html lang="en"><head><meta charset="UTF-8">
{og_tags_str}
</head><body></body></html>"""
if cache is not None:
logger.info(f"{video_id}: Caching HTML response")
cache.set(f"{video_id}{cache_html_suffix}", html_response, expire=CACHE_EXPIRATION_HTML)
logger.info(f"{video_id}: Returning response")
logger.debug(f"{video_id}: HTML response:\n----------\n{html_response}\n----------")
return Response(html_response, mimetype="text/html")
@app.route("/owoembed")
def owoembed():
"""
Handles oEmbed requests with parameters in the URL
Returns JSON payload in oEmbed format
"""
logger.info("Received request for oEmbed endpoint")
# Get parameters from query string
author_id = request.args.get('author_id', '')
author_name = request.args.get('author_name', '')
video_id = request.args.get('video_id', '')
provider = request.args.get('provider', '')
author_name_decoded = urllib.parse.unquote(author_name)
provider_decoded = urllib.parse.unquote(provider)
# Create the author_url and provider_url
author_url = f"https://www.nicovideo.jp/user/{author_id}"
video_url = f"https://www.nicovideo.jp/watch/{video_id}"
# Create oEmbed response
oembed_response = {
"author_name": author_name_decoded,
"author_url": author_url,
"provider_name": provider_decoded,
"provider_url": video_url,
"title": "Embed",
"type": "link",
"version": "1.0"
}
logger.info(f"{video_id}: Returning oEmbed response")
logger.debug(f"{video_id}: oEmbed response:\n----------\n{json.dumps(oembed_response, indent=2)}\n----------")
return jsonify(oembed_response)

87
clean.py Normal file
View File

@ -0,0 +1,87 @@
import os
import sys
import datetime
import argparse
import logging
import boto3
from botocore.client import Config as BotoConfig
from dotenv import load_dotenv
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logger = logging.getLogger(__name__)
def cleanup_old_files(dry_run=False, days=7, directory_prefix="niconico"):
required_env_vars = [
'NICONICOGAY_S3_ACCESS_KEY',
'NICONICOGAY_S3_SECRET_KEY',
'NICONICOGAY_S3_BUCKET_NAME',
'NICONICOGAY_S3_REGION'
]
missing_vars = [var for var in required_env_vars if not os.environ.get(var)]
if missing_vars:
logger.error(f"Missing required environment variables: {', '.join(missing_vars)}")
sys.exit(1)
try:
s3_session = boto3.Session()
s3_client = s3_session.client(
's3',
aws_access_key_id=os.environ['NICONICOGAY_S3_ACCESS_KEY'],
aws_secret_access_key=os.environ['NICONICOGAY_S3_SECRET_KEY'],
region_name=os.environ['NICONICOGAY_S3_REGION'],
endpoint_url=f"https://{os.environ['NICONICOGAY_S3_REGION']}.digitaloceanspaces.com",
config=BotoConfig(s3={'addressing_style': 'virtual'}),
)
bucket_name = os.environ['NICONICOGAY_S3_BUCKET_NAME']
cutoff_date = datetime.datetime.now(datetime.timezone.utc) - datetime.timedelta(days=days)
paginator = s3_client.get_paginator('list_objects_v2')
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=f"{directory_prefix}/")
total_files = 0
objects_to_delete = []
for page in page_iterator:
if 'Contents' not in page:
continue
for obj in page['Contents']:
total_files += 1
if obj['LastModified'] < cutoff_date: # type: ignore
objects_to_delete.append({'Key': obj['Key']}) # type: ignore
if len(objects_to_delete) == 0:
logger.info("No files to delete")
return
if dry_run:
logger.info(f"DRY RUN: Would delete {len(objects_to_delete)} out of {total_files} files")
else:
# Delete files in batches of 1000 (S3 limit?)
for i in range(0, len(objects_to_delete), 1000):
batch = objects_to_delete[i:i+1000]
s3_client.delete_objects(
Bucket=bucket_name,
Delete={'Objects': batch}
)
logger.info(f"Successfully deleted {len(objects_to_delete)} out of {total_files} files")
except Exception as e:
logger.error(f"Error: {e}")
sys.exit(1)
if __name__ == "__main__":
load_dotenv()
parser = argparse.ArgumentParser()
parser.add_argument(
"--dry-run",
action="store_true",
help="Show what would be deleted without actually deleting anything"
)
args = parser.parse_args()
cleanup_old_files(dry_run=args.dry_run)

View File

@ -1,3 +1,7 @@
beautifulsoup4==4.12.3
Flask==3.1.0
Requests==2.32.3
diskcache==5.6.3
nndownload==1.19
boto3
python-dotenv