niconico-gay/app.py

445 lines
18 KiB
Python

import os
import http.cookiejar
import json
import requests
from bs4 import BeautifulSoup
from flask import Flask, Response, request, jsonify, send_file, make_response
from diskcache import Cache
from io import BytesIO
import logging
import threading
import time
import tempfile
import nndownload
import boto3
from botocore.client import Config as BotoConfig
import urllib.parse
from dotenv import load_dotenv
load_dotenv()
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s.%(msecs)03d - %(levelname)s - %(message)s',
datefmt='%Y-%m-%d %H:%M:%S'
)
logging.getLogger("urllib3.connectionpool").setLevel(logging.ERROR)
logger = logging.getLogger(__name__)
app = Flask(__name__)
HOST = os.environ.get('NICONICOGAY_HOST', 'https://nicovideo.gay')
S3_BUCKET_NAME = os.environ.get('NICONICOGAY_S3_BUCKET_NAME')
S3_REGION = os.environ.get('NICONICOGAY_S3_REGION')
CDN_BASE_URL = os.environ.get('NICONICOGAY_CDN_BASE_URL')
MAX_CONCURRENT_DOWNLOADS = 3
CACHE_EXPIRATION_HTML = 60 * 60 # 1 hour
CACHE_EXPIRATION_CDN = 60 * 60 * 24 * 7 # 1 week
CACHE_SIZE_LIMIT = 100 * 1024 * 1024 # 100 MB
placeholder_video = None
PLACEHOLDER_VIDEO_PATH = os.environ.get('NICONICOGAY_PLACEHOLDER_VIDEO', 'placeholder.mp4')
try:
with open(PLACEHOLDER_VIDEO_PATH, 'rb') as f:
placeholder_video = BytesIO(f.read())
logger.debug("Loaded placeholder video")
except FileNotFoundError:
logger.warning(f"Placeholder video file '{PLACEHOLDER_VIDEO_PATH}' not found")
except Exception as e:
logger.error(f"Error loading placeholder video: {e}")
cache = None
if os.environ.get('NICONICOGAY_DISABLE_CACHE', '') != '1':
cache = Cache("disk_cache", size_limit=CACHE_SIZE_LIMIT)
logger.debug("Using disk cache")
else:
logger.info("Disk cache disabled")
user_session = None
cookie_jar = http.cookiejar.MozillaCookieJar('cookies.txt')
try:
cookie_jar.load(ignore_discard=True, ignore_expires=True)
user_session = next((cookie.value for cookie in cookie_jar if cookie.name == 'user_session'), None)
except FileNotFoundError:
logger.info("cookies.txt not found, starting with empty cookie jar")
s = requests.Session()
s.headers.update({
"User-Agent": os.environ.get('NICONICOGAY_USER_AGENT', 'Twitterbot/1.0')
})
s.cookies = cookie_jar # type: ignore
s3_client = None
if all(key in os.environ for key in [
'NICONICOGAY_S3_ACCESS_KEY',
'NICONICOGAY_S3_SECRET_KEY',
]):
s3_session = boto3.Session()
s3_client = s3_session.client(
's3',
aws_access_key_id=os.environ['NICONICOGAY_S3_ACCESS_KEY'],
aws_secret_access_key=os.environ['NICONICOGAY_S3_SECRET_KEY'],
region_name=S3_REGION,
endpoint_url=f"https://{S3_REGION}.digitaloceanspaces.com",
config=BotoConfig(s3={'addressing_style': 'virtual'}),
)
else:
logger.info("S3 credentials not provided. Videos will not be downloaded.")
download_tracker = {
'active_downloads': 0,
'in_progress': set(),
}
download_lock = threading.Lock()
download_queue = []
def download_and_upload_video(video_id, url, video_quality):
try:
with download_lock:
download_tracker['active_downloads'] += 1
download_tracker['in_progress'].add(video_id)
with tempfile.NamedTemporaryFile(suffix='.mp4', delete=True) as temp_file:
temp_path = temp_file.name
try:
logger.info(f"{video_id}: Starting download")
nndownload_args = [
"--no-login",
"--user-agent", "Googlebot/2.1",
"--video-quality", video_quality,
"--output-path", temp_path,
url
]
if user_session:
nndownload_args += ["--session-cookie", user_session]
nndownload_args = nndownload_args[1:]
nndownload.execute(*nndownload_args)
if os.path.exists(temp_path) and s3_client and S3_BUCKET_NAME:
logger.info(f"{video_id}: Downloaded, uploading to CDN")
try:
s3_key = f"niconico/{video_id}.mp4"
s3_client.upload_file(
temp_path,
S3_BUCKET_NAME,
s3_key,
ExtraArgs={'ContentType': 'video/mp4', 'ACL': 'public-read'}
)
logger.info(f"{video_id}: Upload successful to CDN")
if cache is not None:
cache.set(f"{video_id}_cdn", True, expire=CACHE_EXPIRATION_CDN)
# Clear HTML cache for this video to ensure next view gets updated HTML
cache.delete(f"{video_id}_html")
logger.debug(f"{video_id}: Cleared HTML cache")
return True
except Exception as e:
logger.error(f"{video_id}: Error uploading to CDN: {e}")
return False
else:
logger.error(f"{video_id}: Failed to download or S3 client not configured")
return False
finally:
if os.path.exists(temp_path):
os.unlink(temp_path)
logger.debug(f"Removed temporary file: {temp_path}")
except Exception as e:
logger.error(f"{video_id}: Error in download process: {e}")
return False
finally:
with download_lock:
download_tracker['active_downloads'] -= 1
download_tracker['in_progress'].discard(video_id)
def download_worker():
while True:
try:
with download_lock:
can_download = download_tracker['active_downloads'] < MAX_CONCURRENT_DOWNLOADS
queue_has_items = len(download_queue) > 0
if queue_has_items and can_download:
with download_lock:
# Get next video that is not already being downloaded
for i, (video_id, _, _) in enumerate(download_queue):
if video_id not in download_tracker['in_progress']:
video_info = download_queue.pop(i)
threading.Thread(target=download_and_upload_video,
args=(video_info[0], video_info[1], video_info[2])).start()
break
time.sleep(1)
except Exception as e:
logger.error(f"Error in download worker: {e}")
time.sleep(5) # Back off in case of error
worker_thread = threading.Thread(target=download_worker, daemon=True)
worker_thread.start()
def is_video_in_cdn(video_id):
"""Check if video exists in CDN"""
if cache is not None and cache.get(f"{video_id}_cdn"):
logger.debug(f"{video_id}: Already uploaded to CDN (cached)")
return True
if not s3_client or not S3_BUCKET_NAME:
logger.warning("S3 client not configured. Cannot check if video exists in CDN.")
return False
try:
s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=f"niconico/{video_id}.mp4")
return True
except Exception:
return False
def is_video_being_downloaded(video_id):
"""Check if video is currently being downloaded"""
with download_lock:
return video_id in download_tracker['in_progress']
def get_cdn_url(video_id):
"""Get the CDN URL for a video"""
return f"{CDN_BASE_URL}/niconico/{video_id}.mp4"
def get_video_resolution(params):
if not params:
return None, None
video = params['media']['domand']['videos'][0]
return video['width'], video['height']
def get_video_quality(params, quality_level_threshold=3):
"""Get the code of the best video quality available (optionally below a certain threshold)"""
videos = params['media']['domand']['videos']
eligible_videos = [v for v in videos if v['qualityLevel'] < quality_level_threshold]
if not eligible_videos:
return None
return str(max(eligible_videos, key=lambda x: int(x['qualityLevel']))['id'])
def get_data(video_id, real_url):
"""Get the server response for a given video ID"""
try:
logger.debug(f"{video_id}: Fetching content from URL: {real_url}")
r = s.get(real_url, timeout=10)
# r.raise_for_status()
except requests.RequestException as e:
logger.error(f"{video_id}: Error fetching the page ('{real_url}'): {e}")
return None, None
soup = BeautifulSoup(r.text, "html.parser")
try:
server_response = soup.find("meta", {"name": "server-response"})
if server_response:
params = json.loads(server_response["content"])["data"]["response"] # type: ignore
return params, soup
except (KeyError, json.JSONDecodeError) as e:
logger.warning(f"{video_id}: Failed to extract thumbnail info: {e}")
pass
return None, soup
def human_format(num):
"""Format a number in a human-readable way (e.g., 1K, 2M, etc.)"""
if num is None:
return None
num = float('{:.3g}'.format(num))
magnitude = 0
while abs(num) >= 1000:
magnitude += 1
num /= 1000.0
return '{}{}'.format('{:f}'.format(num).rstrip('0').rstrip('.'), ['', 'K', 'M', 'B', 'T'][magnitude])
def get_oembed_url(params):
"""Get the oEmbed (/owoembed) URL based on the given params (server response)"""
if not params:
return None
author_id = None
author_name = None
if params.get('owner'):
author_id = params['owner'].get('id')
author_name = params['owner'].get('nickname')
video_id = params.get('video', {}).get('id')
if not video_id:
return None
view_count = human_format(params.get('video', {}).get('count', {}).get('view')) or "n/a"
comment_count = human_format(params.get('video', {}).get('count', {}).get('comment')) or "n/a"
like_count = human_format(params.get('video', {}).get('count', {}).get('like')) or "n/a"
mylist_count = human_format(params.get('video', {}).get('count', {}).get('mylist')) or "n/a"
provder_stats = f"👁️ {view_count} 💬 {comment_count} ❤️ {like_count} 📝 {mylist_count}"
author_name_encoded = urllib.parse.quote(author_name) if author_name else ""
provider_stats_encoded = urllib.parse.quote(provder_stats)
oembed_url = (
f"{HOST}/owoembed?"
f"author_id={author_id if author_id else ''}&"
f"author_name={author_name_encoded}&"
f"video_id={video_id}&"
f"provider={provider_stats_encoded}"
)
return oembed_url
@app.route("/cdn/<video_id>.mp4")
def cdn_redirect(video_id):
"""
Checks if a video exists in CDN and redirects accordingly.
Returns CDN URL if video exists, otherwise returns a placeholder video URL.
Only responds to requests from Discord bots.
"""
logger.info(f"{video_id}: CDN redirect request received")
request_user_agent = request.headers.get('User-Agent', '').lower()
if 'discordbot' not in request_user_agent:
logger.info(f"{video_id}: Video CDN redirect ignored due to user agent ({request_user_agent})")
return Response("Video not found", status=404)
if placeholder_video is None or is_video_in_cdn(video_id):
cdn_url = get_cdn_url(video_id)
logger.info(f"{video_id}: Redirecting to CDN URL: {cdn_url}")
return Response("", status=302, headers={"Location": cdn_url})
logger.info(f"{video_id}: Video not found in CDN, returning placeholder")
response = make_response(send_file(placeholder_video, mimetype="video/mp4"))
response.headers['Content-Length'] = str(placeholder_video.getbuffer().nbytes)
return response
@app.route("/watch/<video_id>")
def proxy(video_id):
logger.info(f"{video_id}: Received request")
if cache is not None:
logger.debug(f"{video_id}: Checking cache")
cached_html = cache.get(f"{video_id}_html")
if cached_html is not None:
logger.info(f"{video_id}: Returning cached response")
return Response(cached_html, mimetype="text/html") # type: ignore
logger.debug(f"{video_id}: Cache miss - fetching")
# Not in cache or cache expired; fetch from nicovideo.jp
real_url = f"https://www.nicovideo.jp/watch/{video_id}"
params, soup = get_data(video_id, real_url)
if not params or not soup:
logger.error(f"{video_id}: Failed to fetch data")
return Response("Video not found", status=404)
reason_code = params.get('reasonCode', '').upper()
if reason_code in ['HIDDEN_VIDEO', 'ADMINISTRATOR_DELETE_VIDEO']:
logger.warning(f"{video_id}: Video is hidden or deleted ({reason_code}) - returning 404")
return Response("Video not found", status=404)
thumbnail_url = (
params["video"]["thumbnail"].get("ogp") or
params["video"]["thumbnail"].get("player") or
params["video"]["thumbnail"].get("largeUrl") or
params["video"]["thumbnail"].get("middleUrl") or
params["video"]["thumbnail"].get("url")
) if params else None
video_width, video_height = get_video_resolution(params) if params else (None, None)
download_allowed = True
if params['video']['duration'] > 60 * 20: # 20 minutes
logger.info(f"{video_id}: Video download ignored due to duration ({params['video']['duration']} seconds)")
download_allowed = False
request_user_agent = request.headers.get('User-Agent', '').lower()
if download_allowed and 'discordbot' not in request_user_agent:
logger.info(f"{video_id}: Video download ignored due to user agent ({request_user_agent})")
download_allowed = False
video_quality = get_video_quality(params) if params else None
if download_allowed and video_quality is not None:
video_in_cdn = is_video_in_cdn(video_id)
video_in_progress = is_video_being_downloaded(video_id)
if not video_in_cdn and not video_in_progress and s3_client:
with download_lock:
# Add to queue if not already in it
queue_video_ids = [item[0] for item in download_queue]
if video_id not in queue_video_ids:
download_queue.append((video_id, real_url, video_quality))
logger.info(f"{video_id}: Queued for download")
cdn_video_url = f"{HOST}/cdn/{video_id}.mp4" if placeholder_video else get_cdn_url(video_id)
og_tags = soup.find_all("meta", attrs={"property": True})
for tag in og_tags:
# Remove attribute(s) added by niconico
if 'data-server' in tag.attrs:
del tag.attrs['data-server']
# Fix thumbnail
if tag.get("property") == "og:image" and thumbnail_url:
tag["content"] = thumbnail_url
# Fix video URL
if tag.get("property") == "og:video:url" or tag.get("property") == "og:video:secure_url":
tag["content"] = cdn_video_url
og_tags_str = "\n".join(str(tag) for tag in og_tags if tag.get("property") not in ["og:site_name"])
# og_tags_str += '\n<meta content="0" property="twitter:image"/>'
# og_tags_str += '\n<meta content="player" property="twitter:card"/>'
# og_tags_str += '\n<meta content="video/mp4" property="twitter:player:stream:content_type"/>'
# og_tags_str += f'\n<meta content="{cdn_video_url}" property="twitter:player:stream"/>'
# if video_width:
# og_tags_str += f'\n<meta content="{video_width}" property="twitter:player:width"/>'
# if video_height:
# og_tags_str += f'\n<meta content="{video_height}" property="twitter:player:height"/>'
og_tags_str += '\n<meta content="ニコニコ動画" property="og:site_name"/>'
og_tags_str += f'\n<link rel="alternate" href="{get_oembed_url(params)}" type="application/json+oembed" title="{video_id}"/>'
html_response = f"""<!DOCTYPE html>
<!--
niconico proxy - brought to you by https://mmaker.moe
this service is intended to be used by social media open graph embed generators and discordbot.
please do not abuse! the videos returned by the CDN are lower quality and intended to only be proxied by discord, not hotlinked.
if you want to download videos, please consider using a tool like nndownload: https://github.com/AlexAplin/nndownload
-->
<html lang="en"><head><meta charset="UTF-8">
{og_tags_str}
</head><body></body></html>"""
if cache is not None:
logger.info(f"{video_id}: Caching HTML response")
cache.set(f"{video_id}_html", html_response, expire=CACHE_EXPIRATION_HTML)
logger.info(f"{video_id}: Returning response")
return Response(html_response, mimetype="text/html")
@app.route("/owoembed")
def owoembed():
"""
Handles oEmbed requests with parameters in the URL
Returns JSON payload in oEmbed format
"""
logger.info("Received request for oEmbed endpoint")
# Get parameters from query string
author_id = request.args.get('author_id', '')
author_name = request.args.get('author_name', '')
video_id = request.args.get('video_id', '')
provider = request.args.get('provider', '')
author_name_decoded = urllib.parse.unquote(author_name)
provider_decoded = urllib.parse.unquote(provider)
# Create the author_url and provider_url
author_url = f"https://www.nicovideo.jp/user/{author_id}"
video_url = f"https://www.nicovideo.jp/watch/{video_id}"
# Create oEmbed response
oembed_response = {
"author_name": author_name_decoded,
"author_url": author_url,
"provider_name": provider_decoded,
"provider_url": video_url,
"title": "Embed",
"type": "link",
"version": "1.0"
}
logger.info(f"{video_id}: Returning oEmbed response")
return jsonify(oembed_response)