Files
slideshow/downloader.py

245 lines
7.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import io
import math
import os
import shutil
import sys
import argparse
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from dotenv import load_dotenv
from db import init_db, image_exists, insert_image
def parse_e621(post):
file_url = post.get('file', {}).get('url')
if not file_url:
return None
tags = ' '.join(t for cat in post.get('tags', {}).values() for t in cat)
return str(post['id']), file_url, tags
def parse_moebooru(post):
file_url = post.get('file_url')
if not file_url:
return None
return str(post['id']), file_url, post.get('tags', '')
SITES = {
'e621': {
'base_url': 'https://e621.net/',
'endpoint': 'posts.json',
'per_page': 100,
'threads': 8,
'envelope': 'posts',
'parse': parse_e621,
'post_url_fmt': 'https://e621.net/posts/{post_id}',
},
'konachan': {
'base_url': 'https://konachan.com/',
'endpoint': 'post.json',
'per_page': 100,
'threads': 20,
'envelope': None,
'parse': parse_moebooru,
'post_url_fmt': 'https://konachan.com/post/show/{post_id}',
},
'yandere': {
'base_url': 'https://yande.re/',
'endpoint': 'post.json',
'per_page': 100,
'threads': 20,
'envelope': None,
'parse': parse_moebooru,
'post_url_fmt': 'https://yande.re/post/show/{post_id}',
},
'rule34': {
'base_url': 'https://rule34.xxx/',
'endpoint': 'index.php?page=dapi&s=post&q=index&json=1',
'per_page': 100,
'threads': 20,
'envelope': None,
'page_param': 'pid',
'page_start': 0,
'parse': parse_moebooru,
'post_url_fmt': 'https://rule34.xxx/index.php?page=post&s=view&id={post_id}',
},
}
def fetch_all_posts(adapter, query, auth, limit):
page_param = adapter.get('page_param', 'page')
posts, page = [], adapter.get('page_start', 1)
session = requests.Session()
session.headers['User-Agent'] = 'booru-downloader/2.0 (personal archiver)'
while len(posts) < limit:
params = {'tags': query, 'limit': adapter['per_page'], page_param: page}
if auth:
params.update(auth)
r = session.get(
adapter['base_url'] + adapter['endpoint'], params=params, timeout=30
)
r.raise_for_status()
data = r.json()
batch = data[adapter['envelope']] if adapter['envelope'] else data
if not batch:
break
posts.extend(batch)
if len(batch) < adapter['per_page']:
break
page += 1
return posts[:limit]
def make_preview(src_path, previews_dir, max_bytes=1_048_576):
from PIL import Image
src_name = os.path.basename(src_path)
name_no_ext, _ext = os.path.splitext(src_name)
if os.path.getsize(src_path) <= max_bytes:
dest_path = os.path.join(previews_dir, src_name)
shutil.copy2(src_path, dest_path)
return src_name
try:
img = Image.open(src_path)
if img.mode not in ('RGB', 'L', 'RGBA'):
img = img.convert('RGB')
elif img.mode == 'RGBA':
bg = Image.new('RGB', img.size, (255, 255, 255))
bg.paste(img, mask=img.split()[3])
img = bg
except Exception:
dest_path = os.path.join(previews_dir, src_name)
shutil.copy2(src_path, dest_path)
return src_name
file_size = os.path.getsize(src_path)
scale = math.sqrt(max_bytes / file_size)
w, h = img.size
img = img.resize((max(1, int(w * scale)), max(1, int(h * scale))), Image.LANCZOS)
preview_name = name_no_ext + '.jpg'
dest_path = os.path.join(previews_dir, preview_name)
quality = 85
buf = io.BytesIO()
while quality >= 20:
buf = io.BytesIO()
img.save(buf, format='JPEG', quality=quality, optimize=True)
if buf.tell() <= max_bytes:
break
quality -= 10
with open(dest_path, 'wb') as f:
f.write(buf.getvalue())
return preview_name
def download_one(post, site_name, adapter, pictures_dir, previews_dir, session):
parsed = adapter['parse'](post)
if not parsed:
return 'skip:no_url'
post_id, file_url, tags = parsed
if image_exists(site_name, post_id):
return f'skip:dup:{post_id}'
clean_url = file_url.split('?')[0]
ext = os.path.splitext(clean_url)[1]
filename = f'{site_name}_{post_id}{ext}'
dest = os.path.join(pictures_dir, filename)
r = session.get(file_url, timeout=60)
r.raise_for_status()
with open(dest, 'wb') as f:
f.write(r.content)
post_url = adapter['post_url_fmt'].format(post_id=post_id)
preview_filename = make_preview(dest, previews_dir)
insert_image(post_id, site_name, filename, tags, file_url, post_url, preview_filename)
return f'ok:{filename}'
def main():
load_dotenv()
parser = argparse.ArgumentParser(
description='Download booru images to unified store'
)
parser.add_argument('--site', required=True, choices=SITES.keys())
parser.add_argument('--query', required=True, help='Space-separated tags to search')
parser.add_argument('--limit', type=int, default=100)
args = parser.parse_args()
init_db()
adapter = SITES[args.site]
auth = None
if args.site == 'e621':
login = os.environ.get('E621_LOGIN')
api_key = os.environ.get('E621_API_KEY')
if not login or not api_key:
raise KeyError(
'E621_LOGIN and E621_API_KEY must be set in .env — see .env.example'
)
auth = {'login': login, 'api_key': api_key}
elif args.site == 'rule34':
api_key = os.environ.get('RULE34_API_KEY')
user_id = os.environ.get('RULE34_USER_ID')
if api_key and user_id:
auth = {'api_key': api_key, 'user_id': user_id}
base_dir = os.path.dirname(os.path.abspath(__file__))
pictures_dir = os.path.join(base_dir, 'Pictures')
previews_dir = os.path.join(base_dir, 'Previews')
os.makedirs(pictures_dir, exist_ok=True)
os.makedirs(previews_dir, exist_ok=True)
piped = not sys.stdout.isatty()
scan_cap = args.limit * 10
print(f'Scanning up to {scan_cap} posts from {args.site} for query: {args.query!r}')
all_posts = fetch_all_posts(adapter, args.query, auth, scan_cap)
total_api = len(all_posts)
new_posts, skipped = [], 0
for post in all_posts:
parsed = adapter['parse'](post)
if not parsed:
continue
post_id, _, _ = parsed
if image_exists(args.site, post_id):
skipped += 1
else:
new_posts.append(post)
if len(new_posts) >= args.limit:
break
posts = new_posts
total = len(posts)
start = skipped + 1
end = skipped + total
status_msg = f'skipped {skipped} | fetching {start}{end} of {total_api}'
print(f'Scan done: {status_msg}')
if piped:
print(f'status:{status_msg}', flush=True)
print(f'total:{total}', flush=True)
session = requests.Session()
session.headers['User-Agent'] = 'booru-downloader/2.0 (personal archiver)'
done = 0
with ThreadPoolExecutor(max_workers=adapter['threads']) as pool:
futures = {
pool.submit(download_one, p, args.site, adapter, pictures_dir, previews_dir, session): p
for p in posts
}
for _ in tqdm(as_completed(futures), total=total, file=sys.stderr):
done += 1
if piped:
print(f'progress:{done}/{total}', flush=True)
if piped:
print('done', flush=True)
if __name__ == '__main__':
main()