import io import math import os import shutil import sys import argparse from datetime import datetime, timezone import requests from concurrent.futures import ThreadPoolExecutor, as_completed from tqdm import tqdm from dotenv import load_dotenv from db import init_db, image_exists, insert_image def parse_e621(post): file_url = post.get('file', {}).get('url') if not file_url: return None tags = ' '.join(t for cat in post.get('tags', {}).values() for t in cat) try: created_at = int(datetime.fromisoformat(post['created_at']).timestamp()) except Exception: created_at = 0 return str(post['id']), file_url, tags, created_at def parse_moebooru(post): file_url = post.get('file_url') if not file_url: return None try: created_at = int(post.get('created_at') or post.get('change', 0)) except Exception: created_at = 0 return str(post['id']), file_url, post.get('tags', ''), created_at SITES = { 'e621': { 'base_url': 'https://e621.net/', 'endpoint': 'posts.json', 'per_page': 100, 'threads': 8, 'envelope': 'posts', 'parse': parse_e621, 'post_url_fmt': 'https://e621.net/posts/{post_id}', }, 'konachan': { 'base_url': 'https://konachan.com/', 'endpoint': 'post.json', 'per_page': 100, 'threads': 20, 'envelope': None, 'parse': parse_moebooru, 'post_url_fmt': 'https://konachan.com/post/show/{post_id}', }, 'yandere': { 'base_url': 'https://yande.re/', 'endpoint': 'post.json', 'per_page': 100, 'threads': 20, 'envelope': None, 'parse': parse_moebooru, 'post_url_fmt': 'https://yande.re/post/show/{post_id}', }, 'rule34': { 'base_url': 'https://api.rule34.xxx/', 'endpoint': 'index.php?page=dapi&s=post&q=index&json=1', 'per_page': 100, 'threads': 20, 'envelope': None, 'page_param': 'pid', 'page_start': 0, 'parse': parse_moebooru, 'post_url_fmt': 'https://rule34.xxx/index.php?page=post&s=view&id={post_id}', }, } def fetch_all_posts(adapter, query, auth, limit): page_param = adapter.get('page_param', 'page') posts, page = [], adapter.get('page_start', 1) session = requests.Session() session.headers['User-Agent'] = 'booru-downloader/2.0 (personal archiver)' while len(posts) < limit: params = {'tags': query, 'limit': adapter['per_page'], page_param: page} if auth: params.update(auth) r = session.get( adapter['base_url'] + adapter['endpoint'], params=params, timeout=30 ) r.raise_for_status() data = r.json() batch = data[adapter['envelope']] if adapter['envelope'] else data if not isinstance(batch, list): raise RuntimeError(f'API returned unexpected response: {batch!r}') if not batch: break posts.extend(batch) if len(batch) < adapter['per_page']: break page += 1 return posts[:limit] def make_preview(src_path, previews_dir, max_bytes=1_048_576): from PIL import Image src_name = os.path.basename(src_path) name_no_ext, _ext = os.path.splitext(src_name) if os.path.getsize(src_path) <= max_bytes: dest_path = os.path.join(previews_dir, src_name) shutil.copy2(src_path, dest_path) return src_name try: img = Image.open(src_path) if img.mode not in ('RGB', 'L', 'RGBA'): img = img.convert('RGB') elif img.mode == 'RGBA': bg = Image.new('RGB', img.size, (255, 255, 255)) bg.paste(img, mask=img.split()[3]) img = bg except Exception: dest_path = os.path.join(previews_dir, src_name) shutil.copy2(src_path, dest_path) return src_name file_size = os.path.getsize(src_path) scale = math.sqrt(max_bytes / file_size) w, h = img.size img = img.resize((max(1, int(w * scale)), max(1, int(h * scale))), Image.LANCZOS) preview_name = name_no_ext + '.jpg' dest_path = os.path.join(previews_dir, preview_name) quality = 85 buf = io.BytesIO() while quality >= 20: buf = io.BytesIO() img.save(buf, format='JPEG', quality=quality, optimize=True) if buf.tell() <= max_bytes: break quality -= 10 with open(dest_path, 'wb') as f: f.write(buf.getvalue()) return preview_name def download_one(post, site_name, adapter, pictures_dir, previews_dir, session): parsed = adapter['parse'](post) if not parsed: return 'skip:no_url' post_id, file_url, tags, created_at = parsed if image_exists(site_name, post_id): return f'skip:dup:{post_id}' clean_url = file_url.split('?')[0] ext = os.path.splitext(clean_url)[1] filename = f'{site_name}_{post_id}{ext}' dest = os.path.join(pictures_dir, filename) r = session.get(file_url, timeout=60) r.raise_for_status() with open(dest, 'wb') as f: f.write(r.content) post_url = adapter['post_url_fmt'].format(post_id=post_id) preview_filename = make_preview(dest, previews_dir) insert_image(post_id, site_name, filename, tags, file_url, post_url, preview_filename, created_at) return f'ok:{filename}' def main(): load_dotenv() parser = argparse.ArgumentParser( description='Download booru images to unified store' ) parser.add_argument('--site', required=True, choices=SITES.keys()) parser.add_argument('--query', required=True, help='Space-separated tags to search') parser.add_argument('--limit', type=int, default=100) args = parser.parse_args() init_db() adapter = SITES[args.site] auth = None if args.site == 'e621': login = os.environ.get('E621_LOGIN') api_key = os.environ.get('E621_API_KEY') if not login or not api_key: raise KeyError( 'E621_LOGIN and E621_API_KEY must be set in .env — see .env.example' ) auth = {'login': login, 'api_key': api_key} elif args.site == 'rule34': api_key = os.environ.get('RULE34_API_KEY') user_id = os.environ.get('RULE34_USER_ID') if api_key and user_id: auth = {'api_key': api_key, 'user_id': user_id} base_dir = os.path.dirname(os.path.abspath(__file__)) pictures_dir = os.path.join(base_dir, 'Pictures') previews_dir = os.path.join(base_dir, 'Previews') os.makedirs(pictures_dir, exist_ok=True) os.makedirs(previews_dir, exist_ok=True) piped = not sys.stdout.isatty() scan_cap = args.limit * 10 print(f'Scanning up to {scan_cap} posts from {args.site} for query: {args.query!r}') all_posts = fetch_all_posts(adapter, args.query, auth, scan_cap) total_api = len(all_posts) new_posts, skipped = [], 0 for post in all_posts: parsed = adapter['parse'](post) if not parsed: continue post_id, _, _, _ = parsed if image_exists(args.site, post_id): skipped += 1 else: new_posts.append(post) if len(new_posts) >= args.limit: break posts = new_posts total = len(posts) start = skipped + 1 end = skipped + total status_msg = f'skipped {skipped} | fetching {start}–{end} of {total_api}' print(f'Scan done: {status_msg}') if piped: print(f'status:{status_msg}', flush=True) print(f'total:{total}', flush=True) session = requests.Session() session.headers['User-Agent'] = 'booru-downloader/2.0 (personal archiver)' done = 0 with ThreadPoolExecutor(max_workers=adapter['threads']) as pool: futures = { pool.submit(download_one, p, args.site, adapter, pictures_dir, previews_dir, session): p for p in posts } for _ in tqdm(as_completed(futures), total=total, file=sys.stderr): done += 1 if piped: print(f'progress:{done}/{total}', flush=True) if piped: print('done', flush=True) if __name__ == '__main__': main()