245 lines
7.6 KiB
Python
245 lines
7.6 KiB
Python
import io
|
||
import math
|
||
import os
|
||
import shutil
|
||
import sys
|
||
import argparse
|
||
import requests
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
from tqdm import tqdm
|
||
from dotenv import load_dotenv
|
||
from db import init_db, image_exists, insert_image
|
||
|
||
|
||
def parse_e621(post):
|
||
file_url = post.get('file', {}).get('url')
|
||
if not file_url:
|
||
return None
|
||
tags = ' '.join(t for cat in post.get('tags', {}).values() for t in cat)
|
||
return str(post['id']), file_url, tags
|
||
|
||
|
||
def parse_moebooru(post):
|
||
file_url = post.get('file_url')
|
||
if not file_url:
|
||
return None
|
||
return str(post['id']), file_url, post.get('tags', '')
|
||
|
||
|
||
SITES = {
|
||
'e621': {
|
||
'base_url': 'https://e621.net/',
|
||
'endpoint': 'posts.json',
|
||
'per_page': 100,
|
||
'threads': 8,
|
||
'envelope': 'posts',
|
||
'parse': parse_e621,
|
||
'post_url_fmt': 'https://e621.net/posts/{post_id}',
|
||
},
|
||
'konachan': {
|
||
'base_url': 'https://konachan.com/',
|
||
'endpoint': 'post.json',
|
||
'per_page': 100,
|
||
'threads': 20,
|
||
'envelope': None,
|
||
'parse': parse_moebooru,
|
||
'post_url_fmt': 'https://konachan.com/post/show/{post_id}',
|
||
},
|
||
'yandere': {
|
||
'base_url': 'https://yande.re/',
|
||
'endpoint': 'post.json',
|
||
'per_page': 100,
|
||
'threads': 20,
|
||
'envelope': None,
|
||
'parse': parse_moebooru,
|
||
'post_url_fmt': 'https://yande.re/post/show/{post_id}',
|
||
},
|
||
'rule34': {
|
||
'base_url': 'https://rule34.xxx/',
|
||
'endpoint': 'index.php?page=dapi&s=post&q=index&json=1',
|
||
'per_page': 100,
|
||
'threads': 20,
|
||
'envelope': None,
|
||
'page_param': 'pid',
|
||
'page_start': 0,
|
||
'parse': parse_moebooru,
|
||
'post_url_fmt': 'https://rule34.xxx/index.php?page=post&s=view&id={post_id}',
|
||
},
|
||
}
|
||
|
||
|
||
def fetch_all_posts(adapter, query, auth, limit):
|
||
page_param = adapter.get('page_param', 'page')
|
||
posts, page = [], adapter.get('page_start', 1)
|
||
session = requests.Session()
|
||
session.headers['User-Agent'] = 'booru-downloader/2.0 (personal archiver)'
|
||
while len(posts) < limit:
|
||
params = {'tags': query, 'limit': adapter['per_page'], page_param: page}
|
||
if auth:
|
||
params.update(auth)
|
||
r = session.get(
|
||
adapter['base_url'] + adapter['endpoint'], params=params, timeout=30
|
||
)
|
||
r.raise_for_status()
|
||
data = r.json()
|
||
batch = data[adapter['envelope']] if adapter['envelope'] else data
|
||
if not batch:
|
||
break
|
||
posts.extend(batch)
|
||
if len(batch) < adapter['per_page']:
|
||
break
|
||
page += 1
|
||
return posts[:limit]
|
||
|
||
|
||
def make_preview(src_path, previews_dir, max_bytes=1_048_576):
|
||
from PIL import Image
|
||
|
||
src_name = os.path.basename(src_path)
|
||
name_no_ext, _ext = os.path.splitext(src_name)
|
||
|
||
if os.path.getsize(src_path) <= max_bytes:
|
||
dest_path = os.path.join(previews_dir, src_name)
|
||
shutil.copy2(src_path, dest_path)
|
||
return src_name
|
||
|
||
try:
|
||
img = Image.open(src_path)
|
||
if img.mode not in ('RGB', 'L', 'RGBA'):
|
||
img = img.convert('RGB')
|
||
elif img.mode == 'RGBA':
|
||
bg = Image.new('RGB', img.size, (255, 255, 255))
|
||
bg.paste(img, mask=img.split()[3])
|
||
img = bg
|
||
except Exception:
|
||
dest_path = os.path.join(previews_dir, src_name)
|
||
shutil.copy2(src_path, dest_path)
|
||
return src_name
|
||
|
||
file_size = os.path.getsize(src_path)
|
||
scale = math.sqrt(max_bytes / file_size)
|
||
w, h = img.size
|
||
img = img.resize((max(1, int(w * scale)), max(1, int(h * scale))), Image.LANCZOS)
|
||
|
||
preview_name = name_no_ext + '.jpg'
|
||
dest_path = os.path.join(previews_dir, preview_name)
|
||
|
||
quality = 85
|
||
buf = io.BytesIO()
|
||
while quality >= 20:
|
||
buf = io.BytesIO()
|
||
img.save(buf, format='JPEG', quality=quality, optimize=True)
|
||
if buf.tell() <= max_bytes:
|
||
break
|
||
quality -= 10
|
||
|
||
with open(dest_path, 'wb') as f:
|
||
f.write(buf.getvalue())
|
||
return preview_name
|
||
|
||
|
||
def download_one(post, site_name, adapter, pictures_dir, previews_dir, session):
|
||
parsed = adapter['parse'](post)
|
||
if not parsed:
|
||
return 'skip:no_url'
|
||
post_id, file_url, tags = parsed
|
||
if image_exists(site_name, post_id):
|
||
return f'skip:dup:{post_id}'
|
||
clean_url = file_url.split('?')[0]
|
||
ext = os.path.splitext(clean_url)[1]
|
||
filename = f'{site_name}_{post_id}{ext}'
|
||
dest = os.path.join(pictures_dir, filename)
|
||
r = session.get(file_url, timeout=60)
|
||
r.raise_for_status()
|
||
with open(dest, 'wb') as f:
|
||
f.write(r.content)
|
||
post_url = adapter['post_url_fmt'].format(post_id=post_id)
|
||
preview_filename = make_preview(dest, previews_dir)
|
||
insert_image(post_id, site_name, filename, tags, file_url, post_url, preview_filename)
|
||
return f'ok:{filename}'
|
||
|
||
|
||
def main():
|
||
load_dotenv()
|
||
parser = argparse.ArgumentParser(
|
||
description='Download booru images to unified store'
|
||
)
|
||
parser.add_argument('--site', required=True, choices=SITES.keys())
|
||
parser.add_argument('--query', required=True, help='Space-separated tags to search')
|
||
parser.add_argument('--limit', type=int, default=100)
|
||
args = parser.parse_args()
|
||
|
||
init_db()
|
||
adapter = SITES[args.site]
|
||
|
||
auth = None
|
||
if args.site == 'e621':
|
||
login = os.environ.get('E621_LOGIN')
|
||
api_key = os.environ.get('E621_API_KEY')
|
||
if not login or not api_key:
|
||
raise KeyError(
|
||
'E621_LOGIN and E621_API_KEY must be set in .env — see .env.example'
|
||
)
|
||
auth = {'login': login, 'api_key': api_key}
|
||
elif args.site == 'rule34':
|
||
api_key = os.environ.get('RULE34_API_KEY')
|
||
user_id = os.environ.get('RULE34_USER_ID')
|
||
if api_key and user_id:
|
||
auth = {'api_key': api_key, 'user_id': user_id}
|
||
|
||
base_dir = os.path.dirname(os.path.abspath(__file__))
|
||
pictures_dir = os.path.join(base_dir, 'Pictures')
|
||
previews_dir = os.path.join(base_dir, 'Previews')
|
||
os.makedirs(pictures_dir, exist_ok=True)
|
||
os.makedirs(previews_dir, exist_ok=True)
|
||
|
||
piped = not sys.stdout.isatty()
|
||
|
||
scan_cap = args.limit * 10
|
||
print(f'Scanning up to {scan_cap} posts from {args.site} for query: {args.query!r}')
|
||
all_posts = fetch_all_posts(adapter, args.query, auth, scan_cap)
|
||
total_api = len(all_posts)
|
||
|
||
new_posts, skipped = [], 0
|
||
for post in all_posts:
|
||
parsed = adapter['parse'](post)
|
||
if not parsed:
|
||
continue
|
||
post_id, _, _ = parsed
|
||
if image_exists(args.site, post_id):
|
||
skipped += 1
|
||
else:
|
||
new_posts.append(post)
|
||
if len(new_posts) >= args.limit:
|
||
break
|
||
|
||
posts = new_posts
|
||
total = len(posts)
|
||
start = skipped + 1
|
||
end = skipped + total
|
||
status_msg = f'skipped {skipped} | fetching {start}–{end} of {total_api}'
|
||
print(f'Scan done: {status_msg}')
|
||
if piped:
|
||
print(f'status:{status_msg}', flush=True)
|
||
print(f'total:{total}', flush=True)
|
||
|
||
session = requests.Session()
|
||
session.headers['User-Agent'] = 'booru-downloader/2.0 (personal archiver)'
|
||
done = 0
|
||
with ThreadPoolExecutor(max_workers=adapter['threads']) as pool:
|
||
futures = {
|
||
pool.submit(download_one, p, args.site, adapter, pictures_dir, previews_dir, session): p
|
||
for p in posts
|
||
}
|
||
for _ in tqdm(as_completed(futures), total=total, file=sys.stderr):
|
||
done += 1
|
||
if piped:
|
||
print(f'progress:{done}/{total}', flush=True)
|
||
|
||
if piped:
|
||
print('done', flush=True)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|