You're welcome. Now behave. CHANGES: - Added rule34.xxx as a source — yes it's unlocked, don't get too excited - rule34 auth via RULE34_API_KEY + RULE34_USER_ID in .env, because you don't get in without permission - Fixed pagination for Gelbooru-style APIs (pid, 0-indexed) — the old page param was just embarrassing - Default download limit capped at 100 per request — you don't get unlimited, you get what you're given - Downloader now scans 10x the limit first, skips what's already owned, then fetches only fresh ones — efficient, like you should be - Progress bar now shows scan status: "skipped N | fetching X–Y of Z" — full transparency, no excuses - File size shown top-right of the image in small text — size matters and now you can see it
192 lines
5.9 KiB
Python
192 lines
5.9 KiB
Python
import os
|
||
import sys
|
||
import argparse
|
||
import requests
|
||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||
from tqdm import tqdm
|
||
from dotenv import load_dotenv
|
||
from db import init_db, image_exists, insert_image
|
||
|
||
|
||
def parse_e621(post):
|
||
file_url = post.get('file', {}).get('url')
|
||
if not file_url:
|
||
return None
|
||
tags = ' '.join(t for cat in post.get('tags', {}).values() for t in cat)
|
||
return str(post['id']), file_url, tags
|
||
|
||
|
||
def parse_moebooru(post):
|
||
file_url = post.get('file_url')
|
||
if not file_url:
|
||
return None
|
||
return str(post['id']), file_url, post.get('tags', '')
|
||
|
||
|
||
SITES = {
|
||
'e621': {
|
||
'base_url': 'https://e621.net/',
|
||
'endpoint': 'posts.json',
|
||
'per_page': 100,
|
||
'threads': 8,
|
||
'envelope': 'posts',
|
||
'parse': parse_e621,
|
||
'post_url_fmt': 'https://e621.net/posts/{post_id}',
|
||
},
|
||
'konachan': {
|
||
'base_url': 'https://konachan.com/',
|
||
'endpoint': 'post.json',
|
||
'per_page': 100,
|
||
'threads': 20,
|
||
'envelope': None,
|
||
'parse': parse_moebooru,
|
||
'post_url_fmt': 'https://konachan.com/post/show/{post_id}',
|
||
},
|
||
'yandere': {
|
||
'base_url': 'https://yande.re/',
|
||
'endpoint': 'post.json',
|
||
'per_page': 100,
|
||
'threads': 20,
|
||
'envelope': None,
|
||
'parse': parse_moebooru,
|
||
'post_url_fmt': 'https://yande.re/post/show/{post_id}',
|
||
},
|
||
'rule34': {
|
||
'base_url': 'https://rule34.xxx/',
|
||
'endpoint': 'index.php?page=dapi&s=post&q=index&json=1',
|
||
'per_page': 100,
|
||
'threads': 20,
|
||
'envelope': None,
|
||
'page_param': 'pid',
|
||
'page_start': 0,
|
||
'parse': parse_moebooru,
|
||
'post_url_fmt': 'https://rule34.xxx/index.php?page=post&s=view&id={post_id}',
|
||
},
|
||
}
|
||
|
||
|
||
def fetch_all_posts(adapter, query, auth, limit):
|
||
page_param = adapter.get('page_param', 'page')
|
||
posts, page = [], adapter.get('page_start', 1)
|
||
session = requests.Session()
|
||
session.headers['User-Agent'] = 'booru-downloader/2.0 (personal archiver)'
|
||
while len(posts) < limit:
|
||
params = {'tags': query, 'limit': adapter['per_page'], page_param: page}
|
||
if auth:
|
||
params.update(auth)
|
||
r = session.get(
|
||
adapter['base_url'] + adapter['endpoint'], params=params, timeout=30
|
||
)
|
||
r.raise_for_status()
|
||
data = r.json()
|
||
batch = data[adapter['envelope']] if adapter['envelope'] else data
|
||
if not batch:
|
||
break
|
||
posts.extend(batch)
|
||
if len(batch) < adapter['per_page']:
|
||
break
|
||
page += 1
|
||
return posts[:limit]
|
||
|
||
|
||
def download_one(post, site_name, adapter, pictures_dir, session):
|
||
parsed = adapter['parse'](post)
|
||
if not parsed:
|
||
return 'skip:no_url'
|
||
post_id, file_url, tags = parsed
|
||
if image_exists(site_name, post_id):
|
||
return f'skip:dup:{post_id}'
|
||
clean_url = file_url.split('?')[0]
|
||
ext = os.path.splitext(clean_url)[1]
|
||
filename = f'{site_name}_{post_id}{ext}'
|
||
dest = os.path.join(pictures_dir, filename)
|
||
r = session.get(file_url, timeout=60)
|
||
r.raise_for_status()
|
||
with open(dest, 'wb') as f:
|
||
f.write(r.content)
|
||
post_url = adapter['post_url_fmt'].format(post_id=post_id)
|
||
insert_image(post_id, site_name, filename, tags, file_url, post_url)
|
||
return f'ok:{filename}'
|
||
|
||
|
||
def main():
|
||
load_dotenv()
|
||
parser = argparse.ArgumentParser(
|
||
description='Download booru images to unified store'
|
||
)
|
||
parser.add_argument('--site', required=True, choices=SITES.keys())
|
||
parser.add_argument('--query', required=True, help='Space-separated tags to search')
|
||
parser.add_argument('--limit', type=int, default=100)
|
||
args = parser.parse_args()
|
||
|
||
init_db()
|
||
adapter = SITES[args.site]
|
||
|
||
auth = None
|
||
if args.site == 'e621':
|
||
login = os.environ.get('E621_LOGIN')
|
||
api_key = os.environ.get('E621_API_KEY')
|
||
if not login or not api_key:
|
||
raise KeyError(
|
||
'E621_LOGIN and E621_API_KEY must be set in .env — see .env.example'
|
||
)
|
||
auth = {'login': login, 'api_key': api_key}
|
||
elif args.site == 'rule34':
|
||
api_key = os.environ.get('RULE34_API_KEY')
|
||
user_id = os.environ.get('RULE34_USER_ID')
|
||
if api_key and user_id:
|
||
auth = {'api_key': api_key, 'user_id': user_id}
|
||
|
||
pictures_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'Pictures')
|
||
os.makedirs(pictures_dir, exist_ok=True)
|
||
|
||
piped = not sys.stdout.isatty()
|
||
|
||
scan_cap = args.limit * 10
|
||
print(f'Scanning up to {scan_cap} posts from {args.site} for query: {args.query!r}')
|
||
all_posts = fetch_all_posts(adapter, args.query, auth, scan_cap)
|
||
total_api = len(all_posts)
|
||
|
||
new_posts, skipped = [], 0
|
||
for post in all_posts:
|
||
parsed = adapter['parse'](post)
|
||
if not parsed:
|
||
continue
|
||
post_id, _, _ = parsed
|
||
if image_exists(args.site, post_id):
|
||
skipped += 1
|
||
else:
|
||
new_posts.append(post)
|
||
if len(new_posts) >= args.limit:
|
||
break
|
||
|
||
posts = new_posts
|
||
total = len(posts)
|
||
start = skipped + 1
|
||
end = skipped + total
|
||
status_msg = f'skipped {skipped} | fetching {start}–{end} of {total_api}'
|
||
print(f'Scan done: {status_msg}')
|
||
if piped:
|
||
print(f'status:{status_msg}', flush=True)
|
||
print(f'total:{total}', flush=True)
|
||
|
||
session = requests.Session()
|
||
session.headers['User-Agent'] = 'booru-downloader/2.0 (personal archiver)'
|
||
done = 0
|
||
with ThreadPoolExecutor(max_workers=adapter['threads']) as pool:
|
||
futures = {
|
||
pool.submit(download_one, p, args.site, adapter, pictures_dir, session): p
|
||
for p in posts
|
||
}
|
||
for _ in tqdm(as_completed(futures), total=total, file=sys.stderr):
|
||
done += 1
|
||
if piped:
|
||
print(f'progress:{done}/{total}', flush=True)
|
||
|
||
if piped:
|
||
print('done', flush=True)
|
||
|
||
|
||
if __name__ == '__main__':
|
||
main()
|