Files
slideshow/downloader.py
Johannes 45af8a2ace rule34 support, smarter downloads, file size display
You're welcome. Now behave.

CHANGES:
- Added rule34.xxx as a source — yes it's unlocked, don't get too excited
- rule34 auth via RULE34_API_KEY + RULE34_USER_ID in .env, because you don't get in without permission
- Fixed pagination for Gelbooru-style APIs (pid, 0-indexed) — the old page param was just embarrassing
- Default download limit capped at 100 per request — you don't get unlimited, you get what you're given
- Downloader now scans 10x the limit first, skips what's already owned, then fetches only fresh ones — efficient, like you should be
- Progress bar now shows scan status: "skipped N | fetching X–Y of Z" — full transparency, no excuses
- File size shown top-right of the image in small text — size matters and now you can see it
2026-05-06 00:56:02 +02:00

192 lines
5.9 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import os
import sys
import argparse
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from dotenv import load_dotenv
from db import init_db, image_exists, insert_image
def parse_e621(post):
file_url = post.get('file', {}).get('url')
if not file_url:
return None
tags = ' '.join(t for cat in post.get('tags', {}).values() for t in cat)
return str(post['id']), file_url, tags
def parse_moebooru(post):
file_url = post.get('file_url')
if not file_url:
return None
return str(post['id']), file_url, post.get('tags', '')
SITES = {
'e621': {
'base_url': 'https://e621.net/',
'endpoint': 'posts.json',
'per_page': 100,
'threads': 8,
'envelope': 'posts',
'parse': parse_e621,
'post_url_fmt': 'https://e621.net/posts/{post_id}',
},
'konachan': {
'base_url': 'https://konachan.com/',
'endpoint': 'post.json',
'per_page': 100,
'threads': 20,
'envelope': None,
'parse': parse_moebooru,
'post_url_fmt': 'https://konachan.com/post/show/{post_id}',
},
'yandere': {
'base_url': 'https://yande.re/',
'endpoint': 'post.json',
'per_page': 100,
'threads': 20,
'envelope': None,
'parse': parse_moebooru,
'post_url_fmt': 'https://yande.re/post/show/{post_id}',
},
'rule34': {
'base_url': 'https://rule34.xxx/',
'endpoint': 'index.php?page=dapi&s=post&q=index&json=1',
'per_page': 100,
'threads': 20,
'envelope': None,
'page_param': 'pid',
'page_start': 0,
'parse': parse_moebooru,
'post_url_fmt': 'https://rule34.xxx/index.php?page=post&s=view&id={post_id}',
},
}
def fetch_all_posts(adapter, query, auth, limit):
page_param = adapter.get('page_param', 'page')
posts, page = [], adapter.get('page_start', 1)
session = requests.Session()
session.headers['User-Agent'] = 'booru-downloader/2.0 (personal archiver)'
while len(posts) < limit:
params = {'tags': query, 'limit': adapter['per_page'], page_param: page}
if auth:
params.update(auth)
r = session.get(
adapter['base_url'] + adapter['endpoint'], params=params, timeout=30
)
r.raise_for_status()
data = r.json()
batch = data[adapter['envelope']] if adapter['envelope'] else data
if not batch:
break
posts.extend(batch)
if len(batch) < adapter['per_page']:
break
page += 1
return posts[:limit]
def download_one(post, site_name, adapter, pictures_dir, session):
parsed = adapter['parse'](post)
if not parsed:
return 'skip:no_url'
post_id, file_url, tags = parsed
if image_exists(site_name, post_id):
return f'skip:dup:{post_id}'
clean_url = file_url.split('?')[0]
ext = os.path.splitext(clean_url)[1]
filename = f'{site_name}_{post_id}{ext}'
dest = os.path.join(pictures_dir, filename)
r = session.get(file_url, timeout=60)
r.raise_for_status()
with open(dest, 'wb') as f:
f.write(r.content)
post_url = adapter['post_url_fmt'].format(post_id=post_id)
insert_image(post_id, site_name, filename, tags, file_url, post_url)
return f'ok:{filename}'
def main():
load_dotenv()
parser = argparse.ArgumentParser(
description='Download booru images to unified store'
)
parser.add_argument('--site', required=True, choices=SITES.keys())
parser.add_argument('--query', required=True, help='Space-separated tags to search')
parser.add_argument('--limit', type=int, default=100)
args = parser.parse_args()
init_db()
adapter = SITES[args.site]
auth = None
if args.site == 'e621':
login = os.environ.get('E621_LOGIN')
api_key = os.environ.get('E621_API_KEY')
if not login or not api_key:
raise KeyError(
'E621_LOGIN and E621_API_KEY must be set in .env — see .env.example'
)
auth = {'login': login, 'api_key': api_key}
elif args.site == 'rule34':
api_key = os.environ.get('RULE34_API_KEY')
user_id = os.environ.get('RULE34_USER_ID')
if api_key and user_id:
auth = {'api_key': api_key, 'user_id': user_id}
pictures_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'Pictures')
os.makedirs(pictures_dir, exist_ok=True)
piped = not sys.stdout.isatty()
scan_cap = args.limit * 10
print(f'Scanning up to {scan_cap} posts from {args.site} for query: {args.query!r}')
all_posts = fetch_all_posts(adapter, args.query, auth, scan_cap)
total_api = len(all_posts)
new_posts, skipped = [], 0
for post in all_posts:
parsed = adapter['parse'](post)
if not parsed:
continue
post_id, _, _ = parsed
if image_exists(args.site, post_id):
skipped += 1
else:
new_posts.append(post)
if len(new_posts) >= args.limit:
break
posts = new_posts
total = len(posts)
start = skipped + 1
end = skipped + total
status_msg = f'skipped {skipped} | fetching {start}{end} of {total_api}'
print(f'Scan done: {status_msg}')
if piped:
print(f'status:{status_msg}', flush=True)
print(f'total:{total}', flush=True)
session = requests.Session()
session.headers['User-Agent'] = 'booru-downloader/2.0 (personal archiver)'
done = 0
with ThreadPoolExecutor(max_workers=adapter['threads']) as pool:
futures = {
pool.submit(download_one, p, args.site, adapter, pictures_dir, session): p
for p in posts
}
for _ in tqdm(as_completed(futures), total=total, file=sys.stderr):
done += 1
if piped:
print(f'progress:{done}/{total}', flush=True)
if piped:
print('done', flush=True)
if __name__ == '__main__':
main()