initial diashow webapp

2026-03-13 21:20:29 +01:00
commit 704a9194ed
7 changed files with 740 additions and 0 deletions
--- a/app.py
+++ b/app.py
@@ -0,0 +1,91 @@
+import re
+import random
+import requests
+from flask import Flask, request, jsonify, send_from_directory
+
+app = Flask(__name__, static_folder="static")
+
+USER_AGENTS = [
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
+    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36",
+]
+
+def get_headers():
+    return {
+        "User-Agent": random.choice(USER_AGENTS),
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+        "Accept-Language": "en-US,en;q=0.5",
+        "Accept-Encoding": "gzip, deflate, br",
+        "DNT": "1",
+        "Connection": "keep-alive",
+        "Upgrade-Insecure-Requests": "1",
+    }
+
+def scrape_images(query, page=0):
+    url = "https://www.google.com/search"
+    params = {
+        "q": query,
+        "tbm": "isch",
+        "ijn": str(page),
+        "start": str(page * 20),
+        "asearch": "ichunk",
+        "async": "_id:rg_s,_pms:s,_fmt:pc",
+    }
+    try:
+        resp = requests.get(url, params=params, headers=get_headers(), timeout=10)
+        resp.raise_for_status()
+    except requests.RequestException as e:
+        return [], str(e)
+
+    # Extract original image URLs: Google embeds them as "ou":"https://..."
+    ou_matches = re.findall(r'"ou"\s*:\s*"(https?://[^"]+)"', resp.text)
+
+    # Fallback: look for image URLs in JSON arrays ["https://...", width, height]
+    if not ou_matches:
+        ou_matches = re.findall(
+            r'(?:^|[^"])(https?://(?!encrypted-tbn)[^"\'\\]+\.(?:jpg|jpeg|png|gif|webp)(?:\?[^"\'\\]*)?)',
+            resp.text,
+            re.IGNORECASE,
+        )
+
+    seen = set()
+    images = []
+    for img_url in ou_matches:
+        if img_url in seen:
+            continue
+        seen.add(img_url)
+        # skip google's own thumbnails / tracking pixels
+        if "google.com" in img_url or "gstatic.com" in img_url:
+            continue
+        images.append({"url": img_url})
+        if len(images) >= 20:
+            break
+
+    return images, None
+
+
+@app.route("/api/search")
+def search():
+    q = request.args.get("q", "").strip()
+    page = max(0, int(request.args.get("page", 0)))
+    if not q:
+        return jsonify({"error": "missing query"}), 400
+    images, err = scrape_images(q, page)
+    if err:
+        return jsonify({"error": err}), 502
+    return jsonify({"images": images, "page": page, "query": q})
+
+
+@app.route("/")
+def index():
+    return send_from_directory("static", "index.html")
+
+
+@app.route("/<path:path>")
+def static_files(path):
+    return send_from_directory("static", path)
+
+
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=5000)