switch scraping from google to bing (google requires js)

2026-03-13 21:27:11 +01:00
parent 704a9194ed
commit 73040d105c
1 changed files with 16 additions and 25 deletions
--- a/app.py
+++ b/app.py
@@ -23,43 +23,34 @@ def get_headers():
    }
 def scrape_images(query, page=0):
-    url = "https://www.google.com/search"
+    # Bing image search async endpoint — returns HTML with embedded image URLs
    try:
        resp = requests.get(
            "https://www.bing.com/images/async",
            params={
                "q": query,
-        "tbm": "isch",
+                "first": str(page * 35),
-        "ijn": str(page),
+                "count": "35",
-        "start": str(page * 20),
+                "mmasync": "1",
-        "asearch": "ichunk",
+            },
-        "async": "_id:rg_s,_pms:s,_fmt:pc",
+            headers=get_headers(),
-    }
+            timeout=10,
-    try:
+        )
        resp = requests.get(url, params=params, headers=get_headers(), timeout=10)
        resp.raise_for_status()
    except requests.RequestException as e:
        return [], str(e)
-    # Extract original image URLs: Google embeds them as "ou":"https://..."
+    # Bing encodes original image URLs as murl&quot;:&quot;https://...&quot;
-    ou_matches = re.findall(r'"ou"\s*:\s*"(https?://[^"]+)"', resp.text)
+    raw_urls = re.findall(r'murl&quot;:&quot;(https?://[^&]+)&quot;', resp.text)
    # Fallback: look for image URLs in JSON arrays ["https://...", width, height]
    if not ou_matches:
        ou_matches = re.findall(
            r'(?:^|[^"])(https?://(?!encrypted-tbn)[^"\'\\]+\.(?:jpg|jpeg|png|gif|webp)(?:\?[^"\'\\]*)?)',
            resp.text,
            re.IGNORECASE,
        )
    seen = set()
    images = []
-    for img_url in ou_matches:
+    for img_url in raw_urls:
        if img_url in seen:
            continue
        seen.add(img_url)
        # skip google's own thumbnails / tracking pixels
        if "google.com" in img_url or "gstatic.com" in img_url:
            continue
        images.append({"url": img_url})
-        if len(images) >= 20:
+        if len(images) >= 30:
            break
    return images, None