switch scraping from google to bing (google requires js)

This commit is contained in:
Johannes
2026-03-13 21:27:11 +01:00
parent 704a9194ed
commit 73040d105c

39
app.py
View File

@@ -23,43 +23,34 @@ def get_headers():
} }
def scrape_images(query, page=0): def scrape_images(query, page=0):
url = "https://www.google.com/search" # Bing image search async endpoint — returns HTML with embedded image URLs
try:
resp = requests.get(
"https://www.bing.com/images/async",
params={ params={
"q": query, "q": query,
"tbm": "isch", "first": str(page * 35),
"ijn": str(page), "count": "35",
"start": str(page * 20), "mmasync": "1",
"asearch": "ichunk", },
"async": "_id:rg_s,_pms:s,_fmt:pc", headers=get_headers(),
} timeout=10,
try: )
resp = requests.get(url, params=params, headers=get_headers(), timeout=10)
resp.raise_for_status() resp.raise_for_status()
except requests.RequestException as e: except requests.RequestException as e:
return [], str(e) return [], str(e)
# Extract original image URLs: Google embeds them as "ou":"https://..." # Bing encodes original image URLs as murl":"https://..."
ou_matches = re.findall(r'"ou"\s*:\s*"(https?://[^"]+)"', resp.text) raw_urls = re.findall(r'murl":"(https?://[^&]+)"', resp.text)
# Fallback: look for image URLs in JSON arrays ["https://...", width, height]
if not ou_matches:
ou_matches = re.findall(
r'(?:^|[^"])(https?://(?!encrypted-tbn)[^"\'\\]+\.(?:jpg|jpeg|png|gif|webp)(?:\?[^"\'\\]*)?)',
resp.text,
re.IGNORECASE,
)
seen = set() seen = set()
images = [] images = []
for img_url in ou_matches: for img_url in raw_urls:
if img_url in seen: if img_url in seen:
continue continue
seen.add(img_url) seen.add(img_url)
# skip google's own thumbnails / tracking pixels
if "google.com" in img_url or "gstatic.com" in img_url:
continue
images.append({"url": img_url}) images.append({"url": img_url})
if len(images) >= 20: if len(images) >= 30:
break break
return images, None return images, None