diff --git a/app.py b/app.py index ba14d0e..6342161 100644 --- a/app.py +++ b/app.py @@ -23,43 +23,34 @@ def get_headers(): } def scrape_images(query, page=0): - url = "https://www.google.com/search" - params = { - "q": query, - "tbm": "isch", - "ijn": str(page), - "start": str(page * 20), - "asearch": "ichunk", - "async": "_id:rg_s,_pms:s,_fmt:pc", - } + # Bing image search async endpoint — returns HTML with embedded image URLs try: - resp = requests.get(url, params=params, headers=get_headers(), timeout=10) + resp = requests.get( + "https://www.bing.com/images/async", + params={ + "q": query, + "first": str(page * 35), + "count": "35", + "mmasync": "1", + }, + headers=get_headers(), + timeout=10, + ) resp.raise_for_status() except requests.RequestException as e: return [], str(e) - # Extract original image URLs: Google embeds them as "ou":"https://..." - ou_matches = re.findall(r'"ou"\s*:\s*"(https?://[^"]+)"', resp.text) - - # Fallback: look for image URLs in JSON arrays ["https://...", width, height] - if not ou_matches: - ou_matches = re.findall( - r'(?:^|[^"])(https?://(?!encrypted-tbn)[^"\'\\]+\.(?:jpg|jpeg|png|gif|webp)(?:\?[^"\'\\]*)?)', - resp.text, - re.IGNORECASE, - ) + # Bing encodes original image URLs as murl":"https://..." + raw_urls = re.findall(r'murl":"(https?://[^&]+)"', resp.text) seen = set() images = [] - for img_url in ou_matches: + for img_url in raw_urls: if img_url in seen: continue seen.add(img_url) - # skip google's own thumbnails / tracking pixels - if "google.com" in img_url or "gstatic.com" in img_url: - continue images.append({"url": img_url}) - if len(images) >= 20: + if len(images) >= 30: break return images, None