switch scraping from google to bing (google requires js)
This commit is contained in:
39
app.py
39
app.py
@@ -23,43 +23,34 @@ def get_headers():
|
||||
}
|
||||
|
||||
def scrape_images(query, page=0):
|
||||
url = "https://www.google.com/search"
|
||||
# Bing image search async endpoint — returns HTML with embedded image URLs
|
||||
try:
|
||||
resp = requests.get(
|
||||
"https://www.bing.com/images/async",
|
||||
params={
|
||||
"q": query,
|
||||
"tbm": "isch",
|
||||
"ijn": str(page),
|
||||
"start": str(page * 20),
|
||||
"asearch": "ichunk",
|
||||
"async": "_id:rg_s,_pms:s,_fmt:pc",
|
||||
}
|
||||
try:
|
||||
resp = requests.get(url, params=params, headers=get_headers(), timeout=10)
|
||||
"first": str(page * 35),
|
||||
"count": "35",
|
||||
"mmasync": "1",
|
||||
},
|
||||
headers=get_headers(),
|
||||
timeout=10,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
except requests.RequestException as e:
|
||||
return [], str(e)
|
||||
|
||||
# Extract original image URLs: Google embeds them as "ou":"https://..."
|
||||
ou_matches = re.findall(r'"ou"\s*:\s*"(https?://[^"]+)"', resp.text)
|
||||
|
||||
# Fallback: look for image URLs in JSON arrays ["https://...", width, height]
|
||||
if not ou_matches:
|
||||
ou_matches = re.findall(
|
||||
r'(?:^|[^"])(https?://(?!encrypted-tbn)[^"\'\\]+\.(?:jpg|jpeg|png|gif|webp)(?:\?[^"\'\\]*)?)',
|
||||
resp.text,
|
||||
re.IGNORECASE,
|
||||
)
|
||||
# Bing encodes original image URLs as murl":"https://..."
|
||||
raw_urls = re.findall(r'murl":"(https?://[^&]+)"', resp.text)
|
||||
|
||||
seen = set()
|
||||
images = []
|
||||
for img_url in ou_matches:
|
||||
for img_url in raw_urls:
|
||||
if img_url in seen:
|
||||
continue
|
||||
seen.add(img_url)
|
||||
# skip google's own thumbnails / tracking pixels
|
||||
if "google.com" in img_url or "gstatic.com" in img_url:
|
||||
continue
|
||||
images.append({"url": img_url})
|
||||
if len(images) >= 20:
|
||||
if len(images) >= 30:
|
||||
break
|
||||
|
||||
return images, None
|
||||
|
||||
Reference in New Issue
Block a user