switch scraping from google to bing (google requires js)
This commit is contained in:
41
app.py
41
app.py
@@ -23,43 +23,34 @@ def get_headers():
|
|||||||
}
|
}
|
||||||
|
|
||||||
def scrape_images(query, page=0):
|
def scrape_images(query, page=0):
|
||||||
url = "https://www.google.com/search"
|
# Bing image search async endpoint — returns HTML with embedded image URLs
|
||||||
params = {
|
|
||||||
"q": query,
|
|
||||||
"tbm": "isch",
|
|
||||||
"ijn": str(page),
|
|
||||||
"start": str(page * 20),
|
|
||||||
"asearch": "ichunk",
|
|
||||||
"async": "_id:rg_s,_pms:s,_fmt:pc",
|
|
||||||
}
|
|
||||||
try:
|
try:
|
||||||
resp = requests.get(url, params=params, headers=get_headers(), timeout=10)
|
resp = requests.get(
|
||||||
|
"https://www.bing.com/images/async",
|
||||||
|
params={
|
||||||
|
"q": query,
|
||||||
|
"first": str(page * 35),
|
||||||
|
"count": "35",
|
||||||
|
"mmasync": "1",
|
||||||
|
},
|
||||||
|
headers=get_headers(),
|
||||||
|
timeout=10,
|
||||||
|
)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
except requests.RequestException as e:
|
except requests.RequestException as e:
|
||||||
return [], str(e)
|
return [], str(e)
|
||||||
|
|
||||||
# Extract original image URLs: Google embeds them as "ou":"https://..."
|
# Bing encodes original image URLs as murl":"https://..."
|
||||||
ou_matches = re.findall(r'"ou"\s*:\s*"(https?://[^"]+)"', resp.text)
|
raw_urls = re.findall(r'murl":"(https?://[^&]+)"', resp.text)
|
||||||
|
|
||||||
# Fallback: look for image URLs in JSON arrays ["https://...", width, height]
|
|
||||||
if not ou_matches:
|
|
||||||
ou_matches = re.findall(
|
|
||||||
r'(?:^|[^"])(https?://(?!encrypted-tbn)[^"\'\\]+\.(?:jpg|jpeg|png|gif|webp)(?:\?[^"\'\\]*)?)',
|
|
||||||
resp.text,
|
|
||||||
re.IGNORECASE,
|
|
||||||
)
|
|
||||||
|
|
||||||
seen = set()
|
seen = set()
|
||||||
images = []
|
images = []
|
||||||
for img_url in ou_matches:
|
for img_url in raw_urls:
|
||||||
if img_url in seen:
|
if img_url in seen:
|
||||||
continue
|
continue
|
||||||
seen.add(img_url)
|
seen.add(img_url)
|
||||||
# skip google's own thumbnails / tracking pixels
|
|
||||||
if "google.com" in img_url or "gstatic.com" in img_url:
|
|
||||||
continue
|
|
||||||
images.append({"url": img_url})
|
images.append({"url": img_url})
|
||||||
if len(images) >= 20:
|
if len(images) >= 30:
|
||||||
break
|
break
|
||||||
|
|
||||||
return images, None
|
return images, None
|
||||||
|
|||||||
Reference in New Issue
Block a user