import requests import os from lxml import html import sys import time import json def get(url): return requests.get(url, headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"}) data = dict() def get_page(type, src): if not data.get(type): data[type] = list() images = src.xpath(".//img[contains(@id, 'image')]") for image in images: print image.attrib["src"] data[type].append(image.attrib["src"]) open("/tmp/insecam_scrape.json", "w").write(json.dumps(data)) time.sleep(1) baseurl = "https://www.insecam.org/" page = get(baseurl) tree = html.fromstring(page.content) lnks = list() links = tree.xpath(".//*[@id='insecam-target']//*[@class='dropdown-menu']/li/a") for link in links: if "bytype" in link.attrib["href"]: lnks.append(link.attrib["href"][1:]) for link in lnks: type = link.replace("en/bytype", "").replace("/", "") link = os.path.join(baseurl, link) page = get(link + "?page=1") tree = html.fromstring(page.content) total_pages = int(tree.xpath("//script[contains(text(),'pagenavigator')]")[0].text.strip().replace("pagenavigator(\"?page=\", ", "").replace(", 1);", "")) get_page(type, tree) for x in range(2, total_pages+1): page = get(link + "?page={}".format(x)) tree = html.fromstring(page.content) get_page(type, tree) time.sleep(5)