diff options
Diffstat (limited to 'web/req/insecam_scrape.py')
-rw-r--r-- | web/req/insecam_scrape.py | 45 |
1 files changed, 45 insertions, 0 deletions
diff --git a/web/req/insecam_scrape.py b/web/req/insecam_scrape.py new file mode 100644 index 0000000..4690d0f --- /dev/null +++ b/web/req/insecam_scrape.py @@ -0,0 +1,45 @@ +import requests +import os +from lxml import html +import sys +import time +import json + +def get(url): + return requests.get(url, headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"}) + +data = dict() + +def get_page(type, src): + if not data.get(type): + data[type] = list() + images = src.xpath(".//img[contains(@id, 'image')]") + for image in images: + print image.attrib["src"] + data[type].append(image.attrib["src"]) + open("/tmp/insecam_scrape.json", "w").write(json.dumps(data)) + time.sleep(1) + +baseurl = "https://www.insecam.org/" +page = get(baseurl) +tree = html.fromstring(page.content) + + +lnks = list() +links = tree.xpath(".//*[@id='insecam-target']//*[@class='dropdown-menu']/li/a") +for link in links: + if "bytype" in link.attrib["href"]: + lnks.append(link.attrib["href"][1:]) + +for link in lnks: + type = link.replace("en/bytype", "").replace("/", "") + link = os.path.join(baseurl, link) + page = get(link + "?page=1") + tree = html.fromstring(page.content) + total_pages = int(tree.xpath("//script[contains(text(),'pagenavigator')]")[0].text.strip().replace("pagenavigator(\"?page=\", ", "").replace(", 1);", "")) + get_page(type, tree) + for x in range(2, total_pages+1): + page = get(link + "?page={}".format(x)) + tree = html.fromstring(page.content) + get_page(type, tree) + time.sleep(5) |