web/req/insecam_scrape.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45

import requests
import os
from lxml import html
import sys
import time
import json

def get(url):
	return requests.get(url, headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"})

data = dict()

def get_page(type, src):
	if not data.get(type):
		data[type] = list()
	images = src.xpath(".//img[contains(@id, 'image')]")
	for image in images:
		print image.attrib["src"]
		data[type].append(image.attrib["src"])
	open("/tmp/insecam_scrape.json", "w").write(json.dumps(data))
	time.sleep(1)

baseurl = "https://www.insecam.org/"
page = get(baseurl)
tree = html.fromstring(page.content)


lnks = list()
links = tree.xpath(".//*[@id='insecam-target']//*[@class='dropdown-menu']/li/a")
for link in links:
	if "bytype" in link.attrib["href"]:
		lnks.append(link.attrib["href"][1:])

for link in lnks:
	type = link.replace("en/bytype", "").replace("/", "")
	link = os.path.join(baseurl, link)
	page = get(link + "?page=1")
	tree = html.fromstring(page.content)
	total_pages = int(tree.xpath("//script[contains(text(),'pagenavigator')]")[0].text.strip().replace("pagenavigator(\"?page=\", ", "").replace(", 1);", ""))
	get_page(type, tree)
	for x in range(2, total_pages+1):
		page = get(link + "?page={}".format(x))
		tree = html.fromstring(page.content)
		get_page(type, tree)
	time.sleep(5)