1 files changed, 45 insertions, 0 deletions
diff --git a/web/req/insecam_scrape.py b/web/req/insecam_scrape.py
new file mode 100644
index 0000000..4690d0f
--- /dev/null
+++ b/web/req/insecam_scrape.py
@@ -0,0 +1,45 @@
+import requests
+import os
+from lxml import html
+import sys
+import time
+import json
+
+def get(url):
+	return requests.get(url, headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"})
+
+data = dict()
+
+def get_page(type, src):
+	if not data.get(type):
+		data[type] = list()
+	images = src.xpath(".//img[contains(@id, 'image')]")
+	for image in images:
+		print image.attrib["src"]
+		data[type].append(image.attrib["src"])
+	open("/tmp/insecam_scrape.json", "w").write(json.dumps(data))
+	time.sleep(1)
+
+baseurl = "https://www.insecam.org/"
+page = get(baseurl)
+tree = html.fromstring(page.content)
+
+
+lnks = list()
+links = tree.xpath(".//*[@id='insecam-target']//*[@class='dropdown-menu']/li/a")
+for link in links:
+	if "bytype" in link.attrib["href"]:
+		lnks.append(link.attrib["href"][1:])
+
+for link in lnks:
+	type = link.replace("en/bytype", "").replace("/", "")
+	link = os.path.join(baseurl, link)
+	page = get(link + "?page=1")
+	tree = html.fromstring(page.content)
+	total_pages = int(tree.xpath("//script[contains(text(),'pagenavigator')]")[0].text.strip().replace("pagenavigator(\"?page=\", ", "").replace(", 1);", ""))
+	get_page(type, tree)
+	for x in range(2, total_pages+1):
+		page = get(link + "?page={}".format(x))
+		tree = html.fromstring(page.content)
+		get_page(type, tree)
+	time.sleep(5)