summaryrefslogtreecommitdiff
path: root/web/req/insecam_scrape.py
diff options
context:
space:
mode:
Diffstat (limited to 'web/req/insecam_scrape.py')
-rw-r--r--web/req/insecam_scrape.py45
1 files changed, 45 insertions, 0 deletions
diff --git a/web/req/insecam_scrape.py b/web/req/insecam_scrape.py
new file mode 100644
index 0000000..4690d0f
--- /dev/null
+++ b/web/req/insecam_scrape.py
@@ -0,0 +1,45 @@
+import requests
+import os
+from lxml import html
+import sys
+import time
+import json
+
+def get(url):
+ return requests.get(url, headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"})
+
+data = dict()
+
+def get_page(type, src):
+ if not data.get(type):
+ data[type] = list()
+ images = src.xpath(".//img[contains(@id, 'image')]")
+ for image in images:
+ print image.attrib["src"]
+ data[type].append(image.attrib["src"])
+ open("/tmp/insecam_scrape.json", "w").write(json.dumps(data))
+ time.sleep(1)
+
+baseurl = "https://www.insecam.org/"
+page = get(baseurl)
+tree = html.fromstring(page.content)
+
+
+lnks = list()
+links = tree.xpath(".//*[@id='insecam-target']//*[@class='dropdown-menu']/li/a")
+for link in links:
+ if "bytype" in link.attrib["href"]:
+ lnks.append(link.attrib["href"][1:])
+
+for link in lnks:
+ type = link.replace("en/bytype", "").replace("/", "")
+ link = os.path.join(baseurl, link)
+ page = get(link + "?page=1")
+ tree = html.fromstring(page.content)
+ total_pages = int(tree.xpath("//script[contains(text(),'pagenavigator')]")[0].text.strip().replace("pagenavigator(\"?page=\", ", "").replace(", 1);", ""))
+ get_page(type, tree)
+ for x in range(2, total_pages+1):
+ page = get(link + "?page={}".format(x))
+ tree = html.fromstring(page.content)
+ get_page(type, tree)
+ time.sleep(5)