1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
|
import requests
import os
from lxml import html
import sys
import time
import json
def get(url):
return requests.get(url, headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:45.0) Gecko/20100101 Firefox/45.0"})
data = dict()
def get_page(type, src):
if not data.get(type):
data[type] = list()
images = src.xpath(".//img[contains(@id, 'image')]")
for image in images:
print image.attrib["src"]
data[type].append(image.attrib["src"])
open("/tmp/insecam_scrape.json", "w").write(json.dumps(data))
time.sleep(1)
baseurl = "https://www.insecam.org/"
page = get(baseurl)
tree = html.fromstring(page.content)
lnks = list()
links = tree.xpath(".//*[@id='insecam-target']//*[@class='dropdown-menu']/li/a")
for link in links:
if "bytype" in link.attrib["href"]:
lnks.append(link.attrib["href"][1:])
for link in lnks:
type = link.replace("en/bytype", "").replace("/", "")
link = os.path.join(baseurl, link)
page = get(link + "?page=1")
tree = html.fromstring(page.content)
total_pages = int(tree.xpath("//script[contains(text(),'pagenavigator')]")[0].text.strip().replace("pagenavigator(\"?page=\", ", "").replace(", 1);", ""))
get_page(type, tree)
for x in range(2, total_pages+1):
page = get(link + "?page={}".format(x))
tree = html.fromstring(page.content)
get_page(type, tree)
time.sleep(5)
|