Zone-h.org

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from time import sleep
import csv
import requests
from bs4 import BeautifulSoup
import datetime
import os

def parse(html):
        header = ['Date', 'Notifier', 'H', 'M', 'R', 'L', 'Star', 'Domain', 'OS', 'View']
        data = {}
        for _ in header:
                data[_] = []

        html_doc = open(html).read().rstrip()
        available = []
        soup = BeautifulSoup(html_doc, 'html.parser')
        table = soup.findAll("table", {"id" : "ldeface" })[0]
        for item in table.findAll('tr'):
                i = 0
                for _,x in enumerate(item.findAll('td')):
                        try:
                                print header[i], x.getText()
                                data[header[i]].append(str(x.getText().rstrip()))
                        except:
                                pass
                        i += 1
        clean = {}
        for _ in header:
                clean[_] = data[_][1:26]
        unique = datetime.datetime.now().strftime('%d%m%Y')
        with open('csv/zone-h-data-collected-at-{}.csv'.format(unique), 'a') as csvfile:
                author = csv.writer(csvfile, delimiter=',', quotechar='|', quoting=csv.QUOTE_MINIMAL)
                for i in xrange(25):
                        row = [clean[_][i] for _ in header]
                        author.writerow(row)

os.system('rm csv/* && rm /tmp/*.html')

driver = webdriver.Firefox()
driver.set_window_size(768, 512)
driver.get("http://www.zone-h.org/archive")

while 'captcha' in driver.page_source:
    elem = driver.find_element_by_name("captcha")
    elem.clear()
    challenge = raw_input('> ')
    elem.send_keys(challenge)
    elem.send_keys(Keys.RETURN)
    sleep(5)

elements = driver.find_elements_by_xpath("//a[@href]")
for elem in elements:
    if '/archive/filter=1' in elem.get_attribute("href"):
        elem.click()
        sleep(5)
        break

elem = driver.find_element_by_class_name("domaininput")
elem.clear()
challenge = 'id'
elem.send_keys(challenge)
elem.send_keys(Keys.RETURN)
sleep(5)

while 'captcha' in driver.page_source:
    elem = driver.find_element_by_name("archivecaptcha")
    elem.clear()
    challenge = raw_input('> ')
    elem.send_keys(challenge)
    elem.send_keys(Keys.RETURN)
    sleep(5)

i = 1
unique = '{}-{}'.format(i,datetime.datetime.now().strftime('%d%m%Y'))
filename = '/tmp/zone-h-archive-page-{}.html'.format(unique)
with open(filename, 'w') as f:
    f.write(driver.page_source)
parse(filename)

for i in xrange(2, 50+1, 1):
    sleep(5)
    elements = driver.find_elements_by_xpath("//a[@href]")
    for elem in elements:
        try:
            if '/page={}'.format(i) in elem.get_attribute("href"):
                elem.click()
                sleep(5)
                unique = '{}-{}'.format(i,datetime.datetime.now().strftime('%d%m%Y'))
                filename = '/tmp/zone-h-archive-page-{}.html'.format(unique)
                with open(filename, 'w') as f:
                    f.write(driver.page_source)
                parse(filename)
                break
        except:
            print '[!] ERROR: Ctrl+Z'
            break
            # driver.refresh()
            # elem.send_keys(Keys.RETURN)

sleep(5)
driver.close()

results matching ""

    No results matching ""