FFL_Scrape/FFLCrawler.py at master · patrickFalvey/FFL_Scrape · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import requests
from bs4 import BeautifulSoup
import urllib2
import urlparse
import thread
import time

products = []
prices = []
hurl = 'http://www.hyattgunstore.com/'
url = 'http://grabagun.com/firearms/'


def scrapeGrab(url):
    urls = [url]
    visited = [url]
    print 'Length of urls = ' + str(len(urls))
    for i in range(5000):
        try:
            html = requests.get(urls[0])
        except:
            print urls[0]

        print 'visiting = ' + urls[0]
        visited.append(urls[0])

        soup = BeautifulSoup(html.text, 'html5lib')
        urls.pop(0)
        print 'Length of urls = ' + str(len(urls))

        tag = soup.findAll('h2', {'class':'product-name'})
        for i in tag:
            product = i.text
            if product not in products:
                products.append(product)

        money = soup.findAll('span', {'class':'price'})
        for i in money:
            price = i.text
            if price not in prices:
                prices.append(price)

        for tag in soup.findAll('a',href=True):
            tag = urlparse.urljoin(url,tag['href'])
            if url in tag and tag not in visited:
                if tag not in urls:
                    urls.append(tag)


scrapeGrab(url)
##def scrapeHyatt(hurl):
##    hurls = [hurl]
##    hvisited = [hurl]
##
##    print 'Length of urls = ' + str(len(hurls))
##    for i in range(10):
##        try:
##            html = requests.get(hurls[0])
##        except:
##            print urls[0]
##
##        print 'visiting = ' + hurls[0]
##        soup = BeautifulSoup(html.text, 'html5lib')
##        hurls.pop(0)
##        print 'Length of urls = ' + str(len(hurls))
##
##        tag = soup.select('a strong')
##        for i in tag:
##            product = i.text
##            if product not in products:
##                products.append(product)
##
##        money = soup.findAll('span', {'class':'currency'})
##        for i in money:
##            price = i.text
##            if price not in prices:
##                prices.append(price)
##
##        for tag in soup.findAll('a',href=True):
##            tag = urlparse.urljoin(hurl,tag['href'])
##            if hurl in tag and tag not in hvisited:
##                print tag
##                hurls.append(tag)
##                hvisited.append(tag)


##thread.start_new_thread(scrapeHyatt,(hurl,))
##    time.sleep(1)
##thread.start_new_thread(scrapeGrab, (url,))


import mysql.connector
conn = mysql.connector.connect(user = 'patrick', password = 'Password2011!', host = 'localhost', database = 'test')
cursor = conn.cursor()
cursor.execute('DROP TABLE IF EXISTS Inventory')
try:
    cursor.execute(
        """CREATE TABLE inventory(
            Id  INT(10) PRIMARY KEY AUTO_INCREMENT,
            product VARCHAR(100),
            price VARCHAR(100))""")

    for i in zip(products, prices):
            cursor.execute(" INSERT INTO inventory(product, price) VALUES(%s, %s)", (i))
    conn.commit()
    conn.close()

except Exception as e:
    print e