-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathFFLCrawler.py
More file actions
109 lines (93 loc) · 3.02 KB
/
FFLCrawler.py
File metadata and controls
109 lines (93 loc) · 3.02 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import requests
from bs4 import BeautifulSoup
import urllib2
import urlparse
import thread
import time
products = []
prices = []
hurl = 'http://www.hyattgunstore.com/'
url = 'http://grabagun.com/firearms/'
def scrapeGrab(url):
urls = [url]
visited = [url]
print 'Length of urls = ' + str(len(urls))
for i in range(5000):
try:
html = requests.get(urls[0])
except:
print urls[0]
print 'visiting = ' + urls[0]
visited.append(urls[0])
soup = BeautifulSoup(html.text, 'html5lib')
urls.pop(0)
print 'Length of urls = ' + str(len(urls))
tag = soup.findAll('h2', {'class':'product-name'})
for i in tag:
product = i.text
if product not in products:
products.append(product)
money = soup.findAll('span', {'class':'price'})
for i in money:
price = i.text
if price not in prices:
prices.append(price)
for tag in soup.findAll('a',href=True):
tag = urlparse.urljoin(url,tag['href'])
if url in tag and tag not in visited:
if tag not in urls:
urls.append(tag)
scrapeGrab(url)
##def scrapeHyatt(hurl):
## hurls = [hurl]
## hvisited = [hurl]
##
## print 'Length of urls = ' + str(len(hurls))
## for i in range(10):
## try:
## html = requests.get(hurls[0])
## except:
## print urls[0]
##
## print 'visiting = ' + hurls[0]
## soup = BeautifulSoup(html.text, 'html5lib')
## hurls.pop(0)
## print 'Length of urls = ' + str(len(hurls))
##
## tag = soup.select('a strong')
## for i in tag:
## product = i.text
## if product not in products:
## products.append(product)
##
## money = soup.findAll('span', {'class':'currency'})
## for i in money:
## price = i.text
## if price not in prices:
## prices.append(price)
##
## for tag in soup.findAll('a',href=True):
## tag = urlparse.urljoin(hurl,tag['href'])
## if hurl in tag and tag not in hvisited:
## print tag
## hurls.append(tag)
## hvisited.append(tag)
##thread.start_new_thread(scrapeHyatt,(hurl,))
## time.sleep(1)
##thread.start_new_thread(scrapeGrab, (url,))
import mysql.connector
conn = mysql.connector.connect(user = 'patrick', password = 'Password2011!', host = 'localhost', database = 'test')
cursor = conn.cursor()
cursor.execute('DROP TABLE IF EXISTS Inventory')
try:
cursor.execute(
"""CREATE TABLE inventory(
Id INT(10) PRIMARY KEY AUTO_INCREMENT,
product VARCHAR(100),
price VARCHAR(100))""")
for i in zip(products, prices):
cursor.execute(" INSERT INTO inventory(product, price) VALUES(%s, %s)", (i))
conn.commit()
conn.close()
except Exception as e:
print e