1. URLsParser.py:
from sgmllib import SGMLParser, SGMLParseError
import urllib
import urlparse
import time, sys

class URLLister(SGMLParser):
def reset(self):
SGMLParser.reset(self)
self.urls = []
def start_a(self, attrs):
href = [v for k, v in attrs if k =='href']
if href:
self.urls.extend(href)
def start_img(self, attrs):
href = [v for k, v in attrs if k =='src']
if href:
self.urls.extend(href)
def start_go(self, attrs):
href = [v for k, v in attrs if k =='href']
if href:
self.urls.extend(href)
def URLFormat(current, relative):
currenturl = 'http://' + urlparse.urlsplit(current)[1]
#current[:current.rfind('/')]
relativeurl = relative.strip()
if relativeurl[:3] <> '../':
if relativeurl[:7] == 'http://':
return relativeurl
elif relativeurl[:2] == './':
return currenturl + relativeurl[1:]
else:
return currenturl + '/' + relativeurl
while relativeurl[:3] == '../':
url = current[:current.rfind('/') + 1]
relativeurl = relativeurl[2:]
if url == 'http://': break
currenturl = current[:current.rfind('/') + 1]

return currenturl + relativeurl
def URLsParser(url):
result = []
# if urlparse.urlsplit(url)[1] == 'bbs.roboo.com':
# return result
parser = URLLister()
try:
usock = urllib.urlopen(url)
except:
print 'Can not connect the Page:', url
return -1
try:
parser.feed(usock.read())
except SGMLParseError, message:
#print url, ':::', message
return -2
usock.close()
if not parser.urls:
return -2
for curl in parser.urls:
curl = URLFormat(url, curl)
result.append(curl)
return result
2. 主程序 Spider.py:
import re, urllib, threading, urlparse
from lib.URLsParser import URLsParser
from Queue import Queue
from time import sleep
import csv, os

URLSTRAPPED = []
URLS = []
DEADURLS = []
WORKFLOW = Queue()
WIPCount = 0
lock = threading.RLock()
ROOT = 'http://www.roboo.com'
HOST = urlparse.urlsplit(ROOT)[1].split('.', 1)[1:][0]
Finished = False

class ThreadSpider(threading.Thread):
def __init__(self, name):
threading.Thread.__init__(self, name = name)
def run(self):
global URLS, URLSTRAPPED, WORKFLOW, DEADURLS, WIPCount, Finished
if WIPCount > 0:
_urlandparent = WORKFLOW.get()
lock.acquire()
WIPCount -= 1
lock.release()
else:
print 'Work in Process have been finished.'
Finished = True
return
AllLinks = URLsParser(_urlandparent[0])
if type(AllLinks) == int and AllLinks == -1:
fhandle = open(rootdir+'/Report/urls_dead.txt', 'a')
fhandle.write(str(_urlandparent[0]) + ',' + str(_urlandparent[1]) + '\n')
fhandle.close()
DEADURLS.append((_urlandparent[0], _urlandparent[1]))
return
elif type(AllLinks) == list:
URLS += AllLinks
else:
return
for url in AllLinks:
try:
host = urlparse.urlsplit(url)[1].split('.', 1)[1:][0]
except:
host = HOST
if host == HOST and _urlandparent[0] not in URLSTRAPPED:
lock.acquire()
WIPCount += 1
lock.release()
WORKFLOW.put((url, _urlandparent[0]))
lock.acquire()
URLSTRAPPED.append(_urlandparent[0])
lock.release()

class Spider():
def __init__(self, rooturl, maxthreads = 20):
_mainhost = urlparse.urlsplit(rooturl)[1]
self.root = rooturl
self.host = HOST
self._MAXTHREADS = maxthreads
def Trapping(self):
global WIPCount, URLS, URLSTRAPPED
threadpool = []
n = 0
AllLinks = URLsParser(self.root)
try:
URLS += AllLinks
except:
pass
for url in AllLinks:
try:
host = urlparse.urlsplit(url)[1].split('.', 1)[1:][0]
except:
host = self.host
if host == self.host:
WORKFLOW.put((url, self.root))
WIPCount += 1
i = 0
while not Finished:
print i, 'Threads in use:', threading.activeCount(), 'WIP:', WIPCount, 'Trapped URLS:', len(URLSTRAPPED), 'Dead_URLS:', len(DEADURLS)
i = i % 100 + 1
if WIPCount < 1:
break
elif threading.activeCount() > self._MAXTHREADS:
sleep(3)
continue
else:
n = n % self._MAXTHREADS + 1
tspider = ThreadSpider('threadSpider' + str(n))
tspider.start()
tspider.join()
print 'All task done.'
if __name__ == '__main__':
rootdir = os.path.dirname(__file__)
if not os.path.isdir(rootdir + '/Report'):
os.chdir(rootdir)
os.mkdir('Report')
spider = Spider(ROOT, maxthreads = 20)
spider.Trapping()
fhandle = open(rootdir+'/Report/urls_trapped.txt', 'w')
for every in URLSTRAPPED:
fhandle.write(every + '\n')
fhandle.close()
fhandle = open(rootdir+'/Report/urls.txt', 'w')
for url in URLS:
fhandle.write(url + '\n')
fhandle.close()
print '~_^: Action: Statistic save finished.'

|
|
|
| | 日 | 一 | 二 | 三 | 四 | 五 | 六 |
|---|
| 25 | 26 | 27 | 28 | 29 | 30 | 1 | | 2 | 3 | 4 | 5 | 6 | 7 | 8 | | 9 | 10 | 11 | 12 | 13 | 14 | 15 | | 16 | 17 | 18 | 19 | 20 | 21 | 22 | | 23 | 24 | 25 | 26 | 27 | 28 | 29 | | 30 | 31 | 1 | 2 | 3 | 4 | 5 |
|
导航
统计
- 随笔: 42
- 文章: 0
- 评论: 12
- 引用: 0
常用链接
留言簿(1)
随笔分类(32)
随笔档案(42)
文章分类
最新随笔
最新评论

阅读排行榜
评论排行榜
|
|