1. URLsParser.py:
from sgmllib import SGMLParser, SGMLParseError
import urllib
import urlparse
import time, sys

class URLLister(SGMLParser):
    
def reset(self):
        SGMLParser.reset(self)
        self.urls 
= []
        
    
def start_a(self, attrs):
        href 
= [v for k, v in attrs if k =='href']
        
if href:
            self.urls.extend(href)
        
    
def start_img(self, attrs):
        href 
= [v for k, v in attrs if k =='src']
        
if href:
            self.urls.extend(href)
            
    
def start_go(self, attrs):
        href 
= [v for k, v in attrs if k =='href']
        
if href:
            self.urls.extend(href)
            
def URLFormat(current, relative):
    currenturl 
= 'http://' + urlparse.urlsplit(current)[1]
    
#current[:current.rfind('/')]
    relativeurl = relative.strip()
    
    
if relativeurl[:3<> '../':
        
if relativeurl[:7== 'http://':
            
return relativeurl
        
elif relativeurl[:2== './':
            
return currenturl + relativeurl[1:]
        
else:
            
return currenturl + '/' + relativeurl
    
    
while relativeurl[:3== '../':
        url 
= current[:current.rfind('/'+ 1]
        relativeurl 
= relativeurl[2:]
        
if url == 'http://'break
        currenturl 
= current[:current.rfind('/'+ 1]
        
    

    
return currenturl + relativeurl
            
def URLsParser(url):
    result 
= []
    
#    if urlparse.urlsplit(url)[1] == 'bbs.roboo.com':
#
        return result
    
    parser 
= URLLister()
    
try:
        usock 
= urllib.urlopen(url) 
    
except:
        
print 'Can not connect the Page:', url
        
return -1
    
try:
        parser.feed(usock.read())
    
except SGMLParseError, message:
        
#print url, ':::', message
        return -2
    
    usock.close()
        
    
if not parser.urls:
        
return -2
    
    
for curl in parser.urls:
        curl 
= URLFormat(url, curl)
        result.append(curl)
        
    
return result   
            


2. 主程序 Spider.py:
import re, urllib, threading, urlparse
from lib.URLsParser import URLsParser
from Queue import Queue
from time import sleep
import csv, os

URLSTRAPPED 
= []
URLS 
= []
DEADURLS 
= []
WORKFLOW 
= Queue()
WIPCount 
= 0
lock 
= threading.RLock()
ROOT 
= 'http://www.roboo.com'
HOST 
= urlparse.urlsplit(ROOT)[1].split('.'1)[1:][0]
Finished 
= False

class ThreadSpider(threading.Thread):        
    
def __init__(self, name):
        threading.Thread.
__init__(self, name = name)
        
    
def run(self):
        
global URLS, URLSTRAPPED, WORKFLOW, DEADURLS, WIPCount, Finished
        
        
if WIPCount > 0:
            _urlandparent 
= WORKFLOW.get()   
            lock.acquire()  
            WIPCount 
-= 1
            lock.release()             
        
else:
            
print 'Work in Process have been finished.'
            Finished 
= True
            
return 
        
        AllLinks 
= URLsParser(_urlandparent[0])  
        
if type(AllLinks) == int and AllLinks == -1
            fhandle 
= open(rootdir+'/Report/urls_dead.txt''a'
            fhandle.write(str(_urlandparent[0]) 
+ ',' + str(_urlandparent[1]) + '\n'
            fhandle.close()                            
            DEADURLS.append((_urlandparent[0], _urlandparent[
1]))            
            
return
        
elif type(AllLinks) == list:
            URLS 
+= AllLinks
        
else:
            
return
               
        
        
for url in AllLinks:
            
try:
                host 
= urlparse.urlsplit(url)[1].split('.'1)[1:][0]
            
except:
                host 
= HOST            
            
if host == HOST and _urlandparent[0] not in URLSTRAPPED:
                lock.acquire()                
                WIPCount 
+= 1
                lock.release()  
                WORKFLOW.put((url, _urlandparent[0]))
                
        lock.acquire()
        URLSTRAPPED.append(_urlandparent[0])
        lock.release()

class Spider():
    
def __init__(self, rooturl, maxthreads = 20): 
        _mainhost 
= urlparse.urlsplit(rooturl)[1]
        self.root 
= rooturl
        self.host 
= HOST
        self._MAXTHREADS 
= maxthreads
        
    
    
def Trapping(self):
        
global WIPCount, URLS, URLSTRAPPED
        threadpool 
= []
        n 
= 0
        AllLinks 
= URLsParser(self.root)
        
try:
            URLS 
+= AllLinks
        
except:
            
pass
        
        
for url in AllLinks:
            
try:
                host 
= urlparse.urlsplit(url)[1].split('.'1)[1:][0]               
            
except:
                host 
= self.host 
            
if host == self.host:                
                WORKFLOW.put((url, self.root))
                WIPCount 
+= 1
            
        i 
= 0
        
while not Finished:  
            
print i, 'Threads in use:', threading.activeCount(), 'WIP:', WIPCount, 'Trapped URLS:', len(URLSTRAPPED), 'Dead_URLS:', len(DEADURLS)
            i 
= i % 100 + 1         
            
if WIPCount < 1:
                
break
            
elif threading.activeCount() > self._MAXTHREADS:
                sleep(
3)
                
continue
            
else:                
                n 
= n % self._MAXTHREADS + 1                
                tspider 
= ThreadSpider('threadSpider' + str(n))
                tspider.start()
                tspider.join()      
                                
        
print 'All task done.'
    
if __name__ == '__main__':
    rootdir 
= os.path.dirname(__file__)
    
if not os.path.isdir(rootdir + '/Report'):
        os.chdir(rootdir)
        os.mkdir(
'Report')
        
    spider 
= Spider(ROOT, maxthreads = 20)
    spider.Trapping()  
    
    fhandle 
= open(rootdir+'/Report/urls_trapped.txt''w')
    
for every in URLSTRAPPED:        
        fhandle.write(every 
+ '\n')
    fhandle.close()
        
    fhandle 
= open(rootdir+'/Report/urls.txt''w')
    
for url in URLS:
        fhandle.write(url 
+ '\n')
    fhandle.close()
    
    
print '~_^: Action: Statistic save finished.'