网站死链分析脚本(多线程)

1. URLsParser.py:

from sgmllib import SGMLParser, SGMLParseError

import urllib

import urlparse

import time, sys

class URLLister(SGMLParser):

def reset(self):

SGMLParser.reset(self)

self.urls = []

def start_a(self, attrs):

href = [v for k, v in attrs if k =='href']

if href:

self.urls.extend(href)

def start_img(self, attrs):

href = [v for k, v in attrs if k =='src']

if href:

self.urls.extend(href)

def start_go(self, attrs):

href = [v for k, v in attrs if k =='href']

if href:

self.urls.extend(href)

def URLFormat(current, relative):

currenturl = 'http://' + urlparse.urlsplit(current)[1]

#current[:current.rfind('/')]

relativeurl = relative.strip()

if relativeurl[:3] <> '../':

if relativeurl[:7] == 'http://':

return relativeurl

elif relativeurl[:2] == './':

return currenturl + relativeurl[1:]

else:

return currenturl + '/' + relativeurl

while relativeurl[:3] == '../':

url = current[:current.rfind('/') + 1]

relativeurl = relativeurl[2:]

if url == 'http://': break

currenturl = current[:current.rfind('/') + 1]

return currenturl + relativeurl

def URLsParser(url):

result = []

# if urlparse.urlsplit(url)[1] == 'bbs.roboo.com':

# return result

parser = URLLister()

try:

usock = urllib.urlopen(url)

except:

print 'Can not connect the Page:', url

return -1

try:

parser.feed(usock.read())

except SGMLParseError, message:

#print url, ':::', message

return -2

usock.close()

if not parser.urls:

return -2

for curl in parser.urls:

curl = URLFormat(url, curl)

result.append(curl)

return result

2. 主程序 Spider.py:

import re, urllib, threading, urlparse

from lib.URLsParser import URLsParser

from Queue import Queue

from time import sleep

import csv, os

URLSTRAPPED = []

URLS = []

DEADURLS = []

WORKFLOW = Queue()

WIPCount = 0

lock = threading.RLock()

ROOT = 'http://www.roboo.com'

HOST = urlparse.urlsplit(ROOT)[1].split('.', 1)[1:][0]

Finished = False

class ThreadSpider(threading.Thread):

def __init__(self, name):

threading.Thread.__init__(self, name = name)

def run(self):

global URLS, URLSTRAPPED, WORKFLOW, DEADURLS, WIPCount, Finished

if WIPCount > 0:

_urlandparent = WORKFLOW.get()

lock.acquire()

WIPCount -= 1

lock.release()

else:

print 'Work in Process have been finished.'

Finished = True

return

AllLinks = URLsParser(_urlandparent[0])

if type(AllLinks) == int and AllLinks == -1:

fhandle = open(rootdir+'/Report/urls_dead.txt', 'a')

fhandle.write(str(_urlandparent[0]) + ',' + str(_urlandparent[1]) + '\n')

fhandle.close()

DEADURLS.append((_urlandparent[0], _urlandparent[1]))

return

elif type(AllLinks) == list:

URLS += AllLinks

else:

return

for url in AllLinks:

try:

host = urlparse.urlsplit(url)[1].split('.', 1)[1:][0]

except:

host = HOST

if host == HOST and _urlandparent[0] not in URLSTRAPPED:

lock.acquire()

WIPCount += 1

lock.release()

WORKFLOW.put((url, _urlandparent[0]))

lock.acquire()

URLSTRAPPED.append(_urlandparent[0])

lock.release()

class Spider():

def __init__(self, rooturl, maxthreads = 20):

_mainhost = urlparse.urlsplit(rooturl)[1]

self.root = rooturl

self.host = HOST

self._MAXTHREADS = maxthreads

def Trapping(self):

global WIPCount, URLS, URLSTRAPPED

threadpool = []

n = 0

AllLinks = URLsParser(self.root)

try:

URLS += AllLinks

except:

pass

for url in AllLinks:

try:

host = urlparse.urlsplit(url)[1].split('.', 1)[1:][0]

except:

host = self.host

if host == self.host:

WORKFLOW.put((url, self.root))

WIPCount += 1

i = 0

while not Finished:

print i, 'Threads in use:', threading.activeCount(), 'WIP:', WIPCount, 'Trapped URLS:', len(URLSTRAPPED), 'Dead_URLS:', len(DEADURLS)

i = i % 100 + 1

if WIPCount < 1:

break

elif threading.activeCount() > self._MAXTHREADS:

sleep(3)

continue

else:

n = n % self._MAXTHREADS + 1

tspider = ThreadSpider('threadSpider' + str(n))

tspider.start()

tspider.join()

print 'All task done.'

if __name__ == '__main__':

rootdir = os.path.dirname(__file__)

if not os.path.isdir(rootdir + '/Report'):

os.chdir(rootdir)

os.mkdir('Report')

spider = Spider(ROOT, maxthreads = 20)

spider.Trapping()

fhandle = open(rootdir+'/Report/urls_trapped.txt', 'w')

for every in URLSTRAPPED:

fhandle.write(every + '\n')

fhandle.close()

fhandle = open(rootdir+'/Report/urls.txt', 'w')

for url in URLS:

fhandle.write(url + '\n')

fhandle.close()

print '~_^: Action: Statistic save finished.'

发表于 2007-12-04 13:52 Don Li 阅读(629) 评论(0) 编辑收藏引用所属分类: Python

导航

统计

常用链接

留言簿(1)

随笔分类(32)

随笔档案(42)

文章分类

最新随笔

最新评论

阅读排行榜

评论排行榜