The following code samples are released under the terms of the GPL 2.0
It's URL link checker, it's not fully recursive (working on it, they don't teach recursive algorithms on my course). But checks all the anchor tags on the given page, returning warnings and the like depending on what it finds.
Usage: To use this just run it like so "python [scriptname]"
It will then ask for the url of the page you wish to check, and then it will print a report, like so.
Python
Quote of the Day
Realizations, Andorra here i come
It's URL link checker, it's not fully recursive (working on it, they don't teach recursive algorithms on my course). But checks all the anchor tags on the given page, returning warnings and the like depending on what it finds.
Usage: To use this just run it like so "python [scriptname]"
It will then ask for the url of the page you wish to check, and then it will print a report, like so.
Python
#Link Checker
#Brazen attempt to write one in 15 minutes
# Jonathan Dalrymple
# July 15,2008
# Start 11:52
# End 12:35
import httplib
import sgmllib
import re
#HTML Parser
class LinkChecker( sgmllib.SGMLParser ):
def __init__(self, verbose = 0):
sgmllib.SGMLParser.__init__( self, verbose )
self.linkList = []
self.inLink = False
self.lastHref = None
self.hostName = None
print 'Parsing file...'
def parse( self, fileStr ):
self.feed( fileStr )
self.close()
def start_a( self, attr ):
#Show user that the parser is working
#print '*'
self.inLink = True
for name, val in attr:
if name == 'href':
self.lastHref = val
def end_a( self):
self.inLink = False
def handle_data(self, str):
if self.inLink:
tmp = self.__parseUrl( self.lastHref )
self.linkList.append( (str,tmp['host'],tmp['path']) )
def __parseUrl( self, str ):
ret = {}
#slice of preceeding http
#if str[0:7] == 'http://':
# str = str[7:len(str)]
# Extract path regex "\w+\.(\w*)\.(\w{2}\.)?(\w{2,3})"
m = re.compile("\w+\.(\w*)\.(\w{2}\.)?(\w{2,3})").match( str )
if m == None:
#print 'Error in handle url, regex failed'
ret['host'] = None
ret['path'] = str
else:
ret['host'] = m.string[0:m.end()]
if m.end() <> ret['path'] = str[ m.end(): len(str) ] else: ret['path'] = None
return ret #Check link def __checkLink( self, displayName, host = None, path = '/' ): if host == None: host = self.hostName
#slice path and determine if it is a full url reqObj = httplib.HTTPConnection( host, 80 )
#print '---Requesting %s' % url+path reqObj.request('get', path )
response = reqObj.getresponse()
if response.status == 200: retVal = "SUCCESS |%s returned %d" % (displayName, response.status ) elif response.status == 404: retVal = "FAILURE |%s returned %d, (%s)" % (displayName, response.status, path ) else: retVal = "WARNING |%s returned %d, (%s)" % (displayName, response.status, path )
return retVal def testUrlParser( self, list ): for u in list: print '--' + str(self.__parseUrl( u )) def runReport( self, urlStr ): urlDict = self.__parseUrl( urlStr )
if urlDict != None: self.hostName = urlDict['host'] req = httplib.HTTPConnection(urlDict['host'], 80)
req.request('get',urlDict['path']) response = req.getresponse()
if response.status == 200:
htmlStr = response.read()
self.parse( htmlStr )
print "%d links found" % len( self.linkList )
for v in self.linkList: #print v print self.__checkLink( v[0], v[1], v[2] ) else: print "Download Request for %s failed: %d" % ( response.reason, response.status)
def main():
#urlStr = "http://www.google.co.uk/search?hl=en&q=bar&btnG=Google+Search&meta=" urlStr = raw_input('Url you wish to check:') if urlStr != None: bar = LinkChecker() #foo = ('www.google.com','http://www.google.com','http://docs.python.org/lib/lib.html','docs.python.org/test') #bar.testUrlParser( foo ) bar.runReport( urlStr )
if __name__ == '__main__': main()
Quote of the Day
I now finally understand why those guys buy £100,000 cars, only to sit in traffic. It's because they remember the days of running for the train, only to find that it's standing room only and their new best friend is a significantly taller gentleman's (or lady!!) armpit and or sweaty back.
Realizations, Andorra here i come