Wednesday, July 23, 2008

OSS: Python webpage link checker

Another piece of python to go. Nothing special, but something fairly useful that i've thrown together. 
The following code samples are released under the terms of the GPL 2.0

It's URL link checker, it's not fully recursive (working on it, they don't teach recursive algorithms on my course). But checks all the anchor tags on the given page, returning warnings and the like depending on what it finds.

Usage: To use this just run it like so "python [scriptname]"
It will then ask for the url of the page you wish to check, and then it will print a report, like so.

Python

#Link Checker
#Brazen attempt to write one in 15 minutes
# Jonathan Dalrymple
# July 15,2008
# Start 11:52
# End 12:35

import httplib
import sgmllib
import re

#HTML Parser
class LinkChecker( sgmllib.SGMLParser ):
def __init__(self, verbose = 0):
sgmllib.SGMLParser.__init__( self, verbose )

self.linkList = []
self.inLink = False
self.lastHref = None
self.hostName = None

print 'Parsing file...'

def parse( self, fileStr ):
self.feed( fileStr )
self.close()

def start_a( self, attr ):

#Show user that the parser is working
#print '*'

self.inLink = True

for name, val in attr:
if name == 'href':
self.lastHref = val

def end_a( self):
self.inLink = False

def handle_data(self, str):

if self.inLink:

tmp = self.__parseUrl( self.lastHref )
self.linkList.append( (str,tmp['host'],tmp['path']) )

def __parseUrl( self, str ):
ret = {}

#slice of preceeding http
#if str[0:7] == 'http://':
# str = str[7:len(str)]

# Extract path regex "\w+\.(\w*)\.(\w{2}\.)?(\w{2,3})"
m = re.compile("\w+\.(\w*)\.(\w{2}\.)?(\w{2,3})").match( str )

if m == None:
#print 'Error in handle url, regex failed'

ret['host'] = None
ret['path'] = str
else:
ret['host'] = m.string[0:m.end()]

if m.end() <>
ret['path'] = str[ m.end(): len(str) ]
else:
ret['path'] = None

return ret
#Check link
def __checkLink( self, displayName, host = None, path = '/' ):
if host == None:
host = self.hostName

#slice path and determine if it is a full url
reqObj = httplib.HTTPConnection( host, 80 )

#print '---Requesting %s' % url+path
reqObj.request('get', path )

response = reqObj.getresponse()

if response.status == 200:
retVal = "SUCCESS |%s returned %d" % (displayName, response.status )
elif response.status == 404:
retVal = "FAILURE |%s returned %d, (%s)" % (displayName, response.status, path )
else:
retVal = "WARNING |%s returned %d, (%s)" % (displayName, response.status, path )

return retVal
def testUrlParser( self, list ):
for u in list:
print '--' + str(self.__parseUrl( u ))
def runReport( self, urlStr ):
urlDict = self.__parseUrl( urlStr )

if urlDict != None:
self.hostName = urlDict['host']
req = httplib.HTTPConnection(urlDict['host'], 80)

req.request('get',urlDict['path'])
response = req.getresponse()

if response.status == 200:

htmlStr = response.read()

self.parse( htmlStr )

print "%d links found" % len( self.linkList )

for v in self.linkList:
#print v
print self.__checkLink( v[0], v[1], v[2] )
else:
print "Download Request for %s failed: %d" % ( response.reason, response.status)


def main():

#urlStr = "http://www.google.co.uk/search?hl=en&q=bar&btnG=Google+Search&meta="
urlStr = raw_input('Url you wish to check:')
if urlStr != None:
bar = LinkChecker()
#foo = ('www.google.com','http://www.google.com','http://docs.python.org/lib/lib.html','docs.python.org/test')
#bar.testUrlParser( foo )
bar.runReport( urlStr )

if __name__ == '__main__':
main()


Quote of the Day
I now finally understand why those guys buy £100,000  cars, only to sit in traffic. It's because they remember the days of running for the train, only to find that it's standing room only and their new best friend is a significantly taller gentleman's (or lady!!) armpit and or sweaty back.

Realizations, Andorra here i come

No comments: