#!/usr/bin/env python ''' Super sloppy parser an analysis Copyright 2003 J o s e p h R e a g l e Licensed under the W3C Software License: http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231 ''' def parse((site,ranking), infd): import re, sys pattern = re.compile('(?P.*?)</a> \((?P<inbound>.*?)\)') rank = 1 for line in infd.xreadlines(): match = pattern.search(line) if match: url, title, inbound = match.groups() else: print "regexp failed" sys.exit() ranking[rank] = (url) site[url] = (rank, title, inbound) # print ranking, url, title, inbound rank += 1 return site, ranking def cmpDist((k1,d1),(k2,d2)): return cmp(d1,d2) if __name__ == "__main__": import getopt, sys site_old = {} site_now = {} ranking_old = {} ranking_now = {} distance = {} file_old = ((site_old,ranking_old), "linked-20020731.html") file_now = ((site_now,ranking_now), "linked-20030212.html") files = file_old, file_now for (data, file) in files: try: infd = open(file) except IndexError: print "No such file" sys.exit() site,ranking = parse(data, infd) # print len(data_now) # print site_now['http://diveintomark.org/'] # print ranking_now[1] # print site_old['http://diveintomark.org/'] new = 0 for i in range(1,500): print ranking_now[i],# site_now[ranking_now[i]][0], if site_old.has_key(ranking_now[i]): # print site_old[ranking_now[i]][0], distance[ranking_now[i]] = site_now[ranking_now[i]][0] - site_old[ranking_now[i]][0] # print "speed: ", speed[site_old[ranking_now[i]]] else: # print "new!" new += 1 print "Total number of new sites in the top 500", new print "*******************" distance_list = distance.items() distance_list.sort(cmpDist) for site, step in distance_list: # print "Site '%s' moved '%d' new '%s' old '%s'" % (site, step, \ # site_now[site][2], site_old[site][2] ) print "%s\t%d" % (site, step)