#!/usr/bin/env python
''' Super sloppy parser an analysis
Copyright 2003 J o s e p h R e a g l e
Licensed under the W3C Software License:
http://www.w3.org/Consortium/Legal/2002/copyright-software-20021231
'''
def parse((site,ranking), infd):
import re, sys
pattern = re.compile('(?P.*?) \((?P.*?)\)')
rank = 1
for line in infd.xreadlines():
match = pattern.search(line)
if match:
url, title, inbound = match.groups()
else:
print "regexp failed"
sys.exit()
ranking[rank] = (url)
site[url] = (rank, title, inbound)
# print ranking, url, title, inbound
rank += 1
return site, ranking
def cmpDist((k1,d1),(k2,d2)):
return cmp(d1,d2)
if __name__ == "__main__":
import getopt, sys
site_old = {}
site_now = {}
ranking_old = {}
ranking_now = {}
distance = {}
file_old = ((site_old,ranking_old), "linked-20020731.html")
file_now = ((site_now,ranking_now), "linked-20030212.html")
files = file_old, file_now
for (data, file) in files:
try:
infd = open(file)
except IndexError:
print "No such file"
sys.exit()
site,ranking = parse(data, infd)
# print len(data_now)
# print site_now['http://diveintomark.org/']
# print ranking_now[1]
# print site_old['http://diveintomark.org/']
new = 0
for i in range(1,500):
print ranking_now[i],# site_now[ranking_now[i]][0],
if site_old.has_key(ranking_now[i]):
# print site_old[ranking_now[i]][0],
distance[ranking_now[i]] = site_now[ranking_now[i]][0] - site_old[ranking_now[i]][0]
# print "speed: ", speed[site_old[ranking_now[i]]]
else:
# print "new!"
new += 1
print "Total number of new sites in the top 500", new
print "*******************"
distance_list = distance.items()
distance_list.sort(cmpDist)
for site, step in distance_list:
# print "Site '%s' moved '%d' new '%s' old '%s'" % (site, step, \
# site_now[site][2], site_old[site][2] )
print "%s\t%d" % (site, step)