Toolserver:User:Valhallasw/enwikitslinks

Generation
wget enwiki-20121201-externallinks.sql

import re,sys

for i,line in enumerate(open(sys.argv[1])): if i % 100 == 0: print >> sys.stderr, "line %i" % i   entries = re.findall(r"'(http://org.toolserver.*?)'", line) if entries: print "\n".join(entries)

import sys, itertools, collections

counter = collections.defaultdict(lambda: 0)

scripts = ['http://org.toolserver./~tparis/pcount/', 'http://org.toolserver./%7Edispenser/cgi-bin/dab_solver.py/']

for line in sys.stdin: fline = line.split('?')[0] fline = fline.replace('%7E', '~') pieces = [] for sub in fline.split('/'): pieces.append(sub) counter["/".join(pieces)] += 1

counter[fline] += 1

data = sorted(counter.items, key=lambda x:-x[1])

for key, value in data: print " ", key.strip, "  ", value, " " if value < 10000: break