Manual talk:Pywikibot/weblinkchecker.py/LQT Archive 1

Future feature
It would be pretty neat if one day this could also check whether a page is saved in the Internet Archive (which should be pretty easy to do, since all you have to do is append the page's address which you are checking to "http://web.archive.org/web/", and see whether it returns "$PAGE is not available in the Wayback Machine.").

Even neater would be if one could take a positive result and have the bot automatically insert in the article a link to the most recent archived copy, but that would not be as easy. --129.21.121.171 21:16, 10 May 2006 (UTC)

Is it possible to feed the output from this into the replace.py or some other script? ST47 19:02, 6 December 2006 (UTC)

Environment proxy variables (http_proxy, no_proxy) support
Weblinkchecker now uses httplib which does not honor environment proxy variables http_proxy, no_proxy. So if you are behind a proxy and your wiki is NOT in the outer world, there is no way to check the links normally. Here's a patch which adds the support for these variables, though the resulting proxy support is very buggy - works only in Python 2.6 with your wiki address first in no_proxy variable, like no_proxy="www.wiki.local, *.local" It would be great to merge this patch (or its fixed version) into your SVN...

Index: weblinkchecker.py

=
====================================================== --- weblinkchecker.py	(revision 6937) +++ weblinkchecker.py	(working copy) @@ -95,6 +95,7 @@ import wikipedia, config, pagegenerators import sys, re +import os import codecs, pickle import httplib, socket, urlparse, urllib, urllib2 import threading, time @@ -297,6 +298,16 @@        resolveRedirect. This is needed to detect redirect loops. """        self.url = url +        proxy = os.environ.get("http_proxy").replace('http://',,1) +        self.noproxy = re.compile('\s*(?:,\s*)+').split(os.environ.get("no_proxy").replace('http://',)) +        self.noproxy = map(lambda s: re.escape(s).replace('\\*','.*'), self.noproxy) +        self.noproxy = '|'.join(self.noproxy) +        self.noproxy = re.compile(self.noproxy) +        if proxy and re.search(':', proxy): +            self.proxy, self.proxyport = proxy.split(':') +        else: +            self.proxy = proxy +            self.proxyport = 3128         self.serverEncoding = serverEncoding         self.header = {             # 'User-agent': wikipedia.useragent, @@ -315,30 +326,16 @@         self.HTTPignore = HTTPignore     def getConnection(self): -        if self.scheme == 'http': +        if self.proxy and not self.noproxy.match(self.host): +            return httplib.HTTPConnection(self.proxy, self.proxyport) +       elif self.scheme == 'http': return httplib.HTTPConnection(self.host) elif self.scheme == 'https': return httplib.HTTPSConnection(self.host) def getEncodingUsedByServer(self): -       if not self.serverEncoding: -           try: -               wikipedia.output(u'Contacting server %s to find out its default encoding...' % self.host) -               conn = self.getConnection -               conn.request('HEAD', '/', None, self.header) -               response = conn.getresponse +       return 'utf-8' -               self.readEncodingFromResponse(response) -           except: -               pass -           if not self.serverEncoding: -               # TODO: We might also load a page, then check for an encoding -               # definition in a HTML meta tag. -               wikipedia.output(u'Error retrieving server\'s default charset. Using ISO 8859-1.') -               # most browsers use ISO 8859-1 (Latin-1) as the default. -               self.serverEncoding = 'iso8859-1' -       return self.serverEncoding -    def readEncodingFromResponse(self, response): if not self.serverEncoding: try: @@ -367,6 +364,7 @@            encoding = self.getEncodingUsedByServer self.path = unicode(urllib.quote(self.path.encode(encoding))) self.query = unicode(urllib.quote(self.query.encode(encoding), '=&')) +       self.url = urlparse.urlunparse([ self.scheme, self.host, self.path, '', self.query, urllib.quote(self.fragment) ]) def resolveRedirect(self, useHEAD = False): ''' @@ -379,9 +377,9 @@        conn = self.getConnection try: if useHEAD: -               conn.request('HEAD', '%s%s' % (self.path, self.query), None, self.header) +               conn.request('HEAD', self.url, None, self.header) else: -               conn.request('GET', '%s%s' % (self.path, self.query), None, self.header) +               conn.request('GET', self.url, None, self.header) response = conn.getresponse # read the server's encoding, in case we need it later self.readEncodingFromResponse(response) @@ -446,7 +444,8 @@            if isinstance(error, basestring): msg = error else: -               msg = error[1] +               try: msg = error[1] +               except: msg = error[0] # TODO: decode msg. On Linux, it's encoded in UTF-8. # How is it encoded in Windows? Or can we somehow just # get the English message? @@ -483,7 +482,7 @@            except httplib.error, error: return False, u'HTTP Error: %s' % error.__class__.__name__ try: -               conn.request('GET', '%s%s' % (self.path, self.query), None, self.header) +               conn.request('GET', self.url, None, self.header) except socket.error, error: return False, u'Socket Error: %s' % repr(error[1]) try: @@ -789,6 +788,7 @@    # that are also used by other scripts and that determine on which pages # to work on. genFactory = pagegenerators.GeneratorFactory +   global day day = 7 for arg in wikipedia.handleArgs: if arg == '-talk': @@ -805,7 +805,6 @@        elif arg.startswith('-ignore:'): HTTPignore.append(int(arg[8:])) elif arg.startswith('-day:'): -           global day day = int(arg[5:]) else: if not genFactory.handleArg(arg):