User:Alterego/ExtensionMatrix/Source code

From MediaWiki.org
Jump to: navigation, search

Contents

[edit] Extension Matrix Source

This is the source code for User:Alterego/ExtensionMatrix

username = ''
password = ''
 
from re import sub
from sys import path
from dateutil.parser import parse
import datetime
path.append('/usr/local/mwclient')
path.append('/usr/local/mwclient/simplejson')
import client as mwclient
 
site = mwclient.Site('www.mediawiki.org', path='/w/')
site.login(username,password)
 
all_extensions = site.categories["All extensions"]
 
extensions, extensions_dicts, extensions_by_type, extensions_by_status = {}, {}, {}, {}
extensions_by_mw_version, extensions_by_creation_date = {}, {}
recently_edited, recently_discussed, recently_updated, recently_created = [], [], [], []
 
# converts an extension dict back into template format
def BuildTemplate(extension_dict):
    template = '{{ExtensionMatrix\n'
    keys = extension_dict.keys()
    for key in keys:
        # This guy giving me a hard time for some reason
        if '<!-' in extension_dict[key] or '-->' in extension_dict[key]:
            continue
        # Build this line of the template
        template += '|' + key + '=' + extension_dict[key] + '\n'
 
    template += '}}\n'
    return template
 
##########################################
# Download the template for each extension
##########################################
 
for this_extension in all_extensions:
    try:
        extension_name = this_extension.name.split(':')[1]
 
        # Keep empty vals around to create a list of poorly formatted extensions
        extensions[extension_name] = ''
 
        # Extract the wikitext. Normally wouldn't be this simple but
        # the extensions are well formatted, each ending with
        # \n}}. Could recursively look for sub templates to be more
        # sure we're at the end.
 
        wikitext = site.Pages['Extension:' + extension_name].edit()
        template_start = wikitext.find('{{Extension')
 
        if template_start == -1:
            template_start = wikitext.find('{{extension')
 
        if template_start == -1:
            raise
 
        template_end = template_start + wikitext[template_start:].find('\n}}')
        template = wikitext[template_start:template_end+3]
        extensions[extension_name] = template
 
    except:
        # If someone did something stupid, not worth breaking the bot
        continue
 
# With just a little work we can turn the template into a dictionary
# and then do some cleanup processing of its parameters. This bot
# is definitely relying on the fact that the template ends with \n}}
 
for extension in extensions.keys():
 
    extension_dict = {}
    hooks, tags, types = [], [], []
 
    template = extensions[extension]
 
    # Some people like to have funky spacing. Double up just in case
    template = template.replace(' |','|').replace('  |','|').replace('| ','|').replace('|  ','|')
 
    # This hacks off {{Extension and }}, and has the convenient side effect
    # of nuking |templatemode= when it shows up on the first line
    template = template.split('\n')[1:-1]
 
    # Can't allow newlines - saw way too many crazy template values. In order for
    # this to be sane the template must have a pipe as the first non whitespace char
    # on each line
    filtered_template = []
    for line in template:
        if len(line):
            if line[0] == '|':
                filtered_template.append(line)
    template = filtered_template
 
    # Don't allow subpage extensions
    if '/' in extension:
        continue
 
    for param in template:
        try:
            param = param.split('=',1)
            key = param[0].replace('|','').strip()
            value = param[1].strip()
        except:
            continue # Can't do this? Not my fault.
 
        if not len(value.strip()):
            continue
 
        if 'name' in key:
            # Sometimes the name field doesn't contain the actual name of the extension
            value = extension
        if key.find('hook') is not -1:
            hooks.append(value)
            continue
        if key.find('tag') is not -1:
            tags.append(value)
            continue
        if key.find('type') is not -1:
            types.append(value)
            continue
        # Have a look at LocalisationUpdate for nested templateness that is just not ok.
        if '{{' in value and not '}}' in value:
            continue
        if '}}' in value and not '{{' in value:
            continue
        # These always turn out to be copy/paste jobs from the prototype template
        if '<!-' in value or '-->' in value:
            continue
        if '<ref>' in value:
            value = value.replace('<ref>',' ')
        if '</ref>' in value:
            value = value.replace('</ref>',' ')
        extension_dict[key] = value
 
    if hooks:
        hooks.sort()
        hooks = '<br/>'.join(hooks)
        extension_dict['hooks'] = hooks
    if tags:
        tags.sort()
        tags = '<br/>'.join(tags)
        extension_dict['tags'] = tags   
    if types:
        types.sort()
        types = '<br/>'.join(types)
        extension_dict['types'] = types
 
    # Sometimes the name isn't specified at all
    if not extension_dict.has_key('name'):
        extension_dict['name'] = extension
 
    # If there is a newline in a template parameter, that's probably
    # going to mess things up
    keys = extension_dict.keys()
 
    # Don't allow empty templates, or templates with just one parameter
    if not len(keys) or len(keys) == 1:
        continue
 
    extensions_dicts[extension] = extension_dict
 
# sorted list of the full matrix for later use
sorted_matrix = extensions_dicts.keys()
sorted_matrix.sort()
 
# convert all parseable dates into a common wikitable-sortable format
months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec']
for extension in extensions_dicts.keys():
    if extensions_dicts[extension].has_key('update'):
        try:
            this_date = parse(extensions_dicts[extension]['update'])
            this_day = this_date.day
            this_month = months[this_date.month-1]
            this_year = this_date.year
            extensions_dicts[extension]['update'] = str(this_day) + ' ' + \
                                                    str(this_month) + ' ' + \
                                                    str(this_year)
        except:
            del extensions_dicts[extension]['update']
 
##########################################
# figure out what versions of mediawiki this extension works on
# this just looks for a string match of the version. i personally
# don't trust the +,>=,etc.. sign people like to use, for example, 1.12+.
# that generally means that they tested it on 1.12, but not the
# versions that came afterwards.
##########################################
 
for extension in extensions_dicts.keys():
    if extensions_dicts[extension].has_key('mediawiki'):
        supported_versions = []
        version_text = extensions_dicts[extension]['mediawiki']
        for major_version in xrange(1,3):
            for version in xrange(0,20): 
                this_version = str(major_version) + '.' + str(version)
                # TODO: Hacky. This still fails on i.e.
                # - 1.11.0+ in that it puts it in 1.0 because it matches on "1.0+"
                # - mediawiki 1.16 <= accesscontrol 1.3 because it matches on "1.3 "
                # - 1.6.x, 1.8.x, 1.9.x or higher (not tested by author on most recent MW versions - i.e. > 1.12) because it matches on "1.12)"
                if version_text.find(this_version + " ") != -1 or \
                   version_text.find(this_version + "\n") != -1 or \
                   version_text.find(this_version + "(") != -1 or \
                   version_text.find(this_version + ".") != -1 or \
                   version_text.find(this_version + "+") != -1:
                    supported_versions.append(this_version)
                    if not extensions_by_mw_version.has_key(this_version):
                        extensions_by_mw_version[this_version] = [extension]
                    else:
                        extensions_by_mw_version[this_version].append(extension)
        extensions_dicts[extension]['mediawiki'] = ', '.join(supported_versions)
 
##########################################
# Get the last day that each extension and its talk page were edited
# and the creation date of the extension
##########################################
 
 
for extension in extensions_dicts.keys():
    this_extension = site.Pages["Extension:" + extension]
 
    if this_extension.exists: # should never fail!
        this_date = this_extension.touched
        this_day = this_date.tm_mday
        this_month = months[this_date.tm_mon-1]
        this_year = this_date.tm_year
        this_date = str(this_day) + ' ' + \
                    str(this_month) + ' ' + \
                    str(this_year)
        extensions_dicts[extension]['lastupdated'] = this_date
 
 
        first_edit_timestamp = list(this_extension.revisions())[-1]['timestamp']
        first_edit_year = first_edit_timestamp.tm_year
        first_edit_month = months[first_edit_timestamp.tm_mon-1]
        first_edit_day = first_edit_timestamp.tm_mday
        first_edit_date = str(first_edit_day) + ' ' + \
                          str(first_edit_month) + ' ' + \
                          str(first_edit_year)
        extensions_dicts[extension]['created'] = first_edit_date
 
    this_extension = site.Pages["Extension_talk:" + extension]
    if this_extension.exists:
        this_date = this_extension.touched
        this_day = this_date.tm_mday
        this_month = months[this_date.tm_mon-1]
        this_year = this_date.tm_year
        this_date = str(this_day) + ' ' + \
                    str(this_month) + ' ' + \
                    str(this_year)        
        extensions_dicts[extension]['lastupdatedtalk'] = this_date
 
 
##########################################
# Create lists the most recently edited, discussed, updated and created extensions
# A bit redundant with above code, but its more clear to break it out
# Key to sorting by date is a tuple with (year,month,day). easy peasy.
##########################################        
 
for extension in extensions_dicts.keys():
    if extensions_dicts[extension].has_key('lastupdated'):
        this_date = parse(extensions_dicts[extension]['lastupdated'])
        recently_edited.append((this_date.year, this_date.month, this_date.day, extension))
    if extensions_dicts[extension].has_key('lastupdatedtalk'):
        this_date = parse(extensions_dicts[extension]['lastupdatedtalk'])
        recently_discussed.append((this_date.year, this_date.month, this_date.day, extension))
    if extensions_dicts[extension].has_key('update'):
        this_date = parse(extensions_dicts[extension]['update'])
        recently_updated.append((this_date.year, this_date.month, this_date.day, extension))
    if extensions_dicts[extension].has_key('created'):
        this_date = parse(extensions_dicts[extension]['created'])
        recently_created.append((this_date.year, this_date.month, this_date.day, extension))
 
recently_edited.sort()
recently_discussed.sort()
recently_updated.sort()
recently_created.sort()
 
recently_edited.reverse()
recently_discussed.reverse()
recently_updated.reverse()
recently_created.reverse()
 
##########################################
# extensions by type
##########################################
 
for extension in extensions_dicts.keys():
    this_extension = extensions_dicts[extension]
    if this_extension.has_key('types'):
        this_extensions_types = this_extension['types'].split('<br/>')
        for this_type in this_extensions_types:
            this_type = this_type.lower()
            if '--' in this_type:
                this_type = this_type.split('<!--')[0]
            if not extensions_by_type.has_key(this_type):
                if '--' in this_type:
                    this_type = this_type.split('<!--')[0]
                extensions_by_type[this_type] = [extension]
            else:
                extensions_by_type[this_type].append(extension)
    else:
        if not extensions_by_type.has_key('notype'):
            extensions_by_type['notype'] = [extension]
        else:
            extensions_by_type['notype'].append(extension)
 
for this_type in extensions_by_type.keys():
    if not len(extensions_by_type[this_type]) >= 5:
        del extensions_by_type[this_type]
 
##########################################
# extensions by status
##########################################
extensions_by_status = {}
for extension in extensions_dicts.keys():
    this_extension = extensions_dicts[extension]
    if this_extension.has_key('status'):
        this_status = this_extension['status'].lower()
        # Make sure this is a single word status - sanity check
        if len(this_status.split(' ')) == 1:
            if not extensions_by_status.has_key(this_status):
                extensions_by_status[this_status] = [extension]
            else:
                extensions_by_status[this_status].append(extension) 
 
##########################################
# Create main extension matrix output page
##########################################
extension_matrix = ''
prefix = 'Extension Matrix'
 
updated = 'Last updated: ' + \
          datetime.datetime.now().strftime("%Y-%m-%d %H:%M") + ' MST. '
 
num_listed = 'Listing ' + str(len(extensions_dicts)) + \
             ' out of ' + str(len(extensions)) + \
             ' members of [[:Category:Extensions]]<br/>'
 
 
extension_matrix = updated + num_listed + '\n'
extension_matrix += '== Entire Extension Matrix ==\n'
extension_matrix += '* [[' + prefix + '/AllExtensions|View all extensions]] (very large!)\n'
 
# Create the entire extension matrix
entire_matrix = '{{ExtensionMatrixHeader}}'
for extension in sorted_matrix:
    entire_matrix += BuildTemplate(extensions_dicts[extension])
entire_matrix += '{{ExtensionMatrixFooter}}'
page = site.Pages[prefix + "/AllExtensions"]
page.save(entire_matrix)
 
 
# One subpage for each version of mediawiki that has extensions which mention it
extension_matrix += '== By explicitly supported MediaWiki version ==\n* '
 
for major_version in xrange(1,3):
    for version in xrange(0,20): 
        version = str(major_version) + '.' + str(version)
 
        if extensions_by_mw_version.has_key(version):
            num_extensions = str(len(extensions_by_mw_version[version]))
            extension_matrix += '[[' + prefix + '/' + version + '|' + version + ']] (' + num_extensions + '), '
 
            # Create an extension matrix for each version
            this_version_matrix = '{{ExtensionMatrixHeader}}'
            for extension in extensions_by_mw_version[version]:
                this_version_matrix += BuildTemplate(extensions_dicts[extension])
            this_version_matrix += '{{ExtensionMatrixFooter}}'
            page = site.Pages[prefix + '/' + version]
            page.save(this_version_matrix)
 
extension_matrix += '\n'
 
# One subpage for each type of status
extension_matrix += '== By status of extension ==\n*'
status_keys = extensions_by_status.keys()
status_keys.sort()
for this_status in status_keys:
    num_extensions = str(len(extensions_by_status[this_status]))
    extension_matrix += '[[' + prefix + '/' + this_status + '|' + this_status + ']] (' + num_extensions + '), '
    this_status_matrix = '{{ExtensionMatrixHeader}}'
    for extension in extensions_by_status[this_status]:
        this_status_matrix += BuildTemplate(extensions_dicts[extension])
    this_status_matrix += '{{ExtensionMatrixFooter}}'
    page = site.Pages[prefix + '/' + this_status]
    page.save(this_status_matrix)
extension_matrix += '\n'
 
# One subpage for each extension type
extension_matrix += '== By type of extension ==\n* '
type_keys = extensions_by_type.keys()
type_keys.sort()
for this_type in type_keys[1:]: # [1:] gets rid of weird 'Alterego/ExtensionMatrix' type
    num_extensions = str(len(extensions_by_type[this_type]))
    extension_matrix += '[[' + prefix + '/' + this_type + '|' + this_type + ']] (' + num_extensions + '), '
    this_type_matrix = '{{ExtensionMatrixHeader}}'
    for extension in extensions_by_type[this_type]:
        this_type_matrix += BuildTemplate(extensions_dicts[extension])
    this_type_matrix += '{{ExtensionMatrixFooter}}'
    page = site.Pages[prefix + '/' + this_type]
    page.save(this_type_matrix)
extension_matrix += '\n'
 
extension_matrix += '== 500 most recently created extensions ==\n* '
for extension in xrange(500):
    extension_name = recently_created[extension][3]
    extension_date = extensions_dicts[extension_name]['created']
    extension_matrix += '[[Extension:' + extension_name + '|' + extension_name + ']] (' + extension_date + '), '
extension_matrix += '\n'
 
extension_matrix += '== 500 most recently edited extension pages ==\n* '
for extension in xrange(500):
    extension_name = recently_edited[extension][3]
    extension_date = extensions_dicts[extension_name]['lastupdated']
    extension_matrix += '[[Extension:' + extension_name + '|' + extension_name + ']] (' + extension_date + '), '
extension_matrix += '\n'
 
extension_matrix += '== 500 most recently edited extension talk pages ==\n* '
for extension in xrange(500):
    extension_name = recently_discussed[extension][3]
    extension_date = extensions_dicts[extension_name]['lastupdatedtalk']
    extension_matrix += '[[Extension_talk:' + extension_name + '|' + extension_name + ']] (' + extension_date + '), '
extension_matrix += '\n'
 
extension_matrix += '== 500 most recently updated extensions ==\n* '
for extension in xrange(500):
    extension_name = recently_updated[extension][3]
    extension_date = extensions_dicts[extension_name]['update']
    extension_matrix += '[[Extension:' + extension_name + '|' + extension_name + ']] (' + extension_date + '), '
extension_matrix += '\n'
 
# '\n{{ExtensionMatrixFooter}}\n'
 
page = site.Pages[prefix]
page.save(extension_matrix)

[edit] Extension Matrix Hooks Source

#!/usr/bin/python
from pysvn import Client
from urllib2 import urlopen
from glob import glob
from os import walk
from pprint import pprint
 
################################
# Checkout all the extensions
# Gets 676 extensions at this time, which is about 33% of the extension matrix.
# TODO: Just update the previous checkout, put in sane /usr/local directory
################################
client = Client()
client.checkout("http://svn.wikimedia.org/svnroot/mediawiki/trunk/extensions", "/tmp/mw_extensions")
 
################################
# Get a list of all hooks out of mw docs. Relies on consistent
# ^'HookName': formatting.
################################
hooks_txt = urlopen("https://gerrit.wikimedia.org/r/gitweb?p=mediawiki/core.git;a=blob_plain;f=docs/hooks.txt").read()
hooks = []
for line in hooks_txt.split("\n"):
    if "':" in line:
        hooks.append(line.split(":")[0].replace("'",''))
 
################################
# Get rid of all hooks whose entire name occurs in another hook
# Hacky, but prevents us from needing to parse PHP.
# Maybe we just need to parse enough to get the $wgHooks array?
# At the time of running it gets rid of the following 37 / 411 hooks
# NOTE: Manually excluding the User hook
# -----
# AlternateEdit AlternateEditPreview
# ArticleAfterFetchContent ArticleAfterFetchContentObject
# ArticleDelete ArticleDeleteComplete
# ArticleEditUpdates ArticleEditUpdatesDeleteFromRecentchanges
# ArticleProtect ArticleProtectComplete
# ArticleSave ArticleSaveComplete
# ArticleUndelete ArticleUndeleteLogEntry
# BlockIp BlockIpComplete
# EditFilter EditFilterMerged
# EditFilterMerged EditFilterMergedContent
# EditPage EditPageBeforeConflictDiff
# EditPage EditPageBeforeConflictDiff
# EditPage EditPageBeforeConflictDiff
# EditPage EditPageBeforeConflictDiff
# EditPage EditPageBeforeConflictDiff
# EditSectionLink DoEditSectionLink
# EmailUser EmailUserCC
# ExtensionTypes SpecialVersionExtensionTypes
# getUserPermissionsErrors getUserPermissionsErrorsExpensive
# Language LanguageGetNamespaces
# LinksUpdate LinksUpdateComplete
# LocalFile LocalFilePurgeThumbnails
# MarkPatrolled MarkPatrolledComplete
# PageContentSave PageContentSaveComplete
# SearchGetNearMatch SearchGetNearMatchBefore
# ShowSearchHit ShowSearchHitTitle
# SpecialListusersHeader SpecialListusersHeaderForm
# UnwatchArticle UnwatchArticleComplete
# UploadForm UploadFormInitDescriptor
# UploadForm UploadFormInitDescriptor
# UploadComplete SpecialUploadComplete
# UploadComplete SpecialUploadComplete
# User AlternateUserMailer
# UserGetEmail UserGetEmailAuthenticationTimestamp
# UserLogout UserLogoutComplete
# UserSetEmail UserSetEmailAuthenticationTimestamp
# WatchArticle WatchArticleComplete
# -----
################################
hooks_keep = []
for i in hooks:
    found = False
    for j in hooks:
        if i == j: continue
        if i in j:
            found = True
            break
    if not found:
        hooks_keep.append(i)
hooks = hooks_keep
 
################################
# Get a list of all php files in the extension directory
################################
files = []
for root, dirnames, filenames in walk("/tmp/mw_extensions"):
        files.extend(glob(root + "/*.php"))
 
################################
# Get all the files for each extension in a list in a dict
################################
extension_files = {}
for f in files:
    extension = f.split("/")[3]
    if not extension_files.has_key(extension):
        extension_files[extension] = [f]
    else:
        extension_files[extension].append(f)
 
################################
# For each extension get a list of all the hooks used.
# And the reverse.
# 639 / 676 extension used hooks
# 270 / 375 hooks used by extensions
################################
extension_hooks, hook_extensions = {}, {}
 
for extension in extension_files.keys():
    code = ''
    for f in extension_files[extension]:
        d = open(f)
        code += d.read()
        d.close()
 
    for hook in hooks:
        # TODO: User hook is spammy
        if hook in code:
            if not extension_hooks.has_key(extension):
                extension_hooks[extension] = [hook]
            else:
                extension_hooks[extension].append(hook)
 
            if not hook_extensions.has_key(hook):
                hook_extensions[hook] = [extension]
            else:
                hook_extensions[hook].append(extension)
 
################################
# Print
################################
extension_keys = extension_hooks.keys()
extension_keys.sort()
print "== Extensions --> Hooks =="
for key in extension_keys:
    print "* '" + key + "': <nowiki>" + str(extension_hooks[key]) + '</nowiki>'
 
hook_keys = hook_extensions.keys()
hook_keys.sort()
print "== Hooks --> Extensions =="
for key in hook_keys:
    print "* '" + key + "': <nowiki>" + str(hook_extensions[key]) + '</nowiki>'