User:Alterego/ExtensionMatrix/Source code
From MediaWiki.org
Contents |
[edit] Extension Matrix Source
This is the source code for User:Alterego/ExtensionMatrix
username = '' password = '' from re import sub from sys import path from dateutil.parser import parse import datetime path.append('/usr/local/mwclient') path.append('/usr/local/mwclient/simplejson') import client as mwclient site = mwclient.Site('www.mediawiki.org', path='/w/') site.login(username,password) all_extensions = site.categories["All extensions"] extensions, extensions_dicts, extensions_by_type, extensions_by_status = {}, {}, {}, {} extensions_by_mw_version, extensions_by_creation_date = {}, {} recently_edited, recently_discussed, recently_updated, recently_created = [], [], [], [] # converts an extension dict back into template format def BuildTemplate(extension_dict): template = '{{ExtensionMatrix\n' keys = extension_dict.keys() for key in keys: # This guy giving me a hard time for some reason if '<!-' in extension_dict[key] or '-->' in extension_dict[key]: continue # Build this line of the template template += '|' + key + '=' + extension_dict[key] + '\n' template += '}}\n' return template ########################################## # Download the template for each extension ########################################## for this_extension in all_extensions: try: extension_name = this_extension.name.split(':')[1] # Keep empty vals around to create a list of poorly formatted extensions extensions[extension_name] = '' # Extract the wikitext. Normally wouldn't be this simple but # the extensions are well formatted, each ending with # \n}}. Could recursively look for sub templates to be more # sure we're at the end. wikitext = site.Pages['Extension:' + extension_name].edit() template_start = wikitext.find('{{Extension') if template_start == -1: template_start = wikitext.find('{{extension') if template_start == -1: raise template_end = template_start + wikitext[template_start:].find('\n}}') template = wikitext[template_start:template_end+3] extensions[extension_name] = template except: # If someone did something stupid, not worth breaking the bot continue # With just a little work we can turn the template into a dictionary # and then do some cleanup processing of its parameters. This bot # is definitely relying on the fact that the template ends with \n}} for extension in extensions.keys(): extension_dict = {} hooks, tags, types = [], [], [] template = extensions[extension] # Some people like to have funky spacing. Double up just in case template = template.replace(' |','|').replace(' |','|').replace('| ','|').replace('| ','|') # This hacks off {{Extension and }}, and has the convenient side effect # of nuking |templatemode= when it shows up on the first line template = template.split('\n')[1:-1] # Can't allow newlines - saw way too many crazy template values. In order for # this to be sane the template must have a pipe as the first non whitespace char # on each line filtered_template = [] for line in template: if len(line): if line[0] == '|': filtered_template.append(line) template = filtered_template # Don't allow subpage extensions if '/' in extension: continue for param in template: try: param = param.split('=',1) key = param[0].replace('|','').strip() value = param[1].strip() except: continue # Can't do this? Not my fault. if not len(value.strip()): continue if 'name' in key: # Sometimes the name field doesn't contain the actual name of the extension value = extension if key.find('hook') is not -1: hooks.append(value) continue if key.find('tag') is not -1: tags.append(value) continue if key.find('type') is not -1: types.append(value) continue # Have a look at LocalisationUpdate for nested templateness that is just not ok. if '{{' in value and not '}}' in value: continue if '}}' in value and not '{{' in value: continue # These always turn out to be copy/paste jobs from the prototype template if '<!-' in value or '-->' in value: continue if '<ref>' in value: value = value.replace('<ref>',' ') if '</ref>' in value: value = value.replace('</ref>',' ') extension_dict[key] = value if hooks: hooks.sort() hooks = '<br/>'.join(hooks) extension_dict['hooks'] = hooks if tags: tags.sort() tags = '<br/>'.join(tags) extension_dict['tags'] = tags if types: types.sort() types = '<br/>'.join(types) extension_dict['types'] = types # Sometimes the name isn't specified at all if not extension_dict.has_key('name'): extension_dict['name'] = extension # If there is a newline in a template parameter, that's probably # going to mess things up keys = extension_dict.keys() # Don't allow empty templates, or templates with just one parameter if not len(keys) or len(keys) == 1: continue extensions_dicts[extension] = extension_dict # sorted list of the full matrix for later use sorted_matrix = extensions_dicts.keys() sorted_matrix.sort() # convert all parseable dates into a common wikitable-sortable format months = ['Jan','Feb','Mar','Apr','May','Jun','Jul','Aug','Sep','Oct','Nov','Dec'] for extension in extensions_dicts.keys(): if extensions_dicts[extension].has_key('update'): try: this_date = parse(extensions_dicts[extension]['update']) this_day = this_date.day this_month = months[this_date.month-1] this_year = this_date.year extensions_dicts[extension]['update'] = str(this_day) + ' ' + \ str(this_month) + ' ' + \ str(this_year) except: del extensions_dicts[extension]['update'] ########################################## # figure out what versions of mediawiki this extension works on # this just looks for a string match of the version. i personally # don't trust the +,>=,etc.. sign people like to use, for example, 1.12+. # that generally means that they tested it on 1.12, but not the # versions that came afterwards. ########################################## for extension in extensions_dicts.keys(): if extensions_dicts[extension].has_key('mediawiki'): supported_versions = [] version_text = extensions_dicts[extension]['mediawiki'] for major_version in xrange(1,3): for version in xrange(0,20): this_version = str(major_version) + '.' + str(version) # TODO: Hacky. This still fails on i.e. # - 1.11.0+ in that it puts it in 1.0 because it matches on "1.0+" # - mediawiki 1.16 <= accesscontrol 1.3 because it matches on "1.3 " # - 1.6.x, 1.8.x, 1.9.x or higher (not tested by author on most recent MW versions - i.e. > 1.12) because it matches on "1.12)" if version_text.find(this_version + " ") != -1 or \ version_text.find(this_version + "\n") != -1 or \ version_text.find(this_version + "(") != -1 or \ version_text.find(this_version + ".") != -1 or \ version_text.find(this_version + "+") != -1: supported_versions.append(this_version) if not extensions_by_mw_version.has_key(this_version): extensions_by_mw_version[this_version] = [extension] else: extensions_by_mw_version[this_version].append(extension) extensions_dicts[extension]['mediawiki'] = ', '.join(supported_versions) ########################################## # Get the last day that each extension and its talk page were edited # and the creation date of the extension ########################################## for extension in extensions_dicts.keys(): this_extension = site.Pages["Extension:" + extension] if this_extension.exists: # should never fail! this_date = this_extension.touched this_day = this_date.tm_mday this_month = months[this_date.tm_mon-1] this_year = this_date.tm_year this_date = str(this_day) + ' ' + \ str(this_month) + ' ' + \ str(this_year) extensions_dicts[extension]['lastupdated'] = this_date first_edit_timestamp = list(this_extension.revisions())[-1]['timestamp'] first_edit_year = first_edit_timestamp.tm_year first_edit_month = months[first_edit_timestamp.tm_mon-1] first_edit_day = first_edit_timestamp.tm_mday first_edit_date = str(first_edit_day) + ' ' + \ str(first_edit_month) + ' ' + \ str(first_edit_year) extensions_dicts[extension]['created'] = first_edit_date this_extension = site.Pages["Extension_talk:" + extension] if this_extension.exists: this_date = this_extension.touched this_day = this_date.tm_mday this_month = months[this_date.tm_mon-1] this_year = this_date.tm_year this_date = str(this_day) + ' ' + \ str(this_month) + ' ' + \ str(this_year) extensions_dicts[extension]['lastupdatedtalk'] = this_date ########################################## # Create lists the most recently edited, discussed, updated and created extensions # A bit redundant with above code, but its more clear to break it out # Key to sorting by date is a tuple with (year,month,day). easy peasy. ########################################## for extension in extensions_dicts.keys(): if extensions_dicts[extension].has_key('lastupdated'): this_date = parse(extensions_dicts[extension]['lastupdated']) recently_edited.append((this_date.year, this_date.month, this_date.day, extension)) if extensions_dicts[extension].has_key('lastupdatedtalk'): this_date = parse(extensions_dicts[extension]['lastupdatedtalk']) recently_discussed.append((this_date.year, this_date.month, this_date.day, extension)) if extensions_dicts[extension].has_key('update'): this_date = parse(extensions_dicts[extension]['update']) recently_updated.append((this_date.year, this_date.month, this_date.day, extension)) if extensions_dicts[extension].has_key('created'): this_date = parse(extensions_dicts[extension]['created']) recently_created.append((this_date.year, this_date.month, this_date.day, extension)) recently_edited.sort() recently_discussed.sort() recently_updated.sort() recently_created.sort() recently_edited.reverse() recently_discussed.reverse() recently_updated.reverse() recently_created.reverse() ########################################## # extensions by type ########################################## for extension in extensions_dicts.keys(): this_extension = extensions_dicts[extension] if this_extension.has_key('types'): this_extensions_types = this_extension['types'].split('<br/>') for this_type in this_extensions_types: this_type = this_type.lower() if '--' in this_type: this_type = this_type.split('<!--')[0] if not extensions_by_type.has_key(this_type): if '--' in this_type: this_type = this_type.split('<!--')[0] extensions_by_type[this_type] = [extension] else: extensions_by_type[this_type].append(extension) else: if not extensions_by_type.has_key('notype'): extensions_by_type['notype'] = [extension] else: extensions_by_type['notype'].append(extension) for this_type in extensions_by_type.keys(): if not len(extensions_by_type[this_type]) >= 5: del extensions_by_type[this_type] ########################################## # extensions by status ########################################## extensions_by_status = {} for extension in extensions_dicts.keys(): this_extension = extensions_dicts[extension] if this_extension.has_key('status'): this_status = this_extension['status'].lower() # Make sure this is a single word status - sanity check if len(this_status.split(' ')) == 1: if not extensions_by_status.has_key(this_status): extensions_by_status[this_status] = [extension] else: extensions_by_status[this_status].append(extension) ########################################## # Create main extension matrix output page ########################################## extension_matrix = '' prefix = 'Extension Matrix' updated = 'Last updated: ' + \ datetime.datetime.now().strftime("%Y-%m-%d %H:%M") + ' MST. ' num_listed = 'Listing ' + str(len(extensions_dicts)) + \ ' out of ' + str(len(extensions)) + \ ' members of [[:Category:Extensions]]<br/>' extension_matrix = updated + num_listed + '\n' extension_matrix += '== Entire Extension Matrix ==\n' extension_matrix += '* [[' + prefix + '/AllExtensions|View all extensions]] (very large!)\n' # Create the entire extension matrix entire_matrix = '{{ExtensionMatrixHeader}}' for extension in sorted_matrix: entire_matrix += BuildTemplate(extensions_dicts[extension]) entire_matrix += '{{ExtensionMatrixFooter}}' page = site.Pages[prefix + "/AllExtensions"] page.save(entire_matrix) # One subpage for each version of mediawiki that has extensions which mention it extension_matrix += '== By explicitly supported MediaWiki version ==\n* ' for major_version in xrange(1,3): for version in xrange(0,20): version = str(major_version) + '.' + str(version) if extensions_by_mw_version.has_key(version): num_extensions = str(len(extensions_by_mw_version[version])) extension_matrix += '[[' + prefix + '/' + version + '|' + version + ']] (' + num_extensions + '), ' # Create an extension matrix for each version this_version_matrix = '{{ExtensionMatrixHeader}}' for extension in extensions_by_mw_version[version]: this_version_matrix += BuildTemplate(extensions_dicts[extension]) this_version_matrix += '{{ExtensionMatrixFooter}}' page = site.Pages[prefix + '/' + version] page.save(this_version_matrix) extension_matrix += '\n' # One subpage for each type of status extension_matrix += '== By status of extension ==\n*' status_keys = extensions_by_status.keys() status_keys.sort() for this_status in status_keys: num_extensions = str(len(extensions_by_status[this_status])) extension_matrix += '[[' + prefix + '/' + this_status + '|' + this_status + ']] (' + num_extensions + '), ' this_status_matrix = '{{ExtensionMatrixHeader}}' for extension in extensions_by_status[this_status]: this_status_matrix += BuildTemplate(extensions_dicts[extension]) this_status_matrix += '{{ExtensionMatrixFooter}}' page = site.Pages[prefix + '/' + this_status] page.save(this_status_matrix) extension_matrix += '\n' # One subpage for each extension type extension_matrix += '== By type of extension ==\n* ' type_keys = extensions_by_type.keys() type_keys.sort() for this_type in type_keys[1:]: # [1:] gets rid of weird 'Alterego/ExtensionMatrix' type num_extensions = str(len(extensions_by_type[this_type])) extension_matrix += '[[' + prefix + '/' + this_type + '|' + this_type + ']] (' + num_extensions + '), ' this_type_matrix = '{{ExtensionMatrixHeader}}' for extension in extensions_by_type[this_type]: this_type_matrix += BuildTemplate(extensions_dicts[extension]) this_type_matrix += '{{ExtensionMatrixFooter}}' page = site.Pages[prefix + '/' + this_type] page.save(this_type_matrix) extension_matrix += '\n' extension_matrix += '== 500 most recently created extensions ==\n* ' for extension in xrange(500): extension_name = recently_created[extension][3] extension_date = extensions_dicts[extension_name]['created'] extension_matrix += '[[Extension:' + extension_name + '|' + extension_name + ']] (' + extension_date + '), ' extension_matrix += '\n' extension_matrix += '== 500 most recently edited extension pages ==\n* ' for extension in xrange(500): extension_name = recently_edited[extension][3] extension_date = extensions_dicts[extension_name]['lastupdated'] extension_matrix += '[[Extension:' + extension_name + '|' + extension_name + ']] (' + extension_date + '), ' extension_matrix += '\n' extension_matrix += '== 500 most recently edited extension talk pages ==\n* ' for extension in xrange(500): extension_name = recently_discussed[extension][3] extension_date = extensions_dicts[extension_name]['lastupdatedtalk'] extension_matrix += '[[Extension_talk:' + extension_name + '|' + extension_name + ']] (' + extension_date + '), ' extension_matrix += '\n' extension_matrix += '== 500 most recently updated extensions ==\n* ' for extension in xrange(500): extension_name = recently_updated[extension][3] extension_date = extensions_dicts[extension_name]['update'] extension_matrix += '[[Extension:' + extension_name + '|' + extension_name + ']] (' + extension_date + '), ' extension_matrix += '\n' # '\n{{ExtensionMatrixFooter}}\n' page = site.Pages[prefix] page.save(extension_matrix)
[edit] Extension Matrix Hooks Source
#!/usr/bin/python from pysvn import Client from urllib2 import urlopen from glob import glob from os import walk from pprint import pprint ################################ # Checkout all the extensions # Gets 676 extensions at this time, which is about 33% of the extension matrix. # TODO: Just update the previous checkout, put in sane /usr/local directory ################################ client = Client() client.checkout("http://svn.wikimedia.org/svnroot/mediawiki/trunk/extensions", "/tmp/mw_extensions") ################################ # Get a list of all hooks out of mw docs. Relies on consistent # ^'HookName': formatting. ################################ hooks_txt = urlopen("https://gerrit.wikimedia.org/r/gitweb?p=mediawiki/core.git;a=blob_plain;f=docs/hooks.txt").read() hooks = [] for line in hooks_txt.split("\n"): if "':" in line: hooks.append(line.split(":")[0].replace("'",'')) ################################ # Get rid of all hooks whose entire name occurs in another hook # Hacky, but prevents us from needing to parse PHP. # Maybe we just need to parse enough to get the $wgHooks array? # At the time of running it gets rid of the following 37 / 411 hooks # NOTE: Manually excluding the User hook # ----- # AlternateEdit AlternateEditPreview # ArticleAfterFetchContent ArticleAfterFetchContentObject # ArticleDelete ArticleDeleteComplete # ArticleEditUpdates ArticleEditUpdatesDeleteFromRecentchanges # ArticleProtect ArticleProtectComplete # ArticleSave ArticleSaveComplete # ArticleUndelete ArticleUndeleteLogEntry # BlockIp BlockIpComplete # EditFilter EditFilterMerged # EditFilterMerged EditFilterMergedContent # EditPage EditPageBeforeConflictDiff # EditPage EditPageBeforeConflictDiff # EditPage EditPageBeforeConflictDiff # EditPage EditPageBeforeConflictDiff # EditPage EditPageBeforeConflictDiff # EditSectionLink DoEditSectionLink # EmailUser EmailUserCC # ExtensionTypes SpecialVersionExtensionTypes # getUserPermissionsErrors getUserPermissionsErrorsExpensive # Language LanguageGetNamespaces # LinksUpdate LinksUpdateComplete # LocalFile LocalFilePurgeThumbnails # MarkPatrolled MarkPatrolledComplete # PageContentSave PageContentSaveComplete # SearchGetNearMatch SearchGetNearMatchBefore # ShowSearchHit ShowSearchHitTitle # SpecialListusersHeader SpecialListusersHeaderForm # UnwatchArticle UnwatchArticleComplete # UploadForm UploadFormInitDescriptor # UploadForm UploadFormInitDescriptor # UploadComplete SpecialUploadComplete # UploadComplete SpecialUploadComplete # User AlternateUserMailer # UserGetEmail UserGetEmailAuthenticationTimestamp # UserLogout UserLogoutComplete # UserSetEmail UserSetEmailAuthenticationTimestamp # WatchArticle WatchArticleComplete # ----- ################################ hooks_keep = [] for i in hooks: found = False for j in hooks: if i == j: continue if i in j: found = True break if not found: hooks_keep.append(i) hooks = hooks_keep ################################ # Get a list of all php files in the extension directory ################################ files = [] for root, dirnames, filenames in walk("/tmp/mw_extensions"): files.extend(glob(root + "/*.php")) ################################ # Get all the files for each extension in a list in a dict ################################ extension_files = {} for f in files: extension = f.split("/")[3] if not extension_files.has_key(extension): extension_files[extension] = [f] else: extension_files[extension].append(f) ################################ # For each extension get a list of all the hooks used. # And the reverse. # 639 / 676 extension used hooks # 270 / 375 hooks used by extensions ################################ extension_hooks, hook_extensions = {}, {} for extension in extension_files.keys(): code = '' for f in extension_files[extension]: d = open(f) code += d.read() d.close() for hook in hooks: # TODO: User hook is spammy if hook in code: if not extension_hooks.has_key(extension): extension_hooks[extension] = [hook] else: extension_hooks[extension].append(hook) if not hook_extensions.has_key(hook): hook_extensions[hook] = [extension] else: hook_extensions[hook].append(extension) ################################ # Print ################################ extension_keys = extension_hooks.keys() extension_keys.sort() print "== Extensions --> Hooks ==" for key in extension_keys: print "* '" + key + "': <nowiki>" + str(extension_hooks[key]) + '</nowiki>' hook_keys = hook_extensions.keys() hook_keys.sort() print "== Hooks --> Extensions ==" for key in hook_keys: print "* '" + key + "': <nowiki>" + str(hook_extensions[key]) + '</nowiki>'