User:Gizmhail/LinkDetection
From MediaWiki.org
[edit] Purpose
This extension adds a special page to improve articles in a category by detecting links to articles in another category.
[edit] Status
Beta : please, use this extension with care (its purpose is to automatically change a large number of pages, so a backup of your database might be a good idea if you use it while it is still not much tested)
[edit] Code
Note :
- the text might not have been properly escaped. If you encounter any problems, please use the source of this page (between the pre tags)
- There is a problem with the title of the page. It is a know bug
- To access and use the page, an user should have the linkdetection right
linkdetection.php
<? /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * @author Sebastien Poivre <gizmhail@gmail.com> * @copyright Copyright (C) 2008 Sebastien Poivre * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later */ $wgExtensionCredits['parserhook'][] = array( 'name' => 'LinkDetection', 'author' => 'Orange Labs (Sebastien Poivre)', 'url' => 'http://www.mediawiki.org/wiki/User:Gizmhail/LinkDetection', 'description' => 'Automatically detects links.', 'version' => 0.1 ); $dir = dirname(__FILE__) . '/'; $wgAutoloadClasses['LinkDetection'] = $dir . 'linkdetection_body.php'; # Tell MediaWiki to load the extension body. //$wgExtensionMessagesFiles['LinkDetection'] = $dir . 'LinkDetection.i18n.php'; $wgSpecialPages['LinkDetection'] = 'LinkDetection'; # Let MediaWiki know about your new special page.
linkdetection_body.php
<?php function compareTitleLengh(&$a,&$b){ $aLen = strlen($a->getText()); $bLen = strlen($b->getText()); if ($aLen == $bLen) { return 0; } return ($aLen > $bLen) ? -1 : 1; } class LinkDetection extends SpecialPage { function LinkDetection() { SpecialPage::SpecialPage("LinkDetection",'linkdetection'); //wfLoadExtensionMessages('LinkDetection'); } function formText($source,$target,$startTarget=0,$endTarget=30){ return " <form method='POST' name='LinkDetection'> Target: Category containing pages to improve<input name='target' value='$target'/><br/> Source: Categories containing pages to detect<input name='source' value='$source'/><br/> Start: <input name='startTarget' value='$startTarget'/><br/> End: <input name='endTarget' value='$endTarget'/><br/> Fake changes: <input name='fake' value=''/><br/> <input type='submit' value='Ok'/> </form> <hr/> "; } function execute( $par ) { global $wgRequest, $wgOut,$wgUser; $defaultSource = 'Term'; $defaultTarget = 'Term'; $defaultRange = 30; if( !$wgUser->isAllowed( 'linkdetection' ) ) { $wgOut->addHTML("Sorry, you don't have the needed rights (linkdetection) to acceed this page."); return; } $this->setHeaders(); # Get request data $source = $wgRequest->getText('source'); $fake = $wgRequest->getText('fake'); $target = $wgRequest->getText('target'); $startTarget = $wgRequest->getText('startTarget'); $endTarget = $wgRequest->getText('endTarget'); $paramReceived = false; if($fake===''){ $fake = false; }else{ $fake= true; } if($target!==''){ $paramReceived = true; $targetPages = $this->getTitlesFromCategory(Title::newFromText($target)); } if($source!==''){ $paramReceived = true; $sourcePages = $this->getTitlesFromCategory(Title::newFromText($source)); } if($startTarget===''){ $startTarget = 0; } if(($endTarget==='')||($endTarget<$startTarget)){ $endTarget = $startTarget + $defaultRange; } if($target===''){ $target = $source; } if($source===''){ $source = $target; } if(!$paramReceived||(count($sourcePages)<$startTarget)){ $wgOut->addHTML($this->formText($defaultSource,$defaultTarget,0,$defaultRange)); }else{ if(!$fake){ $wgOut->addHTML($this->formText($source,$target,$startTarget+$defaultRange,$endTarget+$defaultRange)); }else{ $wgOut->addHTML($this->formText($source,$target,$startTarget,$endTarget)); } $wgOut->addHTML("<h1>Editing pages $startTarget to $endTarget (on ".count($targetPages).")</h1>"); $timeout = 10; if(!$fake){ $wgOut->addHTML("<p><i>Process will continue after $timeout seconds, or on submit</i></p>" ."<script language='JavaScript' type='text/javascript'>\n" ."setTimeout('document.LinkDetection.submit()',${timeout}000);\n" ."</script>\n"); } // Sorting to have longuest titles first usort($sourcePages,"compareTitleLengh"); //$page = $targetPages[7];// :) -> doublons //$page = $targetPages[12];// :) -> url //$page = $targetPages[25];// :) -> contROLE //$page = $targetPages[1];// :) -> lien en debut de terme $targetCount = 0; foreach($targetPages as $page){ if($startTarget<=$targetCount&&$targetCount<=$endTarget){ $articleContent = $this->getArticleContent($page); $newArticleContent = $this->getImprovedArticleContent($page,$sourcePages,1); if(strcmp($articleContent,$newArticleContent)!=0){ $wgOut->addHTML( "Changed ".$page->getText()."<br/>\n"); if(!$fake){ $this->saveRevision($page,$newArticleContent); }else{ $wgOut->addHTML($articleContent."<hr/>".$newArticleContent."<br/><br/>\n"); } //break; }else{ if($fake){ $wgOut->addHTML( " --- nothing changed for ".$page->getText()."<br/>\n" ); } } } $targetCount++; } } } /* * * $limitReplacement * -1: replaces all occurences * 1: only replaces first occurence */ function getImprovedArticleContent(&$page,&$sourcePages,$limitReplacement = -1,$noChangeInTemplates = true){ $protectionTag = "LINKDETECTIONTAG"; $articleContent = $this->getArticleContent($page); foreach($sourcePages as $sourcePage){ if(strcasecmp($sourcePage->getText(),$page->getText())==0){ // No link to self continue; } $escTitle = $this->escapedTitleForRegExpSearch($sourcePage); $forceTemplateCleaning = false; if($forceTemplateCleaning){ //Cleaning link in templates : internal development need, to fix an error $articleContent = preg_replace("/(\{\{.*)\[\[($escTitle)\]\](.*\}\})/sU","$1$2$3",$articleContent); } if($noChangeInTemplates){ // Term in templates should be protected //----$articleContent = preg_replace("/(\{\{.*)\[\[($escTitle)\]\](.*\}\})/sU","$1[[$protectionTag$2$protectionTag]]$3",$articleContent); $replacementDone = 1; while($replacementDone>0){ $articleContent = preg_replace("/(\{\{.*)([^\p{L}]|^)($escTitle)(s)?([^\p{L}]|$)(.*\}\})/sU","$1$2$protectionTag$3$protectionTag$4$5$6",$articleContent,-1,$replacementDone); } } //Removing simple links to existing pages $articleContent = preg_replace("/\[\[($escTitle)\]\]/isU","$1",$articleContent); // Protecting term inside other links $articleContent = preg_replace("/\[\[([^\]]*)($escTitle)([^\]]*)\]\]/isU","[[$1$protectionTag$2$protectionTag$3]]",$articleContent); if(strcasecmp($sourcePage->getText(),$page->getText())==0){ // No link to self continue; } $escTitle = $this->escapedTitleForRegExpSearch($sourcePage); $replacementDone = 0; //if($noChangeInTemplates){ // Term in templates should be protected //----$articleContent = preg_replace("/(\{\{.*)\b($escTitle)(s)?\b(.*\}\})/sU","$1$protectionTag$2$3$protectionTag$4",$articleContent); //} // Term as a word (trailing 's' accepted) //----$articleContent = preg_replace("/(\W)(\p{L}')?($escTitle)(s)?(\W)/sU","$1$2[[$3]]$4$5",$articleContent,$limitReplacement,$replacementDone); //----$articleContent = preg_replace("/([^\p{L}]|^)(\p{L}')?($escTitle)(s)?([^\p{L}]|$)/sU","$1$2[[$3]]$4$5",$articleContent,$limitReplacement,$replacementDone); $articleContent = preg_replace("/([^\p{L}]|^)($escTitle)(s)?([^\p{L}]|$)/sU","$1[[$2]]$3$4",$articleContent,$limitReplacement,$replacementDone); } $articleContent = str_replace($protectionTag,'',$articleContent); return $articleContent; } function escapedTitleForRegExpSearch(&$page){ //TODO: Improve escape $title = $page->getText(); $escTitle = str_replace("/","\/",$title);//TODO $titleFirstLetter = $escTitle[0]; $escTitle = substr($escTitle, 1); $escTitle = "[".strtolower($titleFirstLetter).strtoupper($titleFirstLetter)."]".$escTitle; return $escTitle; } function getTitlesFromCategory( $title ) { global $wgContLang; $name = $title->getDBKey(); $dbr = wfGetDB( DB_SLAVE ); list( $page, $categorylinks ) = $dbr->tableNamesN( 'page', 'categorylinks' ); $sql = "SELECT page_namespace, page_title FROM $page " . "JOIN $categorylinks ON cl_from = page_id " . "WHERE cl_to = " . $dbr->addQuotes( $name ); $pages = array(); $res = $dbr->query( $sql, 'wfExportGetPagesFromCategory' ); while ( $row = $dbr->fetchObject( $res ) ) { $n = $row->page_title; if ($row->page_namespace) { $ns = $wgContLang->getNsText( $row->page_namespace ); $n = $ns . ':' . $n; } $pages[] = Title::newFromText($n); } $dbr->freeResult($res); return $pages; } function getArticleContent(&$title){ $articleContent = ""; if($title){ $rev = Revision::newFromTitle( $title ); if($rev){ $articleContent = $rev->getText(); } } return $articleContent; } function saveRevision(&$title,$articleContent){ $flags = EDIT_FORCE_BOT; $article = new Article($title); $summary = "Automatic link detection"; $article->doEdit( $articleContent, $summary, $flags ); } }