User:Gizmhail/LinkDetection

Purpose
This extension adds a special page to improve articles in a category by detecting links to articles in another category.

Status
Beta : please, use this extension with care (its purpose is to automatically change a large number of pages, so a backup of your database might be a good idea if you use it while it is still not much tested)

Code
''Note :
 * the text might not have been properly escaped. If you encounter any problems, please use the source of this page (between the pre tags)
 * There is a problem with the title of the page. It is a know bug
 * To access and use the page, an user should have the linkdetection right''

linkdetection.php <? /* * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * @author Sebastien Poivre  * @copyright Copyright (C) 2008 Sebastien Poivre * @license http://www.gnu.org/copyleft/gpl.html GNU General Public License 2.0 or later */

$wgExtensionCredits['parserhook'][] = array(       'name' => 'LinkDetection',        'author' =>  'Orange Labs (Sebastien Poivre)',        'url' => 'http://www.mediawiki.org/wiki/User:Gizmhail/LinkDetection',        'description' => 'Automatically detects links.',        'version' => 0.1  );

$dir = dirname(__FILE__). '/';

$wgAutoloadClasses['LinkDetection'] = $dir. 'linkdetection_body.php'; # Tell MediaWiki to load the extension body. //$wgExtensionMessagesFiles['LinkDetection'] = $dir. 'LinkDetection.i18n.php'; $wgSpecialPages['LinkDetection'] = 'LinkDetection'; # Let MediaWiki know about your new special page.

linkdetection_body.php ?php

function compareTitleLengh(&$a,&$b){ $aLen = strlen($a->getText); $bLen = strlen($b->getText); if ($aLen == $bLen) { return 0; }       return ($aLen > $bLen) ? -1 : 1; }

class LinkDetection extends SpecialPage { function LinkDetection { SpecialPage::SpecialPage("LinkDetection",'linkdetection'); //wfLoadExtensionMessages('LinkDetection'); }

function formText($source,$target,$startTarget=0,$endTarget=30){ return "  Target: Category containing pages to improve   Source: Categories containing pages to detect   Start:    End:    Fake changes:                    "; }

function execute( $par ) { global $wgRequest, $wgOut,$wgUser;

$defaultSource = 'Term'; $defaultTarget = 'Term'; $defaultRange = 30;

if( !$wgUser->isAllowed( 'linkdetection' ) ) { $wgOut->addHTML("Sorry, you don't have the needed rights (linkdetection) to acceed this page."); return; }

$this->setHeaders;

# Get request data $source = $wgRequest->getText('source'); $fake = $wgRequest->getText('fake'); $target = $wgRequest->getText('target'); $startTarget = $wgRequest->getText('startTarget'); $endTarget = $wgRequest->getText('endTarget');

$paramReceived = false; if($fake===''){ $fake = false; }else{ $fake= true; }

if($target!==''){ $paramReceived = true; $targetPages = $this->getTitlesFromCategory(Title::newFromText($target)); }               if($source!==''){ $paramReceived = true; $sourcePages = $this->getTitlesFromCategory(Title::newFromText($source)); }

if($startTarget===''){ $startTarget = 0; }               if(($endTarget==='')||($endTarget<$startTarget)){ $endTarget = $startTarget + $defaultRange; }

if($target===''){ $target = $source; }               if($source===''){ $source = $target; }

if(!$paramReceived||(count($sourcePages)<$startTarget)){ $wgOut->addHTML($this->formText($defaultSource,$defaultTarget,0,$defaultRange)); }else{ if(!$fake){ $wgOut->addHTML($this->formText($source,$target,$startTarget+$defaultRange,$endTarget+$defaultRange)); }else{ $wgOut->addHTML($this->formText($source,$target,$startTarget,$endTarget)); }                       $wgOut->addHTML(" Editing pages $startTarget to $endTarget (on ".count($targetPages).") "); $timeout = 10; if(!$fake){ $wgOut->addHTML(" Process will continue after $timeout seconds, or on submit "                                       ."\n"                                        ."setTimeout('document.LinkDetection.submit',${timeout}000);\n"                                        ." \n"); }                       // Sorting to have longuest titles first usort($sourcePages,"compareTitleLengh"); //$page = $targetPages[7];// :) -> doublons                       //$page = $targetPages[12];// :) -> url //$page = $targetPages[25];// :) -> contROLE                       //$page = $targetPages[1];// :) -> lien en debut de terme

$targetCount = 0; foreach($targetPages as $page){ if($startTarget<=$targetCount&&$targetCount<=$endTarget){ $articleContent = $this->getArticleContent($page); $newArticleContent = $this->getImprovedArticleContent($page,$sourcePages,1); if(strcmp($articleContent,$newArticleContent)!=0){ $wgOut->addHTML( "Changed ".$page->getText." \n"); if(!$fake){ $this->saveRevision($page,$newArticleContent); }else{ $wgOut->addHTML($articleContent." ".$newArticleContent." \n"); }                                               //break; }else{ if($fake){ $wgOut->addHTML( " --- nothing changed for ".$page->getText." \n" ); }                                       }                                }                                $targetCount++; }               }        }

/*       *        * $limitReplacement *    -1: replaces all occurences *     1: only replaces first occurence */       function getImprovedArticleContent(&$page,&$sourcePages,$limitReplacement = -1,$noChangeInTemplates = true){ $protectionTag = "LINKDETECTIONTAG"; $articleContent = $this->getArticleContent($page); foreach($sourcePages as $sourcePage){ if(strcmp(strtolower($sourcePage->getText),strtolower($page->getText))==0){ // No link to self continue; }                       $escTitle = $this->escapedTitleForRegExpSearch($sourcePage); $forceTemplateCleaning = false; if($forceTemplateCleaning){ //Cleaning link in templates : internal development need, to fix an error $articleContent = preg_replace("/(\{\{.*)\[\[($escTitle)\]\](.*\}\})/sU","$1$2$3",$articleContent);

}

if($noChangeInTemplates){ // Term in templates should be protected //$articleContent = preg_replace("/(\{\{.*)\[\[($escTitle)\]\](.*\}\})/sU","$1$protectionTag$2$protectionTag$3",$articleContent); $replacementDone = 1; while($replacementDone>0){ $articleContent = preg_replace("/(\{\{.*)([^\p{L}]|^)($escTitle)(s)?([^\p{L}]|$)(.*\}\})/sU","$1$2$protectionTag$3$protectionTag$4$5$6",$articleContent,-1,$replacementDone); }                       }

//Removing simple links to existing pages $articleContent = preg_replace("/\[\[($escTitle)\]\]/isU","$1",$articleContent);

// Protecting term inside other links $articleContent = preg_replace("/\[\[([^\]]*)($escTitle)([^\]]*)\]\]/isU","$1$protectionTag$2$protectionTag$3",$articleContent);

if(strcmp(strtolower($sourcePage->getText),strtolower($page->getText))==0){ // No link to self continue; }                       $escTitle = $this->escapedTitleForRegExpSearch($sourcePage); $replacementDone = 0; //if($noChangeInTemplates){ // Term in templates should be protected //$articleContent = preg_replace("/(\{\{.*)\b($escTitle)(s)?\b(.*\}\})/sU","$1$protectionTag$2$3$protectionTag$4",$articleContent); //}                       // Term as a word (trailing 's' accepted) //$articleContent = preg_replace("/(\W)(\p{L}')?($escTitle)(s)?(\W)/sU","$1$2$3$4$5",$articleContent,$limitReplacement,$replacementDone); //$articleContent = preg_replace("/([^\p{L}]|^)(\p{L}')?($escTitle)(s)?([^\p{L}]|$)/sU","$1$2$3$4$5",$articleContent,$limitReplacement,$replacementDone); $articleContent = preg_replace("/([^\p{L}]|^)($escTitle)(s)?([^\p{L}]|$)/sU","$1$2$3$4",$articleContent,$limitReplacement,$replacementDone); }               $articleContent = str_replace($protectionTag,'',$articleContent);

return $articleContent; }

function escapedTitleForRegExpSearch(&$page){ //TODO: Improve escape $title = $page->getText; $escTitle = str_replace("/","\/",$title);//TODO $titleFirstLetter = $escTitle[0]; $escTitle = substr($escTitle, 1); $escTitle = "[".strtolower($titleFirstLetter).strtoupper($titleFirstLetter)."]".$escTitle; return $escTitle; }

function getTitlesFromCategory( $title ) { global $wgContLang;

$name = $title->getDBKey;

$dbr = wfGetDB( DB_SLAVE );

list( $page, $categorylinks ) = $dbr->tableNamesN( 'page', 'categorylinks' ); $sql = "SELECT page_namespace, page_title FROM $page ". "JOIN $categorylinks ON cl_from = page_id ". "WHERE cl_to = ". $dbr->addQuotes( $name );

$pages = array; $res = $dbr->query( $sql, 'wfExportGetPagesFromCategory' ); while ( $row = $dbr->fetchObject( $res ) ) { $n = $row->page_title; if ($row->page_namespace) { $ns = $wgContLang->getNsText( $row->page_namespace ); $n = $ns. ':' . $n; }

$pages[] = Title::newFromText($n); }               $dbr->freeResult($res); return $pages; }

function getArticleContent(&$title){ $articleContent = ""; if($title){ $rev = Revision::newFromTitle( $title ); if($rev){ $articleContent = $rev->getText; }               }                return $articleContent; }

function saveRevision(&$title,$articleContent){ $flags = EDIT_FORCE_BOT; $article = new Article($title); $summary = "Automatic link detection"; $article->doEdit( $articleContent, $summary, $flags ); } }