Extension:RPED/RPEDGetDeletedAndRestoredPageTitles.pl

From MediaWiki.org
Jump to: navigation, search

RPEDGetDeletedAndRestoredPageTitles.pl[edit | edit source]

# RPEDGetDeletedAndRestoredPageTitles.pl by Tisane, http://www.mediawiki.org/wiki/User:Tisane
#
# This script is free software that is available under the terms of the Creative Commons
# Attribution 3.0 license and the current version of the GNU General Public License.
#
# The purpose of this script is to query the enwiki API for deleted and restored pages from the
# logs. Each page title is added to a database table. The script loops indefinitely, but sleeps
# for a certain number of seconds between queries.
 
use strict;
use warnings;
use Mysql;
use DBI;
use LWP::Simple;
use LWP::UserAgent;
use HTTP::Request;
use HTTP::Response;
 
my $sql_login = 'wikiuser2';
my $sql_pass = 'password';
my $db_name = 'page_title_db';
my $db_host = 'localhost';
my $table_name1 = 'page_title_update_table';
my $table_name2 = 'syndication_table';
 
my $conn_string = "DBI:mysql:$db_name";
if ($db_host) { $conn_string .= ":$db_host"; }
my $dbh = DBI->connect("$conn_string",$sql_login,$sql_pass); 
 
my $table_name=$table_name1;
for (my $thisCount=0; $thisCount<=1; $thisCount++){
	my $sql = "CREATE TABLE ".$table_name."(
			p_ID int NOT NULL AUTO_INCREMENT,
			PRIMARY KEY(p_ID),
			logid VARCHAR(256),
			ns VARCHAR(256),
			page_title VARCHAR(256),
			rcid VARCHAR(256),
			action VARCHAR(256),
			timestamp VARCHAR(256)
		)";
	my $execute = $dbh->do($sql);
 
	$sql="CREATE INDEX logid_ind on ".$table_name." (logid)";
	$execute = $dbh->do($sql);
 
	$sql="CREATE INDEX ns_ind on ".$table_name." (ns)";
	$execute = $dbh->do($sql);
 
	$sql="CREATE INDEX page_title_ind on ".$table_name." (page_title)";
	$execute = $dbh->do($sql);
 
	$sql="CREATE INDEX rcid_ind on ".$table_name." (rcid)";
	$execute = $dbh->do($sql);
 
	$sql="CREATE INDEX action_ind on ".$table_name." (action)";
	$execute = $dbh->do($sql);
 
	$sql="CREATE INDEX timestamp_ind on ".$table_name." (timestamp)";
	$execute = $dbh->do($sql);
 
	$table_name=$table_name2;
}
 
my $agentName="User:Tisane (http://www.mediawiki.org/wiki/User:Tisane) grabbing some
	page title data off Wikipedia using RPEDGetWP.pl (alpha)";
my $browser = LWP::UserAgent->new();
$browser->agent($agentName);
 
my $initialTimestamp='20100305104000';
my $lestart=$initialTimestamp;
 
my @currentLineRecord=('Hello');
my @pastLineRecord=('Hello');
 
for (my $count=0; $count<=500; $count++){
	$currentLineRecord[$count]=0;
	$pastLineRecord[$count]=0;
}
 
my $logidPreface='[logid] => ';
my $logidEOL='[pageid]';
my $nsPreface='[ns] => ';
my $nsEOL='[title]';
my $titlePreface='[title] => ';
my $titleEOL='[type]';
my $actionPreface='[action] => ';
my $actionEOL='[timestamp]';
my $timestampPreface='[timestamp] => ';
my $timestampEOL=')';
my $lelimit=500;
my $sleepNumber=12;
 
while (1){
	my $URL="http://en.wikipedia.org/w/api.php?action=query&list=logevents&letype=delete&lelimit="
	.$lelimit."&lestart=".$lestart."&leprop=title|timestamp|ids|type&ledir=newer&format=txt";
	#$browser->timeout(500);
	my $request = HTTP::Request->new(GET => $URL);
	my $response = $browser->request($request);
	if ($response->is_error()) {printf "%s\n", $response->status_line;}
	my $contents = $response->content();
 
	#print $contents;
	#sleep 6;
 
	# Find the title
	my $logidPosition=0;
	my $nsPosition=0;
	my $titlePosition=0;
	my $actionPosition=0;
	my $timestampPosition=0;
	my $currentPosition=0;
	my $lineCount=0;
	my $timestampName="";
	my $cleared=0;
 
	while ($logidPosition!=-1){
		$logidPosition=index($contents,$logidPreface,$currentPosition);
		if ($logidPosition!=-1){
			$logidPosition+=length($logidPreface);
			my $logidEOLPosition=index($contents,$logidEOL,$logidPosition);
			my $logidName=substr($contents,$logidPosition,$logidEOLPosition-$logidPosition);
			while (substr($logidName,length($logidName)-1,1) eq " "
			|| substr($logidName,length($logidName)-1,1) eq "\t"
			|| substr($logidName,length($logidName)-1,1) eq "\n"){
				chop $logidName;
			}
			$currentLineRecord[$lineCount]=$logidName;
			my $identical=0;
			if ($lestart eq $initialTimestamp && $cleared==0){
				my $checkQuery="SELECT COUNT(*) FROM `".$table_name1."` WHERE `logid`=".$logidName;
				my $execute=$dbh->prepare($checkQuery);
				$execute->execute();
				$execute->bind_col( 1, \my $countResult );
				while ( $execute->fetch ) {
					$identical=$countResult;
				}
				if ($identical==0){
					$cleared=1;
				}
			}
			for (my $count=0; $count<=500; $count++){
				if ($logidName eq $pastLineRecord[$count]){
					$identical=1;
				}
			}
			if ($identical==0){
				print $logidName." ";
				$currentPosition=$logidPosition;
				$nsPosition=index($contents,$nsPreface,$currentPosition);
				$nsPosition+=length($nsPreface);
				my $nsEOLPosition=index($contents,$nsEOL,$nsPosition);
				my $nsName=substr($contents,$nsPosition,$nsEOLPosition-$nsPosition);
				while (substr($nsName,length($nsName)-1,1) eq " "
				|| substr($nsName,length($nsName)-1,1) eq "\t"
				|| substr($nsName,length($nsName)-1,1) eq "\n"){
					chop $nsName;
				}
				print $nsName." ";
				$currentPosition=$nsPosition;
				$titlePosition=index($contents,$titlePreface,$currentPosition);
				$titlePosition+=length($titlePreface);
				my $titleEOLPosition=index($contents,$titleEOL,$titlePosition);
				my $titleName=substr($contents,$titlePosition,$titleEOLPosition-$titlePosition);
				while (substr($titleName,length($titleName)-1,1) eq " "
				|| substr($titleName,length($titleName)-1,1) eq "\t"
				|| substr($titleName,length($titleName)-1,1) eq "\n"){
					chop $titleName;
				}
				print $titleName." ";
				$currentPosition=$titlePosition;
				$actionPosition=index($contents,$actionPreface,$currentPosition);
				$actionPosition+=length($actionPreface);
				my $actionEOLPosition=index($contents,$actionEOL,$actionPosition);
				my $actionName=substr($contents,$actionPosition,$actionEOLPosition-$actionPosition);
				while (substr($actionName,length($actionName)-1,1) eq " "
				|| substr($actionName,length($actionName)-1,1) eq "\t"
				|| substr($actionName,length($actionName)-1,1) eq "\n"){
					chop $actionName;
				}
				print $actionName." ";
				$currentPosition=$actionPosition;
				$timestampPosition=index($contents,$timestampPreface,$currentPosition);
				$timestampPosition+=length($timestampPreface);
				my $timestampEOLPosition=index($contents,$timestampEOL,$timestampPosition);
				$timestampName=substr($contents,$timestampPosition,$timestampEOLPosition-$timestampPosition);
				while (substr($timestampName,length($timestampName)-1,1) eq " "
				|| substr($timestampName,length($timestampName)-1,1) eq "\t"
				|| substr($timestampName,length($timestampName)-1,1) eq "\n"){
					chop $timestampName;
				}
				print $timestampName." \n";
				$currentPosition=$timestampPosition;
				$logidName=$dbh->quote("$logidName");
				$nsName=$dbh->quote("$nsName");
				$titleName=$dbh->quote("$titleName");
				$actionName=$dbh->quote("$actionName");
				my $quotedTimestampName=$dbh->quote("$timestampName");
				#my $sql="INSERT INTO ".$table_name1." (logid,page_title,action,timestamp) VALUES ('"
				#	.$logidName."','".$titleName."','".$actionName."','".$timestampName."')";
				my $sql="INSERT INTO $table_name1 (logid,ns,page_title,action,timestamp) VALUES "
					."($logidName,$nsName,$titleName,$actionName,$quotedTimestampName)";
				$dbh->prepare($sql);
				$dbh->do($sql);
				$sql="INSERT INTO $table_name2 (logid,ns,page_title,action,timestamp) VALUES "
					."($logidName,$nsName,$titleName,$actionName,$quotedTimestampName)";
				$dbh->prepare($sql);
				$dbh->do($sql);
				#print $sql."\n";
			}
			else{
				print "(Duplicate) ".$logidName."\n";
				$currentPosition=$logidPosition+1;
			}
			$lineCount++;
		}
	}
	if ($timestampName ne ""){
		$lestart=$timestampName;
	}
	for (my $count=0; $count<=500; $count++){
		$pastLineRecord[$count]=$currentLineRecord[$count];
		$currentLineRecord[$count]='';
	}
	sleep $sleepNumber;
}