Extension:RPED/RPEDFileReader.pl

From MediaWiki.org
Jump to: navigation, search

RPEDFileReader.pl[edit | edit source]

# RPEDFileReader.pl by Tisane, http://www.mediawiki.org/wiki/User:Tisane
#
# This script is free software that is available under the terms of the Creative Commons
# Attribution 3.0 license and the current version of the GNU General Public License.
#
# The purpose of this script is to read a text file (specifically, the list of page titles from
# Wikipedia's data dump) and add each page title to a database table.
 
use strict;
use Mysql;
use DBI;
 
my $sql_login = 'wikiuser2';
my $sql_pass = 'password';
my $db_name = 'page_title_db';
my $db_host = 'localhost'; # or remote mysql server name
# if left blank, this defaults to localhost 
 
# PERL MYSQL CONNECT()
#my $connect = Mysql->connect($host, $database, $user, $pw);
my $conn_string = "DBI:mysql:$db_name";
if ($db_host) { $conn_string .= ":$db_host"; }
my $dbh = DBI->connect("$conn_string",$sql_login,$sql_pass); 
 
# SELECT DB
#$dbh->selectdb($database);
 
my $sql = "CREATE TABLE page_title_table(
		p_ID int NOT NULL AUTO_INCREMENT,
		PRIMARY KEY(p_ID),
		page_title VARCHAR(256)
	)";
my $execute = $dbh->do($sql);
$sql="CREATE INDEX pageind on page_title_table (page_title)";
$execute = $dbh->do($sql);
 
my $filename='enwiki-20100116-all-titles-in-ns0';
open(MYDATA, $filename) or 
	die("Error: cannot open file '".$filename."'\n");
my $line;
my $lnum = 1;
while( $line = <MYDATA> ){
	chomp($line);
	$line=$dbh->quote("$line");
	#print "$lnum: $line\n";
	$sql="INSERT INTO page_title_table (page_title) VALUES ($line)";
	$dbh->prepare($sql);
	$dbh->do($sql);
 $lnum++;
}
 
close MYDATA;