User:Atrox~mediawikiwiki/Phpwiki2Mediawiki/mwconverter.pl

From mediawiki.org
# Copyright (c) 2010 Artjom Vassiljev <artjom@max.ee>, MAX 123 AS, Estonia
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
# 3. All advertising materials mentioning features or use of this software
#    must display the following acknowledgement:
#	This product includes software developed by Artjom Vassiljev.
# 4. Neither the name of the author nor the names of any co-contributors
#    may be used to endorse or promote products derived from this software
#    without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY Artjom Vassiljev AND "MAX 123" ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL Bill Paul OR THE VOICES IN HIS HEAD
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
# THE POSSIBILITY OF SUCH DAMAGE.
#
# Some ideas taken from the script php2mediawiki.pl by Isaac Wilcox
# 
# Change the directories in the config to get this script to work
# I use file tree.txt to put there some info which I extract from the
# article headers and later use this file to pump articles into mediawiki.
# All this script does is just converts from one syntax to another and
# prepares files to be put to mediawiki by another script.
# It is not ideal, has bugs but it was enough to fulfill my needs.

# changelog:

# (09.Aug.2010), Silver
# * pluginListSubPages -- replace <ListSubPages/> with <splist/>

# (26.Jul.2010), Silver
# * syntaxconvert() -- bugfix: don't convert indents - it broke sub-lists
# * syntaxconvert() -- convert primitive sub-lists - mixed ones get broken
# * syntaxconvert() -- bugfix: more fixes to preserving underscores in links (now in non-forced http://... links too)
# * syntaxconvert() -- bugfix: fix hyperlink captions (separated by space instead of '|')

# (23.Jul.2010), Silver
# * pluginTable() -- bugfix: don't break links' captions within table cells

# (22.Jul.2010), Silver
# * pluginBacklinks() -- implement converting plugin BackLinks to DynamicPageList
# * syntaxconvert() -- bugfix: fix converting multiple categories, eg: OneCategoryTwo -> Category:One Category:Two
# * delete all '$_ =~ '
# * replace all: m/bla/ -> /bla/

# (21.Jul.2010), Silver
# * syntaxconvert() -- bugfix: replace separator of http-links' captions '|' with ' ', eg: [http://blabla|bla] -> [http://blabla bla]
# * syntaxconvert() -- convert [[Upload:...]] to [$phpwiki_uploads]

# (16.Jul.2010), Silver
# * pluginTable() -- bugfix: more fixes for converting table
# * pluginTable() -- convert rowspans
# * syntaxconvert() -- convert <verbatim> to <pre>
# * syntaxconvert() -- bugfix: fix converting links - replace multiple actions with only one universal one
# * syntaxconvert() -- bugfix: hack around replacing underscore within links

# (22.Jun.2010), Silver
# * pluginTable() -- many improvements for converting table
# * syntaxconvert() -- bugfix: substitute newlines (%%%) multiple times (g)
# * syntaxconvert() -- bugfix: substitute bold italics (_*) multiple times (g)

# (21.Jun.2010), Silver
# * removed unnecessary 'i' flag from regex substitutions in many places
# * pluginCreatePage() -- move replacing 'Template' to syntaxconvert()
# * syntaxconvert() -- substitute: HomePage -> MainPage

# (17.Jun.2010), Silver
# * pluginCreatePage() -- replace 'Template' with 'Template:' in the page name set as a template

# (16.Jun.2010), Silver
# * pluginCreatePage() -- new function to convert CreatePage plugin to CreateArticle (http://www.mediawiki.org/wiki/Extension:CreateArticle)
# * optimize regular expressions a bit: (\w+|\W+) -> ((\w|\W)+)

# (03.Jun.2010), Silver
# * parseheader() -- don't rely on header's line feed, force it instead
# * header() -- don't consider an empty line, but 'Content-Transfer-Encoding:' as the end of header
# * print progress numbers onto new lines, otherwise they are printed at once
# * syntaxconvert() -- leave centering text within table cell untouched
# * pluginTable() -- convert centering bold text within a cell to header
# * pluginListSubPages() -- new function to convert listing subpages
# * parseplugins() -- use pluginListSubPages() too


#!/usr/bin/perl -w

use strict;
use warnings;

# TODO:
# * history?
# * correctly convert sub-lists - function subLists()
# * convert PhpWiki plugins to MediaWiki alternatives:
#   * FullTextSearch
#   * ...

# config
# directory with phpwiki dump
my $input_dir = "./wikidump";
# output directory to store converted files
my $output_dir = "./converted";
# file with the page info (category, author, etc)
my $file_tree = "./tree.txt";
my $phpwiki_uploads = "http://mediawiki.mydomain/phpwiki_uploads";

###############
# SUBROUTINES #
###############

#####################
# Plugin converters #

sub pluginTable
{
	my (@t) = @{ (shift) };
	my ($c, $flag) = 0;

	foreach (@t) {
		# plugin opening tag
		if (/<\?plugin OldStyleTable/) {
			s/<\?plugin OldStyleTable/\{\| border = "1"/i;
			# set the flag, so that we do not accidentaly close tag for some other plugin
			# TODO: can we have a table in a table?
			$flag = 1;
			next;
		}
		# plugin closing tag
		if (/\?>/) {
			if ($flag) { s/\?>/\|\}/gi; $flag = 0; }
			next;
		}
		# table
		if (/^\|((\w|\W|\s)+)/) {
			# let's start a new row in table
			chomp($_);
			$_ .= "\n|-\n";

			# double all separators...
			s/\|/||/g;
			# ...except the new rows...
			s/\|\|-/\|-/g;
			# ...and except the links in table cells, eg: ||[[link|caption]]||
			s/\[\[([^\]]*)\|\|([^\]]*)\]\]/[[$1|$2]]/g;

			# colspan - one-by-one: |||asd|asd -> |colspan="3"|asd||asd
			while (/\|{3,}/) {
				my ($tmp) = /(\|{3,})/;
				my $count = $tmp =~ s/([\|])/$1/g;
				$count = $count / 2;
				s/\|{3,}/|colspan="$count"|/;
				# now it could be smth like this: !!|colspan="$count"|
				s/!!\|/!/;
# 				s/\| *!/!/;
			}

			# rowspan: |vvasd|asd -> |rowspan="3"|asd||asd
			while (/\|v/) {
				my ($tmp) = /(\|v+)/;
				my $count = $tmp =~ s/([v])/$1/g;
				$count = $count + 1;
				s/\|v+/|rowspan="$count"|/;
				# now it could be smth like this: !!|rowspan="$count"|
				s/!!\|/!/;
			}

			# replace potential header with a real header
			s/\|\^([^']*)'''([^']+)'''/!!$1$2/g;
			s/\|!!/!!/g;
			# it could be smth like this: |colspan="$n"!! - swap positions
			s/\|(col|row)(span="[0-9]*")(!+)/$3$1$2\|/g;

			# Mediawiki cannot understand the 1st-only ordinary cell, eg: || asd !! asdf !! asdf
			s/(^|\|)\|([^\|!]+)!!/$1|$2\n!/g;

			# fix row-headers, eg: !! asd || bsd || bsd
			s/(^|!)!([^\|]+)\|\|/$1!$2\n|/g;

			# fix the 1st '(|)!!'
			s/^\|?!!([^!]+)/!$1/g;

			# fix the 1st '||'
			s/^\|\|([^\|]+)/|$1/g;

			# align center
			s/(\||^|!)(\||!)\^/$1$2align="center"\|/g;
		}
	}

	return @t;
}

sub pluginToc
{
	my (@t) = @{ (shift) };
	foreach (@t) { s/<\?plugin CreateToc\?>/__TOC__/; }
	return @t;
}

sub pluginCalendar
{
	# TODO: syntax depends on the mediawiki calendar plugin, I didn't use calendar offsets, start days, etc
	# this one just inserts the calendar: http://www.mediawiki.org/wiki/Extension:Calendar_(Barrylb)
	my (@t) = @{ (shift) };
	foreach (@t) {
		s/<\?plugin Calendar.*\?>/<calendar><\/calendar>/;
	}
	return @t;
}

sub pluginBackLinks
{
	# use DynamicPageList extension for this to work: http://www.mediawiki.org/wiki/Extension:DynamicPageList
	my (@t) = @{ (shift) };
	foreach (@t) {
		if ($_ !~ /<\?plugin BackLinks/) {
			next;
		}
		s/<\?plugin BackLinks(.*)\?>/<DynamicPageList>$1<\/DynamicPageList>/;
		my $tmp = $_;
		# params to separate lines
		$tmp =~ s/([^ ]+=[^ ]+|<\/DynamicPageList>)/\n$1/g;
		my @tmp = split(/\n/,$tmp);
		$tmp = "";
		# convert parameters
		foreach (@tmp) {
			if (/page=/) {
				s/page=\[\[Category:([^\]]*)\]\](.*)/category=$1/;
			}
			elsif (/sortby=pagename/) {
				# FIXME: how to preserve descending order?
				s/sortby=pagename/ordermethod=categorysortkey\norder=ascending/;
			}
			else {
				# delete other (unknown or unimportant) params
				s/.+=.+//;
			}
			if ($_ !~ /^$/) {
				$tmp .= "$_\n";
			}
		}
		s/<DynamicPageList>.*<\/DynamicPageList>/$tmp/;
	}
	return @t;
}

sub pluginListSubPages
{
	# convert to simple SubPageList3 extension: http://www.mediawiki.org/wiki/Extension:SubPageList3
	my (@t) = @{ (shift) };
	foreach (@t) {
		s/<\?plugin ListSubpages.*\?>/<splist\/>/;
	}
	return @t;
}

sub pluginCreatePage
{
	# convert to simple ListSubPages extension: http://www.mediawiki.org/wiki/User:Karora/ListSubPages
	my (@t) = @{ (shift) };
	foreach (@t) {
		s/<\?plugin-form CreatePage template=((\w|\W)+)\?>/<createarticle>\ntype=createarticle\npreload=$1\nbuttonlabel=Add\nalign=left\n<\/createarticle>/;
	}
	return @t;
}

# TODO
sub pluginFullTextSearch
{
	# use DynamicPageList extension for this to work: http://www.mediawiki.org/wiki/Extension:DynamicPageList
	my (@t) = @{ (shift) };
	foreach (@t) {

	}
	return @t;
}

# sub subLists
# {
# 	my (@t) = @{ (shift) };
# 	foreach (@t) {
# 		my ($listType) = / *([\*#])/;
# 		if (/^$listType/) {
# 			next
# 		}
# 		if ($listType == "*") {
# 			my $otherType = "#"
# 		}
# 		if (/ *)
# 	}
# 	return @t;
# }

###################
# Plugins handler #

sub parseplugins
{
	my (@textl) = @{ (shift) };
	
	######
	# OldStyleTable
	@textl = pluginTable(\@textl);

	######
	# CreateToc
	@textl = pluginToc(\@textl);
	
	######
	# Calendar
	@textl = pluginCalendar(\@textl);
	
	######
	# BackLinks
	@textl = pluginBackLinks(\@textl);
	
	######
	# FullTextSearch
# 	@textl = pluginFullTextSearch(\@textl);

	######
	# plugin ListSubpages
	@textl = pluginListSubPages(\@textl);

	######
	# plugin plugin-form CreatePage template=
	@textl = pluginCreatePage(\@textl);
	
	return @textl;
}

#######################################
# convert phpwiki syntax to mediawiki #

sub syntaxconvert
{
	my (@textl) = @{ (shift) };

	foreach (@textl) {

		my $m;

		# convert special chars
                s/%2F/\//g;
                s/%E4/Ă€/g;
                s/%C4/Ä/g;
                s/%20/ /g;
                s/%F6/ö/g;
                s/%28/(/g;
                s/%29/)/g;
                s/%D6/Ö/g;
                s/%F5/Ă”/g;
                s/%FC/ĂŒ/g;
                s/%DC/Ü/g;
                s/%26/&/g;

		########
		# remove the extra newline ^M
		s/\r//;

		########
		# convert newlines
		s/%{3}/<br>/g;

		########
		# convert headings
		if (/(!{1,3})/) {
			$m = '=' x (5 - length($1));
			s/^!{1,3}\s*(.*)/$m$1$m/;
		}

		########
		# convert bold: __text__ -> '''text'''
		if (/_{2}(.+)_{2}/) {
			# FIXME: don't replace underscores within links
			s/([^\[])__([^\]])/$1'''$2/g;
		}

		# convert bold: <strong>text</strong> -> '''text'''
		s/<(\/?)strong>/'''/gi;

		# convert preformatted text: <verbatim>text</verbatim> -> <pre>text</pre>
		s/<(\/?)verbatim>/<$1pre>/gi;

		# convert bold: *text* -> '''text'''
		if (/\*(.+)\*/) {
			s/\*/'''/g;
		}

		########
		# convert italic: <em>text</em> -> ''text''
		s/<(\/?)em>/''/gi;

		########
		# convert italic: _text_ -> ''text''
# 		while (/_(.+)_/) {
		if (/_.+_/) {
			# don't replace underscores within links, eg: [http://somesite?some_page]
			# FIXME: now THIS is a hack!
			# <hack>
			while (/\[[^\]]+_[^\]]+\]/) {
				s/(\[[^\]]*)_([^\]]*\])/$1<UNDERSCORE>$2/;
			}
			while (/([^\[]+|^)http[^ ]*_[^ ]*/) {
				s/([^\[]+|^)(http[^ ]*)_([^ ]*)/$1$2<UNDERSCORE>$3/g;
			}
			s/_(.+)_/''$1''/g;
			s/<UNDERSCORE>/_/g;
			# </hack>
		}

		########	
		# convert bold italic: _*text*_ -> '''''text'''''
		if (/_\*(.+)\*_/) {
			s/_\*([^\*_]+)\*_/'''''$1'''''/g;
		}

		########
		# convert indents
# 		if (/^(\s{2,3})/) {
# 			$m = ':' x length($1);
# 			s/^\s{2,3}(.*)/$m/;
# 		}

		########
		# convert sub-lists
		# FIXME: mixed lists get lost
		while (/^ {2,}[\*#]/) {
			s/^  ( *)([\*#])/$1$2$2/;
		}
		if (/^ [\*#]/) {
			s/^ ([\*#])/$1/;
		}
		
		########
		# convert hyperlinks
		s/\[([^\|\]]+)(\|?)([^\]]*)\]/[[$3$2$1]]/g;
		# http-links are automated, eg: [[http://blabla|bla]] -> [http://blabla bla]
		s/\[\[(http[^\]\|]*)\|?([^\]]*)\]\]/[$1 $2]/g;
		# in case there was no '|' for caption, eg: [[http://blabla|bla]] -> [http://blabla ]
		s/\[(http[^\]]*) \]/[$1]/g;

		########
		# convert links of uploaded files (proper uploads have to be redone)
		s/\[\[Upload:([^\]]*)\]\]/[$phpwiki_uploads\/$1]/g;

		########
		# '~' prevent hyperlinking
		# FIXME: does mediawiki have it?
		s/~//;

		########
		# convert PhpWiki categories
		if (/(\w+)Category(\w+)/) {
			s/(\w+)Category(\w+)/[[Category:$1]] [[Category:$2]]/g;
		}
		if (/Category(\w+)/) {
			s/Category(\w+)/[[Category:$1]]/g;
			
		}
		if (/(\w+)Category/) {
			s/(\w+)Category/[[Category:$1]]/g;
		}

		########
		# renamings done in bot.php (PHP-script that uploads pages into database afterwards)
		if (/([\W\s]+|^)Template\w+/) {
			s/([\W\s]+|^)Template(\w+)/$1Template:$2/gi;
		}
# 		if (/[\w]{0}Template\w+/) {
# 			s/[\WS]{0}Template(\w+)/Template:$1/i;
# 		}
		if (/([\W\s]+|^)HomePage([\W\s]+|$)/) {
			s/([\W\s]+|^)HomePage([\W\s]+|$)/$1MainPage$2/g;
		}
	}
	
	return @textl;
}

#########
# Files #

sub parseheader
{
	# Structure to write to tree.txt:
	# file name;page name;author id;page summary
	# header
	my (@t) = @{ (shift) };
	# tree.txt file handle
	my $f = shift;
	# wiki file name
	my $fn = shift;
	my ($s, $pos);

	print $f "$fn;";
	
	foreach (@t) {

		# convert special chars
		s/%2F/\//g;
		s/%E4/Ă€/g;
		s/%C4/Ä/g;
		s/%20/ /g;
		s/%F6/ö/g;
		s/%28/(/g;
		s/%29/)/g;
		s/%D6/Ö/g;
		s/%F5/Ă”/g;
		s/%FC/ĂŒ/g;
		s/%DC/Ü/g;
		s/%26/&/g;

		if (/pagename/) {

			/=/;
			# get the position of '='
			$pos = $+[0];
			# put the page name into the header array, don't forget to remove the ';'
			$s = substr($_, $pos);
			$s =~ s/\r//;
			chomp($s);
			print $f $s;
		}

		if (/author_id/) {

			/=/;
			$pos = $+[0];
			$s = substr($_, $pos);
			$s =~ s/\r//g;
			chomp($s);
			print $f $s;
		}

		if (/summary/) {

			/=/;
			$pos = $+[0];
			$s = substr($_, $pos);
			$s =~ s/(;)?[\r\n]$//g;
			print $f $s;
		}

	}
	# force line break, don't rely on summary's line ending
	print $f "\n";

}

sub header
{
	opendir(DIR, $input_dir) or die("cannot open directory");
	
	my @files = readdir(DIR);
	
	closedir(DIR);
	
	print "Total files: " . @files . "\n" . "Parsing headers... \n";

	# status
	my $m = 0;

	# sort them
	my @fs = sort { $a cmp $b } @files;

	# open $tree file for the header info output
	open my $tree, '>', $file_tree or die ("error opening $file_tree: $!");

	foreach (@fs) {

		if($_ eq "." or $_ eq "..") { next; }

		open(FILE, "$input_dir/$_") or die ("error opening $_: $!");

		my @lines = <FILE>;
		my $num = @lines;
		my $l;
		my $i = 0;

		# find the 1st empty line which will indicate the end of header
		foreach $l (@lines) {
			# some (broken?) headers contain an empty line too, so sometimes it cannot be relied upon
			last if (($l =~ m/^[\n\r]?$/));
# 			last if ($l =~ m/^Content-Transfer-Encoding:/);
			$i++;
		}

		# copy everything except header to the new array
		my @new = @lines[$i+1 .. $num-1];

		close(FILE);

		# write data without the header
		open(NFILE, ">$output_dir/$_") or print "could not open $_\n";
		print NFILE @new;
		close(NFILE);

		# get the header
		my @header = @lines[0 .. $i];
		# parse it and write info to the tree
		parseheader(\@header, $tree, $_);

		# status
		print "$m\n" if ($m % 1000 == 0);
		$m++;

	}

	close $tree;
	print "Done!\n";

}

############
# Main     #

# remove headers and build tree
header();

# 
opendir(DIR, $output_dir) or die("cannot open directory: $output_dir");
my @files = readdir(DIR);
closedir(DIR);

print "\nTotal files: " . @files . "\nConverting files... \n";

# status
my $m = 0;

foreach (@files) {

	if($_ eq "." or $_ eq "..") { next; }


	open (FILE, "<$output_dir/$_") or print "error: $!\n";
	my @inf = <FILE>;
	close(FILE);
	# FIXME: stupid, I know
	open (FILE, ">$output_dir/$_") or print "error: $!\n";

	my @cat = split(/%2F/, $_);
	my $li = $#cat;

	if ($li > 0) {
		my $category = $cat[$li-1];
		print FILE "[[Category:$category]]\n";
	}
	#else { print FILE "[[Category:$_]]\n"; }

	my @w = syntaxconvert(\@inf);
	my @t = parseplugins(\@w);
	print FILE @t;
	close(FILE);
	
	# status
	print "$m\n" if ($m % 1000 == 0);
	$m++;
}

print "Done!\n";

exit 0;