User:Atrox/Phpwiki2Mediawiki/mwconverter.pl
From MediaWiki.org
# Copyright (c) 2010 Artjom Vassiljev <artjom@max.ee>, MAX 123 AS, Estonia # # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: # 1. Redistributions of source code must retain the above copyright # notice, this list of conditions and the following disclaimer. # 2. Redistributions in binary form must reproduce the above copyright # notice, this list of conditions and the following disclaimer in the # documentation and/or other materials provided with the distribution. # 3. All advertising materials mentioning features or use of this software # must display the following acknowledgement: # This product includes software developed by Artjom Vassiljev. # 4. Neither the name of the author nor the names of any co-contributors # may be used to endorse or promote products derived from this software # without specific prior written permission. # # THIS SOFTWARE IS PROVIDED BY Artjom Vassiljev AND "MAX 123" ``AS IS'' AND # ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE # ARE DISCLAIMED. IN NO EVENT SHALL Bill Paul OR THE VOICES IN HIS HEAD # BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR # CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF # SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS # INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF # THE POSSIBILITY OF SUCH DAMAGE. # # Some ideas taken from the script php2mediawiki.pl by Isaac Wilcox # # Change the directories in the config to get this script to work # I use file tree.txt to put there some info which I extract from the # article headers and later use this file to pump articles into mediawiki. # All this script does is just converts from one syntax to another and # prepares files to be put to mediawiki by another script. # It is not ideal, has bugs but it was enough to fulfill my needs. # changelog: # (09.Aug.2010), Silver # * pluginListSubPages -- replace <ListSubPages/> with <splist/> # (26.Jul.2010), Silver # * syntaxconvert() -- bugfix: don't convert indents - it broke sub-lists # * syntaxconvert() -- convert primitive sub-lists - mixed ones get broken # * syntaxconvert() -- bugfix: more fixes to preserving underscores in links (now in non-forced http://... links too) # * syntaxconvert() -- bugfix: fix hyperlink captions (separated by space instead of '|') # (23.Jul.2010), Silver # * pluginTable() -- bugfix: don't break links' captions within table cells # (22.Jul.2010), Silver # * pluginBacklinks() -- implement converting plugin BackLinks to DynamicPageList # * syntaxconvert() -- bugfix: fix converting multiple categories, eg: OneCategoryTwo -> Category:One Category:Two # * delete all '$_ =~ ' # * replace all: m/bla/ -> /bla/ # (21.Jul.2010), Silver # * syntaxconvert() -- bugfix: replace separator of http-links' captions '|' with ' ', eg: [http://blabla|bla] -> [http://blabla bla] # * syntaxconvert() -- convert [[Upload:...]] to [$phpwiki_uploads] # (16.Jul.2010), Silver # * pluginTable() -- bugfix: more fixes for converting table # * pluginTable() -- convert rowspans # * syntaxconvert() -- convert <verbatim> to <pre> # * syntaxconvert() -- bugfix: fix converting links - replace multiple actions with only one universal one # * syntaxconvert() -- bugfix: hack around replacing underscore within links # (22.Jun.2010), Silver # * pluginTable() -- many improvements for converting table # * syntaxconvert() -- bugfix: substitute newlines (%%%) multiple times (g) # * syntaxconvert() -- bugfix: substitute bold italics (_*) multiple times (g) # (21.Jun.2010), Silver # * removed unnecessary 'i' flag from regex substitutions in many places # * pluginCreatePage() -- move replacing 'Template' to syntaxconvert() # * syntaxconvert() -- substitute: HomePage -> MainPage # (17.Jun.2010), Silver # * pluginCreatePage() -- replace 'Template' with 'Template:' in the page name set as a template # (16.Jun.2010), Silver # * pluginCreatePage() -- new function to convert CreatePage plugin to CreateArticle (http://www.mediawiki.org/wiki/Extension:CreateArticle) # * optimize regular expressions a bit: (\w+|\W+) -> ((\w|\W)+) # (03.Jun.2010), Silver # * parseheader() -- don't rely on header's line feed, force it instead # * header() -- don't consider an empty line, but 'Content-Transfer-Encoding:' as the end of header # * print progress numbers onto new lines, otherwise they are printed at once # * syntaxconvert() -- leave centering text within table cell untouched # * pluginTable() -- convert centering bold text within a cell to header # * pluginListSubPages() -- new function to convert listing subpages # * parseplugins() -- use pluginListSubPages() too #!/usr/bin/perl -w use strict; use warnings; # TODO: # * history? # * correctly convert sub-lists - function subLists() # * convert PhpWiki plugins to MediaWiki alternatives: # * FullTextSearch # * ... # config # directory with phpwiki dump my $input_dir = "./wikidump"; # output directory to store converted files my $output_dir = "./converted"; # file with the page info (category, author, etc) my $file_tree = "./tree.txt"; my $phpwiki_uploads = "http://mediawiki.mydomain/phpwiki_uploads"; ############### # SUBROUTINES # ############### ##################### # Plugin converters # sub pluginTable { my (@t) = @{ (shift) }; my ($c, $flag) = 0; foreach (@t) { # plugin opening tag if (/<\?plugin OldStyleTable/) { s/<\?plugin OldStyleTable/\{\| border = "1"/i; # set the flag, so that we do not accidentaly close tag for some other plugin # TODO: can we have a table in a table? $flag = 1; next; } # plugin closing tag if (/\?>/) { if ($flag) { s/\?>/\|\}/gi; $flag = 0; } next; } # table if (/^\|((\w|\W|\s)+)/) { # let's start a new row in table chomp($_); $_ .= "\n|-\n"; # double all separators... s/\|/||/g; # ...except the new rows... s/\|\|-/\|-/g; # ...and except the links in table cells, eg: ||[[link|caption]]|| s/\[\[([^\]]*)\|\|([^\]]*)\]\]/[[$1|$2]]/g; # colspan - one-by-one: |||asd|asd -> |colspan="3"|asd||asd while (/\|{3,}/) { my ($tmp) = /(\|{3,})/; my $count = $tmp =~ s/([\|])/$1/g; $count = $count / 2; s/\|{3,}/|colspan="$count"|/; # now it could be smth like this: !!|colspan="$count"| s/!!\|/!/; # s/\| *!/!/; } # rowspan: |vvasd|asd -> |rowspan="3"|asd||asd while (/\|v/) { my ($tmp) = /(\|v+)/; my $count = $tmp =~ s/([v])/$1/g; $count = $count + 1; s/\|v+/|rowspan="$count"|/; # now it could be smth like this: !!|rowspan="$count"| s/!!\|/!/; } # replace potential header with a real header s/\|\^([^']*)'''([^']+)'''/!!$1$2/g; s/\|!!/!!/g; # it could be smth like this: |colspan="$n"!! - swap positions s/\|(col|row)(span="[0-9]*")(!+)/$3$1$2\|/g; # Mediawiki cannot understand the 1st-only ordinary cell, eg: || asd !! asdf !! asdf s/(^|\|)\|([^\|!]+)!!/$1|$2\n!/g; # fix row-headers, eg: !! asd || bsd || bsd s/(^|!)!([^\|]+)\|\|/$1!$2\n|/g; # fix the 1st '(|)!!' s/^\|?!!([^!]+)/!$1/g; # fix the 1st '||' s/^\|\|([^\|]+)/|$1/g; # align center s/(\||^|!)(\||!)\^/$1$2align="center"\|/g; } } return @t; } sub pluginToc { my (@t) = @{ (shift) }; foreach (@t) { s/<\?plugin CreateToc\?>/__TOC__/; } return @t; } sub pluginCalendar { # TODO: syntax depends on the mediawiki calendar plugin, I didn't use calendar offsets, start days, etc # this one just inserts the calendar: http://www.mediawiki.org/wiki/Extension:Calendar_(Barrylb) my (@t) = @{ (shift) }; foreach (@t) { s/<\?plugin Calendar.*\?>/<calendar><\/calendar>/; } return @t; } sub pluginBackLinks { # use DynamicPageList extension for this to work: http://www.mediawiki.org/wiki/Extension:DynamicPageList my (@t) = @{ (shift) }; foreach (@t) { if ($_ !~ /<\?plugin BackLinks/) { next; } s/<\?plugin BackLinks(.*)\?>/<DynamicPageList>$1<\/DynamicPageList>/; my $tmp = $_; # params to separate lines $tmp =~ s/([^ ]+=[^ ]+|<\/DynamicPageList>)/\n$1/g; my @tmp = split(/\n/,$tmp); $tmp = ""; # convert parameters foreach (@tmp) { if (/page=/) { s/page=\[\[Category:([^\]]*)\]\](.*)/category=$1/; } elsif (/sortby=pagename/) { # FIXME: how to preserve descending order? s/sortby=pagename/ordermethod=categorysortkey\norder=ascending/; } else { # delete other (unknown or unimportant) params s/.+=.+//; } if ($_ !~ /^$/) { $tmp .= "$_\n"; } } s/<DynamicPageList>.*<\/DynamicPageList>/$tmp/; } return @t; } sub pluginListSubPages { # convert to simple SubPageList3 extension: http://www.mediawiki.org/wiki/Extension:SubPageList3 my (@t) = @{ (shift) }; foreach (@t) { s/<\?plugin ListSubpages.*\?>/<splist\/>/; } return @t; } sub pluginCreatePage { # convert to simple ListSubPages extension: http://www.mediawiki.org/wiki/User:Karora/ListSubPages my (@t) = @{ (shift) }; foreach (@t) { s/<\?plugin-form CreatePage template=((\w|\W)+)\?>/<createarticle>\ntype=createarticle\npreload=$1\nbuttonlabel=Add\nalign=left\n<\/createarticle>/; } return @t; } # TODO sub pluginFullTextSearch { # use DynamicPageList extension for this to work: http://www.mediawiki.org/wiki/Extension:DynamicPageList my (@t) = @{ (shift) }; foreach (@t) { } return @t; } # sub subLists # { # my (@t) = @{ (shift) }; # foreach (@t) { # my ($listType) = / *([\*#])/; # if (/^$listType/) { # next # } # if ($listType == "*") { # my $otherType = "#" # } # if (/ *) # } # return @t; # } ################### # Plugins handler # sub parseplugins { my (@textl) = @{ (shift) }; ###### # OldStyleTable @textl = pluginTable(\@textl); ###### # CreateToc @textl = pluginToc(\@textl); ###### # Calendar @textl = pluginCalendar(\@textl); ###### # BackLinks @textl = pluginBackLinks(\@textl); ###### # FullTextSearch # @textl = pluginFullTextSearch(\@textl); ###### # plugin ListSubpages @textl = pluginListSubPages(\@textl); ###### # plugin plugin-form CreatePage template= @textl = pluginCreatePage(\@textl); return @textl; } ####################################### # convert phpwiki syntax to mediawiki # sub syntaxconvert { my (@textl) = @{ (shift) }; foreach (@textl) { my $m; # convert special chars s/%2F/\//g; s/%E4/ä/g; s/%C4/Ä/g; s/%20/ /g; s/%F6/ö/g; s/%28/(/g; s/%29/)/g; s/%D6/Ö/g; s/%F5/õ/g; s/%FC/ü/g; s/%DC/Ü/g; s/%26/&/g; ######## # remove the extra newline ^M s/\r//; ######## # convert newlines s/%{3}/<br>/g; ######## # convert headings if (/(!{1,3})/) { $m = '=' x (5 - length($1)); s/^!{1,3}\s*(.*)/$m$1$m/; } ######## # convert bold: __text__ -> '''text''' if (/_{2}(.+)_{2}/) { # FIXME: don't replace underscores within links s/([^\[])__([^\]])/$1'''$2/g; } # convert bold: <strong>text</strong> -> '''text''' s/<(\/?)strong>/'''/gi; # convert preformatted text: <verbatim>text</verbatim> -> <pre>text</pre> s/<(\/?)verbatim>/<$1pre>/gi; # convert bold: *text* -> '''text''' if (/\*(.+)\*/) { s/\*/'''/g; } ######## # convert italic: <em>text</em> -> ''text'' s/<(\/?)em>/''/gi; ######## # convert italic: _text_ -> ''text'' # while (/_(.+)_/) { if (/_.+_/) { # don't replace underscores within links, eg: [http://somesite?some_page] # FIXME: now THIS is a hack! # <hack> while (/\[[^\]]+_[^\]]+\]/) { s/(\[[^\]]*)_([^\]]*\])/$1<UNDERSCORE>$2/; } while (/([^\[]+|^)http[^ ]*_[^ ]*/) { s/([^\[]+|^)(http[^ ]*)_([^ ]*)/$1$2<UNDERSCORE>$3/g; } s/_(.+)_/''$1''/g; s/<UNDERSCORE>/_/g; # </hack> } ######## # convert bold italic: _*text*_ -> '''''text''''' if (/_\*(.+)\*_/) { s/_\*([^\*_]+)\*_/'''''$1'''''/g; } ######## # convert indents # if (/^(\s{2,3})/) { # $m = ':' x length($1); # s/^\s{2,3}(.*)/$m/; # } ######## # convert sub-lists # FIXME: mixed lists get lost while (/^ {2,}[\*#]/) { s/^ ( *)([\*#])/$1$2$2/; } if (/^ [\*#]/) { s/^ ([\*#])/$1/; } ######## # convert hyperlinks s/\[([^\|\]]+)(\|?)([^\]]*)\]/[[$3$2$1]]/g; # http-links are automated, eg: [[http://blabla|bla]] -> [http://blabla bla] s/\[\[(http[^\]\|]*)\|?([^\]]*)\]\]/[$1 $2]/g; # in case there was no '|' for caption, eg: [[http://blabla|bla]] -> [http://blabla ] s/\[(http[^\]]*) \]/[$1]/g; ######## # convert links of uploaded files (proper uploads have to be redone) s/\[\[Upload:([^\]]*)\]\]/[$phpwiki_uploads\/$1]/g; ######## # '~' prevent hyperlinking # FIXME: does mediawiki have it? s/~//; ######## # convert PhpWiki categories if (/(\w+)Category(\w+)/) { s/(\w+)Category(\w+)/[[Category:$1]] [[Category:$2]]/g; } if (/Category(\w+)/) { s/Category(\w+)/[[Category:$1]]/g; } if (/(\w+)Category/) { s/(\w+)Category/[[Category:$1]]/g; } ######## # renamings done in bot.php (PHP-script that uploads pages into database afterwards) if (/([\W\s]+|^)Template\w+/) { s/([\W\s]+|^)Template(\w+)/$1Template:$2/gi; } # if (/[\w]{0}Template\w+/) { # s/[\WS]{0}Template(\w+)/Template:$1/i; # } if (/([\W\s]+|^)HomePage([\W\s]+|$)/) { s/([\W\s]+|^)HomePage([\W\s]+|$)/$1MainPage$2/g; } } return @textl; } ######### # Files # sub parseheader { # Structure to write to tree.txt: # file name;page name;author id;page summary # header my (@t) = @{ (shift) }; # tree.txt file handle my $f = shift; # wiki file name my $fn = shift; my ($s, $pos); print $f "$fn;"; foreach (@t) { # convert special chars s/%2F/\//g; s/%E4/ä/g; s/%C4/Ä/g; s/%20/ /g; s/%F6/ö/g; s/%28/(/g; s/%29/)/g; s/%D6/Ö/g; s/%F5/õ/g; s/%FC/ü/g; s/%DC/Ü/g; s/%26/&/g; if (/pagename/) { /=/; # get the position of '=' $pos = $+[0]; # put the page name into the header array, don't forget to remove the ';' $s = substr($_, $pos); $s =~ s/\r//; chomp($s); print $f $s; } if (/author_id/) { /=/; $pos = $+[0]; $s = substr($_, $pos); $s =~ s/\r//g; chomp($s); print $f $s; } if (/summary/) { /=/; $pos = $+[0]; $s = substr($_, $pos); $s =~ s/(;)?[\r\n]$//g; print $f $s; } } # force line break, don't rely on summary's line ending print $f "\n"; } sub header { opendir(DIR, $input_dir) or die("cannot open directory"); my @files = readdir(DIR); closedir(DIR); print "Total files: " . @files . "\n" . "Parsing headers... \n"; # status my $m = 0; # sort them my @fs = sort { $a cmp $b } @files; # open $tree file for the header info output open my $tree, '>', $file_tree or die ("error opening $file_tree: $!"); foreach (@fs) { if($_ eq "." or $_ eq "..") { next; } open(FILE, "$input_dir/$_") or die ("error opening $_: $!"); my @lines = <FILE>; my $num = @lines; my $l; my $i = 0; # find the 1st empty line which will indicate the end of header foreach $l (@lines) { # some (broken?) headers contain an empty line too, so sometimes it cannot be relied upon last if (($l =~ m/^[\n\r]?$/)); # last if ($l =~ m/^Content-Transfer-Encoding:/); $i++; } # copy everything except header to the new array my @new = @lines[$i+1 .. $num-1]; close(FILE); # write data without the header open(NFILE, ">$output_dir/$_") or print "could not open $_\n"; print NFILE @new; close(NFILE); # get the header my @header = @lines[0 .. $i]; # parse it and write info to the tree parseheader(\@header, $tree, $_); # status print "$m\n" if ($m % 1000 == 0); $m++; } close $tree; print "Done!\n"; } ############ # Main # # remove headers and build tree header(); # opendir(DIR, $output_dir) or die("cannot open directory: $output_dir"); my @files = readdir(DIR); closedir(DIR); print "\nTotal files: " . @files . "\nConverting files... \n"; # status my $m = 0; foreach (@files) { if($_ eq "." or $_ eq "..") { next; } open (FILE, "<$output_dir/$_") or print "error: $!\n"; my @inf = <FILE>; close(FILE); # FIXME: stupid, I know open (FILE, ">$output_dir/$_") or print "error: $!\n"; my @cat = split(/%2F/, $_); my $li = $#cat; if ($li > 0) { my $category = $cat[$li-1]; print FILE "[[Category:$category]]\n"; } #else { print FILE "[[Category:$_]]\n"; } my @w = syntaxconvert(\@inf); my @t = parseplugins(\@w); print FILE @t; close(FILE); # status print "$m\n" if ($m % 1000 == 0); $m++; } print "Done!\n"; exit 0;