User:Atrox/Phpwiki2Mediawiki/mwconverter.pl

From MediaWiki.org
Jump to: navigation, search
# Copyright (c) 2010 Artjom Vassiljev <artjom@max.ee>, MAX 123 AS, Estonia
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# 1. Redistributions of source code must retain the above copyright
#    notice, this list of conditions and the following disclaimer.
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
# 3. All advertising materials mentioning features or use of this software
#    must display the following acknowledgement:
#       This product includes software developed by Artjom Vassiljev.
# 4. Neither the name of the author nor the names of any co-contributors
#    may be used to endorse or promote products derived from this software
#    without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY Artjom Vassiljev AND "MAX 123" ``AS IS'' AND
# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED.  IN NO EVENT SHALL Bill Paul OR THE VOICES IN HIS HEAD
# BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
# THE POSSIBILITY OF SUCH DAMAGE.
#
# Some ideas taken from the script php2mediawiki.pl by Isaac Wilcox
# 
# Change the directories in the config to get this script to work
# I use file tree.txt to put there some info which I extract from the
# article headers and later use this file to pump articles into mediawiki.
# All this script does is just converts from one syntax to another and
# prepares files to be put to mediawiki by another script.
# It is not ideal, has bugs but it was enough to fulfill my needs.
 
# changelog:
 
# (09.Aug.2010), Silver
# * pluginListSubPages -- replace <ListSubPages/> with <splist/>
 
# (26.Jul.2010), Silver
# * syntaxconvert() -- bugfix: don't convert indents - it broke sub-lists
# * syntaxconvert() -- convert primitive sub-lists - mixed ones get broken
# * syntaxconvert() -- bugfix: more fixes to preserving underscores in links (now in non-forced http://... links too)
# * syntaxconvert() -- bugfix: fix hyperlink captions (separated by space instead of '|')
 
# (23.Jul.2010), Silver
# * pluginTable() -- bugfix: don't break links' captions within table cells
 
# (22.Jul.2010), Silver
# * pluginBacklinks() -- implement converting plugin BackLinks to DynamicPageList
# * syntaxconvert() -- bugfix: fix converting multiple categories, eg: OneCategoryTwo -> Category:One Category:Two
# * delete all '$_ =~ '
# * replace all: m/bla/ -> /bla/
 
# (21.Jul.2010), Silver
# * syntaxconvert() -- bugfix: replace separator of http-links' captions '|' with ' ', eg: [http://blabla|bla] -> [http://blabla bla]
# * syntaxconvert() -- convert [[Upload:...]] to [$phpwiki_uploads]
 
# (16.Jul.2010), Silver
# * pluginTable() -- bugfix: more fixes for converting table
# * pluginTable() -- convert rowspans
# * syntaxconvert() -- convert <verbatim> to <pre>
# * syntaxconvert() -- bugfix: fix converting links - replace multiple actions with only one universal one
# * syntaxconvert() -- bugfix: hack around replacing underscore within links
 
# (22.Jun.2010), Silver
# * pluginTable() -- many improvements for converting table
# * syntaxconvert() -- bugfix: substitute newlines (%%%) multiple times (g)
# * syntaxconvert() -- bugfix: substitute bold italics (_*) multiple times (g)
 
# (21.Jun.2010), Silver
# * removed unnecessary 'i' flag from regex substitutions in many places
# * pluginCreatePage() -- move replacing 'Template' to syntaxconvert()
# * syntaxconvert() -- substitute: HomePage -> MainPage
 
# (17.Jun.2010), Silver
# * pluginCreatePage() -- replace 'Template' with 'Template:' in the page name set as a template
 
# (16.Jun.2010), Silver
# * pluginCreatePage() -- new function to convert CreatePage plugin to CreateArticle (http://www.mediawiki.org/wiki/Extension:CreateArticle)
# * optimize regular expressions a bit: (\w+|\W+) -> ((\w|\W)+)
 
# (03.Jun.2010), Silver
# * parseheader() -- don't rely on header's line feed, force it instead
# * header() -- don't consider an empty line, but 'Content-Transfer-Encoding:' as the end of header
# * print progress numbers onto new lines, otherwise they are printed at once
# * syntaxconvert() -- leave centering text within table cell untouched
# * pluginTable() -- convert centering bold text within a cell to header
# * pluginListSubPages() -- new function to convert listing subpages
# * parseplugins() -- use pluginListSubPages() too
 
 
#!/usr/bin/perl -w
 
use strict;
use warnings;
 
# TODO:
# * history?
# * correctly convert sub-lists - function subLists()
# * convert PhpWiki plugins to MediaWiki alternatives:
#   * FullTextSearch
#   * ...
 
# config
# directory with phpwiki dump
my $input_dir = "./wikidump";
# output directory to store converted files
my $output_dir = "./converted";
# file with the page info (category, author, etc)
my $file_tree = "./tree.txt";
my $phpwiki_uploads = "http://mediawiki.mydomain/phpwiki_uploads";
 
###############
# SUBROUTINES #
###############
 
#####################
# Plugin converters #
 
sub pluginTable
{
        my (@t) = @{ (shift) };
        my ($c, $flag) = 0;
 
        foreach (@t) {
                # plugin opening tag
                if (/<\?plugin OldStyleTable/) {
                        s/<\?plugin OldStyleTable/\{\| border = "1"/i;
                        # set the flag, so that we do not accidentaly close tag for some other plugin
                        # TODO: can we have a table in a table?
                        $flag = 1;
                        next;
                }
                # plugin closing tag
                if (/\?>/) {
                        if ($flag) { s/\?>/\|\}/gi; $flag = 0; }
                        next;
                }
                # table
                if (/^\|((\w|\W|\s)+)/) {
                        # let's start a new row in table
                        chomp($_);
                        $_ .= "\n|-\n";
 
                        # double all separators...
                        s/\|/||/g;
                        # ...except the new rows...
                        s/\|\|-/\|-/g;
                        # ...and except the links in table cells, eg: ||[[link|caption]]||
                        s/\[\[([^\]]*)\|\|([^\]]*)\]\]/[[$1|$2]]/g;
 
                        # colspan - one-by-one: |||asd|asd -> |colspan="3"|asd||asd
                        while (/\|{3,}/) {
                                my ($tmp) = /(\|{3,})/;
                                my $count = $tmp =~ s/([\|])/$1/g;
                                $count = $count / 2;
                                s/\|{3,}/|colspan="$count"|/;
                                # now it could be smth like this: !!|colspan="$count"|
                                s/!!\|/!/;
#                               s/\| *!/!/;
                        }
 
                        # rowspan: |vvasd|asd -> |rowspan="3"|asd||asd
                        while (/\|v/) {
                                my ($tmp) = /(\|v+)/;
                                my $count = $tmp =~ s/([v])/$1/g;
                                $count = $count + 1;
                                s/\|v+/|rowspan="$count"|/;
                                # now it could be smth like this: !!|rowspan="$count"|
                                s/!!\|/!/;
                        }
 
                        # replace potential header with a real header
                        s/\|\^([^']*)'''([^']+)'''/!!$1$2/g;
                        s/\|!!/!!/g;
                        # it could be smth like this: |colspan="$n"!! - swap positions
                        s/\|(col|row)(span="[0-9]*")(!+)/$3$1$2\|/g;
 
                        # Mediawiki cannot understand the 1st-only ordinary cell, eg: || asd !! asdf !! asdf
                        s/(^|\|)\|([^\|!]+)!!/$1|$2\n!/g;
 
                        # fix row-headers, eg: !! asd || bsd || bsd
                        s/(^|!)!([^\|]+)\|\|/$1!$2\n|/g;
 
                        # fix the 1st '(|)!!'
                        s/^\|?!!([^!]+)/!$1/g;
 
                        # fix the 1st '||'
                        s/^\|\|([^\|]+)/|$1/g;
 
                        # align center
                        s/(\||^|!)(\||!)\^/$1$2align="center"\|/g;
                }
        }
 
        return @t;
}
 
sub pluginToc
{
        my (@t) = @{ (shift) };
        foreach (@t) { s/<\?plugin CreateToc\?>/__TOC__/; }
        return @t;
}
 
sub pluginCalendar
{
        # TODO: syntax depends on the mediawiki calendar plugin, I didn't use calendar offsets, start days, etc
        # this one just inserts the calendar: http://www.mediawiki.org/wiki/Extension:Calendar_(Barrylb)
        my (@t) = @{ (shift) };
        foreach (@t) {
                s/<\?plugin Calendar.*\?>/<calendar><\/calendar>/;
        }
        return @t;
}
 
sub pluginBackLinks
{
        # use DynamicPageList extension for this to work: http://www.mediawiki.org/wiki/Extension:DynamicPageList
        my (@t) = @{ (shift) };
        foreach (@t) {
                if ($_ !~ /<\?plugin BackLinks/) {
                        next;
                }
                s/<\?plugin BackLinks(.*)\?>/<DynamicPageList>$1<\/DynamicPageList>/;
                my $tmp = $_;
                # params to separate lines
                $tmp =~ s/([^ ]+=[^ ]+|<\/DynamicPageList>)/\n$1/g;
                my @tmp = split(/\n/,$tmp);
                $tmp = "";
                # convert parameters
                foreach (@tmp) {
                        if (/page=/) {
                                s/page=\[\[Category:([^\]]*)\]\](.*)/category=$1/;
                        }
                        elsif (/sortby=pagename/) {
                                # FIXME: how to preserve descending order?
                                s/sortby=pagename/ordermethod=categorysortkey\norder=ascending/;
                        }
                        else {
                                # delete other (unknown or unimportant) params
                                s/.+=.+//;
                        }
                        if ($_ !~ /^$/) {
                                $tmp .= "$_\n";
                        }
                }
                s/<DynamicPageList>.*<\/DynamicPageList>/$tmp/;
        }
        return @t;
}
 
sub pluginListSubPages
{
        # convert to simple SubPageList3 extension: http://www.mediawiki.org/wiki/Extension:SubPageList3
        my (@t) = @{ (shift) };
        foreach (@t) {
                s/<\?plugin ListSubpages.*\?>/<splist\/>/;
        }
        return @t;
}
 
sub pluginCreatePage
{
        # convert to simple ListSubPages extension: http://www.mediawiki.org/wiki/User:Karora/ListSubPages
        my (@t) = @{ (shift) };
        foreach (@t) {
                s/<\?plugin-form CreatePage template=((\w|\W)+)\?>/<createarticle>\ntype=createarticle\npreload=$1\nbuttonlabel=Add\nalign=left\n<\/createarticle>/;
        }
        return @t;
}
 
# TODO
sub pluginFullTextSearch
{
        # use DynamicPageList extension for this to work: http://www.mediawiki.org/wiki/Extension:DynamicPageList
        my (@t) = @{ (shift) };
        foreach (@t) {
 
        }
        return @t;
}
 
# sub subLists
# {
#       my (@t) = @{ (shift) };
#       foreach (@t) {
#               my ($listType) = / *([\*#])/;
#               if (/^$listType/) {
#                       next
#               }
#               if ($listType == "*") {
#                       my $otherType = "#"
#               }
#               if (/ *)
#       }
#       return @t;
# }
 
###################
# Plugins handler #
 
sub parseplugins
{
        my (@textl) = @{ (shift) };
 
        ######
        # OldStyleTable
        @textl = pluginTable(\@textl);
 
        ######
        # CreateToc
        @textl = pluginToc(\@textl);
 
        ######
        # Calendar
        @textl = pluginCalendar(\@textl);
 
        ######
        # BackLinks
        @textl = pluginBackLinks(\@textl);
 
        ######
        # FullTextSearch
#       @textl = pluginFullTextSearch(\@textl);
 
        ######
        # plugin ListSubpages
        @textl = pluginListSubPages(\@textl);
 
        ######
        # plugin plugin-form CreatePage template=
        @textl = pluginCreatePage(\@textl);
 
        return @textl;
}
 
#######################################
# convert phpwiki syntax to mediawiki #
 
sub syntaxconvert
{
        my (@textl) = @{ (shift) };
 
        foreach (@textl) {
 
                my $m;
 
                # convert special chars
                s/%2F/\//g;
                s/%E4/ä/g;
                s/%C4/Ä/g;
                s/%20/ /g;
                s/%F6/ö/g;
                s/%28/(/g;
                s/%29/)/g;
                s/%D6/Ö/g;
                s/%F5/õ/g;
                s/%FC/ü/g;
                s/%DC/Ü/g;
                s/%26/&/g;
 
                ########
                # remove the extra newline ^M
                s/\r//;
 
                ########
                # convert newlines
                s/%{3}/<br>/g;
 
                ########
                # convert headings
                if (/(!{1,3})/) {
                        $m = '=' x (5 - length($1));
                        s/^!{1,3}\s*(.*)/$m$1$m/;
                }
 
                ########
                # convert bold: __text__ -> '''text'''
                if (/_{2}(.+)_{2}/) {
                        # FIXME: don't replace underscores within links
                        s/([^\[])__([^\]])/$1'''$2/g;
                }
 
                # convert bold: <strong>text</strong> -> '''text'''
                s/<(\/?)strong>/'''/gi;
 
                # convert preformatted text: <verbatim>text</verbatim> -> <pre>text</pre>
                s/<(\/?)verbatim>/<$1pre>/gi;
 
                # convert bold: *text* -> '''text'''
                if (/\*(.+)\*/) {
                        s/\*/'''/g;
                }
 
                ########
                # convert italic: <em>text</em> -> ''text''
                s/<(\/?)em>/''/gi;
 
                ########
                # convert italic: _text_ -> ''text''
#               while (/_(.+)_/) {
                if (/_.+_/) {
                        # don't replace underscores within links, eg: [http://somesite?some_page]
                        # FIXME: now THIS is a hack!
                        # <hack>
                        while (/\[[^\]]+_[^\]]+\]/) {
                                s/(\[[^\]]*)_([^\]]*\])/$1<UNDERSCORE>$2/;
                        }
                        while (/([^\[]+|^)http[^ ]*_[^ ]*/) {
                                s/([^\[]+|^)(http[^ ]*)_([^ ]*)/$1$2<UNDERSCORE>$3/g;
                        }
                        s/_(.+)_/''$1''/g;
                        s/<UNDERSCORE>/_/g;
                        # </hack>
                }
 
                ########        
                # convert bold italic: _*text*_ -> '''''text'''''
                if (/_\*(.+)\*_/) {
                        s/_\*([^\*_]+)\*_/'''''$1'''''/g;
                }
 
                ########
                # convert indents
#               if (/^(\s{2,3})/) {
#                       $m = ':' x length($1);
#                       s/^\s{2,3}(.*)/$m/;
#               }
 
                ########
                # convert sub-lists
                # FIXME: mixed lists get lost
                while (/^ {2,}[\*#]/) {
                        s/^  ( *)([\*#])/$1$2$2/;
                }
                if (/^ [\*#]/) {
                        s/^ ([\*#])/$1/;
                }
 
                ########
                # convert hyperlinks
                s/\[([^\|\]]+)(\|?)([^\]]*)\]/[[$3$2$1]]/g;
                # http-links are automated, eg: [[http://blabla|bla]] -> [http://blabla bla]
                s/\[\[(http[^\]\|]*)\|?([^\]]*)\]\]/[$1 $2]/g;
                # in case there was no '|' for caption, eg: [[http://blabla|bla]] -> [http://blabla ]
                s/\[(http[^\]]*) \]/[$1]/g;
 
                ########
                # convert links of uploaded files (proper uploads have to be redone)
                s/\[\[Upload:([^\]]*)\]\]/[$phpwiki_uploads\/$1]/g;
 
                ########
                # '~' prevent hyperlinking
                # FIXME: does mediawiki have it?
                s/~//;
 
                ########
                # convert PhpWiki categories
                if (/(\w+)Category(\w+)/) {
                        s/(\w+)Category(\w+)/[[Category:$1]] [[Category:$2]]/g;
                }
                if (/Category(\w+)/) {
                        s/Category(\w+)/[[Category:$1]]/g;
 
                }
                if (/(\w+)Category/) {
                        s/(\w+)Category/[[Category:$1]]/g;
                }
 
                ########
                # renamings done in bot.php (PHP-script that uploads pages into database afterwards)
                if (/([\W\s]+|^)Template\w+/) {
                        s/([\W\s]+|^)Template(\w+)/$1Template:$2/gi;
                }
#               if (/[\w]{0}Template\w+/) {
#                       s/[\WS]{0}Template(\w+)/Template:$1/i;
#               }
                if (/([\W\s]+|^)HomePage([\W\s]+|$)/) {
                        s/([\W\s]+|^)HomePage([\W\s]+|$)/$1MainPage$2/g;
                }
        }
 
        return @textl;
}
 
#########
# Files #
 
sub parseheader
{
        # Structure to write to tree.txt:
        # file name;page name;author id;page summary
        # header
        my (@t) = @{ (shift) };
        # tree.txt file handle
        my $f = shift;
        # wiki file name
        my $fn = shift;
        my ($s, $pos);
 
        print $f "$fn;";
 
        foreach (@t) {
 
                # convert special chars
                s/%2F/\//g;
                s/%E4/ä/g;
                s/%C4/Ä/g;
                s/%20/ /g;
                s/%F6/ö/g;
                s/%28/(/g;
                s/%29/)/g;
                s/%D6/Ö/g;
                s/%F5/õ/g;
                s/%FC/ü/g;
                s/%DC/Ü/g;
                s/%26/&/g;
 
                if (/pagename/) {
 
                        /=/;
                        # get the position of '='
                        $pos = $+[0];
                        # put the page name into the header array, don't forget to remove the ';'
                        $s = substr($_, $pos);
                        $s =~ s/\r//;
                        chomp($s);
                        print $f $s;
                }
 
                if (/author_id/) {
 
                        /=/;
                        $pos = $+[0];
                        $s = substr($_, $pos);
                        $s =~ s/\r//g;
                        chomp($s);
                        print $f $s;
                }
 
                if (/summary/) {
 
                        /=/;
                        $pos = $+[0];
                        $s = substr($_, $pos);
                        $s =~ s/(;)?[\r\n]$//g;
                        print $f $s;
                }
 
        }
        # force line break, don't rely on summary's line ending
        print $f "\n";
 
}
 
sub header
{
        opendir(DIR, $input_dir) or die("cannot open directory");
 
        my @files = readdir(DIR);
 
        closedir(DIR);
 
        print "Total files: " . @files . "\n" . "Parsing headers... \n";
 
        # status
        my $m = 0;
 
        # sort them
        my @fs = sort { $a cmp $b } @files;
 
        # open $tree file for the header info output
        open my $tree, '>', $file_tree or die ("error opening $file_tree: $!");
 
        foreach (@fs) {
 
                if($_ eq "." or $_ eq "..") { next; }
 
                open(FILE, "$input_dir/$_") or die ("error opening $_: $!");
 
                my @lines = <FILE>;
                my $num = @lines;
                my $l;
                my $i = 0;
 
                # find the 1st empty line which will indicate the end of header
                foreach $l (@lines) {
                        # some (broken?) headers contain an empty line too, so sometimes it cannot be relied upon
                        last if (($l =~ m/^[\n\r]?$/));
#                       last if ($l =~ m/^Content-Transfer-Encoding:/);
                        $i++;
                }
 
                # copy everything except header to the new array
                my @new = @lines[$i+1 .. $num-1];
 
                close(FILE);
 
                # write data without the header
                open(NFILE, ">$output_dir/$_") or print "could not open $_\n";
                print NFILE @new;
                close(NFILE);
 
                # get the header
                my @header = @lines[0 .. $i];
                # parse it and write info to the tree
                parseheader(\@header, $tree, $_);
 
                # status
                print "$m\n" if ($m % 1000 == 0);
                $m++;
 
        }
 
        close $tree;
        print "Done!\n";
 
}
 
############
# Main     #
 
# remove headers and build tree
header();
 
# 
opendir(DIR, $output_dir) or die("cannot open directory: $output_dir");
my @files = readdir(DIR);
closedir(DIR);
 
print "\nTotal files: " . @files . "\nConverting files... \n";
 
# status
my $m = 0;
 
foreach (@files) {
 
        if($_ eq "." or $_ eq "..") { next; }
 
 
        open (FILE, "<$output_dir/$_") or print "error: $!\n";
        my @inf = <FILE>;
        close(FILE);
        # FIXME: stupid, I know
        open (FILE, ">$output_dir/$_") or print "error: $!\n";
 
        my @cat = split(/%2F/, $_);
        my $li = $#cat;
 
        if ($li > 0) {
                my $category = $cat[$li-1];
                print FILE "[[Category:$category]]\n";
        }
        #else { print FILE "[[Category:$_]]\n"; }
 
        my @w = syntaxconvert(\@inf);
        my @t = parseplugins(\@w);
        print FILE @t;
        close(FILE);
 
        # status
        print "$m\n" if ($m % 1000 == 0);
        $m++;
}
 
print "Done!\n";
 
exit 0;