Parsoid/Language conversion/Preprocessor fixups/munge

/* This is the script that post-processes the results of `dumpgrepper` into wikitext for posting on mediawiki.org. It helps if you use the `git-mediawiki` package (https://github.com/Git-Mediawiki/Git-Mediawiki/wiki) to sync the results back onto mediawiki.org.
 * 1) !/usr/bin/node

Usage: 1. Put the output of `dumpgrepper` into a subdirectory named `results/`. 2. Check out the pages from mediawiki.org using: git clone -c remote.origin.categories='Parsoid' -c remote.origin.mwLogin=[your mediawiki username] -c remote.origin.mwPassword=[your mediawiki password] mediawiki::https://www.mediawiki.org/w 3. Run: ./munge.js 4. Change to `w/`: git add. &amp;&amp; git commit &amp;&amp; git push var fs = require('fs'); var path = require('path');

var wikis = "cebwiki dewiki enwiki eswiki frwiki itwiki jawiki mediawikiwiki nlwiki plwiki ptwiki ruwiki svwiki viwiki warwiki zhwiki".split(/\s+/g);

var GIT_WP = true; var PAGE_PREFIX = "Parsoid%2FLanguage_conversion%2FPreprocessor_fixups%2F";

if (GIT_WP) { // Copy this script itself to the wiki var outFile = path.join(__dirname, "w/" + PAGE_PREFIX + "munge.mw"); var self = fs.readFileSync(__filename, "utf8"); self = "&lt;pre>&lt;nowiki>\n" + self.replace(/&amp;/g, '&amp;amp;').replace(/&lt;/g, '&amp;lt;') + "\n&lt;/nowiki>&lt;/pre>\n" + "\n"; fs.writeFileSync(outFile, self, "utf8"); }

wikis.forEach(function(w) {   var inFile = path.join(__dirname, "results/" + w + "-results.txt");    var outFile = GIT_WP ?        path.join(__dirname, "w/" + PAGE_PREFIX + w + ".mw") :        path.join(__dirname, "out/"+w+".wt");    var base = ':' + w.replace(/wiki$/, ) + ':';    if (w=='mediawikiwiki' &amp;&amp; GIT_WP) { base = ; }    console.log("Reading", w);    var title = null;    var nonarticle = "", chem = "", urls = "", math = "", other = "";    var countArticle = 0, countNonarticle = 0;    fs.readFileSync(inFile, "utf8").replace(/\n+$/,'').split(/\r\n?|\n/g).slice(1).forEach(function(line) { var m = /^== Match: \[\[(.*)\]\] ==$/.exec(line); if (m) { title = m[1]; return; } var item = "# " + base + title+"\n" + "#:&lt;code>&lt;nowiki>" + line.replace(/&lt;(\/?nowiki)/g, '&amp;lt;$1').split('-{').join('&lt;/nowiki>&lt;b style="color:red">-&lt;nowiki/>{&lt;/b>&lt;nowiki>') + "&lt;/nowiki>&lt;/code>\n"; // Removed matched -{ ... }- markup. line = line.replace(/-\{[^{}]*\}-/g, ''); if (!/-\{/.test(line)) { /* no unmatched markup */ return; } m = /^\w+:\w/.exec(title); if (m) { nonarticle += item; countNonarticle++; return; }       countArticle++; m = /IUPAC|OtherNames|Andere Namen/.exec(line); if (m) { chem += item; return; }       m = /\[http[^\]\s]*-\{/.exec(line); if (m) { urls += item; return; }       m = /&lt;math/.exec(line); if (m) { math += item; return; }       other += item; return; });   // Write output file.    var out = "==" + w + "==\n";    out += countArticle + " articles, " + countNonarticle + " other pages.\n";    if (chem) {        out += "=== Chemical names ===\n" + chem;    }    if (urls) {        out += "=== Urls ===\n" + urls;    }    if (math) {        out += "=== Math markup ===\n" + math;    }    if (other) {        out += "=== Other ===\n" + other;    }    if (nonarticle) {        out += "=== Matches not in article namespace ===\n" + nonarticle;    }    if (GIT_WP) {        out += '\n';    }    fs.writeFileSync(outFile, out, "utf8"); });