Parsoid/Language conversion/Preprocessor fixups/munge

/* This is the script that post-processes the results of `dumpgrepper` into wikitext for posting on mediawiki.org. It helps if you use the `git-mediawiki` package (https://github.com/Git-Mediawiki/Git-Mediawiki/wiki) to sync the results back onto mediawiki.org.
 * 1) !/usr/bin/node

Usage: 1. Put the output of `dumpgrepper` into a subdirectory named `results-$DUMPDATE/`. 2. Check out the pages from mediawiki.org using: git clone -c remote.origin.categories='Parsoid' -c remote.origin.mwLogin=[your mediawiki username] -c remote.origin.mwPassword=[your mediawiki password] mediawiki::https://www.mediawiki.org/w 3. Run: ./munge.js $DUMPDATE 4. Change to `w/`: git add. &amp;&amp; git commit &amp;&amp; git push var fs = require('fs'); var path = require('path'); var https = require('https');

//var wikis = "cebwiki dewiki enwiki eswiki frwiki itwiki jawiki mediawikiwiki nlwiki plwiki ptwiki ruwiki svwiki viwiki warwiki zhwiki".split(/\s+/g);

var GIT_WP = true; var UNIFIED_LIST = true; var TITLES_ONLY = true; var PAGE_PREFIX = "Parsoid%2FLanguage_conversion%2FPreprocessor_fixups%2F"; var DUMPDATE = process.argv[2] || process.env.DUMPDATE || '20170620'; var unifiedOutput = { wp: {counts:, chem:,urls:,nonarticle:,other:''}, sister: {counts:, chem:,urls:,nonarticle:,other:''} };

if (GIT_WP) { // Copy this script itself to the wiki var outFile = path.join(__dirname, "w/" + PAGE_PREFIX + "munge.mw"); var self = fs.readFileSync(__filename, "utf8"); self = "&lt;pre>&lt;nowiki>\n" + self.replace(/&amp;/g, '&amp;amp;').replace(/&lt;/g, '&amp;lt;') + "\n&lt;/nowiki>&lt;/pre>\n" + "\n"; fs.writeFileSync(outFile, self, "utf8"); }

var jsonRequest = function(url) { return new Promise(function(resolve, reject) {       https.get(url, function(res) { var statusCode = res.statusCode; var contentType = res.headers['content-type']; var error; if (statusCode !== 200) { error = new Error("Request Failed "+statusCode+": "+url); } else if (!/^application\/json/.test(contentType)) { error = new Error("Invalid content type: "+contentType); }           if (error) { reject(error); res.resume; return; } res.setEncoding('utf8'); var rawData = ''; res.on('data', function(d) { rawData += d; }); res.on('end', function {               var parsedData;                try {                    parsedData = JSON.parse(rawData);                } catch (e) { reject(e); return; }                resolve(parsedData);            }); }).on('error', function(e) { reject(e); });   }); };

// Fetch list of wikis from siteinfo var siteMatrixP = jsonRequest('https://www.mediawiki.org//w/api.php?action=sitematrix&amp;format=json'); // Get the interwiki map from mediawiki so we know how to link titles. var interWikiP = jsonRequest('https://www.mediawiki.org/w/api.php?action=query&amp;format=json&amp;meta=siteinfo&amp;siprop=interwikimap%7Clanguagevariants');

var reverseMap; var prefixForSite = function(interWikiMap, site) { if (!reverseMap) { reverseMap = new Map; interWikiMap.forEach(function(iw) {           var m = /^(.+)\/wiki\/\$1$/.exec(iw.url);            if (m) {                var prev = reverseMap.get(m[1]);                // Store shortest prefix                if (prev &amp;&amp; prev.length &lt;= iw.prefix) { return; }                reverseMap.set(m[1], iw.prefix);            }        }); }   // Indirect prefixes ("portable" prefixes) var indirect = { wiki: 'w', wiktionary: 'wikt', wikibooks: 'b', wikinews: 'n', wikiquote: 'q', wikisource: 's', wikiversity: 'v', wikivoyage: 'voy' }; var p = indirect[site.code]; if (p &amp;&amp; site.lang) { return p + ':' + site.lang + ':'; }   // Direct prefixes on mw: var prefix = reverseMap.get(site.url); if (prefix) { return ':' + prefix + ':'; } // Unknown :(   return null; };

var doOneSite = function(site) { var w = site.dbname; var inFile = path.join(__dirname, "results-" + DUMPDATE, w + "-results.txt"); var outFile = GIT_WP ? path.join(__dirname, "w/" + PAGE_PREFIX + w + ".mw") : path.join(__dirname, "out/"+w+".wt"); //console.log("Reading", w); var title = null; var nonarticle = "", chem = "", urls = "", math = "", other = ""; var countArticle = 0, countNonarticle = 0, counted = true; var raw = '', missing = false; try { raw = fs.readFileSync(inFile, "utf8"); } catch (e) { missing=true; console.warn("Skipping missing results:", w); } raw.replace(/\n+$/,).split(/\r\n?|\n/g).slice(1).forEach(function(line) {       var m = /^== Match: \[\[(.*)\]\] ==$/.exec(line);        if (m) { title = m[1]; counted = false; return; }        var item = "# " + site.wikiprefix + title+"\n";        if (TITLES_ONLY) {            if (counted) { return; }        } else {            item +=                "#:&lt;code>&lt;nowiki>" + line.replace(/&lt;(\/?nowiki)/g, '&amp;lt;$1').split('-{').join('&lt;/nowiki>&lt;b style="color:red">-&lt;nowiki/>{&lt;/b>&lt;nowiki>') + "&lt;/nowiki>&lt;/code>\n";        }        // Removed matched -{ ... }- markup.        line = line.replace(/-\{[^{}]*\}-/g, );        if (!/-\{/.test(line)) { /* no unmatched markup */ return; }        m = /^[^:]+:./.exec(title);        if (m) {            nonarticle += item;            if (!counted) { counted = ++countNonarticle; }            return;        }        // only count each title once if (!counted) { counted = ++countArticle; } m = /IUPAC|OtherNames|Andere Namen/.exec(line); if (m) { chem += item; return; }       m = /\[http[^\]\s]*-\{/.exec(line); if (m) { urls += item; return; }       m = /&lt;math/.exec(line); if (m) { math += item; return; }       other += item; return; });   // Write output file.    if (UNIFIED_LIST) {        var key = (site.code === 'wiki') ? 'wp' : 'sister';        var links = '';        'chem',chem],['urls',urls],['other',math+other],['nonarticle',nonarticle.forEach(function(item) { var fld = item[0], content = item[1]; if (!content) { return; } unifiedOutput[key][item[0]] += "==" + w + "==\n" + content; links += ''+fld[0]+' '; });       if (missing) { countArticle = countNonarticle = "(missing)"; }        unifiedOutput[key].counts += '|-\n| ' + w + ' || ' + countArticle + ' || ' + countNonarticle + ' || ' + links + '||\n';        return;    }    var out = "==" + w + "==\n";    out += countArticle + " articles, " + countNonarticle + " other pages.\n";    if (chem) {        out += "=== Chemical names ===\n" + chem;    }    if (urls) {        out += "=== Urls ===\n" + urls;    }    if (math) {        out += "=== Math markup ===\n" + math;    }    if (other) {        out += "=== Other ===\n" + other;    }    if (nonarticle) {        out += "=== Matches not in article namespace ===\n" + nonarticle;    }    if (GIT_WP) {        out += '\n';    }    if (!missing) {        fs.writeFileSync(outFile, out, "utf8");    } };

Promise.all([interWikiP, siteMatrixP]).then(function(arr) {   var interWikiMap = arr[0].query.interwikimap;    var languageVariants = arr[0].query.languagevariants;    var siteMatrix = arr[1].sitematrix;    var sites = [];    var maybeAddOne = function(site) {        if (site.closed !== undefined || site.fishbowl !== undefined || site.private !== undefined) {           return;        }        var prefix = prefixForSite(interWikiMap, site);        if (!prefix) {            console.warn("Skipping", site.url, "because interwiki prefix unknown.");            return;        }        site.wikiprefix = prefix;        sites.push(site);    };    var i;    for (i=0; siteMatrix[i] !== undefined; i++) {        var s = siteMatrix[i];        var lang = s.code;        if (languageVariants[lang] !== undefined) {            console.warn('Skipping', s.localname, 'because LanguageConverter is in use.');        } else {            siteMatrix[i].site.forEach(function(ss) { ss.lang = s.code; maybeAddOne(ss); });       }    }    siteMatrix.specials.forEach(maybeAddOne);

sites.forEach(doOneSite); if (UNIFIED_LIST) { var counts = 'Article counts from the ' + DUMPDATE + ' dump.\n'; ['wp','sister'].forEach(function(key) {           if (key === 'wp') {                counts += '== Wikipedia ==\n';            } else {                counts += '== Sister projects ==\n';            }            counts +=                    '{| class="wikitable sortable" style="width:100%"\n' +                    '|-\n' +                    '! Wikiproject !! # of titles in main namespace !! # of titles in other namespaces !! Links !! Notes\n' +                    unifiedOutput[key].counts +                '|}\n';        }); if (GIT_WP) { counts += '\n'; }       var basename = GIT_WP ? ('w/' + PAGE_PREFIX + DUMPDATE) : 'out/'; var countFile = GIT_WP ? '.mw' : 'counts.wt'; countFile = path.join(__dirname, basename + countFile); fs.writeFileSync(countFile, counts, 'utf8'); ['wp','sister'].forEach(function(key) {           ['chem','urls','other','nonarticle'].forEach(function(ty) { var outFile = GIT_WP ? ('%2F' + key + '-' + ty + '.mw') : (key + '-' + ty + '.wt'); outFile = path.join(__dirname, basename + outFile); var data = unifiedOutput[key][ty]; if (GIT_WP) { data += '\n'; }               fs.writeFileSync(outFile, data, 'utf8'); });       });    } });