Parsoid/Language conversion/Preprocessor fixups/munge

From mediawiki.org
#!/usr/bin/node
/*
This is the script that post-processes the results of `dumpgrepper` into
wikitext for posting on mediawiki.org.  It helps if you use the
`git-mediawiki` package (https://github.com/Git-Mediawiki/Git-Mediawiki/wiki)
to sync the results back onto mediawiki.org.

Usage:
1. Put the output of `dumpgrepper` into a subdirectory named `results-$DUMPDATE/`.
2. Check out the pages from mediawiki.org using:
   git clone -c remote.origin.categories='Parsoid' -c remote.origin.mwLogin=[your mediawiki username] -c remote.origin.mwPassword=[your mediawiki password] mediawiki::https://www.mediawiki.org/w
3. Run:
   ./munge.js $DUMPDATE
4. Change to `w/`:
   git add . && git commit && git push
*/
var fs = require('fs');
var path = require('path');
var https = require('https');

//var wikis = "cebwiki dewiki enwiki eswiki frwiki itwiki jawiki mediawikiwiki nlwiki plwiki ptwiki ruwiki svwiki viwiki warwiki zhwiki".split(/\s+/g);

var GIT_WP = true;
var UNIFIED_LIST = true;
var TITLES_ONLY = true;
var PAGE_PREFIX = "Parsoid%2FLanguage_conversion%2FPreprocessor_fixups%2F";
var DUMPDATE = process.argv[2] || process.env.DUMPDATE || '20170620';
var unifiedOutput = {
    wp: {counts:'', chem:'',urls:'',nonarticle:'',other:''},
    sister: {counts:'', chem:'',urls:'',nonarticle:'',other:''}
};

if (GIT_WP) { // Copy this script itself to the wiki
    var outFile = path.join(__dirname, "w/" + PAGE_PREFIX + "munge.mw");
    var self = fs.readFileSync(__filename, "utf8");
    self = "<pre><nowiki>\n" +
        self.replace(/&/g, '&amp;').replace(/</g, '&lt;') +
        "\n</nowiki></pre>\n" +
        "[[Category:Parsoid]]\n";
    fs.writeFileSync(outFile, self, "utf8");
}

var jsonRequest = function(url) {
    return new Promise(function(resolve, reject) {
        https.get(url, function(res) {
            var statusCode = res.statusCode;
            var contentType = res.headers['content-type'];
            var error;
            if (statusCode !== 200) {
                error = new Error("Request Failed "+statusCode+": "+url);
            } else if (!/^application\/json/.test(contentType)) {
                error = new Error("Invalid content type: "+contentType);
            }
            if (error) { reject(error); res.resume(); return; }
            res.setEncoding('utf8');
            var rawData = '';
            res.on('data', function(d) { rawData += d; });
            res.on('end', function() {
                var parsedData;
                try {
                    parsedData = JSON.parse(rawData);
                } catch (e) { reject(e); return; }
                resolve(parsedData);
            });
        }).on('error', function(e) { reject(e); });
    });
};


// Fetch list of wikis from siteinfo
var siteMatrixP = jsonRequest('https://www.mediawiki.org//w/api.php?action=sitematrix&format=json');
// Get the interwiki map from mediawiki so we know how to link titles.
var interWikiP = jsonRequest('https://www.mediawiki.org/w/api.php?action=query&format=json&meta=siteinfo&siprop=interwikimap%7Clanguagevariants');

var reverseMap;
var prefixForSite = function(interWikiMap, site) {
    if (!reverseMap) {
        reverseMap = new Map();
        interWikiMap.forEach(function(iw) {
            var m = /^(.+)\/wiki\/\$1$/.exec(iw.url);
            if (m) {
                var prev = reverseMap.get(m[1]);
                // Store shortest prefix
                if (prev && prev.length <= iw.prefix) { return; }
                reverseMap.set(m[1], iw.prefix);
            }
        });
    }
    // Indirect prefixes ("portable" prefixes)
    var indirect = { wiki: 'w', wiktionary: 'wikt', wikibooks: 'b', wikinews: 'n', wikiquote: 'q', wikisource: 's', wikiversity: 'v', wikivoyage: 'voy' };
    var p = indirect[site.code];
    if (p && site.lang) {
        return p + ':' + site.lang + ':';
    }
    // Direct prefixes on mw:
    var prefix = reverseMap.get(site.url);
    if (prefix) { return ':' + prefix + ':'; }
    // Unknown :(
    return null;
};

var doOneSite = function(site) {
    var w = site.dbname;
    var inFile = path.join(__dirname, "results-" + DUMPDATE, w + "-results.txt");
    var outFile = GIT_WP ?
        path.join(__dirname, "w/" + PAGE_PREFIX + w + ".mw") :
        path.join(__dirname, "out/"+w+".wt");
    //console.log("Reading", w);
    var title = null;
    var nonarticle = "", chem = "", urls = "", math = "", other = "";
    var countArticle = 0, countNonarticle = 0, counted = true;
    var raw = '', missing = false;
    try {
        raw = fs.readFileSync(inFile, "utf8");
    } catch (e) { missing=true; console.warn("Skipping missing results:", w); }
    raw.replace(/\n+$/,'').split(/\r\n?|\n/g).slice(1).forEach(function(line) {
        var m = /^== Match: \[\[(.*)\]\] ==$/.exec(line);
        if (m) { title = m[1]; counted = false; return; }
        var item = "# [[" + site.wikiprefix + title+"]]\n";
        if (TITLES_ONLY) {
            if (counted) { return; }
        } else {
            item +=
                "#:<code><nowiki>" + line.replace(/<(\/?nowiki)/g, '&lt;$1').split('-{').join('</nowiki><b style="color:red">-<nowiki/>{</b><nowiki>') + "</nowiki></code>\n";
        }
        // Removed matched -{ ... }- markup.
        line = line.replace(/-\{[^{}]*\}-/g, '');
        if (!/-\{/.test(line)) { /* no unmatched markup */ return; }
        m = /^[^:]+:./.exec(title);
        if (m) {
            nonarticle += item;
            if (!counted) { counted = ++countNonarticle; }
            return;
        }
        // only count each title once
        if (!counted) { counted = ++countArticle; }
        m = /IUPAC|OtherNames|Andere Namen/.exec(line);
        if (m) {
            chem += item;
            return;
        }
        m = /\[http[^\]\s]*-\{/.exec(line);
        if (m) {
            urls += item;
            return;
        }
        m = /<math/.exec(line);
        if (m) {
            math += item;
            return;
        }
        other += item;
        return;
    });
    // Write output file.
    if (UNIFIED_LIST) {
        var key = (site.code === 'wiki') ? 'wp' : 'sister';
        var links = '';
        [['chem',chem],['urls',urls],['other',math+other],['nonarticle',nonarticle]].forEach(function(item) {
            var fld = item[0], content = item[1];
            if (!content) { return; }
            unifiedOutput[key][item[0]] += "==" + w + "==\n" + content;
            links += '[[/' + key + '-' + fld + '#' + w + '|'+fld[0]+']] ';
        });
        if (missing) { countArticle = countNonarticle = "(missing)"; }
        unifiedOutput[key].counts += '|-\n| ' + w + ' || ' + countArticle + ' || ' + countNonarticle + ' || ' + links + '||\n';
        return;
    }
    var out = "==" + w + "==\n";
    out += countArticle + " articles, " + countNonarticle + " other pages.\n";
    if (chem) {
        out += "=== Chemical names ===\n" + chem;
    }
    if (urls) {
        out += "=== Urls ===\n" + urls;
    }
    if (math) {
        out += "=== Math markup ===\n" + math;
    }
    if (other) {
        out += "=== Other ===\n" + other;
    }
    if (nonarticle) {
        out += "=== Matches not in article namespace ===\n" + nonarticle;
    }
    if (GIT_WP) {
        out += '[[Category:Parsoid]]\n';
    }
    if (!missing) {
        fs.writeFileSync(outFile, out, "utf8");
    }
};

Promise.all([interWikiP, siteMatrixP]).then(function(arr) {
    var interWikiMap = arr[0].query.interwikimap;
    var languageVariants = arr[0].query.languagevariants;
    var siteMatrix = arr[1].sitematrix;
    var sites = [];
    var maybeAddOne = function(site) {
        if (site.closed !== undefined ||
            site.fishbowl !== undefined ||
            site.private !== undefined) {
            return;
        }
        var prefix = prefixForSite(interWikiMap, site);
        if (!prefix) {
            console.warn("Skipping", site.url, "because interwiki prefix unknown.");
            return;
        }
        site.wikiprefix = prefix;
        sites.push(site);
    };
    var i;
    for (i=0; siteMatrix[i] !== undefined; i++) {
        var s = siteMatrix[i];
        var lang = s.code;
        if (languageVariants[lang] !== undefined) {
            console.warn('Skipping', s.localname, 'because LanguageConverter is in use.');
        } else {
            siteMatrix[i].site.forEach(function(ss) {
                ss.lang = s.code;
                maybeAddOne(ss);
            });
        }
    }
    siteMatrix.specials.forEach(maybeAddOne);

    sites.forEach(doOneSite);
    if (UNIFIED_LIST) {
        var counts = 'Article counts from the ' + DUMPDATE + ' dump.\n';
        ['wp','sister'].forEach(function(key) {
            if (key === 'wp') {
                counts += '== Wikipedia ==\n';
            } else {
                counts += '== Sister projects ==\n';
            }
            counts +=
                    '{| class="wikitable sortable" style="width:100%"\n' +
                    '|-\n' +
                    '! Wikiproject !! # of titles in main namespace !! # of titles in other namespaces !! Links !! Notes\n' +
                    unifiedOutput[key].counts +
                '|}\n';
        });
        if (GIT_WP) {
            counts += '[[Category:Parsoid]]\n';
        }
        var basename = GIT_WP ? ('w/' + PAGE_PREFIX + DUMPDATE) : 'out/';
        var countFile = GIT_WP ? '.mw' : 'counts.wt';
        countFile = path.join(__dirname, basename + countFile);
        fs.writeFileSync(countFile, counts, 'utf8');
        ['wp','sister'].forEach(function(key) {
            ['chem','urls','other','nonarticle'].forEach(function(ty) {
                var outFile = GIT_WP ?
                        ('%2F' + key + '-' + ty + '.mw') :
                        (key + '-' + ty + '.wt');
                outFile = path.join(__dirname, basename + outFile);
                var data = unifiedOutput[key][ty];
                if (GIT_WP) {
                    data += '[[Category:Parsoid]]\n';
                }
                fs.writeFileSync(outFile, data, 'utf8');
            });
        });
    }
});