User:Amgine/Dump processing/test xml.php

From mediawiki.org
<?php
/*
 * test_xml.php
 *
 * Copyright 2014 Amgine <amgine@Amgines-MacBook-Air.local>
 *
 * This program is free software. It comes without any warranty, to the extent
 * permitted by applicable law. You can redistribute it and/or modify it under
 * the terms of the Do What The Fuck You Want To Public License, Version 2, as
 * published by Sam Hocevar. See http://www.wtfpl.net/ for more details.
 *
 */

/**
 * function: getL2
 **
 * Return an array of L2 headers from a wikitext string.
 **
 * @param string $text The wiki syntax string of the page.
 * @return array An array of language names (or other level 2 header texts.)
 **/
function getL2( $text ){
	$l2 = array();

	foreach( explode( "\n", $text ) as $line ){
		$matches = array();
		if( preg_match_all( '/^==([\w\s\-‒-—‑¯­_]+)==/u', $line, $matches ) ){
			$l2[] = trim( $matches[1][0] );
		}
	}
	if( count( $l2 ) > 0 ){
		return $l2;
	}
}

/**
 * function: add2Dictionary
 **
 * Process found term (breaking up phrases), check for uniqueness, and
 * add to the dictionary.
 **
 * @param string $term The found term to be added to the dictionary.
 * @param array $dictionary The dictionary which will be appended.
 **/
function add2Dictionary( $term, &$dictionary, $lang, $whitespace = false ){
	if( $whitespace ){
		if( !in_array( trim( $term ), $dictionary ) ){
			$dictionary[] = trim( $term );
		}
	}else{
		if( preg_match( '/[\w]+/', $term ) ){
			$term = explode( ' ', $term );
			foreach( $term as $word ){
				if( !in_array( trim( $word ), $dictionary ) ){
					$dictionary[] = trim( $word );
				}
			}
		}
	}
	// Buffer 500 entries in ram per language; a reasonable performance vs. memory explode
	if( count( $dictionary ) > 500 ){
		if( !file_put_contents( $lang . '.txt', implode( "\n", $dictionary ), FILE_APPEND ) ){
			die( "Writing $lang dictionary -- FAILED!\n" );
		}
		$dictionary = array();
	}
}

$reader = new XMLReader();

// @FIXME: hard-coded dump filename
if( !$reader->open( '/Volumes/VERBATIM HD/enwiktionary-20140311-pages-meta-current.xml' ) ){
	die( "Failed to open file for xml reading.\n" );
}

// @FIXME: evil, vile, but it works
$reader->read(); $reader->read();
$reader->next( 'page' );

$dictionary = array();

# Try Unicode magic to check for: MIXED_SCRIPT_CONFUSABLE, SINGLE_SCRIPT, INVISIBLE
# https://ssl.icu-project.org/apiref/icu4c/uspoof_8h.html#a0dbd60e53a571689baf65c63f4de8155
# @FIXME: Before this, remove words not in the main script of the language (requires name->code)
# @FIXME: Currently stripping all non-Latin!
$checker = new Spoofchecker();
$checker->setChecks ( 50 );

$i = 0;
do{
	// Grab <page> node, make simple xml object of it.
	$node = $reader->readOuterXML();
	$element = simplexml_load_string( $node );
	if( $element->ns == '0' && !$checker->isSuspicious( $element->title ) ){
		// If in main namespace, and not confusable, grab L2 headers
		// @FIXME: Create a smarter exclusionary system
		// @TODO: Figure out what the consensus is on proper names
		if( !preg_match( '/\{\{context\|[^}]*\b(vulgar(?:ity?)\b|obscen(?:e|ity)|offensive|pejorative|medicine|slang)[^}]*\}\}/iu', $element->revision->text ) ){
			$l2s = getL2( $element->revision->text );
			if( is_array( $l2s ) ){
				foreach( $l2s as $lang ){
					if( !array_key_exists( $lang, $dictionary ) ){
						$dictionary[$lang] = array();
					}
					// NB: Optional $whitespace = true for function allows multi-word phrases,
					// default does not. '''Default may create broken CJK phrases'''
					add2Dictionary( $element->title, $dictionary[$lang], $lang );
				}
			}
		} else {
			add2Dictionary( $element->title, $dictionary['blacklist'], 'blacklist', $whitespace = true );
		}
	}
	++$i;
	echo "\r$i";
}while( $reader->next( 'page' ) );

// File is parsed, flush buffered dictionaries
echo "\n\n";
foreach( $dictionary as $lang => $dict ){
	echo "Writing $lang dictionary.";
	if( file_put_contents( $lang . '.txt', implode( "\n", $dictionary[$lang] ), FILE_APPEND ) ){
		echo "\rWrote $lang dictionary successfully.\n";
	}else{
		echo " -- FAILED!\n";
	}
}

// @FIXME: move langname dictionaries to langcode dictionaries.s