User:Amgine/Dump processing

From mediawiki.org

test_xml[edit]

A quick hack to build some word lists for User:Nemo_bis. Two quick php scripts, the first rips a collection of files (above 1300 with the 20140311 dump), named after their L2 header text containing all words which had that L2 header (./[L2 Header].txt). The second maps these files to wikipedia language codes (copies the list to ./wpdict/[wpCode].txt).

test_xml.php[edit]

<?php
/*
 * test_xml.php
 *
 * Copyright 2014 Amgine <amgine@Amgines-MacBook-Air.local>
 *
 * This program is free software. It comes without any warranty, to the extent
 * permitted by applicable law. You can redistribute it and/or modify it under
 * the terms of the Do What The Fuck You Want To Public License, Version 2, as
 * published by Sam Hocevar. See http://www.wtfpl.net/ for more details.
 *
 */

/**
 * function: getL2
 **
 * Return an array of L2 headers from a wikitext string.
 **
 * @param string $text The wiki syntax string of the page.
 * @return array An array of language names (or other level 2 header texts.)
 **/
function getL2( $text ){
	$l2 = array();

	foreach( explode( "\n", $text ) as $line ){
		$matches = array();
		if( preg_match_all( '/^==([\w\s\-‒-—‑¯­_]+)==/u', $line, $matches ) ){
			$l2[] = trim( $matches[1][0] );
		}
	}
	if( count( $l2 ) > 0 ){
		return $l2;
	}
}

/**
 * function: add2Dictionary
 **
 * Process found term (breaking up phrases), check for uniqueness, and
 * add to the dictionary.
 **
 * @param string $term The found term to be added to the dictionary.
 * @param array $dictionary The dictionary which will be appended.
 **/
function add2Dictionary( $term, &$dictionary, $lang, $whitespace = false ){
	if( $whitespace ){
		if( !in_array( trim( $term ), $dictionary ) ){
			$dictionary[] = trim( $term );
		}
	}else{
		if( preg_match( '/[\w]+/', $term ) ){
			$term = explode( ' ', $term );
			foreach( $term as $word ){
				if( !in_array( trim( $word ), $dictionary ) ){
					$dictionary[] = trim( $word );
				}
			}
		}
	}
	// Buffer 500 entries in ram per language; a reasonable performance vs. memory explode
	if( count( $dictionary ) > 500 ){
		if( !file_put_contents( $lang . '.txt', implode( "\n", $dictionary ), FILE_APPEND ) ){
			die( "Writing $lang dictionary -- FAILED!\n" );
		}
		$dictionary = array();
	}
}

$reader = new XMLReader();

// @FIXME: hard-coded dump filename
if( !$reader->open( '/Volumes/VERBATIM HD/enwiktionary-20140311-pages-meta-current.xml' ) ){
	die( "Failed to open file for xml reading.\n" );
}

// @FIXME: evil, vile, but it works
$reader->read(); $reader->read();
$reader->next( 'page' );

$dictionary = array();

# Try Unicode magic to check for: MIXED_SCRIPT_CONFUSABLE, SINGLE_SCRIPT, INVISIBLE
# https://ssl.icu-project.org/apiref/icu4c/uspoof_8h.html#a0dbd60e53a571689baf65c63f4de8155
# @FIXME: Before this, remove words not in the main script of the language (requires name->code)
# @FIXME: Currently stripping all non-Latin!
$checker = new Spoofchecker();
$checker->setChecks ( 50 );

$i = 0;
do{
	// Grab <page> node, make simple xml object of it.
	$node = $reader->readOuterXML();
	$element = simplexml_load_string( $node );
	if( $element->ns == '0' && !$checker->isSuspicious( $element->title ) ){
		// If in main namespace, and not confusable, grab L2 headers
		// @FIXME: Create a smarter exclusionary system
		// @TODO: Figure out what the consensus is on proper names
		if( !preg_match( '/\{\{context\|[^}]*\b(vulgar(?:ity?)\b|obscen(?:e|ity)|offensive|pejorative|medicine|slang)[^}]*\}\}/iu', $element->revision->text ) ){
			$l2s = getL2( $element->revision->text );
			if( is_array( $l2s ) ){
				foreach( $l2s as $lang ){
					if( !array_key_exists( $lang, $dictionary ) ){
						$dictionary[$lang] = array();
					}
					// NB: Optional $whitespace = true for function allows multi-word phrases,
					// default does not. '''Default may create broken CJK phrases'''
					add2Dictionary( $element->title, $dictionary[$lang], $lang );
				}
			}
		} else {
			add2Dictionary( $element->title, $dictionary['blacklist'], 'blacklist', $whitespace = true );
		}
	}
	++$i;
	echo "\r$i";
}while( $reader->next( 'page' ) );

// File is parsed, flush buffered dictionaries
echo "\n\n";
foreach( $dictionary as $lang => $dict ){
	echo "Writing $lang dictionary.";
	if( file_put_contents( $lang . '.txt', implode( "\n", $dictionary[$lang] ), FILE_APPEND ) ){
		echo "\rWrote $lang dictionary successfully.\n";
	}else{
		echo " -- FAILED!\n";
	}
}

// @FIXME: move langname dictionaries to langcode dictionaries.s

test_dict2wpMapper.php[edit]

<?php
/*
 * test_dict2wpMapper.php
 * 
 * Copyright 2014 Amgine <amgine@Amgines-MacBook-Air.local>
 * 
 * This program is free software. It comes without any warranty, to the extent
 * permitted by applicable law. You can redistribute it and/or modify it under
 * the terms of the Do What The Fuck You Want To Public License, Version 2, as
 * published by Sam Hocevar. See http://www.wtfpl.net/ for more details.
 * 
 */

$lang = array(
	// Germanics
	'English' => array( 'same', 'en', 'simple' ),
	'Dutch' => 'nl',
	'German' => 'de',
	'Swedish' => 'sv',
	'Norwegian BokmĂĽl' => 'no',
	'Norwegian Nynorsk' => 'nn',
	'Danish' => 'da',
	'Luxembourgish' => 'lb',
	'Icelandic' => 'is',
	'Afrikaans' => 'af',
	'West Frisian' => 'fy',
	'Low German' => 'nds',
	'Scots' => 'sco',
	'Alemannic German' => 'als',
	'Yiddish' => 'yi',
	'Limburgish' => 'li',
	'Bavarian' => 'bar',
	'Faroese' => 'fo',
	'Dutch Low Saxon' => 'nds-nl',
	'West Flemish' => 'vls',
	'North Frisian' => 'frr',
	'Saterland Frisian' => 'stq',
	'KĂślsch' => 'ksh',
	'Old English' => 'ang',
	'Pennsylvania German' => 'pdc',
	'Gothic' => 'got',
	// Italic
	'French' => 'fr',
	'Italian' => 'it',
	'Spanish' => 'es',
	'Portuguese' => 'pt',
	'Catalan' => 'ca',
	'Romanian' => 'ro',
	'Galician' => 'gl',
	'Latin' => 'la',
	'Occitan' => 'oc',
	'Piedmontese' => 'pms',
	'Haitian Creole' => 'ht',
	'Aragonese' => 'an',
	'Lombard' => 'lmo',
	'Sicilian' => 'scn',
	'Asturian' => 'ast',
	'Neapolitan' => 'nap',
	'Walloon' => 'wa',
	'Venetian' => 'vec',
	'Tarantino' => 'roa-tara',
	'Corsican' => 'co',
	'Romansch' => 'rm',
	'Ladino' => 'lad',
	'Friulian' => 'fur',
	'Ligurian' => 'lij',
	'Sardinian' => 'sc',
	'Franco-Provençal' => 'frp',
	'Extremaduran' => 'ext',
	'Picard' => 'pcd',
	// NB: Emilian => egl
	'Emilian' => 'eml',
	'Papiamentu' => 'pap',
	'Mirandese' => 'mwl',
	'Aromanian' => 'roa-rup',
    // Slavic
	'Russian' => 'ru',
	'Polish' => 'pl',
	'Ukrainian' => 'uk',
	'Czech' => 'cs',
	//'Serbo-Croatian' => array( 'same', 'sr', 'hr', 'sh', 'bs' ), avoid flames
	'Slovak' => 'sk',
	'Bulgarian' => 'bg',
	'Slovene' => 'sl',
	'Macedonian' => 'mk',
	'Belarusian' => array( 'same', 'be', 'be-x-old' ),
	'Upper Sorbian' => 'hsb',
	'Rusyn' => 'rue',
	'Kashubian' => 'csb',
	'Silesian' => 'szl',
	'Lower Sorbian' => 'dsb',
	'Old Church Slavonic' => 'cu',
	// Philippine
	'Waray-Waray' => 'war',
	'Cebuano' => 'ceb',
	'Tagalog' => 'tl',
	'Kapampangan' => 'pam',
	'Ilocano' => 'ilo',
	'Bikol Central' => 'bcl',
	'Pangasinan' => 'pag',
	// Japanic
	'Japanese' => 'ja',
	// Austroasiatic
	'Vietnamese' => 'vi',
	'Khmer' => 'km',
	// Turkic
	'Turkish' => 'tr',
	'Kazakh' => 'kk',
	'Uzbek' => 'uz',
	'Azeri' => 'az',
	'Tatar' => 'tt',
	'Bashkir' => 'ba',
	'Kyrgyz' => 'ky',
	'Chuvash' => 'cv',
	'Yakut' => 'sah',
	'Turkmen' => 'tk',
	'Uyghur' => 'ug',
	'Gagauz' => 'gag',
	'Karachay-Balkar' => 'krc',
	'Crimean Tatar' => 'crh',
	'Karakalpak' => 'kaa',
	'Tuvan' => 'tyv',
	// Sinitic
	'Cantonese' => 'zh-yue',
	'Min Nan' => 'zh-min-nan',
	'Gan' => 'gan',
	'Wu' => 'wuu',
	'Hakka' => 'hak',
	'Min Dong' => 'cdo',
	// NB: These may require additional parser logic, not processing now.
    //	zh • 758,009 – Chinese (中文)
    //	zh-classical • 3,245 – Classical Chinese (文言)
	// Sunda–Sulawesi
	'Malay' => 'ms',
	'Indonesian' => 'id',
	'Javanese' => 'jv',
	'Sundanese' => 'su',
	'Buginese' => 'bug',
	'Banyumasan' => 'map=bms',
	'Minangkabau' => 'min',
	'Acehnese' => 'ace',
	'Banjarese' => 'bjn',
	'Chamorro' => 'ch',
	// Finno-Permic
	'Finnish' => 'fi',
	'Estonian' => 'et',
	'Northern Sami' => 'se',
	'Western Mari' => 'mrj',
	'VĂľro' => 'fiu-vro',
	'Komi-Zyrian' => 'kv',
	'Komi-Permyak' => 'koi',
	'Udmurt' => 'udm',
	// NB: Eastern Mari => chm
	'Eastern Mari' => 'mhr',
	'Veps' => 'vep',
	'Erzya' => 'myv',
	'Moksha' => 'mdf',
	// Semitic
	'Arabic' => 'ar',
	'Hebrew' => 'he',
	'Amharic' => 'am',
	'Egyptian Arabic' => 'arz',
	'Maltese' => 'mt',
	'Aramaic' => 'arc',
	'Tigrinya' => 'ti',
	// Iranian
	'Persian' => 'fa',
	'Tajik' => 'tg',
	'Kurdish' => 'ku',
	'Central Kurdish' => 'ku',
	'Mazanderani' => 'mzn',
	'Ossetian' => 'os',
	'Gilaki' => 'glk',
	'Pashto' => 'ps',
	'Zazaki' => 'diq',
	// Indo-Aryan
	'Hindi' => 'hi',
	'Marathi' => 'mr',
	'Western Panjabi' => 'pnb',
	'Bengali' => 'bn',
	'Bishnupriya Manipuri' => 'bpy',
	'Urdu' => 'ur',
	'Nepali' => 'ne',
	'Gujarati' => 'gu',
	'Fiji Hindi' => 'hif',
	'Sanskrit' => 'sa',
	'Sinhalese' => 'si',
	'Punjabi' => 'pa',
	'Oriya' => 'or',
	'Dhivehi' => 'dv',
	'Pali' => 'pi',
	'Bihari' => 'bh',
	'Assamese' => 'as',
	'Sindhi' => 'sd',
	'Kashmiri' => 'ks',
    // Constructed
    'Esperanto' => 'eo',
    'VolapĂźk' => 'vo',
    'Ido' => 'io',
    'Interlingua' => 'ia',
    'Interlingue' => 'ie',
    'Novial' => 'nov',
	// Ugric
	'Hungarian' => 'hu',
	// Korean
	'Korean' => 'ko',
	// Baltic
	'Lithuanian' => 'lt',
	'Latvian' => 'lv',
	//NB: Samogitian => sgs
	'Samogitian' => 'bat-smg',
	'Latgalian' => 'ltg',
	// Basque
	'Basque' => 'eu',
	// Dravidian
	'Tamil' => 'ta',
	'Telugu' => 'te',
    'Malayalam' => 'ml',
    'Kannada' => 'kn',
	// Celtic
    'Breton' => 'br',
    'Welsh' => 'cy',
    'Irish' => 'ga',
    'Scottish Gaelic' => 'gd',
    'Manx' => 'gv',
    'Cornish' => 'kw',
	// Tibeto-Burman
	'Newari' => 'new',
	'Burmese' => 'my',
	'Tibetan' => 'bo',
	'Dzongkha' => 'dz',
	// Tai
	'Thai' => 'th',
	'Lao' => 'lo',
	'Zhuang' => 'za',
	// Hellenic
    'Greek' => 'el',
    'Pontic Greek' => 'pnt',
	// Kartvelian
	'Georgian' => 'ka',
	'Mingrelian' => 'xmf',
	// Albanian
	'Albanian' => 'sq',
	// Bornean
	'Malagasy' => 'mg',
	// Bantoidsw • 26,073 – Swahili (Kiswahili)
	'Lingala' => 'ln',
	'Kinyarwanda' => 'rw',
	'Shona' => 'sn',
	'Kongo' => 'kg',
	'Northern Sotho' => 'nso',
	'Zulu' => 'zu',
	'Tswana' => 'tn',
	'Swazi' => 'ss',
	'Tsonga' => 'ts',
	'Kikuyu' => 'ki',
	'Venda' => 've',
	'Kirundi' => 'rn',
	'Luganda' => 'lg',
	'Tumbuka' => 'tum',
	'Sotho' => 'st',
	'Xhosa' => 'xh',
	'Chichewa' => 'ny',
	// Yoruboid
	'Yoruba' => 'yo',
	// Armenian
    'Armenian' => 'hy',
    // Quechuan
    'Quechua' => 'qu',
    // Polynesian
    'Maori' => 'mi',
    'Hawaiian' => 'haw',
    'Tongan' => 'to',
    'Tahitian' => 'ty',
    'Samoan' => 'sm',
    // Mongolic
    'Mongolian' => 'mn',
    'Kalmyk' => 'xal',
	// Uto-Aztecan
	'Nahuatl' => 'nah',
	// Northeast Caucasian
    'Chechen' => 'ce',
    'Lezgi' => 'lez',
    'Lak' => 'lbe',
    'Avar' => 'av',
    // Germanic Pidgins & Creoles
    'Tok Pisin' => 'tpi',
    'Sranan Tongo' => 'srn',
    'Bislama' => 'bi',
    'Pitcairn-Norfolk' => 'pih',
    // Cushitic
    'Somali' => 'so',
    'Oromo' => 'om',
    'Afar' => 'aa',
    // Athabaskan
    'Navajo' => 'nv',
    // Eskimo-Aleut
    'Greenlandic' => 'kl',
    'Inuktitut' => 'iu',
    'Inupiak' => 'ik',
    // Aymaran
    'Aymara' => 'ay',
    // Tupian
    'GuaranĂ­' => 'gn',
    // Northwest Caucasian
    'Kabardian' => 'kbd',
    'Abkhaz' => 'ab',
    // Atlantic
    'Wolof' => 'wo',
    'Fula' => 'ff',
    // A priori Constructed
    'Lojban' => 'jbo',
    // Berber
    'Kabyle' => 'kab',
    // Micronesian
    'Nauruan' => 'na',
    'Marshallese' => 'mh',
    // Igboid
    'Igbo' => 'ig',
    // Timor-Babar
    'Tetum' => 'tet',
    // Algonquian
    'Cheyenne' => 'chy',
    'Cree' => 'cr',
    // Kwa
    'Ewe' => 'ee',
    'Akan' => 'ak',
    'Twi' => 'tw',
    // Iroquoian
    'Cherokee' => 'chr',
    // Mande
    'Bambara' => 'bm',
    // Chadic
    'Hausa' => 'ha',
    // Ubangian
    'Sango' => 'sg',
    // East Fijian
    'Fijian' => 'fj',
);

echo count( $lang );

foreach( $lang as $langname => $val ){
	echo "\n$langname";
	if( is_array( $val ) ){
		foreach( $val as $key => $langcode ){
			if( $key > 0 ){
				if( $val[0] == 'same' ){
					if( file_exists( "$langname.txt" ) ){
						if( !copy( "$langname.txt", "wpdict/$langcode.txt" ) ){
							die( "FAILED -- copying $langname.txt to wpdict/$val.txt\n" );
						}
					}else{
						echo "\n$langname.txt DOES NOT EXIST!\n";
					}
				}//elseif( $val[0] == 'merge' ){ do the merge here }
			}
		}
	}else{
		// Just copy the file over.
		if( file_exists( "$langname.txt" ) ){
			if( !copy( "$langname.txt", "wpdict/$val.txt" ) ){
				die( "FAILED -- copying $langname.txt to wpdict/$val.txt\n" );
			}
		}else{
			echo "\n$langname.txt DOES NOT EXIST!\n";
		}
	}
}

echo "\n\nDone.";

testFiltr.php[edit]

#!/usr/bin/php
<?php
/*
 * testFiltr.php
 * 
 * Copyright 2014 Amgine <amgine@Amgines-MacBook-Air.local>
 * 
 * This program is free software. It comes without any warranty, to the extent
 * permitted by applicable law. You can redistribute it and/or modify it under
 * the terms of the Do What The Fuck You Want To Public License, Version 2, as
 * published by Sam Hocevar. See http://www.wtfpl.net/ for more details.
 * 
 */
 
/**
 * REQUIREMENTS: It is expected this will be run in a *nix environment, plus
 * * cURL
 * * bzip2
 * 
 * If your php is not located at /usr/bin/php, better fix the shebang.
 **
 * USE NOTE: This file must be marked as executable, chmod +x testFiltr.php
 *
 * example:
 * curl http://dumps.wikimedia.org/enwiktionary/20140328/enwiktionary-20140328-pages-articles.xml.bz2 | bzcat | ./testFiltr.php
 **/

/**
 * function: getL2
 **
 * Return an array of L2 headers from a wikitax string.
 **
 * @param string $text The wiki syntax string of the page.
 * @return array An array of language names (or other level 2 header texts.)
 **/
function getL2( $text ){
	$l2 = array();
	
	foreach( explode( "\n", $text ) as $line ){
		$matches = array();
		if( preg_match_all( '/^==([\w\s\-‒-—‑¯­_]+)==/u', $line, $matches ) ){
			$l2[] = trim( $matches[1][0] );
		}
	}
	if( count( $l2 ) > 0 ){
		return $l2;
	}
	return array();
}

/**
 * function: add2Dictionary
 **
 * Process found term (breaking up phrases), check for uniqueness, and
 * add to the dictionary.
 **
 * @param string $term The found term to be added to the dictionary.
 * @param array $dictionary The dictionary which will be appended.
 **/
function add2Dictionary( $term, &$dictionary, $whitespace = false ){
	if( $whitespace ){
		if( !in_array( trim( $word ), $dictionary ) ){
			$dictionary[] = trim( $word );
		}
	}else{
		if( preg_match( '/[\w]+/', $term ) ){
			$term = explode( ' ', $term );
			foreach( $term as $word ){
				if( !in_array( trim( $word ), $dictionary ) ){
					$dictionary[] = trim( $word );
				}
			}
		}
	}
}

// http://dumps.wikimedia.org/mhwiktionary/20140401/mhwiktionary-20140401-pages-articles.xml.bz2
//$fh = fopen( '/Users/amgine/Downloads/mhwiktionary-20140401-pages-articles.xml', 'r' );

$i = $j = 0;
$dictionary = array();
// This is the directory the dictionaries will be stored in. It must exist or script will fail.
$destPath = 'enWT/';

do{
	$buffer = stream_get_line( STDIN, 16384, PHP_EOL );
	++$j;
	if( preg_match( '/^<page>$/i', trim( $buffer ) ) ){
		$node = $buffer;
		do{
			$buffer = stream_get_line( STDIN, 16384, PHP_EOL );
			$node .= "\n$buffer";
			++$j;
		}while( !feof( STDIN ) && !preg_match( '/^<\/page>$/i', trim( $buffer ) ) );
		++$i;
		$element = simplexml_load_string( $node );
		if( $element->ns == '0' ){
			// If in main namespace, grab L2 headers
			if( !preg_match( '/\{\{context\|[^}]*(\bvulgar(?:ity?)\b|\bobscen(?:e|ity))[^}]*\}\}/iu', $element->revision->text ) ){
				$l2s = getL2( $element->revision->text );
				if( is_array( $l2s ) ){
					foreach( $l2s as $lang ){
						if( !array_key_exists( $lang, $dictionary ) ){
							$dictionary[$lang] = array();
						}
						// NB: Optional $whitespace = true for function allows multi-word phrases,
						// default does not. '''Default may create broken CJK phrases'''
						add2Dictionary( $element->title, $dictionary[$lang] );
						// Buffer 500 entries in ram per language; a reasonable performance vs. memory explode
						if( count( $dictionary[$lang] ) > 500 ){
							if( !file_put_contents( $destPat . $lang . '.txt', implode( "\n", $dictionary[$lang] ), FILE_APPEND ) ){
								die( "Writing $lang dictionary -- FAILED!\n" );
							}
							$dictionary[$lang] = array();
						}
					}
				}
			}
		}
	}
}while( !feof( STDIN ) );

echo "\r$j lines processed.\n$i pages found.\n";

// File is parsed, flush buffered dictionaries
echo "\n\n";
foreach( $dictionary as $lang => $dict ){
	echo "Writing $lang dictionary.";
	if( file_put_contents( $destPath . $lang . '.txt', implode( "\n", $dictionary[$lang] ), FILE_APPEND ) ){
		echo "\rWrote $lang dictionary successfully.\n";
	}else{
		echo " -- FAILED!\n";
	}
}