User:Matma Rex/generateCollationTailoringData.rb

From mediawiki.org
# coding: utf-8

# Data scrapper for https://gerrit.wikimedia.org/r/#/c/49776/

require 'restclient'
require 'nokogiri'
require 'unicode_utils'
require 'pp'

n = Nokogiri.HTML RestClient.get 'http://developer.mimer.com/charts/tailorings.htm'

data = n.css('table tr').drop(3).map do |e|
	langcode = e.at('td:first-child .language, td:first-child').children.last.text[/\(([a-z-]+)/, 1]
	$stderr.puts langcode
	rules_container = e.at('td:last-child').at('b')
	rules = rules_container ? (rules_container.text.gsub('&lt', '<').gsub("\u00A0", ' ')) : ''
	
	tailored_first_letters = []
	rules.split('&').each do |chunk|
		next if chunk.strip.empty?
		chunk.strip.split(/\s+/).each_cons(3) do |a, mode, b|
			next unless mode =~ /\A<+\z/
			if mode == '<'
				b = 'Ä°' if b == 'Äą' # fix presence of dotted/dotless i for Turkish and Azerbaijani
				next if b.ascii_only? and b.bytesize == 1 # skip trivial cases
				tailored_first_letters << UnicodeUtils.upcase(b, langcode.to_sym)
			end
		end
	end
	
	[langcode, rules, tailored_first_letters]
end

data = data.sort_by{|a| a[0] }

puts data.map{|langcode, rules, letters|
	letters = letters.map{|lt| '"' + lt + '"' } # happily assume there are no quotes there
	"'#{langcode}' => array( #{letters.join ", "} ),".sub('(  )', '()')
}

$stderr.puts data.map{|langcode, rules, letters|
	"#{langcode}: #{letters.join " "}"
}