1
0
Fork 0
mirror of https://github.com/Oreolek/Togataltu.git synced 2024-05-04 10:08:28 +03:00
Togataltu/phpmorphy/bin/extract_ancodes_map.php
2011-04-22 18:32:32 +07:00

126 lines
3.8 KiB
PHP

#!/usr/bin/php
<?php
if(2 == (ini_get('mbstring.func_overload') & 2)) {
die("don`t overload string functions in mbstring extension, see mbstring.func_overload option");
}
if($argc < 3) {
echo "Usage " . $argv[0] . " MORPH_DATA_FILE LANGUAGE OUT_DIR";
exit;
}
require_once(dirname(__FILE__) . '/../src/common.php');
require_once(dirname(__FILE__) . '/../utils/dict_stuff/mrd/gramtab.php');
require_once(dirname(__FILE__) . '/../utils/dict_stuff/mrd/rml.php');
require_once(dirname(__FILE__) . '/../utils/dict_stuff/mrd/mwz.php');
$graminfo_file = $argv[1];
$language = $argv[2];
$out_dir = $argv[3];
try {
$factory = new phpMorphy_Storage_Factory();
$graminfo = phpMorphy_GramInfo::create($factory->open(PHPMORPHY_STORAGE_FILE, $graminfo_file, false), false);
$out_file = $out_dir . '/morph_data_ancodes_map.' . strtolower($graminfo->getLocale()) . '.bin';
$gramtab_map = get_gramtab_map($language);
$valid_ancodes = array_flip(array_values($gramtab_map));
$ancodes_map = array();
foreach(get_all_ancodes($graminfo) as $id => $value) {
if(isset($gramtab_map[$value])) {
$orig_ancode = $gramtab_map[$value];
$ancodes_map[$id] = $orig_ancode;
} else {
// TODO: typically ancodes don`t contain digits, so we can generate mapping to char + digit ancodes
do {
$new_ancode = chr(mt_rand(ord('a'), ord('z'))) . chr(mt_rand(ord('a'), ord('z')));
} while(isset($valid_ancodes[$new_ancode]));
echo "'$value' not found in gramtab, assume $new_ancode" . PHP_EOL;
$ancodes_map[$id] = $new_ancode;
}
}
foreach($ancodes_map as &$ancode) {
$ancode = iconv('utf-8', $graminfo->getEncoding(), $ancode);
unset($ancode); // remove reference from array
}
unset($ancode);
file_put_contents($out_file, serialize($ancodes_map));
} catch (Exception $e) {
echo $e;
exit(1);
}
function get_all_ancodes($graminfo) {
$grammems = array();
$poses = array();
foreach($graminfo->readAllPartOfSpeech() as $id => $pos) {
$poses[$id] = $pos['name'];
}
foreach($graminfo->readAllGrammems() as $id => $grammem) {
$grammems[$id] = $grammem['name'];
}
$result = array();
foreach($graminfo->readAllAncodes() as $id => $ancode) {
if(!isset($poses[$ancode['pos_id']])) {
throw new Exception("Unknown pos id '" . $ancode['pos_id'] . "'");
}
$pos = iconv($graminfo->getEncoding(), 'utf-8', $poses[$ancode['pos_id']]);
$gram = array();
foreach($ancode['grammem_ids'] as $grammem) {
if(!isset($grammems[$grammem])) {
throw new Exception("Unknown grammem id '$grammem'");
}
$gram[] = iconv($graminfo->getEncoding(), 'utf-8', $grammems[$grammem]);
}
sort($gram);
$result[$id] = mb_strtoupper($pos . ' ' . implode(',', $gram));
}
return $result;
}
function get_gramtab_map($language) {
$rml = new phpMorphy_Rml_IniFile();
$gramtab_file = $rml->getGramTabPath($language);
$gramtab = new phpMorphy_GramTab_File(
$gramtab_file,
phpMorphy_Mwz_File::getEncodingForLang($language),
new phpMorphy_GramTab_GramInfoFactory($language)
);
$gramtab_map = array();
foreach($gramtab as $ancode => $obj) {
$grammems = $obj->getGrammems();
sort($grammems);
$key = $obj->getPartOfSpeech() . ' ' . implode(',', $grammems);
if(isset($gramtab_map[$key])) {
throw new Exception("Duplicate ancode contents for $ancode => $key");
}
$key = mb_strtoupper($key, 'utf-8');
$gramtab_map[$key] = $ancode;
}
return $gramtab_map;
}