1
0
Fork 0
mirror of https://github.com/Oreolek/Togataltu.git synced 2024-05-04 10:08:28 +03:00
Togataltu/phpmorphy/bin/extract_gramtab.php
2011-04-22 18:32:32 +07:00

239 lines
6.9 KiB
PHP

#!/usr/bin/php
<?php
if(2 == (ini_get('mbstring.func_overload') & 2)) {
die("don`t overload string functions in mbstring extension, see mbstring.func_overload option");
}
if($argc < 3) {
echo "Usage " . $argv[0] . " MORPH_DATA_FILE OUT_DIR [case - UPPER or LOWER]";
exit;
}
require_once(dirname(__FILE__) . '/../src/common.php');
require_once(dirname(__FILE__) . '/../src/gramtab_consts.php');
$file = $argv[1];
$out_dir = $argv[2];
if(isset($argv[3])) {
$new_case = strtolower($argv[3]) == 'upper' ? 'upper' : 'lower';
} else {
$new_case = null;
}
try {
extract_gramtab($file, $out_dir, true, $new_case);
extract_gramtab($file, $out_dir, false, $new_case);
} catch (Exception $e) {
echo $e;
exit(1);
}
function replace_keys_with_name($map) {
$result = array();
foreach($map as $item) {
$result[$item['name']] = $item;
}
if(count($map) != count($result)) {
throw new Exception("Map contains non unique names");
}
return $result;
}
abstract class GrammemsProcessor {
abstract function process($partOfSpeech, $grammems);
static function create($locale) {
$locale= self::getNormalizedLocale($locale);
$class = "GrammemsProcessor_$locale";
if(!class_exists($class)) {
return new GrammemsProcessor_Common();
} else {
return new $class();
}
}
static protected function getNormalizedLocale($locale) {
return $locale;
}
}
class GrammemsProcessor_Common extends GrammemsProcessor {
function process($partOfSpeech, $grammems) {
return $grammems;
}
}
class GrammemsProcessor_ru_RU extends GrammemsProcessor {
function process($partOfSpeech, $grammems) {
if(in_array(PMY_RG_INDECLINABLE, $grammems)) {
// íåèçìåíÿåìûå ñëîâà êàê áóäòî ïðèíàäëåæàò âñåì ïàäåæàì
if($partOfSpeech !== PMY_RP_PREDK) {
$grammems = array_merge($grammems, $this->getAllCases());
// ñëîâî 'ïàëüòî' íå èçìåíÿåòñÿ ïî ÷èñëàì, ïîýòîìó ìîæåò
// áûòü èñïîëüçîâàíî â îáîèõ ÷èñëàõ
if(!in_array(PMY_RG_SINGULAR, $grammems)) {
$grammems[] = PMY_RG_PLURAL;
$grammems[] = PMY_RG_SINGULAR;
}
}
if($partOfSpeech === PMY_RP_PRONOUN_P) {
$grammems = array_merge($grammems, $this->getAllGenders());
$grammems = array_merge($grammems, $this->getAllNumbers());
}
}
// ñëîâà îáùåãî ðîäà ('ñèðîòà') ìîãóò èñïîëüçîâàíû êàê
// ñëîâà ì.ð., òàê è êàê ñëîâà æ.ð.
if(in_array(PMY_RG_MASC_FEM, $grammems)) {
$grammems[] = PMY_RG_MASCULINUM;
$grammems[] = PMY_RG_FEMINUM;
}
return array_unique($grammems);
}
protected function getAllCases() {
return array(
PMY_RG_NOMINATIV,
PMY_RG_GENITIV,
PMY_RG_DATIV,
PMY_RG_ACCUSATIV,
PMY_RG_INSTRUMENTALIS,
PMY_RG_LOCATIV,
PMY_RG_VOCATIV,
);
}
protected function getAllGenders() {
return array(
PMY_RG_MASCULINUM,
PMY_RG_FEMINUM,
PMY_RG_NEUTRUM,
);
}
protected function getAllNumbers() {
return array(
PMY_RG_PLURAL,
PMY_RG_SINGULAR,
);
}
}
abstract class CaseConverter {
protected $encoding;
protected function __construct($encoding) {
$this->encoding = $encoding;
if(false === ($value = @mb_strtolower('a', $encoding))) {
throw new Exception("Invalid encoding '$encoding'");
}
}
static function create($encoding, $to) {
if(!isset($to)) {
$class = 'CaseConverter_AsIs';
} else {
$class = $to == 'lower' ? 'CaseConverter_Lower' : 'CaseConverter_Upper';
}
return new $class($encoding);
}
abstract function convert($str);
}
class CaseConverter_AsIs extends CaseConverter {
function convert($str) {
return $str;
}
}
class CaseConverter_Upper extends CaseConverter {
function convert($str) {
return mb_strtoupper($str, $this->encoding);
}
}
class CaseConverter_Lower extends CaseConverter {
function convert($str) {
return mb_strtolower($str, $this->encoding);
}
}
function extract_gramtab($graminfoFile, $outDir, $asText, $case) {
$factory = new phpMorphy_Storage_Factory();
$graminfo = phpMorphy_GramInfo::create($factory->open(PHPMORPHY_STORAGE_FILE, $graminfoFile, false), false);
$grammems_processor = GrammemsProcessor::create($graminfo->getLocale());
$pos_case_converter = CaseConverter::create($graminfo->getEncoding(), 'upper');
$grammems_case_converter = CaseConverter::create($graminfo->getEncoding(), $case);
$poses = $graminfo->readAllPartOfSpeech();
$grammems = $graminfo->readAllGrammems();
$ancodes = $graminfo->readAllAncodes();
foreach($poses as &$pos) {
$pos['name'] = $pos_case_converter->convert($pos['name']);
}
unset($pos);
foreach($grammems as &$grammem) {
$grammem['name'] = $grammems_case_converter->convert($grammem['name']);
}
unset($grammem);
foreach($ancodes as &$ancode) {
$ancode['grammem_ids'] = $grammems_processor->process($ancode['pos_id'], $ancode['grammem_ids']);
}
unset($ancode);
if($asText) {
foreach($ancodes as &$ancode) {
$pos_id = $ancode['pos_id'];
if(!isset($poses[$pos_id])) {
throw new Exception("Unknown pos_id '$pos_id' found");
}
$ancode['pos_id'] = $pos_case_converter->convert($poses[$pos_id]['name']);
foreach($ancode['grammem_ids'] as &$grammem_id) {
if(!isset($grammems[$grammem_id])) {
throw new Exception("Unknown grammem_id '$grammem_id' found");
}
$grammem_id = $grammems_case_converter->convert($grammems[$grammem_id]['name']);
}
}
unset($ancode);
//$poses = replace_keys_with_name($poses);
//$grammems = replace_keys_with_name($grammems);
}
$result = array(
'poses' => $poses,
'grammems' => $grammems,
'ancodes' => $ancodes
);
$type = $asText ? '_txt' : '';
$out_file = 'gramtab' . $type . '.' . strtolower($graminfo->getLocale()) . '.bin';
$out_file = $outDir . '/' . $out_file;
if(false === file_put_contents($out_file, serialize($result))) {
throw new Exception("Can`t write '$out_file'");
}
}