1
0
Fork 0
mirror of https://github.com/Oreolek/Togataltu.git synced 2024-05-04 01:59:35 +03:00
Togataltu/phpmorphy/bin/extract_ancodes.php
2011-04-22 18:32:32 +07:00

105 lines
3 KiB
PHP

#!/usr/bin/php
<?php
if(2 == (ini_get('mbstring.func_overload') & 2)) {
die("don`t overload string functions in mbstring extension, see mbstring.func_overload option");
}
if($argc < 3) {
echo "Usage " . $argv[0] . " MORPH_DATA_FILE OUT_DIR";
exit;
}
require_once(dirname(__FILE__) . '/../src/common.php');
$file = $argv[1];
$out_dir = $argv[2];
$use_references = true;
try {
$factory = new phpMorphy_Storage_Factory();
$graminfo = phpMorphy_GramInfo::create($factory->open(PHPMORPHY_STORAGE_FILE, $file, false), false);
$ancodes_map = new Map('ancodes');
$flexias_map = new Map('affixes');
$i = 0;
foreach($graminfo->readAllFlexia() as $id => $flexia) {
$offset = $flexia['header']['offset'];// + $graminfo->getGramInfoHeaderSize();
$ancodes_map->update($flexia, $offset);
//$flexias_map->update($flexia, $offset);
$i++;
}
echo "Total flexias = $i, unique ancodes = " . count($ancodes_map->getMap()) . ', unique flexias = ' . count($flexias_map->getMap()) . PHP_EOL;
$out_file_format = $out_dir . '/%s.' . strtolower($graminfo->getLocale()) . '.bin';
file_put_contents(sprintf($out_file_format, 'morph_data_ancodes_cache'), serialize($ancodes_map->compose($use_references)));
//file_put_contents(sprintf($out_file_format, 'morph_data_flexias_cache'), serialize($flexias_map->compose($use_references)));
} catch (Exception $e) {
echo $e;
exit(1);
}
class Map {
protected
$key,
$offsets = array(),
$map = array();
function __construct($key) {
$this->key = $key;
}
function update($flexia, $offset) {
$flexia = $flexia[$this->key];
$md5 = md5(serialize($flexia));
if(isset($this->map[$md5])) {
if($this->map[$md5] != $flexia) {
// colission detected
$new_idx = count($this->map);
$this->map[$new_idx] = $flexia;
$this->offsets[$new_idx] = array($offset);
} else {
// equal flexias
$this->offsets[$md5][] = $offset;
}
} else {
$this->map[$md5] = $flexia;
$this->offsets[$md5] = array($offset);
}
}
function getMap() {
return $this->map;
}
function getOffsets() {
return $this->offsets;
}
function compose($useReferences) {
$result = array();
foreach($this->map as $md5 => $flexia) {
$offset = $this->offsets[$md5];
$first_offset = $offset[0];
$result[$first_offset] = $flexia;
for($i = 1, $c = count($offset); $i < $c; $i++) {
if($useReferences) {
$result[$offset[$i]] =& $result[$first_offset];
} else {
$result[$offset[$i]] = $flexia;
}
}
}
return $result;
}
}