--luacheck: no self local curdir = std.getinfo(1).source:gsub("^(.+[\\/])[^\\/]+$", "%1"):gsub("^@", ""); local mrd = { lang = false; words = {}; dirs = {''}; dict_file = 'dict.mrd'; } local msg = dprint or print local function debug(...) if DEBUG then dprint(...) end end local function cache_add(cache, key, val) table.insert(cache.list, 1, key) local len = #cache.list if len > (cache.len or 128) then local okey = cache.list[len] table.remove(cache.list, len) cache.hash[okey] = nil end cache.hash[key] = val end local function split(str, sep) local words = {} if not str then return words end for w in str:gmatch(sep or "[^ \t]+") do table.insert(words, w) end return words end local function empty(l) l = l:gsub("[ \t]+", "") return l == "" end function mrd:gramtab(path) local f, e = io.open(path or 'rgramtab.tab', 'rb') if not f then return false, e end self.gram = { an = {}; -- by ancodes t = {}; -- by types } for l in f:lines() do if not l:find("^[ \t]*//") and not empty(l) then -- not comments local w = split(l) if #w < 3 then msg("Skipping gram: "..l) else local a = split(w[4], '[^,]+') local an = {} for _, v in ipairs(a) do an[v] = true end an.t = w[3] -- type self.gram.an[w[1]] = an; self.gram.t[w[3]] = an; end end end f:close() end local function section(f, fn, ...) local n = f:read("*line") n = n and tonumber(n) if not n then return false end if n == 0 then return true end for l in f:lines() do -- skip accents if fn then fn(l, ...) end n = n - 1 if n == 0 then break end end return true end local flex_filter local function flex_fn(l, flex, an) l = l:gsub("//.*$", "") local fl = {} for w in l:gmatch("[^%%]+") do local ww = split(w, "[^%*]+") if #ww > 3 or #ww < 1 then msg("Skip lex: ", w, l); else local f = { } if #ww == 1 then f.an = ww[1] f.post = '' else f.post = ww[1] f.an = ww[2] end f.pre = ww[3] or '' local a = an[f.an] if not a then msg("Gram not found. Skip lex: "..f.an) else f.an_name = f.an f.an = a if flex_filter(f) then f.filter = true end table.insert(fl, f) end end end table.insert(flex, fl) end local function pref_fn(l, pref) local p = split(l, "[^,]+") table.insert(pref, p) end --[[ local function dump(vv) local s = '' if type(vv) ~= 'table' then return string.format("%s", tostring(vv)) end for k, v in pairs(vv) do s = s .. string.format("%s = %s ", k, v) end return s end local function gram_dump(v) for _, f in ipairs(v.flex) do local tt = v.pref .. f.pre .. v.t .. f.post print("=== ", tt) for _, v in pairs(f.an) do print(_, v) end end end ]]-- local busy_cnt = 0 local function word_fn(l, self, dict) local norm = mrd.lang.norm local words = self.words local words_list = self.words_list local w = split(l) if #w ~= 6 then msg("Skipping word: "..l) return end if w[1] == '#' then w[1] = '' end local nflex = tonumber(w[2]) or false local an = w[5] if an == '-' then an = false end local an_name = an local npref = tonumber(w[6]) or false if not nflex then msg("Skipping word:"..l) return end nflex = self.flex[nflex + 1] if not nflex then msg("Wrong paradigm number for word: "..l) return end if an then an = self.gram.an[an] if not an then msg("Wrong ancode for word: "..l) return end end if npref then npref = self.pref[npref + 1] if not npref then msg("Wrong prefix for word: "..l) return end end local t = w[1] local num = 0 local used = false for _, v in ipairs(nflex) do if v.filter then for _, pref in ipairs(npref or { '' }) do local tt = norm(pref .. v.pre .. t .. v.post) -- if tt == 'ЗАКРЕПЛЕН' then -- gram_dump { t = t, pref = pref, flex = nflex, an = v.an } -- end if not dict or dict[tt] then local a = {} for kk, _ in pairs(an or {}) do a[kk] = an[kk] end for kk, _ in pairs(v.an) do a[kk] = v.an[kk] end local wds = words[tt] or {} table.insert(wds, { t = t, pref = pref, flex = nflex, an = a }) nflex.used = true used = true if npref then npref.used = true end num = num + 1 if #wds == 1 then words[tt] = wds end end end end end if used then table.insert(words_list, { t = w[1], flex = nflex, pref = npref, an = an_name }) end self.words_nr = self.words_nr + num busy_cnt = busy_cnt + 1 if busy_cnt > 1000 then if std then std.busy(true) end busy_cnt = 0 end return end function mrd:load(path, dict) local f, e = io.open(path or 'morphs.mrd', 'rb') if not f then return false, e end local flex = {} flex_filter = mrd.lang.flex_filter if not section(f, flex_fn, flex, self.gram.an) then return false, "Error in section 1" end self.flex = flex if not section(f) then return false, "Error in section 2" end if not section(f) then return false, "Error in section 3" end local pref = {} if not section(f, pref_fn, pref) then return false, "Error in section 4" end self.pref = pref self.words_nr = 0 self.words = {} self.words_list = {} -- collectgarbage("stop") if not section(f, word_fn, self, dict) then -- collectgarbage("restart") return false, "Error in section 4" end -- collectgarbage("restart") msg("Generated: "..tostring(self.words_nr).." word(s)"); local crc = f:read("*line") if crc then crc = tonumber(crc) end f:close() if std then std.busy(false) end return true, crc end function mrd:dump(path, crc) local f, e = io.open(path or 'dict.mrd', 'wb') if not f then return false, e end local n = 0 for _, v in ipairs(self.flex) do if v.used then v.norm_no = n n = n + 1 end end f:write(string.format("%d\n", n)) for _, v in ipairs(self.flex) do if v.used then local s = '' for _, vv in ipairs(v) do s = s .. '%' if vv.post == '' then s = s..vv.an_name else s = s..vv.post..'*'..vv.an_name end if vv.pre ~= '' then s = s .. '*'..vv.pre end end f:write(s.."\n") end end f:write("0\n") f:write("0\n") n = 0 for _, v in ipairs(self.pref) do if v.used then v.norm_no = n n = n + 1 end end f:write(string.format("%d\n", n)) for _, v in ipairs(self.pref) do if v.used then local s = '' for _, vv in ipairs(v) do if s ~= '' then s = s .. ',' end s = s .. vv end f:write(s.."\n") end end f:write(string.format("%d\n", #self.words_list)) for _, v in ipairs(self.words_list) do local s if v.t == '' then s = '#' else s = v.t end s = s ..' '..tostring(v.flex.norm_no) s = s..' - -' if v.an then s = s .. ' '..v.an else s = s .. ' -' end if v.pref then s = s ..' '..tostring(v.pref.norm_no) else s = s .. ' -' end f:write(s..'\n') end if crc then f:write(string.format("%d\n", crc)) end f:close() end local function gram2an(g) local a = {} for _, v in ipairs(g) do if v:sub(1, 1) == '~' then a[v:sub(2)] = false else a[v] = true end end a.t = nil return a end local lookup_cache = { hash = {}; list = {}; len = 512; } local function hint_append(hint, h) if h == "" or not h then return hint end if hint == "" or not hint then return h end return hint .. ',' .. h end function mrd:lookup(w, g) local key = "" for _, v in ipairs(g or {}) do key = hint_append(key, v) end key = w .. '/'..key local cc = lookup_cache.hash[key] if cc then return cc.w, cc.g end w, g = self:__lookup(w, g) cache_add(lookup_cache, key, { w = w, g = g }) return w, g end function mrd:__lookup(w, g) local ow = w local cap, upper = self.lang.is_cap(w) local tt = self.lang.upper(self.lang.norm(w)) w = self.words[tt] if not w then return false, "No word in dictionary" end local res = {} local gram_compat = self.lang.gram_compat local gram_score = self.lang.gram_score for _, v in ipairs(w) do local flex = v.flex local score = gram_score(v.an, g) local t = v.an.t for _, f in ipairs(flex) do if gram_compat(v.an, f.an, gram2an(g)) then local sc = gram_score(f.an, g) if sc >= 0 then if t ~= f.an.t then sc = sc - 1 end -- todo --[[ local tt = v.pref .. f.pre .. v.t .. f.post if tt == 'ЛЕВЫЙ' or tt == 'ЛЕВОГО' or tt == 'ШЛЕМОМ' then print ("======looking for:", g.noun) for _, v in pairs(g) do print(_, v) end print ("======looking got:", score + sc, sc) print(tt, v.t, score + sc) for _, v in pairs(f.an) do print(_, v) end end ]]-- table.insert(res, { score = score + sc, pos = #res, word = v, flex = f }) end end end end if #res == 0 then return ow, gram2an(g) -- false, "No gram" end table.sort(res, function(a, b) if a.score == b.score then return a.pos < b.pos end return a.score > b.score end) --[[ for i = 1, #res do local w = res[i] local tt = self.lang.lower(w.word.pref .. w.flex.pre .. w.word.t .. w.flex.post) print(i, "res: ", tt, w.score) if tt == 'красный' or tt == 'красного' then for _, v in pairs(w.flex.an) do print(_, v) end end -- print(tt, w.score) end ]]-- w = res[1] local gram = {} for k, v in pairs(w.flex.an) do gram[k] = v end for k, v in pairs(w.word.an) do gram[k] = v end w = self.lang.lower(w.word.pref .. w.flex.pre .. w.word.t .. w.flex.post) if upper then w = self.lang.upper(w) elseif cap then w = self.lang.cap(w) end return w, gram end local word_match = "[^ \t,%-!/:%+&]+" local missed_words = {} local word_cache = { list = {}, hash = {} } function mrd:word(w, ob) local cache = word_cache if ob then if not ob.__word_cache then std.rawset(ob, '__word_cache', { list = {}, hash = {}, len = 32, }) end cache = ob.__word_cache end local key = w local c = cache.hash[key] if c then return std.clone(c[1]), std.clone(c[2]) end local ow = w local s, _ = w:find("/[^/]*$") local g = {} local grams = {} local hints = '' if s then hints = w:sub(s + 1) w = w:sub(1, s - 1) g = split(hints, "[^, ]+") end local found = true local noun = false local lang = self.lang w = w:gsub(word_match, function(t) if noun then return t end local ww, gg if ob then ww, gg = self:dict(ob.__dict, t..'/'..hints) end if not ww then ww, gg = self:dict(game.__dict, t..'/'..hints) end if not ww then ww, gg = self:dict(self.__dict, t..'/'..hints) end noun = gg and gg[lang.gram_t.noun] if not ww then ww, gg = self:lookup(t, g) noun = gg and gg.t == lang.gram_t.noun end if gg and (gg[lang.gram_t.proper] or gg[lang.gram_t.surname]) then noun = false end if not ww then found = false else table.insert(grams, gg) end return ww or t end) if not found then if DEBUG and not tonumber(w) and not missed_words[w] then missed_words[w] = true debug("Can not find word: '"..ow.."'") end end cache_add(cache, key, { w, grams }) return w, grams end function mrd:file(f, dict) dict = dict or {} local ff, e = io.open(f, "rb") if not ff then return false, e end debug("Added file: ", f) for l in ff:lines() do for w in l:gmatch('%-"[^"]+"') do w = w:gsub('^%-"', ""):gsub('"$', "") local words = split(w, '[^|]+') for _, word in ipairs(words) do word = word:gsub("/[^/]*$", "") for ww in word:gmatch(word_match) do local t = self.lang.upper(self.lang.norm(ww)) if not dict[t] and not t:find("%*$") then dict[t] = true; debug("mrd: Added word: ", ww) end end end end end ff:close() return dict end local function str_hint(str) -- str = str:gsub("^%+", "") local s, _ = str:find("/[^/]*$") if not s then return str, "" end if s == 1 then return "", str:sub(2) end return str:sub(1, s - 1), str:sub(s + 1) end local function str_strip(str) return std.strip(str) end local function str_split(str, delim) local a = std.split(str, delim) for k, _ in ipairs(a) do a[k] = str_strip(a[k]) end return a end function mrd:dict(dict, word) if not dict then return end local tab = {} local w, hints = str_hint(word) hints = str_split(hints, ",") local tt = dict[w] if not tt then return end for _, v in ipairs(tt) do local whints = {} local w, h = str_hint(v) local hh = str_split(h, ",") for _, vv in ipairs(hh) do whints[vv] = true end local t = { w, score = 0, pos = #tab, w = w } for _, hv in ipairs(hints) do if hv:sub(1, 1) ~= '~' then if whints[hv] then t.score = t.score + 1 end else if whints[str_strip(hv:sub(2))] then t.score = t.score - 1 end end end t.hints = str_split(hint_append(tt.hints, h), ",") if mrd.lang.gram_t.nom and whints[mrd.lang.gram_t.nom] then t.score = t.score + 0.5 end table.insert(tab, t) end if #tab == 0 then return end table.sort(tab, function(a, b) if a.score == b.score then return a.pos < b.pos end return a.score > b.score end) if tab[1].score > 0 then return tab[1].w, gram2an(tab[1].hints) end end function mrd.dispof(w) if w.raw_word ~= nil then local d = std.call(w, 'raw_word') return d, true end if w.word ~= nil then local d = std.call(w, 'word') return d end return std.titleof(w) or std.nameof(w) end local obj_cache = { hash = {}, list = {}, len = 128 } function mrd:obj(w, n, nn) local hint = '' local hint2, disp, ob, raw if type(w) == 'string' then w, hint = str_hint(w) elseif type(n) == 'string' then hint = n n = nn end if type(w) ~= 'string' then -- w = std.object(w) ob = w disp, raw = self.dispof(w) else disp = w end local d = obj_cache.hash[disp] if not d then d = str_split(disp, '|') if #d == 0 then std.err("Wrong object display: ".. (disp or 'nil'), 2) end -- normalize local nd = {} for k, v in ipairs(d) do w, hint2 = str_hint(v) local dd = raw and { w } or str_split(w, ',') for _, vv in ipairs(dd) do table.insert(nd, { word = vv, hint = hint2 or '', alias = k, idx = _ }) -- for w in vv:gmatch("[^ ]+") do -- table.insert(nd, { word = w, hint = hint2 or '', alias = k, idx = _ }) -- end end end d = nd cache_add(obj_cache, disp, d) end if type(n) == 'table' then local ret = n for _, v in ipairs(d) do table.insert(ret, { word = v.word, hint = hint_append(hint, v.hint), alias = v.alias, idx = v.idx }); end return ob, ret end n = n or (ob and ob.__word_alias) or 1 for k, v in ipairs(d) do if v.alias == n then n = k break end end if not d[n] then n = 1 end w = d[n].word hint2 = d[n].hint return ob, w, hint_append(hint, hint2) end local function noun_append(rc, tab, w) -- w = mrd.lang.norm(w) if tab then table.insert(tab, w) else if rc ~= '' then rc = rc .. '|' end rc = rc .. w end return rc end function mrd:noun_hint(ob, n) if not ob then return '' end if not ob.__hint_cache then std.rawset(ob, '__hint_cache', { list = {}, hash = {}, len = 16, }) end local key = n or ob.__word_alias or 1 local c if type(ob.word) == 'string' then -- do not use caching if function c = ob.__hint_cache.hash[key] end if c then return c end local g = ob and ob:gram('noun', n) or {} local hint = '' local lang = self.lang for _, v in ipairs { lang.gram_t.male, lang.gram_t.female, lang.gram_t.neuter, lang.gram_t.plural, lang.gram_t.live } do if g[v] then hint = hint_append(hint, v) end end if not g[self.lang.gram_t.live] then hint = hint_append(hint, lang.gram_t.nonlive) end if ob then hint = hint_append(hint, "noun") end cache_add(ob.__hint_cache, key, hint) return hint end function mrd:noun(w, n, nn) local hint, ob local rc = '' local tab = false ob, w, hint = self:obj(w, n, nn) if type(w) ~= 'table' then local alias = nn if type(alias) ~= 'number' then alias = n end if type(alias) ~= 'number' then alias = nil end w = {{ word = w, hint = hint, alias = alias }} else tab = {} end for _, v in ipairs(w) do local hint2 = self:noun_hint(ob, v.alias) local m = self:word(v.word .. '/'.. hint_append(v.hint, hint2), ob) rc = noun_append(rc, tab, m) end return tab and tab or rc end local function str_hash(str) local sum = 0 for i = 1, str:len() do sum = sum + string.byte(str, i) end return sum end function mrd:init(l) self.lang = l if type(l.dict) == 'table' then std.obj.dict(self, l.dict) end if self:gramtab(curdir .. "rgramtab.tab") == false then msg("Error while opening gramtab.") return end local _, crc = self:load(mrd.dict_file) self:create(mrd.dict_file, crc) -- create or update end function mrd:create(fname, crc) local dict = {} if not std.readdir then return end for _, d in ipairs(self.dirs) do if d == '' then d = instead.gamepath() end local list = {} for f in std.readdir(d) do if f:find("%.lua$") or f:find("%.LUA$") then table.insert(list, f) end end table.sort(list) for _, f in ipairs(list) do local path = d .. "/" .. f mrd:file(path, dict) end end local sum = 0 for w, _ in pairs(dict) do sum = sum + str_hash(w) sum = sum % 4294967291; end if crc ~= sum then msg("Generating dict.mrd with sum: ", sum) if mrd:load(curdir .. "morphs.mrd", dict) then mrd:dump(fname or 'dict.mrd', sum) else msg("Can not find morph/morphs.mrd") end else msg("Using dict.mrd") end end if std then std.obj.noun = function(self, ...) return mrd:noun(self, ...) end std.obj.Noun = function(self, ...) return mrd.lang.cap(mrd:noun(self, ...)) end std.obj.gram = function(self, ...) local hint, w, gram, _ _, w, hint = mrd:obj(self, ...) _, gram = mrd:word(w .. '/'..hint) local thint = '' local t = mrd.lang.gram_t.noun hint = str_split(hint, ",") local g = gram and gram[1] or {} for _, v in ipairs(gram or {}) do if v.t == t or v[t] then g = v break end end local gg = std.clone(g) for _, v in ipairs(hint) do gg[v] = true end for k, v in pairs(gg) do if v then thint = hint_append(thint, k) end end gg.hint = thint return gg end std.obj.dict = function(self, t) local idx = std.rawget(self, '__dict') or {} for word, v in pairs(t) do local w, hints = str_hint(word) if type(v) == 'table' then idx[w] = v v.hints = hints or "" else if not idx[w] then idx[w] = { hints = "", } end table.insert(idx[w], v .. '/' .. hints) end end std.rawset(self, '__dict', idx) return self end local onew = std.obj.new std.obj.new = function(self, v) if type(v[1]) == 'string' or type(v[1]) == 'function' then v.word = v[1] table.remove(v, 1) end return onew(self, v) end end local mt = getmetatable("") function mt.__unm(v) return v end return mrd --mrd:gramtab() --mrd.lang = require "lang-ru" --mrd:load(false, { [mrd.lang.upper "подосиновики"] = true, [mrd.lang.upper "красные"] = true }) --local w = mrd:word(-"красные подосиновики/рд") --print(w) --mrd:file("mrd.lua")