mirror of
https://github.com/Oreolek/Togataltu.git
synced 2024-05-30 14:58:07 +03:00
389 lines
13 KiB
Ruby
389 lines
13 KiB
Ruby
|
#!/usr/bin/env ruby
|
|||
|
#encoding: utf-8
|
|||
|
require "translator.rb"
|
|||
|
require "morphology.rb"
|
|||
|
class Source
|
|||
|
def initialize (text,log)
|
|||
|
@text = text.encode("UTF-8")
|
|||
|
@log = log
|
|||
|
@translation = ""
|
|||
|
@pattern = ""
|
|||
|
@rhymed = Array.new
|
|||
|
@@translated_words = Hash.new
|
|||
|
@@forms = Hash.new
|
|||
|
@text.each_line do |line|
|
|||
|
line.downcase.split.each do |word|
|
|||
|
vowels = word.count("aeioyuáéíóúý´")
|
|||
|
if (vowels==1) then #в односложных словах всё тривиально
|
|||
|
@pattern << "! "
|
|||
|
unless word.match(/[áéíóúý´]/) then #знак ударения не проставлен, но гласная единственная — проставляем
|
|||
|
{"a" => "á","e" => "é","i" => "í","u" => "ú","y" => "ý"}.each do |key, value| word.gsub!(key,value) end
|
|||
|
end #end unless
|
|||
|
next
|
|||
|
end #if vowels == 1
|
|||
|
word.each_char do |char|
|
|||
|
if (char.match(/[áéíóúý´]/)) then @pattern = @pattern+"!"
|
|||
|
elsif (char.match(/[aeioyu]/)) then @pattern = @pattern+"-"
|
|||
|
end
|
|||
|
end #each char in word
|
|||
|
@pattern = @pattern + " "
|
|||
|
end #each word in line
|
|||
|
@pattern = @pattern + "\n"
|
|||
|
end #@text.each_line
|
|||
|
end
|
|||
|
def find_rhymes()
|
|||
|
last_words = Array.new
|
|||
|
@text.each_line do |line|
|
|||
|
last_words << line[/([abcdefghijklmnopqrstuvwxyzáéíóúý]+\s[abcdefghijklmnopqrstuvwxyzáéíóúý]+)\W*$/, 1] #массив двух последних слов строк
|
|||
|
end
|
|||
|
last_words.each_with_index do |word, word_index|
|
|||
|
current_vowels = word.match(/([áéíóúý]).*([aeioy])*/)
|
|||
|
last_words.each_with_index do |word2,word2_index|
|
|||
|
vowels = word2.scan(/([áéíóúý]).*([aeioy])*/)
|
|||
|
if vowels == current_vowels then
|
|||
|
temp = word.gsub(/.*[áéíóúý]/,"")
|
|||
|
end
|
|||
|
@rhymed.push(""+word_index.to_s+" "+word2_index.to_s+vowels.length.to_s+temp.length.to_s)
|
|||
|
end
|
|||
|
end
|
|||
|
@log << @rhymed.to_s
|
|||
|
end
|
|||
|
def replace()
|
|||
|
phrases = {
|
|||
|
"l'" => "la",
|
|||
|
"o´" => "ó",
|
|||
|
"a´" => "á",
|
|||
|
"e´" => "é",
|
|||
|
"i´" => "í",
|
|||
|
"u´" => "ú",
|
|||
|
"y´" => "ý"
|
|||
|
}
|
|||
|
phrases.each do |key, value|
|
|||
|
@text.gsub!(/#{key}/,value)
|
|||
|
end
|
|||
|
end
|
|||
|
def translate()
|
|||
|
translator = Translator.new(@log);
|
|||
|
@text.split.each do |word|
|
|||
|
if @@translated_words[word].nil? then
|
|||
|
word_translation = translator.process(word)
|
|||
|
if word_translation == true then next end
|
|||
|
if word_translation != false then
|
|||
|
@@translated_words[word] = word_translation#.force_encoding("UTF-8")
|
|||
|
else
|
|||
|
@log << "Перевод не удался. Прекращение работы."
|
|||
|
return false
|
|||
|
end
|
|||
|
end
|
|||
|
#@log << "Полученный перевод: "+@@translated_words[word]
|
|||
|
end
|
|||
|
end
|
|||
|
def arrange()
|
|||
|
morphology = Morphology.new(@log);
|
|||
|
@text.downcase.split.each do |word|
|
|||
|
vowels = word.count("аеиоуыэюяё")# - word.length/2 #считаем слоги; Ruby считает побайтово, поэтому приходится его поправлять (thx source777)
|
|||
|
#fixed with ruby1.9
|
|||
|
met_vowels = 0
|
|||
|
piece = "" #текущий слог
|
|||
|
index = 0
|
|||
|
while (met_vowels <= vowels and word[index]) do
|
|||
|
if word[index] =~ /[аеиоуыэюяё]/ then
|
|||
|
met_vowels = met_vowels+1
|
|||
|
if piece.match("аеиоуыэюяё") then #на один слог может быть только одна гласная
|
|||
|
check_piece(piece)
|
|||
|
piece = word[index]
|
|||
|
end
|
|||
|
end
|
|||
|
piece = piece + word[index].to_s
|
|||
|
index = index + 1
|
|||
|
end
|
|||
|
end
|
|||
|
@log << "Паттерн:\n"
|
|||
|
@log << @pattern
|
|||
|
##############################################зд. надо взять каждое слово и получить его словоформы, а также проставить каждой словоформе ударение
|
|||
|
##############################################затем устроить Большой Перебор по паттерну
|
|||
|
@text.each_line do |line|
|
|||
|
line.split.each do |word|
|
|||
|
next if @@translated_words[word].nil?
|
|||
|
@pattern.split.each do |word_pattern|
|
|||
|
piece = ""
|
|||
|
result = ""
|
|||
|
@@translated_words[word].to_s.each_char do |char|
|
|||
|
if char =~ /[аеиоуыэюяё]/ then
|
|||
|
if piece =~ /[аеиоуыэюяё]/ then
|
|||
|
result << check_piece(piece)
|
|||
|
piece = char
|
|||
|
end
|
|||
|
end
|
|||
|
piece << char
|
|||
|
end #@@translated_words[word].to_s.each_char
|
|||
|
if (word_pattern == result) then
|
|||
|
word = @@translated_words[word]
|
|||
|
else
|
|||
|
@@forms[word] = morphology.process(@@translated_words[word])
|
|||
|
if not @@forms[word] == false
|
|||
|
changed = false;
|
|||
|
@@forms[word].each do |form|
|
|||
|
piece = ""
|
|||
|
result = ""
|
|||
|
form.each_char do |char|
|
|||
|
if char =~ /[аеиоуыэюяё]/ then
|
|||
|
if piece =~ /[аеиоуыэюяё]/ then
|
|||
|
result << check_piece(piece)
|
|||
|
piece = char
|
|||
|
end
|
|||
|
end
|
|||
|
piece << char
|
|||
|
end
|
|||
|
if (word_pattern == result) then
|
|||
|
word = form
|
|||
|
changed = true
|
|||
|
end
|
|||
|
end
|
|||
|
if not changed then word = @@forms[word][rand(@@forms[word].size)] end #если форма не подобрана, ставим наугад — чтобы не потерять слово
|
|||
|
else #формы не получены, выбирать не из чего
|
|||
|
word = @@translated_words[word]
|
|||
|
end
|
|||
|
end #if (word_pattern == result)
|
|||
|
@translation << word.to_s
|
|||
|
end #@text.split.each.to_s do |word|
|
|||
|
@translation << " "
|
|||
|
end
|
|||
|
@translation << "\n"
|
|||
|
end
|
|||
|
end
|
|||
|
#проверка русских слогов на ударение
|
|||
|
def check_piece(piece)
|
|||
|
probability_acute_sec={
|
|||
|
"все"=>0.14,
|
|||
|
"че"=>0.0675,
|
|||
|
"ква"=>0.0375,
|
|||
|
"ме"=>0.0275,
|
|||
|
"ми"=>0.025,
|
|||
|
"ви"=>0.0175,
|
|||
|
"ак"=>0.015,
|
|||
|
"сле"=>0.01,
|
|||
|
"на"=>0.01,
|
|||
|
"ки"=>0.01,
|
|||
|
"не"=>0.01,
|
|||
|
"ра"=>0.01,
|
|||
|
}
|
|||
|
probability_acute={
|
|||
|
"ве"=>0.0255724374063771,
|
|||
|
"по"=>0.0209715386261502,
|
|||
|
"го"=>0.0206505456879949,
|
|||
|
"вы"=>0.017547613952493,
|
|||
|
"до"=>0.0117697410656966,
|
|||
|
"са"=>0.0114487481275412,
|
|||
|
"во"=>0.0111277551893858,
|
|||
|
"сто"=>0.0106997646051787,
|
|||
|
"ма"=>0.00802482345388401,
|
|||
|
"ко"=>0.00727583993152151,
|
|||
|
"ду"=>0.00706184463941793,
|
|||
|
"те"=>0.00674085170126257,
|
|||
|
"ме"=>0.006526856409159,
|
|||
|
"бо"=>0.006526856409159,
|
|||
|
"на"=>0.00641985876310721,
|
|||
|
"де"=>0.00588487053284828,
|
|||
|
"то"=>0.00588487053284828,
|
|||
|
"ра"=>0.00534988230258934,
|
|||
|
"за"=>0.00513588701048577,
|
|||
|
"мо"=>0.00513588701048577,
|
|||
|
"ка"=>0.00513588701048577,
|
|||
|
"це"=>0.00502888936443398,
|
|||
|
"па"=>0.00502888936443398,
|
|||
|
"ли"=>0.00481489407233041,
|
|||
|
"сте"=>0.00449390113417505,
|
|||
|
"ви"=>0.00449390113417505,
|
|||
|
"пра"=>0.00449390113417505,
|
|||
|
"пе"=>0.00449390113417505,
|
|||
|
"сло"=>0.00438690348812326,
|
|||
|
"про"=>0.00427990584207147,
|
|||
|
"но"=>0.0040659105499679,
|
|||
|
"ре"=>0.00395891290391611,
|
|||
|
"гла"=>0.00374491761181254,
|
|||
|
"ла"=>0.00363791996576075,
|
|||
|
"су"=>0.00363791996576075,
|
|||
|
"ле"=>0.00363791996576075,
|
|||
|
"зе"=>0.00363791996576075,
|
|||
|
"стра"=>0.00363791996576075,
|
|||
|
"со"=>0.00353092231970897,
|
|||
|
"зна"=>0.00353092231970897,
|
|||
|
"пи"=>0.00353092231970897,
|
|||
|
"се"=>0.00342392467365718,
|
|||
|
"тре"=>0.00342392467365718,
|
|||
|
"хо"=>0.00331692702760539,
|
|||
|
"при"=>0.00320992938155361,
|
|||
|
"ска"=>0.00320992938155361,
|
|||
|
"не"=>0.00320992938155361,
|
|||
|
"ро"=>0.00320992938155361,
|
|||
|
"чи"=>0.00299593408945003,
|
|||
|
"бе"=>0.00299593408945003,
|
|||
|
"ча"=>0.00299593408945003,
|
|||
|
"ва"=>0.00288893644339825,
|
|||
|
"кру"=>0.00267494115129467,
|
|||
|
"ми"=>0.00256794350524288,
|
|||
|
"пу"=>0.0024609458591911,
|
|||
|
"ты"=>0.00235394821313931,
|
|||
|
"ру"=>0.00213995292103574,
|
|||
|
"же"=>0.00213995292103574,
|
|||
|
"да"=>0.00203295527498395,
|
|||
|
"че"=>0.00203295527498395,
|
|||
|
"зме"=>0.00203295527498395,
|
|||
|
"ста"=>0.00192595762893216,
|
|||
|
"жа"=>0.00192595762893216,
|
|||
|
"ну"=>0.00192595762893216,
|
|||
|
"ку"=>0.00181895998288038,
|
|||
|
"ге"=>0.00181895998288038,
|
|||
|
"кра"=>0.00181895998288038,
|
|||
|
"си"=>0.00181895998288038,
|
|||
|
"тра"=>0.00181895998288038,
|
|||
|
"ба"=>0.00181895998288038,
|
|||
|
"ти"=>0.00181895998288038,
|
|||
|
"ха"=>0.00181895998288038,
|
|||
|
"гра"=>0.00171196233682859,
|
|||
|
"тру"=>0.00171196233682859,
|
|||
|
"та"=>0.0016049646907768,
|
|||
|
"бу"=>0.0016049646907768,
|
|||
|
"га"=>0.0016049646907768,
|
|||
|
"тро"=>0.0016049646907768,
|
|||
|
"чу"=>0.00149796704472502,
|
|||
|
"тю"=>0.00149796704472502,
|
|||
|
"хло"=>0.00149796704472502,
|
|||
|
"ни"=>0.00139096939867323,
|
|||
|
"му"=>0.00139096939867323,
|
|||
|
"ту"=>0.00139096939867323,
|
|||
|
"цве"=>0.00128397175262144,
|
|||
|
"ло"=>0.00128397175262144,
|
|||
|
"кла"=>0.00128397175262144,
|
|||
|
"зо"=>0.00128397175262144,
|
|||
|
"ке"=>0.00117697410656966,
|
|||
|
"фо"=>0.00106997646051787,
|
|||
|
"сме"=>0.00106997646051787,
|
|||
|
"мэ"=>0.00106997646051787,
|
|||
|
"ша"=>0.00106997646051787,
|
|||
|
"пла"=>0.00106997646051787,
|
|||
|
"све"=>0.00106997646051787,
|
|||
|
"ки"=>0.00106997646051787,
|
|||
|
}
|
|||
|
probability_no = {
|
|||
|
"от"=>0.0443044406056295,
|
|||
|
"дев"=>0.0436947464688548,
|
|||
|
"ятс"=>0.0409307997154761,
|
|||
|
"ов"=>0.0121938827354944,
|
|||
|
"ом"=>0.0120516207702469,
|
|||
|
"ат"=>0.0112386952545473,
|
|||
|
"цат"=>0.00945025912000813,
|
|||
|
"од"=>0.0088608881211259,
|
|||
|
"ал"=>0.00780408495071639,
|
|||
|
"ог"=>0.00766182298546896,
|
|||
|
"мин"=>0.00709277512447922,
|
|||
|
"ит"=>0.00646275784981201,
|
|||
|
"ан"=>0.00640178843613454,
|
|||
|
"ут"=>0.00597500254039224,
|
|||
|
"он"=>0.00534498526572503,
|
|||
|
"ор"=>0.00526369271415507,
|
|||
|
"ет"=>0.00481658368052027,
|
|||
|
"ен"=>0.00481658368052027,
|
|||
|
"ид"=>0.00449141347424042,
|
|||
|
"ок"=>0.00438979778477797,
|
|||
|
"ят"=>0.00434915150899299,
|
|||
|
"ер"=>0.00430850523320801,
|
|||
|
"ес"=>0.00414592013006808,
|
|||
|
"ин"=>0.00414592013006808,
|
|||
|
"ка"=>0.0041052738542831,
|
|||
|
"ни"=>0.00408495071639061,
|
|||
|
"ол"=>0.00400365816482065,
|
|||
|
"ил"=>0.00400365816482065,
|
|||
|
"ла"=>0.00363784168275582,
|
|||
|
"ый"=>0.00347525657961589,
|
|||
|
"ел"=>0.00339396402804593,
|
|||
|
"сто"=>0.00317040951122853,
|
|||
|
"ой"=>0.0030281475459811,
|
|||
|
"ос"=>0.00300782440808861,
|
|||
|
"ты"=>0.00292653185651865,
|
|||
|
"им"=>0.00284523930494868,
|
|||
|
"ев"=>0.00276394675337872,
|
|||
|
"сам"=>0.00272330047759374,
|
|||
|
"ик"=>0.00268265420180876,
|
|||
|
"пер"=>0.00268265420180876,
|
|||
|
"нач"=>0.00256071537445382,
|
|||
|
"ем"=>0.00254039223656133,
|
|||
|
"дес"=>0.00249974596077634,
|
|||
|
"ар"=>0.00235748399552891,
|
|||
|
"оп"=>0.00231683771974393,
|
|||
|
"ав"=>0.00229651458185144,
|
|||
|
"ам"=>0.00225586830606646,
|
|||
|
"ир"=>0.00225586830606646,
|
|||
|
"том"=>0.00213392947871151,
|
|||
|
"ак"=>0.00209328320292653,
|
|||
|
"ив"=>0.00207296006503404,
|
|||
|
"пол"=>0.00199166751346408,
|
|||
|
"об"=>0.00197134437557159,
|
|||
|
"чет"=>0.0019510212376791,
|
|||
|
"ед"=>0.00193069809978661,
|
|||
|
"тых"=>0.00186972868610914,
|
|||
|
"ис"=>0.00184940554821664,
|
|||
|
"ва"=>0.00178843613453917,
|
|||
|
"ад"=>0.00178843613453917,
|
|||
|
"ятн"=>0.00176811299664668,
|
|||
|
"ать"=>0.00174778985875419,
|
|||
|
"пят"=>0.00174778985875419,
|
|||
|
"ны"=>0.00164617416929174,
|
|||
|
"дор"=>0.00164617416929174,
|
|||
|
"сор"=>0.00162585103139925,
|
|||
|
"век"=>0.00160552789350676,
|
|||
|
"ли"=>0.00152423534193679,
|
|||
|
"ур"=>0.00152423534193679,
|
|||
|
"ах"=>0.0015039122040443,
|
|||
|
"ей"=>0.00144294279036683,
|
|||
|
"ич"=>0.00142261965247434,
|
|||
|
"ек"=>0.00134132710090438,
|
|||
|
"те"=>0.00134132710090438,
|
|||
|
"дом"=>0.00132100396301189,
|
|||
|
"ул"=>0.00132100396301189,
|
|||
|
"гор"=>0.0013006808251194,
|
|||
|
"етр"=>0.0013006808251194,
|
|||
|
"ян"=>0.00128035768722691,
|
|||
|
"та"=>0.00128035768722691,
|
|||
|
"ду"=>0.00128035768722691,
|
|||
|
"аб"=>0.00121938827354944,
|
|||
|
"ас"=>0.00121938827354944,
|
|||
|
"душ"=>0.00119906513565695,
|
|||
|
"на"=>0.00119906513565695,
|
|||
|
"ант"=>0.00119906513565695,
|
|||
|
"ма"=>0.00117874199776445,
|
|||
|
"сем"=>0.00115841885987196,
|
|||
|
"оз"=>0.00115841885987196,
|
|||
|
"две"=>0.00115841885987196,
|
|||
|
"ост"=>0.00115841885987196,
|
|||
|
"ры"=>0.00111777258408698,
|
|||
|
"за"=>0.00111777258408698,
|
|||
|
"ров"=>0.00111777258408698,
|
|||
|
"нац"=>0.00111777258408698,
|
|||
|
"ых"=>0.00109744944619449,
|
|||
|
"из"=>0.00109744944619449,
|
|||
|
"ком"=>0.00109744944619449,
|
|||
|
"ент"=>0.001077126308302,
|
|||
|
"ци"=>0.00105680317040951,
|
|||
|
"стран"=>0.00105680317040951,
|
|||
|
"тся"=>0.00103648003251702,
|
|||
|
"восм"=>0.00103648003251702,
|
|||
|
"гол"=>0.00103648003251702,
|
|||
|
"пар"=>0.00101615689462453,
|
|||
|
}
|
|||
|
if probability_acute_sec[piece].nil? then probability_acute_sec[piece] = 0 end
|
|||
|
if probability_acute[piece].nil? then probability_acute[piece] = 0 end
|
|||
|
if probability_no[piece].nil? then probability_no[piece] = 0 end
|
|||
|
if (probability_acute_sec[piece] + probability_acute[piece] - probability_no[piece]>0) then
|
|||
|
if (probability_acute_sec[piece] > probability_acute[piece]) then
|
|||
|
return 'П'
|
|||
|
else
|
|||
|
return 'У'
|
|||
|
end
|
|||
|
end
|
|||
|
return '-'
|
|||
|
end
|
|||
|
def print() return @translation end
|
|||
|
end
|