1
0
Fork 0
mirror of https://github.com/Oreolek/Togataltu.git synced 2024-04-27 06:39:45 +03:00
Togataltu/source.rb
2011-04-22 18:32:32 +07:00

402 lines
14 KiB
Ruby
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env ruby
#encoding: utf-8
require "translator.rb"
require "morphology.rb"
require "stack.rb"
class Source
def initialize (text,log)
@text = text.encode("UTF-8")
@log = log
@translation = ""
@pattern = ""
@rhymed = Hash.new
@translated_words = Stack.new
@text.each_line do |line|
if line.empty? then next end
line.downcase.split.each do |word|
vowels = word.count("aeioyuáéíóúý´")
if (vowels==1) then #в односложных словах всё тривиально
@pattern << "!"
unless word.match(/[áéíóúý´]/) then #знак ударения не проставлен, но гласная единственная — проставляем
{"a" => "á","e" => "é","i" => "í","u" => "ú","y" => "ý"}.each do |key, value| word.gsub!(key,value) end
end #end unless
next
end #if vowels == 1
word.each_char do |char|
if (char.match(/[áéíóúý´]/)) then @pattern = @pattern+"!"
elsif (char.match(/[aeioyu]/)) then @pattern = @pattern+"-"
end
end #each char in word
#@pattern = @pattern + " "
end #each word in line
@pattern = @pattern + "\n"
end #@text.each_line
end
def find_rhymes()
this_line_number = 0
@text.each_line do |line|
this_line_number = this_line_number + 1
this_last_vowel = line.split.at(-1).scan(/[áéíóúý]|\w´/)
other_line_number = 0
@text.each_line do |line_other|
other_line_number = other_line_number + 1
if other_line_number == this_line_number then next end
other_last_vowel = line_other.split.at(-1).scan(/[áéíóúý]|\w´/)
if other_last_vowel = this_last_vowel then
matched_vowels = 1
if line.split.at(-2).scan(/[áéíóúý]|\w´/) == line_other.split.at(-2).scan(/[áéíóúý]|\w´/) then matched_vowels = 2 end #можно сделать и так далее, но не стоит, наверное
if @rhymed[this_line_number].nil? then @rhymed[this_line_number] = Hash.new end
@rhymed[this_line_number][other_line_number] = matched_vowels
end
end
end
# @log << @rhymed.to_s
end
def replace()
phrases = {
"l'" => "la",
"o´" => "ó",
"a´" => "á",
"e´" => "é",
"i´" => "í",
"u´" => "ú",
"y´" => "ý"
}
phrases.each do |key, value|
@text.gsub!(/#{key}/,value)
end
end
def translate()
translator = Translator.new(@log);
@text.split.each do |word|
word_translation = translator.process(word)
if word_translation == true then next end
if word_translation != false then
@translated_words.push(word_translation)
else
@log << "Перевод не удался. Прекращение работы."
return false
end
end
end
def arrange()
morphology = Morphology.new(@log);
# @text.downcase.split.each do |word|
# vowels = word.count("аеиоуыэюяё")
# met_vowels = 0
# piece = "" #текущий слог
# index = 0
# while (met_vowels <= vowels and word[index]) do
# if word[index] =~ /[аеиоуыэюяё]/ then
# met_vowels = met_vowels+1
# if piece.match("аеиоуыэюяё") then #на один слог может быть только одна гласная
# check_piece(piece)
# piece = word[index]
# end
# end
# piece = piece + word[index].to_s
# index = index + 1
# end
# end
@log << "Паттерн:\n"
@log << @pattern
line_number = 0
@pattern.each_line do |pattern_line|
if (@translated_words.count() == 0) then break end
pattern_index = 0
line_number = line_number + 1
while (pattern_index < pattern_line.size and @translated_words.count() > 0) do
word = @translated_words.pop()
forms = morphology.process(word)
if (forms == false) then
#@translation << word .. " "
#pieces = 0
#word.scan(/[аеиоуыэюяё]/) { pieces += 1}
#pattern_index = pattern_index + pieces
next
end
forms_result = Array.new()
# TODO: если слово — последнее в строке или предпоследнее, то отсеиваем все не рифмующиеся (плохо рифмующиеся) словоформы. Если ничего не осталось — не отсеиваем.
forms.each do |form|
forms_result.push(check_pattern(form, pattern_line,pattern_index))
#TODO: forms_result надо сделать хэшем массивов и сортировать по ключам. Выбирать любой вариант из полученного массива.
end
form_index = 0
forms_result.each do |result|
if result <= 12 then
@translation << forms[form_index] + " "
pieces = 0
forms[form_index].scan(/[аеиоуыэюяё]/) { pieces += 1}
pattern_index = pattern_index + pieces
break
end
form_index += 1
end
end
@translation << "\n"
end
end
private
def check_pattern(word, pattern, index) #проверяет соответствие слова паттерну; возвращает 0, если слово вписывается и число, если не вписывается
word_pattern = ""
piece = ""
word.to_s.each_char do |char|
if char =~ /[аеиоуыэюяё]/ then
if piece =~ /[аеиоуыэюяё]/ then
word_pattern << check_piece(piece)
piece = char
end
end
piece = piece + char
end
pattern_index = index #или index+1?
result = 0
word_pattern.each_char do |char|
if char == '-' and pattern[pattern_index]=='!' then
result = result + 2 #безударные слоги не могут стать ударными - см. главу 1
end
if char == '!' and pattern[pattern_index]=='-' then
result = result + 1
end
pattern_index = pattern_index + 1
end
return result
end
#проверка русских слогов на ударение
def check_piece(piece)
probability_acute_sec={
"все"=>0.14,
"че"=>0.0675,
"ква"=>0.0375,
"ме"=>0.0275,
"ми"=>0.025,
"ви"=>0.0175,
"ак"=>0.015,
"сле"=>0.01,
"на"=>0.01,
"ки"=>0.01,
"не"=>0.01,
"ра"=>0.01,
}
probability_acute={
"ве"=>0.0255724374063771,
"по"=>0.0209715386261502,
"го"=>0.0206505456879949,
"вы"=>0.017547613952493,
"до"=>0.0117697410656966,
"са"=>0.0114487481275412,
"во"=>0.0111277551893858,
"сто"=>0.0106997646051787,
"ма"=>0.00802482345388401,
"ко"=>0.00727583993152151,
"ду"=>0.00706184463941793,
"те"=>0.00674085170126257,
"ме"=>0.006526856409159,
"бо"=>0.006526856409159,
"на"=>0.00641985876310721,
"де"=>0.00588487053284828,
"то"=>0.00588487053284828,
"ра"=>0.00534988230258934,
"за"=>0.00513588701048577,
"мо"=>0.00513588701048577,
"ка"=>0.00513588701048577,
"це"=>0.00502888936443398,
"па"=>0.00502888936443398,
"ли"=>0.00481489407233041,
"сте"=>0.00449390113417505,
"ви"=>0.00449390113417505,
"пра"=>0.00449390113417505,
"пе"=>0.00449390113417505,
"сло"=>0.00438690348812326,
"про"=>0.00427990584207147,
"но"=>0.0040659105499679,
"ре"=>0.00395891290391611,
"гла"=>0.00374491761181254,
"ла"=>0.00363791996576075,
"су"=>0.00363791996576075,
"ле"=>0.00363791996576075,
"зе"=>0.00363791996576075,
"стра"=>0.00363791996576075,
"со"=>0.00353092231970897,
"зна"=>0.00353092231970897,
"пи"=>0.00353092231970897,
"се"=>0.00342392467365718,
"тре"=>0.00342392467365718,
"хо"=>0.00331692702760539,
"при"=>0.00320992938155361,
"ска"=>0.00320992938155361,
"не"=>0.00320992938155361,
"ро"=>0.00320992938155361,
"чи"=>0.00299593408945003,
"бе"=>0.00299593408945003,
"ча"=>0.00299593408945003,
"ва"=>0.00288893644339825,
"кру"=>0.00267494115129467,
"ми"=>0.00256794350524288,
"пу"=>0.0024609458591911,
"ты"=>0.00235394821313931,
"ру"=>0.00213995292103574,
"же"=>0.00213995292103574,
"да"=>0.00203295527498395,
"че"=>0.00203295527498395,
"зме"=>0.00203295527498395,
"ста"=>0.00192595762893216,
"жа"=>0.00192595762893216,
"ну"=>0.00192595762893216,
"ку"=>0.00181895998288038,
"ге"=>0.00181895998288038,
"кра"=>0.00181895998288038,
"си"=>0.00181895998288038,
"тра"=>0.00181895998288038,
"ба"=>0.00181895998288038,
"ти"=>0.00181895998288038,
"ха"=>0.00181895998288038,
"гра"=>0.00171196233682859,
"тру"=>0.00171196233682859,
"та"=>0.0016049646907768,
"бу"=>0.0016049646907768,
"га"=>0.0016049646907768,
"тро"=>0.0016049646907768,
"чу"=>0.00149796704472502,
"тю"=>0.00149796704472502,
"хло"=>0.00149796704472502,
"ни"=>0.00139096939867323,
"му"=>0.00139096939867323,
"ту"=>0.00139096939867323,
"цве"=>0.00128397175262144,
"ло"=>0.00128397175262144,
"кла"=>0.00128397175262144,
"зо"=>0.00128397175262144,
"ке"=>0.00117697410656966,
"фо"=>0.00106997646051787,
"сме"=>0.00106997646051787,
"мэ"=>0.00106997646051787,
"ша"=>0.00106997646051787,
"пла"=>0.00106997646051787,
"све"=>0.00106997646051787,
"ки"=>0.00106997646051787,
}
probability_no = {
"от"=>0.0443044406056295,
"дев"=>0.0436947464688548,
"ятс"=>0.0409307997154761,
"ов"=>0.0121938827354944,
"ом"=>0.0120516207702469,
"ат"=>0.0112386952545473,
"цат"=>0.00945025912000813,
"од"=>0.0088608881211259,
"ал"=>0.00780408495071639,
"ог"=>0.00766182298546896,
"мин"=>0.00709277512447922,
"ит"=>0.00646275784981201,
"ан"=>0.00640178843613454,
"ут"=>0.00597500254039224,
"он"=>0.00534498526572503,
"ор"=>0.00526369271415507,
"ет"=>0.00481658368052027,
"ен"=>0.00481658368052027,
"ид"=>0.00449141347424042,
"ок"=>0.00438979778477797,
"ят"=>0.00434915150899299,
"ер"=>0.00430850523320801,
"ес"=>0.00414592013006808,
"ин"=>0.00414592013006808,
"ка"=>0.0041052738542831,
"ни"=>0.00408495071639061,
"ол"=>0.00400365816482065,
"ил"=>0.00400365816482065,
"ла"=>0.00363784168275582,
"ый"=>0.00347525657961589,
"ел"=>0.00339396402804593,
"сто"=>0.00317040951122853,
"ой"=>0.0030281475459811,
"ос"=>0.00300782440808861,
"ты"=>0.00292653185651865,
"им"=>0.00284523930494868,
"ев"=>0.00276394675337872,
"сам"=>0.00272330047759374,
"ик"=>0.00268265420180876,
"пер"=>0.00268265420180876,
"нач"=>0.00256071537445382,
"ем"=>0.00254039223656133,
"дес"=>0.00249974596077634,
"ар"=>0.00235748399552891,
"оп"=>0.00231683771974393,
"ав"=>0.00229651458185144,
"ам"=>0.00225586830606646,
"ир"=>0.00225586830606646,
"том"=>0.00213392947871151,
"ак"=>0.00209328320292653,
"ив"=>0.00207296006503404,
"пол"=>0.00199166751346408,
"об"=>0.00197134437557159,
"чет"=>0.0019510212376791,
"ед"=>0.00193069809978661,
"тых"=>0.00186972868610914,
"ис"=>0.00184940554821664,
"ва"=>0.00178843613453917,
"ад"=>0.00178843613453917,
"ятн"=>0.00176811299664668,
"ать"=>0.00174778985875419,
"пят"=>0.00174778985875419,
"ны"=>0.00164617416929174,
"дор"=>0.00164617416929174,
"сор"=>0.00162585103139925,
"век"=>0.00160552789350676,
"ли"=>0.00152423534193679,
"ур"=>0.00152423534193679,
"ах"=>0.0015039122040443,
"ей"=>0.00144294279036683,
"ич"=>0.00142261965247434,
"ек"=>0.00134132710090438,
"те"=>0.00134132710090438,
"дом"=>0.00132100396301189,
"ул"=>0.00132100396301189,
"гор"=>0.0013006808251194,
"етр"=>0.0013006808251194,
"ян"=>0.00128035768722691,
"та"=>0.00128035768722691,
"ду"=>0.00128035768722691,
"аб"=>0.00121938827354944,
"ас"=>0.00121938827354944,
"душ"=>0.00119906513565695,
"на"=>0.00119906513565695,
"ант"=>0.00119906513565695,
"ма"=>0.00117874199776445,
"сем"=>0.00115841885987196,
"оз"=>0.00115841885987196,
"две"=>0.00115841885987196,
"ост"=>0.00115841885987196,
"ры"=>0.00111777258408698,
"за"=>0.00111777258408698,
"ров"=>0.00111777258408698,
"нац"=>0.00111777258408698,
"ых"=>0.00109744944619449,
"из"=>0.00109744944619449,
"ком"=>0.00109744944619449,
"ент"=>0.001077126308302,
"ци"=>0.00105680317040951,
"стран"=>0.00105680317040951,
"тся"=>0.00103648003251702,
"восм"=>0.00103648003251702,
"гол"=>0.00103648003251702,
"пар"=>0.00101615689462453,
}
if probability_acute_sec[piece].nil? then probability_acute_sec[piece] = 0 end
if probability_acute[piece].nil? then probability_acute[piece] = 0 end
if probability_no[piece].nil? then probability_no[piece] = 0 end
if (probability_acute_sec[piece] + probability_acute[piece] - probability_no[piece]>0) then
if (probability_acute_sec[piece] > probability_acute[piece]) then
return '!'
# return 'П'
else
return '!'
#return 'У'
end
end
return '-'
end
def print() return @translation end
end