2011-02-27 07:56:29 +02:00
#!/usr/bin/env ruby
#encoding: utf-8
require " translator.rb "
require " morphology.rb "
2011-04-22 14:32:32 +03:00
require " stack.rb "
2011-02-27 07:56:29 +02:00
class Source
2011-04-22 14:32:32 +03:00
def initialize ( text , log )
2011-02-27 07:56:29 +02:00
@text = text . encode ( " UTF-8 " )
@log = log
@translation = " "
@pattern = " "
2011-03-06 11:05:41 +02:00
@rhymed = Hash . new
2011-04-22 14:32:32 +03:00
@translated_words = Stack . new
2011-02-27 07:56:29 +02:00
@text . each_line do | line |
2011-03-06 11:05:41 +02:00
if line . empty? then next end
2011-02-27 07:56:29 +02:00
line . downcase . split . each do | word |
vowels = word . count ( " aeioyuáéíóúý´ " )
if ( vowels == 1 ) then #в односложных словах всё тривиально
2011-04-22 14:32:32 +03:00
@pattern << " ! "
2011-02-27 07:56:29 +02:00
unless word . match ( / [áéíóúý´] / ) then #знак ударения не проставлен, но гласная единственная — проставляем
{ " a " = > " á " , " e " = > " é " , " i " = > " í " , " u " = > " ú " , " y " = > " ý " } . each do | key , value | word . gsub! ( key , value ) end
end #end unless
next
end #if vowels == 1
word . each_char do | char |
if ( char . match ( / [áéíóúý´] / ) ) then @pattern = @pattern + " ! "
elsif ( char . match ( / [aeioyu] / ) ) then @pattern = @pattern + " - "
end
end #each char in word
2011-04-22 14:32:32 +03:00
#@pattern = @pattern + " "
2011-02-27 07:56:29 +02:00
end #each word in line
@pattern = @pattern + " \n "
end #@text.each_line
end
def find_rhymes ( )
2011-02-27 08:44:46 +02:00
this_line_number = 0
2011-02-27 07:56:29 +02:00
@text . each_line do | line |
2011-02-27 08:44:46 +02:00
this_line_number = this_line_number + 1
2011-03-06 11:05:41 +02:00
this_last_vowel = line . split . at ( - 1 ) . scan ( / [áéíóúý]| \ w´ / )
2011-02-27 08:44:46 +02:00
other_line_number = 0
@text . each_line do | line_other |
other_line_number = other_line_number + 1
if other_line_number == this_line_number then next end
2011-03-06 11:05:41 +02:00
other_last_vowel = line_other . split . at ( - 1 ) . scan ( / [áéíóúý]| \ w´ / )
2011-02-27 08:44:46 +02:00
if other_last_vowel = this_last_vowel then
matched_vowels = 1
2011-03-06 11:05:41 +02:00
if line . split . at ( - 2 ) . scan ( / [áéíóúý]| \ w´ / ) == line_other . split . at ( - 2 ) . scan ( / [áéíóúý]| \ w´ / ) then matched_vowels = 2 end #можно сделать и так далее, но не стоит, наверное
if @rhymed [ this_line_number ] . nil? then @rhymed [ this_line_number ] = Hash . new end
@rhymed [ this_line_number ] [ other_line_number ] = matched_vowels
2011-02-27 07:56:29 +02:00
end
end
end
2011-03-06 11:05:41 +02:00
# @log << @rhymed.to_s
2011-02-27 07:56:29 +02:00
end
def replace ( )
phrases = {
" l' " = > " la " ,
" o´ " = > " ó " ,
" a´ " = > " á " ,
" e´ " = > " é " ,
" i´ " = > " í " ,
" u´ " = > " ú " ,
" y´ " = > " ý "
}
phrases . each do | key , value |
@text . gsub! ( / #{ key } / , value )
end
end
def translate ( )
translator = Translator . new ( @log ) ;
@text . split . each do | word |
2011-04-22 14:32:32 +03:00
word_translation = translator . process ( word )
if word_translation == true then next end
if word_translation != false then
@translated_words . push ( word_translation )
else
@log << " Перевод не удался. Прекращение работы. "
return false
2011-02-27 07:56:29 +02:00
end
end
end
def arrange ( )
morphology = Morphology . new ( @log ) ;
2011-04-22 14:32:32 +03:00
# @text.downcase.split.each do |word|
# vowels = word.count("аеиоуыэюяё")
# met_vowels = 0
# piece = "" #текущий слог
# index = 0
# while (met_vowels <= vowels and word[index]) do
# if word[index] =~ /[аеиоуыэюяё]/ then
# met_vowels = met_vowels+1
# if piece.match("аеиоуыэюяё") then #на один слог может быть только одна гласная
# check_piece(piece)
# piece = word[index]
# end
# end
# piece = piece + word[index].to_s
# index = index + 1
# end
# end
@log << " Паттерн: \n "
@log << @pattern
line_number = 0
@pattern . each_line do | pattern_line |
if ( @translated_words . count ( ) == 0 ) then break end
pattern_index = 0
line_number = line_number + 1
while ( pattern_index < pattern_line . size and @translated_words . count ( ) > 0 ) do
word = @translated_words . pop ( )
forms = morphology . process ( word )
if ( forms == false ) then
#@translation << word .. " "
#pieces = 0
#word.scan(/[аеиоуыэюяё]/) { pieces += 1}
#pattern_index = pattern_index + pieces
next
end
forms_result = Array . new ( )
# TODO: если слово — последнее в строке или предпоследнее, то отсеиваем все не рифмующиеся (плохо рифмующиеся) словоформы. Если ничего не осталось — не отсеиваем.
forms . each do | form |
forms_result . push ( check_pattern ( form , pattern_line , pattern_index ) )
#TODO: forms_result надо сделать хэшем массивов и сортировать по ключам. Выбирать любой вариант из полученного массива.
end
form_index = 0
forms_result . each do | result |
if result < = 12 then
@translation << forms [ form_index ] + " "
pieces = 0
forms [ form_index ] . scan ( / [аеиоуыэюяё] / ) { pieces += 1 }
pattern_index = pattern_index + pieces
break
2011-02-27 07:56:29 +02:00
end
2011-04-22 14:32:32 +03:00
form_index += 1
2011-02-27 07:56:29 +02:00
end
end
2011-04-22 14:32:32 +03:00
@translation << " \n "
2011-02-27 07:56:29 +02:00
end
2011-04-22 14:32:32 +03:00
end
private
def check_pattern ( word , pattern , index ) #проверяет соответствие слова паттерну; возвращает 0, если слово вписывается и число, если не вписывается
word_pattern = " "
piece = " "
word . to_s . each_char do | char |
2011-02-27 07:56:29 +02:00
if char =~ / [аеиоуыэюяё] / then
if piece =~ / [аеиоуыэюяё] / then
2011-04-22 14:32:32 +03:00
word_pattern << check_piece ( piece )
2011-02-27 07:56:29 +02:00
piece = char
end
end
2011-04-22 14:32:32 +03:00
piece = piece + char
end
pattern_index = index #или index+1?
result = 0
word_pattern . each_char do | char |
if char == '-' and pattern [ pattern_index ] == '!' then
result = result + 2 #безударные слоги не могут стать ударными - см. главу 1
end
if char == '!' and pattern [ pattern_index ] == '-' then
result = result + 1
end
pattern_index = pattern_index + 1
2011-02-27 07:56:29 +02:00
end
2011-04-22 14:32:32 +03:00
return result
2011-02-27 07:56:29 +02:00
end
#проверка русских слогов на ударение
def check_piece ( piece )
probability_acute_sec = {
" все " = > 0 . 14 ,
" че " = > 0 . 0675 ,
" ква " = > 0 . 0375 ,
" ме " = > 0 . 0275 ,
" ми " = > 0 . 025 ,
" ви " = > 0 . 0175 ,
" ак " = > 0 . 015 ,
" сле " = > 0 . 01 ,
" на " = > 0 . 01 ,
" ки " = > 0 . 01 ,
" не " = > 0 . 01 ,
" р а " = > 0 . 01 ,
}
probability_acute = {
" ве " = > 0 . 0255724374063771 ,
" по " = > 0 . 020 9715386261502 ,
" г о " = > 0 . 0206505456 879949 ,
" вы " = > 0 . 017547613 952493 ,
" до " = > 0 . 01176 97410656966 ,
" с а " = > 0 . 01144 87481275412 ,
" во " = > 0 . 0111277551 893858 ,
" сто " = > 0 . 0106 997646051787 ,
" ма " = > 0 . 00 802482345388401 ,
" ко " = > 0 . 007275 83993152151 ,
" ду " = > 0 . 007061 84463941793 ,
" те " = > 0 . 006740 85170126257 ,
" ме " = > 0 . 006526 856409159 ,
" б о " = > 0 . 006526 856409159 ,
" на " = > 0 . 00641 985876310721 ,
" де " = > 0 . 005 88487053284828 ,
" то " = > 0 . 005 88487053284828 ,
" р а " = > 0 . 00534 988230258934 ,
" за " = > 0 . 005135 88701048577 ,
" мо " = > 0 . 005135 88701048577 ,
" ка " = > 0 . 005135 88701048577 ,
" це " = > 0 . 00502 888936443398 ,
" па " = > 0 . 00502 888936443398 ,
" ли " = > 0 . 004 81489407233041 ,
" сте " = > 0 . 0044 9390113417505 ,
" ви " = > 0 . 0044 9390113417505 ,
" пра " = > 0 . 0044 9390113417505 ,
" пе " = > 0 . 0044 9390113417505 ,
" сло " = > 0 . 0043 8690348812326 ,
" про " = > 0 . 00427 990584207147 ,
" но " = > 0 . 004065 9105499679 ,
" р е " = > 0 . 003 95891290391611 ,
" гла " = > 0 . 003744 91761181254 ,
" ла " = > 0 . 003637 91996576075 ,
" с у " = > 0 . 003637 91996576075 ,
" ле " = > 0 . 003637 91996576075 ,
" зе " = > 0 . 003637 91996576075 ,
" стра " = > 0 . 003637 91996576075 ,
" с о " = > 0 . 003530 92231970897 ,
" зна " = > 0 . 003530 92231970897 ,
" пи " = > 0 . 003530 92231970897 ,
" с е " = > 0 . 003423 92467365718 ,
" тре " = > 0 . 003423 92467365718 ,
" х о " = > 0 . 003316 92702760539 ,
" при " = > 0 . 00320 992938155361 ,
" ска " = > 0 . 00320 992938155361 ,
" не " = > 0 . 00320 992938155361 ,
" р о " = > 0 . 00320 992938155361 ,
" чи " = > 0 . 002 99593408945003 ,
" б е " = > 0 . 002 99593408945003 ,
" ча " = > 0 . 002 99593408945003 ,
" ва " = > 0 . 002 88893644339825 ,
" кру " = > 0 . 002674 94115129467 ,
" ми " = > 0 . 002567 94350524288 ,
" пу " = > 0 . 002460 9458591911 ,
" ты " = > 0 . 002353 94821313931 ,
" р у " = > 0 . 00213 995292103574 ,
" же " = > 0 . 00213 995292103574 ,
" да " = > 0 . 002032 95527498395 ,
" че " = > 0 . 002032 95527498395 ,
" зме " = > 0 . 002032 95527498395 ,
" ста " = > 0 . 001 92595762893216 ,
" жа " = > 0 . 001 92595762893216 ,
" ну " = > 0 . 001 92595762893216 ,
" ку " = > 0 . 001 81895998288038 ,
" г е " = > 0 . 001 81895998288038 ,
" кра " = > 0 . 001 81895998288038 ,
" си " = > 0 . 001 81895998288038 ,
" тра " = > 0 . 001 81895998288038 ,
" б а " = > 0 . 001 81895998288038 ,
" ти " = > 0 . 001 81895998288038 ,
" х а " = > 0 . 001 81895998288038 ,
" г р а " = > 0 . 001711 96233682859 ,
" тру " = > 0 . 001711 96233682859 ,
" та " = > 0 . 001604 9646907768 ,
" б у " = > 0 . 001604 9646907768 ,
" г а " = > 0 . 001604 9646907768 ,
" тро " = > 0 . 001604 9646907768 ,
" чу " = > 0 . 0014 9796704472502 ,
" тю " = > 0 . 0014 9796704472502 ,
" хло " = > 0 . 0014 9796704472502 ,
" ни " = > 0 . 0013 9096939867323 ,
" му " = > 0 . 0013 9096939867323 ,
" ту " = > 0 . 0013 9096939867323 ,
" цве " = > 0 . 0012 8397175262144 ,
" ло " = > 0 . 0012 8397175262144 ,
" кла " = > 0 . 0012 8397175262144 ,
" зо " = > 0 . 0012 8397175262144 ,
" ке " = > 0 . 001176 97410656966 ,
" фо " = > 0 . 00106 997646051787 ,
" сме " = > 0 . 00106 997646051787 ,
" мэ " = > 0 . 00106 997646051787 ,
" ша " = > 0 . 00106 997646051787 ,
" пла " = > 0 . 00106 997646051787 ,
" све " = > 0 . 00106 997646051787 ,
" ки " = > 0 . 00106 997646051787 ,
}
probability_no = {
" от " = > 0 . 04430444060562 95 ,
" дев " = > 0 . 0436 947464688548 ,
" ятс " = > 0 . 040 9307997154761 ,
" ов " = > 0 . 0121 938827354944 ,
" ом " = > 0 . 012051620770246 9 ,
" ат " = > 0 . 01123 86952545473 ,
" цат " = > 0 . 00 945025912000813 ,
" од " = > 0 . 00 88608881211259 ,
" ал " = > 0 . 007 80408495071639 ,
" о г " = > 0 . 007661 82298546896 ,
" мин " = > 0 . 0070 9277512447922 ,
" ит " = > 0 . 006462757 84981201 ,
" ан " = > 0 . 0064017 8843613454 ,
" ут " = > 0 . 005 97500254039224 ,
" он " = > 0 . 005344 98526572503 ,
" о р " = > 0 . 0052636 9271415507 ,
" ет " = > 0 . 004 81658368052027 ,
" ен " = > 0 . 004 81658368052027 ,
" ид " = > 0 . 0044 9141347424042 ,
" ок " = > 0 . 0043 8979778477797 ,
" ят " = > 0 . 00434 915150899299 ,
" е р " = > 0 . 00430 850523320801 ,
" е с " = > 0 . 004145 92013006808 ,
" ин " = > 0 . 004145 92013006808 ,
" ка " = > 0 . 004105273 8542831 ,
" ни " = > 0 . 0040 8495071639061 ,
" ол " = > 0 . 00400365 816482065 ,
" ил " = > 0 . 00400365 816482065 ,
" ла " = > 0 . 003637 84168275582 ,
" ый " = > 0 . 00347525657 961589 ,
" ел " = > 0 . 0033 9396402804593 ,
" сто " = > 0 . 00317040 951122853 ,
" ой " = > 0 . 00302 81475459811 ,
" о с " = > 0 . 003007 82440808861 ,
" ты " = > 0 . 002 92653185651865 ,
" им " = > 0 . 002 84523930494868 ,
" ев " = > 0 . 002763 94675337872 ,
" сам " = > 0 . 0027233004775 9374 ,
" ик " = > 0 . 0026 8265420180876 ,
" пер " = > 0 . 0026 8265420180876 ,
" нач " = > 0 . 002560715374453 82 ,
" ем " = > 0 . 0025403 9223656133 ,
" дес " = > 0 . 0024 9974596077634 ,
" а р " = > 0 . 0023574 8399552891 ,
" оп " = > 0 . 002316 83771974393 ,
" ав " = > 0 . 0022 9651458185144 ,
" ам " = > 0 . 002255 86830606646 ,
" ир " = > 0 . 002255 86830606646 ,
" том " = > 0 . 002133 92947871151 ,
" ак " = > 0 . 0020 9328320292653 ,
" ив " = > 0 . 002072 96006503404 ,
" пол " = > 0 . 001 99166751346408 ,
" о б " = > 0 . 001 97134437557159 ,
" чет " = > 0 . 001 9510212376791 ,
" ед " = > 0 . 001 93069809978661 ,
" тых " = > 0 . 001 86972868610914 ,
" ис " = > 0 . 001 84940554821664 ,
" ва " = > 0 . 0017 8843613453917 ,
" ад " = > 0 . 0017 8843613453917 ,
" ятн " = > 0 . 00176 811299664668 ,
" ать " = > 0 . 0017477 8985875419 ,
" пят " = > 0 . 0017477 8985875419 ,
" ны " = > 0 . 00164617416 929174 ,
" дор " = > 0 . 00164617416 929174 ,
" с о р " = > 0 . 001625 85103139925 ,
" век " = > 0 . 001605527 89350676 ,
" ли " = > 0 . 001524235341 93679 ,
" у р " = > 0 . 001524235341 93679 ,
" а х " = > 0 . 001503 9122040443 ,
" ей " = > 0 . 001442 94279036683 ,
" ич " = > 0 . 00142261 965247434 ,
" ек " = > 0 . 001341327100 90438 ,
" те " = > 0 . 001341327100 90438 ,
" дом " = > 0 . 001321003 96301189 ,
" ул " = > 0 . 001321003 96301189 ,
" г о р " = > 0 . 0013006 808251194 ,
" етр " = > 0 . 0013006 808251194 ,
" ян " = > 0 . 0012 8035768722691 ,
" та " = > 0 . 0012 8035768722691 ,
" ду " = > 0 . 0012 8035768722691 ,
" а б " = > 0 . 00121 938827354944 ,
" а с " = > 0 . 00121 938827354944 ,
" душ " = > 0 . 0011 9906513565695 ,
" на " = > 0 . 0011 9906513565695 ,
" ант " = > 0 . 0011 9906513565695 ,
" ма " = > 0 . 00117 874199776445 ,
" сем " = > 0 . 00115 841885987196 ,
" оз " = > 0 . 00115 841885987196 ,
" две " = > 0 . 00115 841885987196 ,
" ост " = > 0 . 00115 841885987196 ,
" ры " = > 0 . 0011177725 8408698 ,
" за " = > 0 . 0011177725 8408698 ,
" ров " = > 0 . 0011177725 8408698 ,
" нац " = > 0 . 0011177725 8408698 ,
" ых " = > 0 . 0010 9744944619449 ,
" из " = > 0 . 0010 9744944619449 ,
" ком " = > 0 . 0010 9744944619449 ,
" ент " = > 0 . 00107712630 8302 ,
" ци " = > 0 . 001056 80317040951 ,
" стран " = > 0 . 001056 80317040951 ,
" тся " = > 0 . 0010364 8003251702 ,
" восм " = > 0 . 0010364 8003251702 ,
" гол " = > 0 . 0010364 8003251702 ,
" пар " = > 0 . 001016156 89462453 ,
}
if probability_acute_sec [ piece ] . nil? then probability_acute_sec [ piece ] = 0 end
if probability_acute [ piece ] . nil? then probability_acute [ piece ] = 0 end
if probability_no [ piece ] . nil? then probability_no [ piece ] = 0 end
if ( probability_acute_sec [ piece ] + probability_acute [ piece ] - probability_no [ piece ] > 0 ) then
if ( probability_acute_sec [ piece ] > probability_acute [ piece ] ) then
2011-03-06 11:05:41 +02:00
return '!'
# return 'П'
2011-02-27 07:56:29 +02:00
else
2011-03-06 11:05:41 +02:00
return '!'
#return 'У '
2011-02-27 07:56:29 +02:00
end
end
return '-'
end
def print ( ) return @translation end
end