2011-02-27 07:56:29 +02:00
#!/usr/bin/env ruby
#encoding: utf-8
require " translator.rb "
require " morphology.rb "
class Source
def initialize ( text , log )
@text = text . encode ( " UTF-8 " )
@log = log
@translation = " "
@pattern = " "
2011-03-06 11:05:41 +02:00
@rhymed = Hash . new
2011-02-27 07:56:29 +02:00
@@translated_words = Hash . new
@@forms = Hash . new
@text . each_line do | line |
2011-03-06 11:05:41 +02:00
if line . empty? then next end
2011-02-27 07:56:29 +02:00
line . downcase . split . each do | word |
vowels = word . count ( " aeioyuáéíóúý´ " )
if ( vowels == 1 ) then #в односложных словах всё тривиально
@pattern << " ! "
unless word . match ( / [áéíóúý´] / ) then #знак ударения не проставлен, но гласная единственная — проставляем
{ " a " = > " á " , " e " = > " é " , " i " = > " í " , " u " = > " ú " , " y " = > " ý " } . each do | key , value | word . gsub! ( key , value ) end
end #end unless
next
end #if vowels == 1
word . each_char do | char |
if ( char . match ( / [áéíóúý´] / ) ) then @pattern = @pattern + " ! "
elsif ( char . match ( / [aeioyu] / ) ) then @pattern = @pattern + " - "
end
end #each char in word
@pattern = @pattern + " "
end #each word in line
@pattern = @pattern + " \n "
end #@text.each_line
end
def find_rhymes ( )
2011-02-27 08:44:46 +02:00
this_line_number = 0
2011-02-27 07:56:29 +02:00
@text . each_line do | line |
2011-02-27 08:44:46 +02:00
this_line_number = this_line_number + 1
2011-03-06 11:05:41 +02:00
this_last_vowel = line . split . at ( - 1 ) . scan ( / [áéíóúý]| \ w´ / )
2011-02-27 08:44:46 +02:00
other_line_number = 0
@text . each_line do | line_other |
other_line_number = other_line_number + 1
if other_line_number == this_line_number then next end
2011-03-06 11:05:41 +02:00
other_last_vowel = line_other . split . at ( - 1 ) . scan ( / [áéíóúý]| \ w´ / )
2011-02-27 08:44:46 +02:00
if other_last_vowel = this_last_vowel then
matched_vowels = 1
2011-03-06 11:05:41 +02:00
if line . split . at ( - 2 ) . scan ( / [áéíóúý]| \ w´ / ) == line_other . split . at ( - 2 ) . scan ( / [áéíóúý]| \ w´ / ) then matched_vowels = 2 end #можно сделать и так далее, но не стоит, наверное
if @rhymed [ this_line_number ] . nil? then @rhymed [ this_line_number ] = Hash . new end
@rhymed [ this_line_number ] [ other_line_number ] = matched_vowels
2011-02-27 07:56:29 +02:00
end
end
end
2011-03-06 11:05:41 +02:00
# @log << @rhymed.to_s
2011-02-27 07:56:29 +02:00
end
def replace ( )
phrases = {
" l' " = > " la " ,
" o´ " = > " ó " ,
" a´ " = > " á " ,
" e´ " = > " é " ,
" i´ " = > " í " ,
" u´ " = > " ú " ,
" y´ " = > " ý "
}
phrases . each do | key , value |
@text . gsub! ( / #{ key } / , value )
end
end
def translate ( )
translator = Translator . new ( @log ) ;
@text . split . each do | word |
if @@translated_words [ word ] . nil? then
word_translation = translator . process ( word )
if word_translation == true then next end
if word_translation != false then
2011-03-06 11:05:41 +02:00
@@translated_words [ word ] = word_translation
2011-02-27 07:56:29 +02:00
else
@log << " Перевод не удался. Прекращение работы. "
return false
end
end
#@log << "Полученный перевод: "+@@translated_words[word]
end
end
def arrange ( )
morphology = Morphology . new ( @log ) ;
@text . downcase . split . each do | word |
vowels = word . count ( " аеиоуыэюяё " ) # - word.length/2 #считаем слоги; Ruby считает побайтово, поэтому приходится е г о поправлять (thx source777)
#fixed with ruby1.9
met_vowels = 0
piece = " " #текущий слог
index = 0
while ( met_vowels < = vowels and word [ index ] ) do
if word [ index ] =~ / [аеиоуыэюяё] / then
met_vowels = met_vowels + 1
if piece . match ( " аеиоуыэюяё " ) then #на один слог может быть только одна гласная
check_piece ( piece )
piece = word [ index ]
end
end
piece = piece + word [ index ] . to_s
index = index + 1
end
end
@log << " Паттерн: \n "
@log << @pattern
##############################################зд. надо взять каждое слово и получить е г о словоформы, а также проставить каждой словоформе ударение
##############################################затем устроить Большой Перебор по паттерну
2011-03-06 11:05:41 +02:00
line_number = 0
2011-02-27 07:56:29 +02:00
@text . each_line do | line |
2011-03-06 11:05:41 +02:00
line_number = line_number + 1
2011-02-27 07:56:29 +02:00
line . split . each do | word |
2011-03-06 11:05:41 +02:00
forms_ready = Array . new
2011-02-27 07:56:29 +02:00
next if @@translated_words [ word ] . nil?
@pattern . split . each do | word_pattern |
piece = " "
result = " "
@@translated_words [ word ] . to_s . each_char do | char |
if char =~ / [аеиоуыэюяё] / then
if piece =~ / [аеиоуыэюяё] / then
result << check_piece ( piece )
piece = char
end
end
piece << char
end #@@translated_words[word].to_s.each_char
if ( word_pattern == result ) then
word = @@translated_words [ word ]
else
@@forms [ word ] = morphology . process ( @@translated_words [ word ] )
if not @@forms [ word ] == false
changed = false ;
@@forms [ word ] . each do | form |
piece = " "
result = " "
form . each_char do | char |
if char =~ / [аеиоуыэюяё] / then
if piece =~ / [аеиоуыэюяё] / then
result << check_piece ( piece )
piece = char
end
end
piece << char
end
if ( word_pattern == result ) then
2011-03-06 11:05:41 +02:00
forms_ready . push ( form )
end
end
#@forms_ready теперь содержит все формы, подходящие по паттерну; проверим рифму
if line_number > 1 and ( word == line . split . last or word == line . split . at ( - 2 ) ) then
forms_ready . each do | form |
@rhymed [ line_number ] [ line_number .. 0 ] . each_with_index do | number_of_vowels , line_index |
piece = " "
previous_vowel = ''
line_vowel = ''
form_vowel = ''
i = 1
found = false
while i < number_of_vowels do
@translation . split ( $/ ) . at ( line_index ) . split . at ( - number_of_vowels ) . each_char do | char | #n-е слово рифмующейся строки перевода
if char =~ / [аеиоуыэюяё] / then
previous_vowel = char
if piece =~ / [аеиоуыэюяё] / then
if check_piece ( piece ) == '!' then
line_vowel = previous_vowel
break
end
end
end
piece << char
end
piece = " "
form . each_char do | char |
if char =~ / [аеиоуыэюяё] / then
previous_vowel = char
if piece =~ / [аеиоуыэюяё] / then
if check_piece ( piece ) == '!' then
form_vowel = previous_vowel
break
end
end
end
piece << char
end
if line_vowel != form_vowel then
found = false
break
else
if i == number_of_vowels - 1 then
word = form
found = true
break
end
end
if ( found == true ) then break end
i = i + 1
end #end while
end
end
else
if forms_ready . size == 0 then changed = false
else
word = forms_ready [ rand ( forms_ready . size ) ]
changed = true
2011-02-27 07:56:29 +02:00
end
end
if not changed then word = @@forms [ word ] [ rand ( @@forms [ word ] . size ) ] end #если форма не подобрана, ставим наугад — чтобы не потерять слово
else #формы не получены, выбирать не из чего
word = @@translated_words [ word ]
end
end #if (word_pattern == result)
@translation << word . to_s
end #@text.split.each.to_s do |word|
@translation << " "
end
@translation << " \n "
end
end
#проверка русских слогов на ударение
def check_piece ( piece )
probability_acute_sec = {
" все " = > 0 . 14 ,
" че " = > 0 . 0675 ,
" ква " = > 0 . 0375 ,
" ме " = > 0 . 0275 ,
" ми " = > 0 . 025 ,
" ви " = > 0 . 0175 ,
" ак " = > 0 . 015 ,
" сле " = > 0 . 01 ,
" на " = > 0 . 01 ,
" ки " = > 0 . 01 ,
" не " = > 0 . 01 ,
" р а " = > 0 . 01 ,
}
probability_acute = {
" ве " = > 0 . 0255724374063771 ,
" по " = > 0 . 020 9715386261502 ,
" г о " = > 0 . 0206505456 879949 ,
" вы " = > 0 . 017547613 952493 ,
" до " = > 0 . 01176 97410656966 ,
" с а " = > 0 . 01144 87481275412 ,
" во " = > 0 . 0111277551 893858 ,
" сто " = > 0 . 0106 997646051787 ,
" ма " = > 0 . 00 802482345388401 ,
" ко " = > 0 . 007275 83993152151 ,
" ду " = > 0 . 007061 84463941793 ,
" те " = > 0 . 006740 85170126257 ,
" ме " = > 0 . 006526 856409159 ,
" б о " = > 0 . 006526 856409159 ,
" на " = > 0 . 00641 985876310721 ,
" де " = > 0 . 005 88487053284828 ,
" то " = > 0 . 005 88487053284828 ,
" р а " = > 0 . 00534 988230258934 ,
" за " = > 0 . 005135 88701048577 ,
" мо " = > 0 . 005135 88701048577 ,
" ка " = > 0 . 005135 88701048577 ,
" це " = > 0 . 00502 888936443398 ,
" па " = > 0 . 00502 888936443398 ,
" ли " = > 0 . 004 81489407233041 ,
" сте " = > 0 . 0044 9390113417505 ,
" ви " = > 0 . 0044 9390113417505 ,
" пра " = > 0 . 0044 9390113417505 ,
" пе " = > 0 . 0044 9390113417505 ,
" сло " = > 0 . 0043 8690348812326 ,
" про " = > 0 . 00427 990584207147 ,
" но " = > 0 . 004065 9105499679 ,
" р е " = > 0 . 003 95891290391611 ,
" гла " = > 0 . 003744 91761181254 ,
" ла " = > 0 . 003637 91996576075 ,
" с у " = > 0 . 003637 91996576075 ,
" ле " = > 0 . 003637 91996576075 ,
" зе " = > 0 . 003637 91996576075 ,
" стра " = > 0 . 003637 91996576075 ,
" с о " = > 0 . 003530 92231970897 ,
" зна " = > 0 . 003530 92231970897 ,
" пи " = > 0 . 003530 92231970897 ,
" с е " = > 0 . 003423 92467365718 ,
" тре " = > 0 . 003423 92467365718 ,
" х о " = > 0 . 003316 92702760539 ,
" при " = > 0 . 00320 992938155361 ,
" ска " = > 0 . 00320 992938155361 ,
" не " = > 0 . 00320 992938155361 ,
" р о " = > 0 . 00320 992938155361 ,
" чи " = > 0 . 002 99593408945003 ,
" б е " = > 0 . 002 99593408945003 ,
" ча " = > 0 . 002 99593408945003 ,
" ва " = > 0 . 002 88893644339825 ,
" кру " = > 0 . 002674 94115129467 ,
" ми " = > 0 . 002567 94350524288 ,
" пу " = > 0 . 002460 9458591911 ,
" ты " = > 0 . 002353 94821313931 ,
" р у " = > 0 . 00213 995292103574 ,
" же " = > 0 . 00213 995292103574 ,
" да " = > 0 . 002032 95527498395 ,
" че " = > 0 . 002032 95527498395 ,
" зме " = > 0 . 002032 95527498395 ,
" ста " = > 0 . 001 92595762893216 ,
" жа " = > 0 . 001 92595762893216 ,
" ну " = > 0 . 001 92595762893216 ,
" ку " = > 0 . 001 81895998288038 ,
" г е " = > 0 . 001 81895998288038 ,
" кра " = > 0 . 001 81895998288038 ,
" си " = > 0 . 001 81895998288038 ,
" тра " = > 0 . 001 81895998288038 ,
" б а " = > 0 . 001 81895998288038 ,
" ти " = > 0 . 001 81895998288038 ,
" х а " = > 0 . 001 81895998288038 ,
" г р а " = > 0 . 001711 96233682859 ,
" тру " = > 0 . 001711 96233682859 ,
" та " = > 0 . 001604 9646907768 ,
" б у " = > 0 . 001604 9646907768 ,
" г а " = > 0 . 001604 9646907768 ,
" тро " = > 0 . 001604 9646907768 ,
" чу " = > 0 . 0014 9796704472502 ,
" тю " = > 0 . 0014 9796704472502 ,
" хло " = > 0 . 0014 9796704472502 ,
" ни " = > 0 . 0013 9096939867323 ,
" му " = > 0 . 0013 9096939867323 ,
" ту " = > 0 . 0013 9096939867323 ,
" цве " = > 0 . 0012 8397175262144 ,
" ло " = > 0 . 0012 8397175262144 ,
" кла " = > 0 . 0012 8397175262144 ,
" зо " = > 0 . 0012 8397175262144 ,
" ке " = > 0 . 001176 97410656966 ,
" фо " = > 0 . 00106 997646051787 ,
" сме " = > 0 . 00106 997646051787 ,
" мэ " = > 0 . 00106 997646051787 ,
" ша " = > 0 . 00106 997646051787 ,
" пла " = > 0 . 00106 997646051787 ,
" све " = > 0 . 00106 997646051787 ,
" ки " = > 0 . 00106 997646051787 ,
}
probability_no = {
" от " = > 0 . 04430444060562 95 ,
" дев " = > 0 . 0436 947464688548 ,
" ятс " = > 0 . 040 9307997154761 ,
" ов " = > 0 . 0121 938827354944 ,
" ом " = > 0 . 012051620770246 9 ,
" ат " = > 0 . 01123 86952545473 ,
" цат " = > 0 . 00 945025912000813 ,
" од " = > 0 . 00 88608881211259 ,
" ал " = > 0 . 007 80408495071639 ,
" о г " = > 0 . 007661 82298546896 ,
" мин " = > 0 . 0070 9277512447922 ,
" ит " = > 0 . 006462757 84981201 ,
" ан " = > 0 . 0064017 8843613454 ,
" ут " = > 0 . 005 97500254039224 ,
" он " = > 0 . 005344 98526572503 ,
" о р " = > 0 . 0052636 9271415507 ,
" ет " = > 0 . 004 81658368052027 ,
" ен " = > 0 . 004 81658368052027 ,
" ид " = > 0 . 0044 9141347424042 ,
" ок " = > 0 . 0043 8979778477797 ,
" ят " = > 0 . 00434 915150899299 ,
" е р " = > 0 . 00430 850523320801 ,
" е с " = > 0 . 004145 92013006808 ,
" ин " = > 0 . 004145 92013006808 ,
" ка " = > 0 . 004105273 8542831 ,
" ни " = > 0 . 0040 8495071639061 ,
" ол " = > 0 . 00400365 816482065 ,
" ил " = > 0 . 00400365 816482065 ,
" ла " = > 0 . 003637 84168275582 ,
" ый " = > 0 . 00347525657 961589 ,
" ел " = > 0 . 0033 9396402804593 ,
" сто " = > 0 . 00317040 951122853 ,
" ой " = > 0 . 00302 81475459811 ,
" о с " = > 0 . 003007 82440808861 ,
" ты " = > 0 . 002 92653185651865 ,
" им " = > 0 . 002 84523930494868 ,
" ев " = > 0 . 002763 94675337872 ,
" сам " = > 0 . 0027233004775 9374 ,
" ик " = > 0 . 0026 8265420180876 ,
" пер " = > 0 . 0026 8265420180876 ,
" нач " = > 0 . 002560715374453 82 ,
" ем " = > 0 . 0025403 9223656133 ,
" дес " = > 0 . 0024 9974596077634 ,
" а р " = > 0 . 0023574 8399552891 ,
" оп " = > 0 . 002316 83771974393 ,
" ав " = > 0 . 0022 9651458185144 ,
" ам " = > 0 . 002255 86830606646 ,
" ир " = > 0 . 002255 86830606646 ,
" том " = > 0 . 002133 92947871151 ,
" ак " = > 0 . 0020 9328320292653 ,
" ив " = > 0 . 002072 96006503404 ,
" пол " = > 0 . 001 99166751346408 ,
" о б " = > 0 . 001 97134437557159 ,
" чет " = > 0 . 001 9510212376791 ,
" ед " = > 0 . 001 93069809978661 ,
" тых " = > 0 . 001 86972868610914 ,
" ис " = > 0 . 001 84940554821664 ,
" ва " = > 0 . 0017 8843613453917 ,
" ад " = > 0 . 0017 8843613453917 ,
" ятн " = > 0 . 00176 811299664668 ,
" ать " = > 0 . 0017477 8985875419 ,
" пят " = > 0 . 0017477 8985875419 ,
" ны " = > 0 . 00164617416 929174 ,
" дор " = > 0 . 00164617416 929174 ,
" с о р " = > 0 . 001625 85103139925 ,
" век " = > 0 . 001605527 89350676 ,
" ли " = > 0 . 001524235341 93679 ,
" у р " = > 0 . 001524235341 93679 ,
" а х " = > 0 . 001503 9122040443 ,
" ей " = > 0 . 001442 94279036683 ,
" ич " = > 0 . 00142261 965247434 ,
" ек " = > 0 . 001341327100 90438 ,
" те " = > 0 . 001341327100 90438 ,
" дом " = > 0 . 001321003 96301189 ,
" ул " = > 0 . 001321003 96301189 ,
" г о р " = > 0 . 0013006 808251194 ,
" етр " = > 0 . 0013006 808251194 ,
" ян " = > 0 . 0012 8035768722691 ,
" та " = > 0 . 0012 8035768722691 ,
" ду " = > 0 . 0012 8035768722691 ,
" а б " = > 0 . 00121 938827354944 ,
" а с " = > 0 . 00121 938827354944 ,
" душ " = > 0 . 0011 9906513565695 ,
" на " = > 0 . 0011 9906513565695 ,
" ант " = > 0 . 0011 9906513565695 ,
" ма " = > 0 . 00117 874199776445 ,
" сем " = > 0 . 00115 841885987196 ,
" оз " = > 0 . 00115 841885987196 ,
" две " = > 0 . 00115 841885987196 ,
" ост " = > 0 . 00115 841885987196 ,
" ры " = > 0 . 0011177725 8408698 ,
" за " = > 0 . 0011177725 8408698 ,
" ров " = > 0 . 0011177725 8408698 ,
" нац " = > 0 . 0011177725 8408698 ,
" ых " = > 0 . 0010 9744944619449 ,
" из " = > 0 . 0010 9744944619449 ,
" ком " = > 0 . 0010 9744944619449 ,
" ент " = > 0 . 00107712630 8302 ,
" ци " = > 0 . 001056 80317040951 ,
" стран " = > 0 . 001056 80317040951 ,
" тся " = > 0 . 0010364 8003251702 ,
" восм " = > 0 . 0010364 8003251702 ,
" гол " = > 0 . 0010364 8003251702 ,
" пар " = > 0 . 001016156 89462453 ,
}
if probability_acute_sec [ piece ] . nil? then probability_acute_sec [ piece ] = 0 end
if probability_acute [ piece ] . nil? then probability_acute [ piece ] = 0 end
if probability_no [ piece ] . nil? then probability_no [ piece ] = 0 end
if ( probability_acute_sec [ piece ] + probability_acute [ piece ] - probability_no [ piece ] > 0 ) then
if ( probability_acute_sec [ piece ] > probability_acute [ piece ] ) then
2011-03-06 11:05:41 +02:00
return '!'
# return 'П'
2011-02-27 07:56:29 +02:00
else
2011-03-06 11:05:41 +02:00
return '!'
#return 'У '
2011-02-27 07:56:29 +02:00
end
end
return '-'
end
def print ( ) return @translation end
end