# # file: translit2koi8.pm # purpose: perl module for converting russian translit encoding to koi8-r # created: pasha nov 3 2000 # modified: pasha nov 18 2014 # modification: one more pattern for e_oborotnoe # synopsis: use translit2koi8; # $koi8_string = translit2koi8 ($translit_string); # pending: 1. exceptions from few-letters-patterns ('shvatil') ? # 2. maybe this should be optimized for speed - for now # we didn't care about it much... # package translit2koi8; use Exporter(); @ISA = qw(Exporter); @EXPORT = qw(translit2koi8 koi8_cmp); use strict; use warnings; ######################### data for convertion ####################### # koi8 use constant CONV_KOI8 => { 'e_oborotnoe' => 'DC', # well, that's ugly, but sometimes we have no other # choice than to specify it explicitly, # when both words, with "e" and "e_oborotnoe", are valid # (like "ser") 'shch' => 'DD', 'i\'' => 'CA', # "i kratkoe" 'kh' => 'C8', # "ha" (first variant) 'zh' => 'D6', # "zhe" (first variant) 'ch' => 'DE', 'sh' => 'DB', 'yu' => 'C0', 'ya' => 'D1', 'yo' => 'A3', # "yo" (e with two dots) 'a' => 'C1', 'b' => 'C2', 'v' => 'D7', 'g' => 'C7', 'd' => 'C4', 'e' => 'C5', 'j' => 'D6', # "zhe" (second variant) 'z' => 'DA', # "ze" 'i' => 'C9', 'k' => 'CB', 'l' => 'CC', 'm' => 'CD', 'n' => 'CE', 'o' => 'CF', 'p' => 'D0', 'r' => 'D2', 's' => 'D3', 't' => 'D4', 'u' => 'D5', 'f' => 'C6', 'h' => 'C8', # "ha" (second variant) 'c' => 'C3', # "tse" 'y' => 'D9', '\'' => 'D8', # "myagkii znak" '"' => 'DF' # "tverdyi znak" }; # replace first occurence of "e" to "e oborotnoe" # in words beginning with the following strings use constant E_OBOROTNOE_PATTERNS => [qw(aero Daniel diedr ebonit edak edgar Edinburg edip Eduard effekt efir egoi eho ekho ei'lat ei'ler ei'nshtei'n ekipirov ekolog ekonom ekran ekscentri ekscess eksgumirovan ekshn eksklyuziv ekskurs ekspedici eksperiment ekspert ekspluat ekspo ekspress ekspromt ekstra ekvator ekvivalent ekzamen ekzemplyar ekzoti elasti eleg elek element elit ellip el'f el'vira elya emanaci emblem embrion emigr emmiter emoci emul enciklopedi endo energ engel entomolog entropi entuzia epataj epatazh epichesk epidem epigraf epikur epilog epitet epizod epoh epokh epolet epopey epos epsilon ergodichesk eridana ermit ernest eroti erudi eshelon eskhil eskimo eskiz espan'olk espinoza esse estet eston estrad etaj etak eta eti etnichesk etnograf etnos eto etu etyud evaku EVM evolyuc ezoter kanoe Keler Kerrol kinoekspedic kinoeskiz koefficient kvintessenciya mnogoetazh odnoetazh poeksperimentiro poeta poetessa poeti poetomu poetu poezi poeznost proeks Rafael sekonom siluet Uel)]; ########################## end of data for convertion ##################### # shift between small and capital letters: use constant HEXCAPSHIFT => 0x20; ########################## "e oborotnoe"-related constants ################ use constant E_OBOROTNOE_INT => hex(CONV_KOI8->{'e_oborotnoe'}); use constant E_OBOROTNOE => chr(E_OBOROTNOE_INT); use constant E_OBOROTNOE_CAPITAL => chr(E_OBOROTNOE_INT + HEXCAPSHIFT); use constant MYAGKII_ZNAK_CAPITAL => chr(hex(CONV_KOI8->{'\''}) + HEXCAPSHIFT); # split "e oborotnoe" patterns around the first occurence of "e" my @E = (); my ($before, $after); for (@{(E_OBOROTNOE_PATTERNS)}) { ($before, $after) = split (/[Ee]/, $_, 2); push (@E, {'before' => $before, 'after' => $after}); } ########################## end of "e oborotnoe"-related constants ######### ################### russian (ordered) alphabet in translit ################ # currently used only in koi8_cmp() use constant ALPHABET => [qw(a b v g d e j z i i' k l m n o p r s t u f h c ch sh shch " y ' e_oborotnoe yu ya)]; use constant NUM_IN_ALPHABET => scalar @{ALPHABET()}; my %ALPHABET_KOI8 = (); # key - koi8 letter (both capital and smalls) # value - it's order number in alphabet my $counter = 0; my $hex; for (@{ALPHABET()}) { ++$counter; $hex = hex (CONV_KOI8->{$_}); $ALPHABET_KOI8{chr($hex)} = $counter + NUM_IN_ALPHABET; # small letters $ALPHABET_KOI8{chr($hex + HEXCAPSHIFT)} = $counter; # capital letters } ################### end of composing russian alphabet ##################### # main function which does the job: # transform given translit string (argument) to koi8 string (return) sub translit2koi8 ($) { my $input = $_[0]; return ('') if (! defined ($input)); # special case: turning "e" to "e oborotnoe" my ($before, $Before, $BEFORE, $after, $AFTER); for my $e (@E) { $before = $e->{'before'}; $Before = ucfirst ($before); $BEFORE = uc ($before); $after = $e->{'after'}; $AFTER = uc ($after); for ($input) { s/\b($before|$Before)e$after/$1 . E_OBOROTNOE . $after/eg; s/\b${BEFORE}E($after|$AFTER)/$BEFORE . E_OBOROTNOE_CAPITAL . $1/eg; } } # 1. mark quotes and double quotes around words # (in order not to replace them by corresponding koi-8 characters) # 2. if the myagkii znak is among upper-case characters (more precisiely, # follows an upper-case character and not followed by a lower-case # character), convert it to upper-case too (excluding combination I', which # should be converted to upper i-kratkoe) for ($input) { s/([\s\(])"([a-zA-Z0-9_\-',\.:!?\ ]*?)"([\s,\.:!\)]|$)/$1__^^__$2__^^__$3/g; s/([\s\(])'([a-zA-Z0-9_\-",\.:!?\ ]*?)'([\s,\.:!\)]|$)/$1__^__$2__^__$3/g; s/([A-HJ-Z])\'($|[^a-z])/$1 . MYAGKII_ZNAK_CAPITAL . $2/eg; } # process "conversion hash" in descending order by key length my ($uc, $ucfirst, $hex); for my $from (sort { length($b) <=> length($a) } keys %{CONV_KOI8()}) { $uc = uc ($from); $ucfirst = ucfirst ($from); $hex = hex (CONV_KOI8->{$from}); for ($input) { s/$from/chr($hex)/eg; s/($uc|$ucfirst)/chr($hex + HEXCAPSHIFT)/eg; } } # restore quotes and double quotes around words for ($input) { s/__\^__/'/g; s/__\^\^__/"/g; } # remove separation sign '^' # we use it to separate between letters # (like sh in 'shvatil', where it should be two separated # letters "s" and "h", and not one "sh") $input =~ s/\^//g; return ($input); } # end of translit2koi8() #TODO: probably not to process one-element hash keys in the loop above, # but use much faster 'tr' instead: # #my $singles_from = ''; #my $singles_to = ''; #for (keys (%SINGLES)) #{ # $hex = hex($SINGLES{$_}); # $singles_from .= $_ . uc($_); # $singles_to .= chr($hex) . chr($hex + HEXCAPSHIFT_KOI8); #} #eval ("\$STRING =~ tr/$singles_from/$singles_to/"); # compare two koi8 strings # return: 0, -1, 1 sub koi8_cmp ($$) { my @zero = split (/ */, $_[0]); my @first = split (/ */, $_[1]); my $r; for (my $i=0;; $i++) { if ((! defined $zero[$i]) && (! defined $first[$i])) { return (0); } elsif ((! defined $zero[$i]) && (defined $first[$i])) { return (-1); } elsif ((defined $zero[$i]) && (! defined $first[$i])) { return (1); } if (! exists $ALPHABET_KOI8{$zero[$i]}) { print (STDERR "$zero[$i] in $_[0] is not in alphabet\n"); exit (-1); } if (! exists $ALPHABET_KOI8{$first[$i]}) { print (STDERR "$first[$i] in $_[1] is not in alphabet\n"); exit (-1); } $r = ($ALPHABET_KOI8{$zero[$i]} <=> $ALPHABET_KOI8{$first[$i]}); return ($r) if ($r != 0); } } # end of koi8_cmp() 1; __END__