UNPKG

cldr

Version:

Library for extracting data from CLDR (the Unicode Common Locale Data Repository)

162 lines (161 loc) 6.83 kB
<?xml version="1.0" encoding="UTF-8" ?> <!DOCTYPE supplementalData SYSTEM "../../common/dtd/ldmlSupplemental.dtd"> <!-- Copyright © 1991-2013 Unicode, Inc. CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/) For terms of use, see http://www.unicode.org/copyright.html --> <supplementalData> <version number="$Revision$"/> <transforms> <transform source="ThaiLogical" target="Latin" direction="both" visibility="internal"> <tRule><![CDATA[ # Thai-Latin # This set of rules follows ISO 11940 # see http://homepage.mac.com/sirbinks/pdf/Thai.r2.pdf # except that that does not mention an implicit vowel, so we use ọ # # The transcription is fairly ugly, so we ought to also do the UNGEGN version # see: http://www.eki.ee/wgrs/rom1_th.pdf # and probably make that the main variant. # # Note: this is an internal file. The NFD/NFC is handled externally, in the index # The insertion of spaces between words, the reversal of the vowels # and the conversion of space to semicolon are done *outside* of these rules. # So as far as these rules are concerned, the vowels are in logical order! # insert implicit vowel (and remove it going the other way) # COMMENTED out: the implicit vowel positions cannot be predicted algorithmically #$consonant = [ก-ฮ]; #$vowel = [ะ-ฺเ-ไ็]; #{ ( $consonant ) } [^$vowel \uE000] → | $1 \uE000 ; #\uE000 → ọ ; # ← ọ ; $notAbove = [^\p{ccc=0}\p{ccc=above}] ; $notBelow = [^\p{ccc=0}\p{ccc=below}] ; # Consonants # Warning: the 'h's need to be handled carefully! # What we really want to say is the following, but we can't # $notHAccent = !($notAbove* ̄ | $notBelow* ̣) ; # Since the only accents we care about that could cause problems are free-standing accents below, we use instead: $freeStandingBelow = [̥ ]; $hAccent = [ ̄ ̣]; $notHAccent0 = [^$freeStandingBelow$hAccent]; $notHAccent1 = $freeStandingBelow [^$hAccent]; ; # THAI CHARACTER HO HIP | $1 h ($notAbove*) ̄; # backward case, account for reordering ; # THAI CHARACTER HO NOKHUK k̄h ; # THAI CHARACTER KHO KHAI ḳ̄h ; # THAI CHARACTER KHO KHUAT kʹh ; # THAI CHARACTER KHO KHON ḳh ; # THAI CHARACTER KHO RAKHANG kh } $notHAccent1 ; # THAI CHARACTER KHO KHWAI kh } $notHAccent0 ; # THAI CHARACTER KHO KHWAI k ; # THAI CHARACTER KO KAI p̣h ; # THAI CHARACTER PHO SAMPHAO p̄h ; # THAI CHARACTER PHO PHUNG ph } $notHAccent1 ; # THAI CHARACTER PHO PHAN ph } $notHAccent0 ; # THAI CHARACTER PHO PHAN p ; # THAI CHARACTER PO PLA c̄h ; # THAI CHARACTER CHO CHING c̣h ; # THAI CHARACTER CHO CHOE ch } $notHAccent1 ; # THAI CHARACTER CHO CHANG ch } $notHAccent0 ; # THAI CHARACTER CHO CHANG c ; # THAI CHARACTER CHO CHAN ṭ̄h ; # THAI CHARACTER THO THAN ṯh ; # THAI CHARACTER THO NANGMONTHO tʹh ; # THAI CHARACTER THO PHUTHAO t̄h ; # THAI CHARACTER THO THUNG ṭh ; # THAI CHARACTER THO THONG th } $notHAccent1 ; # THAI CHARACTER THO THAHAN th } $notHAccent0 ; # THAI CHARACTER THO THAHAN #Note: TO PATAK deviates from ISO since t-dotunder + h would be ambigous. So it uses vertical tick. ; # THAI CHARACTER TO PATAK t ; # THAI CHARACTER TO TAO # since there is no singleton g (generated), don't worry about that. ng ; # THAI CHARACTER NGO NGU ; # THAI CHARACTER NO NEN n ; # THAI CHARACTER NO NU ; # THAI CHARACTER YO YING ; # THAI CHARACTER DO CHADA d ; # THAI CHARACTER DO DEK b ; # THAI CHARACTER BO BAIMAI ; # THAI CHARACTER FO FA | $1 f ($notAbove*) ̄; # backward case, account for reordering m ; # THAI CHARACTER MO MA y ; # THAI CHARACTER YO YAK r ; # THAI CHARACTER RO RUA v ; # THAI CHARACTER RU ł ; # THAI CHARACTER LU w ; # THAI CHARACTER WO WAEN ṣ̄ ; # THAI CHARACTER SO SALA*** | $1 s ̣ ($notAbove*) ̄; # backward case, account for reordering s̄ʹ ; # THAI CHARACTER SO RUSI ; # THAI CHARACTER SO SUA*** | $1 s ($notAbove*) ̄; # backward case, account for reordering ; # THAI CHARACTER LO CHULA l ; # THAI CHARACTER LO LING f ; # THAI CHARACTER FO FAN x ; # THAI CHARACTER O ANG s ; # THAI CHARACTER SO SO # vowels ; # THAI CHARACTER MAI HAN-AKAT ; # THAI CHARACTER SARA AA | $1 a ($notAbove*) ̄; # backward case, account for reordering # We deviate from ISO for SARA AM for disambiguation a ̉; # THAI CHARACTER SARA AM | $1 a ($notAbove*) ̉ ; # backward case, account for reordering a ; # THAI CHARACTER SARA A ; # THAI CHARACTER SARA II | $1 i ($notAbove*) ̄ ; # backward case, account for reordering ụ̄ ; # THAI CHARACTER SARA UEE | $1 u ̣ ($notAbove*) ̄ ; # backward case, account for reordering ; # THAI CHARACTER SARA UE ; # THAI CHARACTER SARA UU | $1 u ($notAbove*) ̄ ; # backward case, account for reordering u ; # THAI CHARACTER SARA U ; # THAI CHARACTER PAIYANNOI # ฿ ↔ XXX ; # THAI CURRENCY SYMBOL BAHT e ; # THAI CHARACTER SARA E æ ; # THAI CHARACTER SARA AE o ; # THAI CHARACTER SARA O ı ; # THAI CHARACTER SARA AI MAIMUAN ; # THAI CHARACTER SARA AI MAIMALAI ɨ ; # THAI CHARACTER LAKKHANGYAO ̆ ; # THAI CHARACTER MAITAIKHU ̀ ; # THAI CHARACTER MAI EK ̂ ; # THAI CHARACTER MAI THO ́ ; # THAI CHARACTER MAI TRI ̌ ; # THAI CHARACTER MAI CHATTAWA ̒ ; # THAI CHARACTER THANTHAKHAT '~' ; # THAI CHARACTER YAMAKKAN # We deviate from ISO for disambiguation ̊ ; # THAI CHARACTER NIKHAHIT '§' ; # THAI CHARACTER FONGMAN 0 ; # THAI DIGIT ZERO 1 ; # THAI DIGIT ONE 2 ; # THAI DIGIT TWO 3 ; # THAI DIGIT THREE 4 ; # THAI DIGIT FOUR 5 ; # THAI DIGIT FIVE 6 ; # THAI DIGIT SIX 7 ; # THAI DIGIT SEVEN 8 ; # THAI DIGIT EIGHT 9 ; # THAI DIGIT NINE '||' ; # THAI CHARACTER ANGKHANKHU » ; # THAI CHARACTER KHOMUT « ; # THAI CHARACTER MAIYAMOK # moved down to make shorter first #Note: PHINTHU deviates from ISO since underring causes canonical problems. So it uses spacing tick below. ˌ ; # THAI CHARACTER PHINTHU i ; # THAI CHARACTER SARA I # fallbacks | k g ; | k h ; | c j ; | k q ; | s z ; :: (lower); ]]></tRule> </transform> </transforms> </supplementalData>