UNPKG

cldr

Version:

Library for extracting data from CLDR (the Unicode Common Locale Data Repository)

476 lines (475 loc) 26.5 kB
<?xml version="1.0" encoding="UTF-8" ?> <!DOCTYPE ldml SYSTEM "../../common/dtd/ldml.dtd"> <!-- Copyright © 1991-2024 Unicode, Inc. For terms of use, see http://www.unicode.org/copyright.html SPDX-License-Identifier: Unicode-3.0 CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/) --> <ldml> <identity> <version number="$Revision$"/> <language type="root"/> </identity> <segmentations> <segmentation type="GraphemeClusterBreak"> <variables> <!-- VARIABLES --> <variable id="$CR">\p{Grapheme_Cluster_Break=CR}</variable> <variable id="$LF">\p{Grapheme_Cluster_Break=LF}</variable> <variable id="$Control">\p{Grapheme_Cluster_Break=Control}</variable> <variable id="$Extend">\p{Grapheme_Cluster_Break=Extend}</variable> <variable id="$ZWJ">\p{Grapheme_Cluster_Break=ZWJ}</variable> <variable id="$RI">\p{Grapheme_Cluster_Break=Regional_Indicator}</variable> <variable id="$Prepend">\p{Grapheme_Cluster_Break=Prepend}</variable> <variable id="$SpacingMark">\p{Grapheme_Cluster_Break=SpacingMark}</variable> <variable id="$L">\p{Grapheme_Cluster_Break=L}</variable> <variable id="$V">\p{Grapheme_Cluster_Break=V}</variable> <variable id="$T">\p{Grapheme_Cluster_Break=T}</variable> <variable id="$LV">\p{Grapheme_Cluster_Break=LV}</variable> <variable id="$LVT">\p{Grapheme_Cluster_Break=LVT}</variable> <!-- Note: The following may overlap with the above Note: ConjunctLinkingScripts is not used anymore, instead that list exists in the derivation of Indic_Conjunct_Break. It is kept here so that the diff of the generated test cases compared to the Unicode 15.1 β is minimal. TODO(egg): Consider removing in Unicode 16.0. --> <variable id="$ConjunctLinkingScripts">[\p{Gujr}\p{sc=Telu}\p{sc=Mlym}\p{sc=Orya}\p{sc=Beng}\p{sc=Deva}]</variable> <variable id="$ConjunctLinker">\p{Indic_Conjunct_Break=Linker}</variable> <variable id="$LinkingConsonant">\p{Indic_Conjunct_Break=Consonant}</variable> <variable id="$ExtPict">\p{Extended_Pictographic}</variable> <variable id="$ExtCccZwj">[\p{Indic_Conjunct_Break=Linker}\p{Indic_Conjunct_Break=Extend}]</variable> </variables> <segmentRules> <!-- RULES Break at the start and end of text, unless the text is empty. Do not break between a CR and LF. Otherwise, break before and after controls. --> <rule id="3"> $CR × $LF </rule> <rule id="4"> ( $Control | $CR | $LF ) ÷ </rule> <rule id="5"> ÷ ( $Control | $CR | $LF ) </rule> <!-- Do not break Hangul syllable sequences. --> <rule id="6"> $L × ( $L | $V | $LV | $LVT ) </rule> <rule id="7"> ( $LV | $V ) × ( $V | $T ) </rule> <rule id="8"> ( $LVT | $T) × $T </rule> <rule id="9"> × ($Extend | $ZWJ) </rule> <!-- Only for extended grapheme clusters: Do not break before SpacingMarks, or after Prepend characters. --> <rule id="9.1"> × $SpacingMark </rule> <rule id="9.2"> $Prepend × </rule> <rule id="9.3"> $LinkingConsonant $ExtCccZwj* $ConjunctLinker $ExtCccZwj* × $LinkingConsonant </rule> <rule id="11"> $ExtPict $Extend* $ZWJ × $ExtPict </rule> <!-- Do not break within emoji flag sequences. That is, do not break between regional indicator (RI) symbols if there is an odd number of RI characters before the break point. --> <rule id="12"> ^ ($RI $RI)* $RI × $RI </rule> <rule id="13"> [^$RI] ($RI $RI)* $RI × $RI </rule> <!-- Otherwise, break everywhere. --> </segmentRules> </segmentation> <segmentation type="LineBreak"> <variables> <!-- VARIABLES --> <variable id="$AI">\p{Line_Break=Ambiguous}</variable> <variable id="$AK">\p{Line_Break=Aksara}</variable> <variable id="$AL">\p{Line_Break=Alphabetic}</variable> <variable id="$AP">\p{Line_Break=Aksara_Prebase}</variable> <variable id="$AS">\p{Line_Break=Aksara_Start}</variable> <variable id="$B2">\p{Line_Break=Break_Both}</variable> <variable id="$BA">\p{Line_Break=Break_After}</variable> <variable id="$BB">\p{Line_Break=Break_Before}</variable> <variable id="$BK">\p{Line_Break=Mandatory_Break}</variable> <variable id="$CB">\p{Line_Break=Contingent_Break}</variable> <variable id="$CL">\p{Line_Break=Close_Punctuation}</variable> <variable id="$CP">\p{Line_Break=CP}</variable> <variable id="$CM1">\p{Line_Break=Combining_Mark}</variable> <variable id="$CR">\p{Line_Break=Carriage_Return}</variable> <variable id="$EX">\p{Line_Break=Exclamation}</variable> <variable id="$GL">\p{Line_Break=Glue}</variable> <variable id="$H2">\p{Line_Break=H2}</variable> <variable id="$H3">\p{Line_Break=H3}</variable> <variable id="$HL">\p{Line_Break=HL}</variable> <variable id="$HY">\p{Line_Break=Hyphen}</variable> <variable id="$ID">\p{Line_Break=Ideographic}</variable> <variable id="$IN">\p{Line_Break=Inseparable}</variable> <variable id="$IS">\p{Line_Break=Infix_Numeric}</variable> <variable id="$JL">\p{Line_Break=JL}</variable> <variable id="$JT">\p{Line_Break=JT}</variable> <variable id="$JV">\p{Line_Break=JV}</variable> <variable id="$LF">\p{Line_Break=Line_Feed}</variable> <variable id="$NL">\p{Line_Break=Next_Line}</variable> <variable id="$NS">\p{Line_Break=Nonstarter}</variable> <variable id="$NU">\p{Line_Break=Numeric}</variable> <variable id="$OP">\p{Line_Break=Open_Punctuation}</variable> <variable id="$PO">\p{Line_Break=Postfix_Numeric}</variable> <variable id="$PR">\p{Line_Break=Prefix_Numeric}</variable> <variable id="$QU">\p{Line_Break=Quotation}</variable> <variable id="$SA">\p{Line_Break=Complex_Context}</variable> <variable id="$SG">\p{Line_Break=Surrogate}</variable> <variable id="$SP">\p{Line_Break=Space}</variable> <variable id="$SY">\p{Line_Break=Break_Symbols}</variable> <variable id="$VF">\p{Line_Break=Virama_Final}</variable> <variable id="$VI">\p{Line_Break=Virama}</variable> <variable id="$WJ">\p{Line_Break=Word_Joiner}</variable> <variable id="$XX">\p{Line_Break=Unknown}</variable> <variable id="$ZW">\p{Line_Break=ZWSpace}</variable> <variable id="$CJ">\p{Line_Break=Conditional_Japanese_Starter}</variable> <variable id="$RI">\p{Line_Break=Regional_Indicator}</variable> <variable id="$EB">\p{Line_Break=E_Base}</variable> <variable id="$EM">\p{Line_Break=E_Modifier}</variable> <variable id="$ZWJ_O">\p{Line_Break=ZWJ}</variable> <variable id="$ZWJ">\p{Line_Break=ZWJ}</variable> <variable id="$QU_Pi">[$QU &amp; \p{gc=Pi}]</variable> <variable id="$QU_Pf">[$QU &amp; \p{gc=Pf}]</variable> <variable id="$DottedCircle"></variable> <variable id="$CP30">[$CP-[\p{ea=F}\p{ea=W}\p{ea=H}]]</variable> <variable id="$OP30">[$OP-[\p{ea=F}\p{ea=W}\p{ea=H}]]</variable> <variable id="$ExtPictUnassigned">[\p{Extended_Pictographic}&amp;\p{gc=Cn}]</variable> <!-- Some rules refer to the start and end of text. We could just use a literal ^ for sot, but naming it as in the spec makes it easier to compare. The parser will eat (and choke on) $, so we play a stupid trick instead. --> <variable id="$sot">^</variable> <variable id="$eot">(?!.)</variable> <!-- SPECIAL EXTENSIONS --> <variable id="$CM">[$CM1 $ZWJ]</variable> <!-- LB 1 Assign a line breaking class to each code point of the input. Resolve AI, CB, SA, SG, and XX into other line breaking classes depending on criteria outside the scope of this algorithm. NOTE: CB is ok to fall through, but must handle others here. --> <variable id="$AL">[$AI $AL $SG $XX $SA]</variable> <variable id="$NS">[$NS $CJ]</variable> <!-- WARNING: Fixes for Rule 9 Treat X (CM|ZWJ* as if it were X. Where X is any line break class except SP, BK, CR, LF, NL or ZW. --> <variable id="$X">$CM*</variable> <!-- MACROS --> <variable id="$Spec1_">[$SP $BK $CR $LF $NL $ZW]</variable> <variable id="$Spec2_">[^ $SP $BK $CR $LF $NL $ZW]</variable> <variable id="$Spec3a_">[^ $SP $BA $HY $CM]</variable> <variable id="$Spec3b_">[^ $BA $HY $CM]</variable> <variable id="$Spec4_">[^ $NU $CM]</variable> <variable id="$AI">($AI $X)</variable> <variable id="$AK">($AK $X)</variable> <variable id="$AL">($AL $X)</variable> <variable id="$AP">($AP $X)</variable> <variable id="$AS">($AS $X)</variable> <variable id="$B2">($B2 $X)</variable> <variable id="$BA">($BA $X)</variable> <variable id="$BB">($BB $X)</variable> <variable id="$CB">($CB $X)</variable> <variable id="$CL">($CL $X)</variable> <variable id="$CP">($CP $X)</variable> <variable id="$CM">($CM $X)</variable> <variable id="$EX">($EX $X)</variable> <variable id="$GL">($GL $X)</variable> <variable id="$H2">($H2 $X)</variable> <variable id="$H3">($H3 $X)</variable> <variable id="$HL">($HL $X)</variable> <variable id="$HY">($HY $X)</variable> <variable id="$ID">($ID $X)</variable> <variable id="$IN">($IN $X)</variable> <variable id="$IS">($IS $X)</variable> <variable id="$JL">($JL $X)</variable> <variable id="$JT">($JT $X)</variable> <variable id="$JV">($JV $X)</variable> <variable id="$NS">($NS $X)</variable> <variable id="$NU">($NU $X)</variable> <variable id="$OP">($OP $X)</variable> <variable id="$PO">($PO $X)</variable> <variable id="$PR">($PR $X)</variable> <variable id="$QU">($QU $X)</variable> <variable id="$SA">($SA $X)</variable> <variable id="$SG">($SG $X)</variable> <variable id="$SY">($SY $X)</variable> <variable id="$VF">($VF $X)</variable> <variable id="$VI">($VI $X)</variable> <variable id="$WJ">($WJ $X)</variable> <variable id="$XX">($XX $X)</variable> <variable id="$RI">($RI $X)</variable> <variable id="$EB">($EB $X)</variable> <variable id="$EM">($EM $X)</variable> <variable id="$ZWJ">($ZWJ $X)</variable> <variable id="$QU_Pi">($QU_Pi $X)</variable> <variable id="$QU_Pf">($QU_Pf $X)</variable> <variable id="$DottedCircle">($DottedCircle $X)</variable> <variable id="$CP30">($CP30 $X)</variable> <variable id="$OP30">($OP30 $X)</variable> <!-- OUT OF ORDER ON PURPOSE LB 10 Treat any remaining combining mark as AL. --> <variable id="$AL">($AL | ^ $CM | (?&lt;=$Spec1_) $CM)</variable> </variables> <segmentRules> <!-- RULES LB 4 Always break after hard line breaks (but never between CR and LF). --> <rule id="4"> $BK ÷ </rule> <!-- LB 5 Treat CR followed by LF, as well as CR, LF and NL as hard line breaks. --> <rule id="5.01"> $CR × $LF </rule> <rule id="5.02"> $CR ÷ </rule> <rule id="5.03"> $LF ÷ </rule> <rule id="5.04"> $NL ÷ </rule> <!-- LB 6 Do not break before hard line breaks. --> <rule id="6"> × ( $BK | $CR | $LF | $NL ) </rule> <!-- LB 7 Do not break before spaces or zero-width space. --> <rule id="7.01"> × $SP </rule> <rule id="7.02"> × $ZW </rule> <!-- LB 8 Break before any character following a zero-width space, even if one or more spaces intervene. --> <rule id="8"> $ZW $SP* ÷ </rule> <!-- LB 8a Don't break between ZWJ and IDs (for use in Emoji ZWJ sequences) --> <rule id="8.1"> $ZWJ_O × </rule> <!-- LB 9 Do not break a combining character sequence; treat it as if it has the LB class of the base character in all of the following rules. (Where X is any line break class except SP, BK, CR, LF, NL or ZW.) --> <rule id="9"> $Spec2_ × $CM </rule> <rule id="11.01"> × $WJ </rule> <rule id="11.02"> $WJ × </rule> <!-- LB 12 Do not break after NBSP and related characters. --> <rule id="12"> $GL × </rule> <rule id="12.1"> $Spec3a_ × $GL </rule> <rule id="12.2"> $Spec3b_ $CM+ × $GL </rule> <rule id="12.3"> ^ $CM+ × $GL </rule> <!-- LB 13 Do not break before \u2018]\u2019 or \u2018!\u2019 or \u2018;\u2019 or \u2018/\u2019, even after spaces. Using customization 7. --> <rule id="13.01"> × $EX </rule> <rule id="13.02"> $Spec4_ × ($CL | $CP | $IS | $SY) </rule> <rule id="13.03"> $Spec4_ $CM+ × ($CL | $CP | $IS | $SY) </rule> <rule id="13.04"> ^ $CM+ × ($CL | $CP | $IS | $SY) </rule> <!-- LB 14 Do not break after \u2018[\u2019, even after spaces. --> <rule id="14"> $OP $SP* × </rule> <!-- LB 15a Do not break after an unresolved initial punctuation that lies at the start of the line, after a space, after opening punctuation, or after an unresolved quotation mark, even after spaces. --> <rule id="15.11"> ( $sot | $BK | $CR | $LF | $NL | $OP | $QU | $GL | $SP | $ZW ) $QU_Pi $SP* × </rule> <!-- LB 15b Do not break before an unresolved final punctuation that lies at the end of the line, before a space, before a prohibited break, or before an unresolved quotation mark, even before spaces. --> <rule id="15.21"> × $QU_Pf ( $SP | $GL | $WJ | $CL | $QU | $CP | $EX | $IS | $SY | $BK | $CR | $LF | $NL | $ZW | $eot ) </rule> <!-- LB 16 Do not break between closing punctuation and a nonstarter (lb=NS), even with intervening spaces. --> <rule id="16"> ($CL | $CP) $SP* × $NS </rule> <!-- LB 17 Do not break within \u2018\u2014\u2014\u2019, even with intervening spaces. --> <rule id="17"> $B2 $SP* × $B2 </rule> <!-- LB 18 Break after spaces. --> <rule id="18"> $SP ÷ </rule> <!-- LB 19 Do not break before or after \u2018\"\u2019. --> <rule id="19.01"> × $QU </rule> <rule id="19.02"> $QU × </rule> <!-- LB 20 Break before and after unresolved CB. --> <rule id="20.01"> ÷ $CB </rule> <rule id="20.02"> $CB ÷ </rule> <!-- LB 21 Do not break before hyphen-minus, other hyphens, fixed-width spaces, small kana and other non-starters, or after acute accents. --> <rule id="21.01"> × $BA </rule> <rule id="21.02"> × $HY </rule> <rule id="21.03"> × $NS </rule> <rule id="21.04"> $BB × </rule> <!-- LB 21a Don't break after Hebrew + Hyphen. --> <rule id="21.1"> $HL ($HY | $BA) × </rule> <!-- LB 21b Don’t break between Solidus and Hebrew letters. --> <rule id="21.2"> $SY × $HL </rule> <!-- LB 22 Do not break before ellipses. --> <rule id="22"> × $IN </rule> <!-- LB 23 Do not break between digits and letters. --> <rule id="23.02"> ($AL | $HL) × $NU </rule> <rule id="23.03"> $NU × ($AL | $HL) </rule> <!-- LB 24 Do not break between prefix and letters or ideographs. --> <rule id="23.12"> $PR × ($ID | $EB | $EM) </rule> <rule id="23.13"> ($ID | $EB | $EM) × $PO </rule> <!-- LB24 Do not break between numeric prefix/postfix and letters, or between letters and prefix/postfix. --> <rule id="24.02"> ($PR | $PO) × ($AL | $HL) </rule> <rule id="24.03"> ($AL | $HL) × ($PR | $PO) </rule> <!-- Using customization 7 LB Alternative: ( PR | PO) ? ( OP | HY ) ? NU (NU | SY | IS) * (CL | CP) ? ( PR | PO) ? Insert × every place it could go. However, make sure that at least one thing is concrete, otherwise would cause $NU to not break before or after --> <rule id="25.01"> ($PR | $PO) × ( $OP | $HY )? $NU </rule> <rule id="25.02"> ( $OP | $HY ) × $NU </rule> <rule id="25.03"> $NU × ($NU | $SY | $IS) </rule> <rule id="25.04"> $NU ($NU | $SY | $IS)* × ($NU | $SY | $IS | $CL | $CP) </rule> <rule id="25.05"> $NU ($NU | $SY | $IS)* ($CL | $CP)? × ($PO | $PR) </rule> <!-- LB 26 Do not break a Korean syllable. --> <rule id="26.01"> $JL × $JL | $JV | $H2 | $H3 </rule> <rule id="26.02"> $JV | $H2 × $JV | $JT </rule> <rule id="26.03"> $JT | $H3 × $JT </rule> <!-- LB 27 Treat a Korean Syllable Block the same as ID. --> <rule id="27.01"> $JL | $JV | $JT | $H2 | $H3 × $PO </rule> <rule id="27.02"> $PR × $JL | $JV | $JT | $H2 | $H3 </rule> <!-- LB 28 Do not break between alphabetics (\"at\"). --> <rule id="28"> ($AL | $HL) × ($AL | $HL) </rule> <!-- LB28a Do not break inside the orthographic syllables of Brahmic scripts. --> <rule id="28.11"> $AP × ($AK | $DottedCircle | $AS) </rule> <rule id="28.12"> ($AK | $DottedCircle | $AS) × ($VF | $VI) </rule> <rule id="28.13"> ($AK | $DottedCircle | $AS) $VI × ($AK | $DottedCircle) </rule> <rule id="28.14"> ($AK | $DottedCircle | $AS) × ($AK | $DottedCircle | $AS) $VF </rule> <!-- LB 29 Do not break between numeric punctuation and alphabetics (\"e.g.\"). --> <rule id="29"> $IS × ($AL | $HL) </rule> <!-- LB 30 Do not break between letters, numbers or ordinary symbols and opening or closing punctuation. --> <rule id="30.01"> ($AL | $HL | $NU) × $OP30 </rule> <rule id="30.02"> $CP30 × ($AL | $HL | $NU) </rule> <!-- LB 30a Break between two Regional Indicators if and only if there is an even number of them before the point being considered. --> <rule id="30.11"> $sot ($RI $RI)* $RI × $RI </rule> <rule id="30.12"> [^$RI] ($RI $RI)* $RI × $RI </rule> <rule id="30.13"> $RI ÷ $RI </rule> <!-- LB 30b Do not break between an emoji base (or potential emoji) and an emoji modifier. --> <rule id="30.21"> $EB × $EM </rule> <rule id="30.22"> $ExtPictUnassigned × $EM </rule> </segmentRules> </segmentation> <segmentation type="SentenceBreak"> <variables> <!-- VARIABLES --> <variable id="$CR">\p{Sentence_Break=CR}</variable> <variable id="$LF">\p{Sentence_Break=LF}</variable> <variable id="$Extend">\p{Sentence_Break=Extend}</variable> <variable id="$Format">\p{Sentence_Break=Format}</variable> <variable id="$Sep">\p{Sentence_Break=Sep}</variable> <variable id="$Sp">\p{Sentence_Break=Sp}</variable> <variable id="$Lower">\p{Sentence_Break=Lower}</variable> <variable id="$Upper">\p{Sentence_Break=Upper}</variable> <variable id="$OLetter">\p{Sentence_Break=OLetter}</variable> <variable id="$Numeric">\p{Sentence_Break=Numeric}</variable> <variable id="$ATerm">\p{Sentence_Break=ATerm}</variable> <variable id="$STerm">\p{Sentence_Break=STerm}</variable> <variable id="$Close">\p{Sentence_Break=Close}</variable> <variable id="$SContinue">\p{Sentence_Break=SContinue}</variable> <variable id="$Any">.</variable> <!-- SPECIAL EXTENSIONS WARNING: For Rule 5, now add format and extend to everything but Sep, Format, and Extend --> <variable id="$FE">[$Format $Extend]</variable> <variable id="$NotPreLower_">[^ $OLetter $Upper $Lower $Sep $CR $LF $STerm $ATerm]</variable> <variable id="$Sp">($Sp $FE*)</variable> <variable id="$Lower">($Lower $FE*)</variable> <variable id="$Upper">($Upper $FE*)</variable> <variable id="$OLetter">($OLetter $FE*)</variable> <variable id="$Numeric">($Numeric $FE*)</variable> <variable id="$ATerm">($ATerm $FE*)</variable> <variable id="$STerm">($STerm $FE*)</variable> <variable id="$Close">($Close $FE*)</variable> <variable id="$SContinue">($SContinue $FE*)</variable> <!-- MACROS --> <variable id="$ParaSep">($Sep | $CR | $LF)</variable> <variable id="$SATerm">($STerm | $ATerm)</variable> </variables> <segmentRules> <!-- RULES Break at the start and end of text, unless the text is empty. Do not break within CRLF. --> <rule id="3"> $CR × $LF </rule> <!-- Break after paragraph separators. --> <rule id="4"> $ParaSep ÷ </rule> <!-- Ignore Format and Extend characters, except after sot, ParaSep, and within CRLF. (See Section 6.2, Replacing Ignore Rules.) This also has the effect of: Any × (Format | Extend) WARNING: Implemented as don't break before format (except after linebreaks), AND add format and extend in all variables definitions that appear after this point! --> <rule id="5"> × [$Format $Extend] </rule> <!-- Do not break after full stop in certain contexts. [See note below.] Do not break after ambiguous terminators like period, if immediately followed by a number or lowercase letter, is between uppercase letters, or if the first following letter (optionally after certain punctuation) is lowercase. For example, a period may be an abbreviation or numeric period, and not mark the end of a sentence. --> <rule id="6"> $ATerm × $Numeric </rule> <rule id="7"> ($Upper | $Lower) $ATerm × $Upper </rule> <rule id="8"> $ATerm $Close* $Sp* × $NotPreLower_* $Lower </rule> <rule id="8.1"> $SATerm $Close* $Sp* × ($SContinue | $SATerm) </rule> <!-- Break after sentence terminators, but include closing punctuation, trailing spaces, and any paragraph separator. [See note below.] Include closing punctuation, trailing spaces, and (optionally) a paragraph separator. --> <rule id="9"> $SATerm $Close* × ( $Close | $Sp | $ParaSep ) </rule> <!-- Note the fix to $Sp*, $Sep? --> <rule id="10"> $SATerm $Close* $Sp* × ( $Sp | $ParaSep ) </rule> <rule id="11"> $SATerm $Close* $Sp* $ParaSep? ÷ </rule> <!-- Otherwise, do not break --> <rule id="998"> × $Any </rule> </segmentRules> </segmentation> <segmentation type="WordBreak"> <variables> <!-- VARIABLES --> <variable id="$CR">\p{Word_Break=CR}</variable> <variable id="$LF">\p{Word_Break=LF}</variable> <variable id="$Newline">\p{Word_Break=Newline}</variable> <variable id="$Extend">\p{Word_Break=Extend}</variable> <!-- Now normal variables --> <variable id="$Format">[\p{Word_Break=Format}]</variable> <variable id="$Katakana">\p{Word_Break=Katakana}</variable> <variable id="$ALetter">\p{Word_Break=ALetter}</variable> <variable id="$MidLetter">[\p{Word_Break = MidLetter} - [\: \uFE55 \uFF1A]]</variable> <variable id="$MidNum">\p{Word_Break=MidNum}</variable> <variable id="$MidNumLet">\p{Word_Break=MidNumLet}</variable> <variable id="$Numeric">\p{Word_Break=Numeric}</variable> <variable id="$ExtendNumLet">\p{Word_Break=ExtendNumLet}</variable> <variable id="$RI">\p{Word_Break=Regional_Indicator}</variable> <variable id="$Hebrew_Letter">\p{Word_Break=Hebrew_Letter}</variable> <variable id="$Double_Quote">\p{Word_Break=Double_Quote}</variable> <variable id="$Single_Quote">\p{Word_Break=Single_Quote}</variable> <variable id="$ZWJ">\p{Word_Break=ZWJ}</variable> <!-- Note: The following may overlap with the above --> <variable id="$ExtPict">\p{Extended_Pictographic}</variable> <variable id="$WSegSpace">\p{Word_Break=WSegSpace}</variable> <!-- MACROS --> <variable id="$AHLetter">($ALetter | $Hebrew_Letter)</variable> <variable id="$MidNumLetQ">($MidNumLet | $Single_Quote)</variable> <!-- SPECIAL EXTENSIONS Add format and extend to everything --> <variable id="$FE">[$Format $Extend $ZWJ]</variable> <variable id="$NotBreak_">[^ $Newline $CR $LF ]</variable> <variable id="$Katakana">($Katakana $FE*)</variable> <variable id="$ALetter">($ALetter $FE*)</variable> <variable id="$MidLetter">($MidLetter $FE*)</variable> <variable id="$MidNum">($MidNum $FE*)</variable> <variable id="$MidNumLet">($MidNumLet $FE*)</variable> <variable id="$Numeric">($Numeric $FE*)</variable> <variable id="$ExtendNumLet">($ExtendNumLet $FE*)</variable> <variable id="$RI">($RI $FE*)</variable> <variable id="$Hebrew_Letter">($Hebrew_Letter $FE*)</variable> <variable id="$Double_Quote">($Double_Quote $FE*)</variable> <variable id="$Single_Quote">($Single_Quote $FE*)</variable> <variable id="$AHLetter">($AHLetter $FE*)</variable> <variable id="$MidNumLetQ">($MidNumLetQ $FE*)</variable> </variables> <segmentRules> <!-- RULES Break at the start and end of text, unless the text is empty. Do not break within CRLF. --> <rule id="3"> $CR × $LF </rule> <!-- Otherwise break before and after Newlines (including CR and LF) --> <rule id="3.1"> ($Newline | $CR | $LF) ÷ </rule> <rule id="3.2"> ÷ ($Newline | $CR | $LF) </rule> <!-- Do not break within emoji zwj sequences. --> <rule id="3.3"> $ZWJ × $ExtPict </rule> <rule id="3.4"> $WSegSpace × $WSegSpace </rule> <!-- Ignore Format and Extend characters, except after sot, CR, LF, and Newline. (See Section 6.2, Replacing Ignore Rules.) This also has the effect of: Any × (Format | Extend) WARNING: Implemented as don't break before format (except after linebreaks), AND add format and extend in all variables definitions that appear after this point! --> <rule id="4"> $NotBreak_ × [$Format $Extend $ZWJ] </rule> <!-- VANILLA RULES Do not break between most letters. --> <rule id="5"> $AHLetter × $AHLetter </rule> <!-- Do not break letters across certain punctuation. --> <rule id="6"> $AHLetter × ($MidLetter | $MidNumLetQ) $AHLetter </rule> <rule id="7"> $AHLetter ($MidLetter | $MidNumLetQ) × $AHLetter </rule> <rule id="7.1"> $Hebrew_Letter × $Single_Quote </rule> <rule id="7.2"> $Hebrew_Letter × $Double_Quote $Hebrew_Letter </rule> <rule id="7.3"> $Hebrew_Letter $Double_Quote × $Hebrew_Letter </rule> <!-- Do not break within sequences of digits, or digits adjacent to letters (“3a”, or “A3”). --> <rule id="8"> $Numeric × $Numeric </rule> <rule id="9"> $AHLetter × $Numeric </rule> <rule id="10"> $Numeric × $AHLetter </rule> <!-- Do not break within sequences, such as “3.2” or “3,456.789”. --> <rule id="11"> $Numeric ($MidNum | $MidNumLetQ) × $Numeric </rule> <rule id="12"> $Numeric × ($MidNum | $MidNumLetQ) $Numeric </rule> <!-- Do not break between Katakana. --> <rule id="13"> $Katakana × $Katakana </rule> <!-- Do not break from extenders. --> <rule id="13.1"> ($AHLetter | $Numeric | $Katakana | $ExtendNumLet) × $ExtendNumLet </rule> <rule id="13.2"> $ExtendNumLet × ($AHLetter | $Numeric | $Katakana) </rule> <!-- Do not break within emoji flag sequences. That is, do not break between regional indicator (RI) symbols if there is an odd number of RI characters before the break point. --> <rule id="15"> ^ ($RI $RI)* $RI × $RI </rule> <rule id="16"> [^$RI] ($RI $RI)* $RI × $RI </rule> <!-- Otherwise, break everywhere (including around ideographs). --> </segmentRules> </segmentation> </segmentations> </ldml>