larvitgeodata
Version:
Geo data, primarily ISO territories, languages etc. Data fetched mostly from CLDR.
32 lines (30 loc) • 1.89 kB
text/xml
<!--
Copyright © 1991-2013 Unicode, Inc.
CLDR data files are interpreted according to the LDML specification (http://unicode.org/reports/tr35/)
For terms of use, see http://www.unicode.org/copyright.html
-->
<supplementalData>
<version number="$Revision: 11914 $"/>
<transforms>
<transform source="Han" target="Spacedhan" direction="both" visibility="internal">
<comment># Only intended for internal use</comment>
<comment># Make sure Han are normalized, including characters that contain them. </comment>
<comment># The first set in the filter is computed with http://unicode.org/cldr/utility/list-unicodeset.jsp?a=[:tonfkd:/XXX/:]-[:ideographic:]-[:sc=han:]</comment>
<comment># Where XXX is the resolved [:ideographic:][:sc=han:]. It needs updating with each Unicode release!</comment>
<tRule>:: [[㆒-㆟㈠-㉇㊀-㊰㋀-㋋㍘-㍰㍻-㍿㏠-㏾ 🈐-🈒🈔-🈺🉀-🉈🉐🉑][:ideographic:][:sc=han:]] nfkc; </tRule>
<tRule>:: fullwidth-halfwidth;</tRule>
<tRule>。 → '.';</tRule>
<tRule>$terminalPunct = [\.\,\:\;\?\!.,:?!。、;[:Pe:][:Pf:]];</tRule>
<tRule>$initialPunct = [:Ps:][:Pi:];</tRule>
<comment># add space between any Han or terminal punctuation and letters, and</comment>
<comment># between letters and Han or initial punct</comment>
<tRule>[[:Ideographic:] $terminalPunct] {} [:Letter:] → ' ' ;</tRule>
<tRule>[:Letter:] [:Mark:]* {} [[:Ideographic:] $initialPunct] → ' ' ;</tRule>
<comment># remove spacing between ideographs and other letters</comment>
<tRule>← [:Ideographic:] { ' ' } [:Letter:] ;</tRule>
<tRule>← [:Letter:] [:Mark:]* { ' ' } [:Ideographic:] ;</tRule>
</transform>
</transforms>
</supplementalData>