sentencex
Version:
Sentence segmentation library
165 lines (163 loc) • 12.2 kB
JavaScript
// unicode code points generated with Unicode::Tussle perl script:
// unichars -aBbs '[\p{Sentence_Break=STerm}\p{Sentence_Break=ATerm}]' | awk '$2="\""$2"\", //"'
// Refer: https://www.unicode.org/Public/UCD/latest/ucd/auxiliary/SentenceBreakProperty.txt
const GLOBAL_SENTENCE_TERMINATORS = [
'!', // U+00021 BC=ON BLK=Basic_Latin SC=Common EXCLAMATION MARK
'.', // U+0002E BC=CS BLK=Basic_Latin SC=Common FULL STOP
'?', // U+0003F BC=ON BLK=Basic_Latin SC=Common QUESTION MARK
'։', // U+00589 BC=L BLK=Armenian SC=Armenian ARMENIAN FULL STOP
'؝', // U+0061D BC=AL BLK=Arabic SC=Arabic ARABIC END OF TEXT MARK
'؞', // U+0061E BC=AL BLK=Arabic SC=Arabic ARABIC TRIPLE DOT PUNCTUATION MARK
'؟', // U+0061F BC=AL BLK=Arabic SC=Common ARABIC QUESTION MARK
'۔', // U+006D4 BC=AL BLK=Arabic SC=Arabic ARABIC FULL STOP
'܀', // U+00700 BC=AL BLK=Syriac SC=Syriac SYRIAC END OF PARAGRAPH
'܁', // U+00701 BC=AL BLK=Syriac SC=Syriac SYRIAC SUPRALINEAR FULL STOP
'܂', // U+00702 BC=AL BLK=Syriac SC=Syriac SYRIAC SUBLINEAR FULL STOP
'߹', // U+007F9 BC=ON BLK=NKo SC=Nko NKO EXCLAMATION MARK
'࠷', // U+00837 BC=R BLK=Samaritan SC=Samaritan SAMARITAN PUNCTUATION MELODIC QITSA
'࠹', // U+00839 BC=R BLK=Samaritan SC=Samaritan SAMARITAN PUNCTUATION QITSA
'࠽', // U+0083D BC=R BLK=Samaritan SC=Samaritan SAMARITAN PUNCTUATION SOF MASHFAAT
'࠾', // U+0083E BC=R BLK=Samaritan SC=Samaritan SAMARITAN PUNCTUATION ANNAAU
'।', // U+00964 BC=L BLK=Devanagari SC=Common DEVANAGARI DANDA
'॥', // U+00965 BC=L BLK=Devanagari SC=Common DEVANAGARI DOUBLE DANDA
'၊', // U+0104A BC=L BLK=Myanmar SC=Myanmar MYANMAR SIGN LITTLE SECTION
'။', // U+0104B BC=L BLK=Myanmar SC=Myanmar MYANMAR SIGN SECTION
'።', // U+01362 BC=L BLK=Ethiopic SC=Ethiopic ETHIOPIC FULL STOP
'፧', // U+01367 BC=L BLK=Ethiopic SC=Ethiopic ETHIOPIC QUESTION MARK
'፨', // U+01368 BC=L BLK=Ethiopic SC=Ethiopic ETHIOPIC PARAGRAPH SEPARATOR
'᙮', // U+0166E BC=L BLK=Unified_Canadian_Aboriginal_Syllabics SC=Canadian_Aboriginal CANADIAN SYLLABICS FULL STOP
'᜵', // U+01735 BC=L BLK=Hanunoo SC=Common PHILIPPINE SINGLE PUNCTUATION
'᜶', // U+01736 BC=L BLK=Hanunoo SC=Common PHILIPPINE DOUBLE PUNCTUATION
'᠃', // U+01803 BC=ON BLK=Mongolian SC=Common MONGOLIAN FULL STOP
'᠉', // U+01809 BC=ON BLK=Mongolian SC=Mongolian MONGOLIAN MANCHU FULL STOP
'᥄', // U+01944 BC=ON BLK=Limbu SC=Limbu LIMBU EXCLAMATION MARK
'᥅', // U+01945 BC=ON BLK=Limbu SC=Limbu LIMBU QUESTION MARK
'᪨', // U+01AA8 BC=L BLK=Tai_Tham SC=Tai_Tham TAI THAM SIGN KAAN
'᪩', // U+01AA9 BC=L BLK=Tai_Tham SC=Tai_Tham TAI THAM SIGN KAANKUU
'᪪', // U+01AAA BC=L BLK=Tai_Tham SC=Tai_Tham TAI THAM SIGN SATKAAN
'᪫', // U+01AAB BC=L BLK=Tai_Tham SC=Tai_Tham TAI THAM SIGN SATKAANKUU
'᭚', // U+01B5A BC=L BLK=Balinese SC=Balinese BALINESE PANTI
'᭛', // U+01B5B BC=L BLK=Balinese SC=Balinese BALINESE PAMADA
'᭞', // U+01B5E BC=L BLK=Balinese SC=Balinese BALINESE CARIK SIKI
'᭟', // U+01B5F BC=L BLK=Balinese SC=Balinese BALINESE CARIK PAREREN
'᭽', // U+01B7D BC=L BLK=Balinese SC=Balinese BALINESE PANTI LANTANG
'᭾', // U+01B7E BC=L BLK=Balinese SC=Balinese BALINESE PAMADA LANTANG
'᰻', // U+01C3B BC=L BLK=Lepcha SC=Lepcha LEPCHA PUNCTUATION TA-ROL
'᰼', // U+01C3C BC=L BLK=Lepcha SC=Lepcha LEPCHA PUNCTUATION NYET THYOOM TA-ROL
'᱾', // U+01C7E BC=L BLK=Ol_Chiki SC=Ol_Chiki OL CHIKI PUNCTUATION MUCAAD
'᱿', // U+01C7F BC=L BLK=Ol_Chiki SC=Ol_Chiki OL CHIKI PUNCTUATION DOUBLE MUCAAD
'․', // U+02024 BC=ON BLK=General_Punctuation SC=Common ONE DOT LEADER
'‼', // U+0203C BC=ON BLK=General_Punctuation SC=Common DOUBLE EXCLAMATION MARK
'‽', // U+0203D BC=ON BLK=General_Punctuation SC=Common INTERROBANG
'⁇', // U+02047 BC=ON BLK=General_Punctuation SC=Common DOUBLE QUESTION MARK
'⁈', // U+02048 BC=ON BLK=General_Punctuation SC=Common QUESTION EXCLAMATION MARK
'⁉', // U+02049 BC=ON BLK=General_Punctuation SC=Common EXCLAMATION QUESTION MARK
'⸮', // U+02E2E BC=ON BLK=Supplemental_Punctuation SC=Common REVERSED QUESTION MARK
'⸼', // U+02E3C BC=ON BLK=Supplemental_Punctuation SC=Common STENOGRAPHIC FULL STOP
'⹓', // U+02E53 BC=ON BLK=Supplemental_Punctuation SC=Common MEDIEVAL EXCLAMATION MARK
'⹔', // U+02E54 BC=ON BLK=Supplemental_Punctuation SC=Common MEDIEVAL QUESTION MARK
'꓿', // U+0A4FF BC=L BLK=Lisu SC=Lisu LISU PUNCTUATION FULL STOP
'꘎', // U+0A60E BC=ON BLK=Vai SC=Vai VAI FULL STOP
'꘏', // U+0A60F BC=ON BLK=Vai SC=Vai VAI QUESTION MARK
'꛳', // U+0A6F3 BC=L BLK=Bamum SC=Bamum BAMUM FULL STOP
'꛷', // U+0A6F7 BC=L BLK=Bamum SC=Bamum BAMUM QUESTION MARK
'꡶', // U+0A876 BC=ON BLK=Phags-pa SC=Phags_Pa PHAGS-PA MARK SHAD
'꡷', // U+0A877 BC=ON BLK=Phags-pa SC=Phags_Pa PHAGS-PA MARK DOUBLE SHAD
'꣎', // U+0A8CE BC=L BLK=Saurashtra SC=Saurashtra SAURASHTRA DANDA
'꣏', // U+0A8CF BC=L BLK=Saurashtra SC=Saurashtra SAURASHTRA DOUBLE DANDA
'꤯', // U+0A92F BC=L BLK=Kayah_Li SC=Kayah_Li KAYAH LI SIGN SHYA
'꧈', // U+0A9C8 BC=L BLK=Javanese SC=Javanese JAVANESE PADA LINGSA
'꧉', // U+0A9C9 BC=L BLK=Javanese SC=Javanese JAVANESE PADA LUNGSI
'꩝', // U+0AA5D BC=L BLK=Cham SC=Cham CHAM PUNCTUATION DANDA
'꩞', // U+0AA5E BC=L BLK=Cham SC=Cham CHAM PUNCTUATION DOUBLE DANDA
'꩟', // U+0AA5F BC=L BLK=Cham SC=Cham CHAM PUNCTUATION TRIPLE DANDA
'꫰', // U+0AAF0 BC=L BLK=Meetei_Mayek_Extensions SC=Meetei_Mayek MEETEI MAYEK CHEIKHAN
'꫱', // U+0AAF1 BC=L BLK=Meetei_Mayek_Extensions SC=Meetei_Mayek MEETEI MAYEK AHANG KHUDAM
'꯫', // U+0ABEB BC=L BLK=Meetei_Mayek SC=Meetei_Mayek MEETEI MAYEK CHEIKHEI
'﹒', // U+0FE52 BC=CS BLK=Small_Form_Variants SC=Common SMALL FULL STOP
'﹖', // U+0FE56 BC=ON BLK=Small_Form_Variants SC=Common SMALL QUESTION MARK
'﹗', // U+0FE57 BC=ON BLK=Small_Form_Variants SC=Common SMALL EXCLAMATION MARK
'!', // U+0FF01 BC=ON BLK=Halfwidth_and_Fullwidth_Forms SC=Common FULLWIDTH EXCLAMATION MARK
'.', // U+0FF0E BC=CS BLK=Halfwidth_and_Fullwidth_Forms SC=Common FULLWIDTH FULL STOP
'?', // U+0FF1F BC=ON BLK=Halfwidth_and_Fullwidth_Forms SC=Common FULLWIDTH QUESTION MARK
'𐩖', // U+10A56 BC=R BLK=Kharoshthi SC=Kharoshthi KHAROSHTHI PUNCTUATION DANDA
'𐩗', // U+10A57 BC=R BLK=Kharoshthi SC=Kharoshthi KHAROSHTHI PUNCTUATION DOUBLE DANDA
'𐽕', // U+10F55 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION TWO VERTICAL BARS
'𐽖', // U+10F56 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION TWO VERTICAL BARS WITH DOTS
'𐽗', // U+10F57 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION CIRCLE WITH DOT
'𐽘', // U+10F58 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION TWO CIRCLES WITH DOTS
'𐽙', // U+10F59 BC=AL BLK=Sogdian SC=Sogdian SOGDIAN PUNCTUATION HALF CIRCLE WITH DOT
'𐾆', // U+10F86 BC=R BLK=Old_Uyghur SC=Old_Uyghur OLD UYGHUR PUNCTUATION BAR
'𐾇', // U+10F87 BC=R BLK=Old_Uyghur SC=Old_Uyghur OLD UYGHUR PUNCTUATION TWO BARS
'𐾈', // U+10F88 BC=R BLK=Old_Uyghur SC=Old_Uyghur OLD UYGHUR PUNCTUATION TWO DOTS
'𐾉', // U+10F89 BC=R BLK=Old_Uyghur SC=Old_Uyghur OLD UYGHUR PUNCTUATION FOUR DOTS
'𑁇', // U+11047 BC=L BLK=Brahmi SC=Brahmi BRAHMI DANDA
'𑁈', // U+11048 BC=L BLK=Brahmi SC=Brahmi BRAHMI DOUBLE DANDA
'𑂾', // U+110BE BC=L BLK=Kaithi SC=Kaithi KAITHI SECTION MARK
'𑂿', // U+110BF BC=L BLK=Kaithi SC=Kaithi KAITHI DOUBLE SECTION MARK
'𑃀', // U+110C0 BC=L BLK=Kaithi SC=Kaithi KAITHI DANDA
'𑃁', // U+110C1 BC=L BLK=Kaithi SC=Kaithi KAITHI DOUBLE DANDA
'𑅁', // U+11141 BC=L BLK=Chakma SC=Chakma CHAKMA DANDA
'𑅂', // U+11142 BC=L BLK=Chakma SC=Chakma CHAKMA DOUBLE DANDA
'𑅃', // U+11143 BC=L BLK=Chakma SC=Chakma CHAKMA QUESTION MARK
'𑇅', // U+111C5 BC=L BLK=Sharada SC=Sharada SHARADA DANDA
'𑇆', // U+111C6 BC=L BLK=Sharada SC=Sharada SHARADA DOUBLE DANDA
'𑇍', // U+111CD BC=L BLK=Sharada SC=Sharada SHARADA SUTRA MARK
'𑇞', // U+111DE BC=L BLK=Sharada SC=Sharada SHARADA SECTION MARK-1
'𑇟', // U+111DF BC=L BLK=Sharada SC=Sharada SHARADA SECTION MARK-2
'𑈸', // U+11238 BC=L BLK=Khojki SC=Khojki KHOJKI DANDA
'𑈹', // U+11239 BC=L BLK=Khojki SC=Khojki KHOJKI DOUBLE DANDA
'𑈻', // U+1123B BC=L BLK=Khojki SC=Khojki KHOJKI SECTION MARK
'𑈼', // U+1123C BC=L BLK=Khojki SC=Khojki KHOJKI DOUBLE SECTION MARK
'𑊩', // U+112A9 BC=L BLK=Multani SC=Multani MULTANI SECTION MARK
'𑑋', // U+1144B BC=L BLK=Newa SC=Newa NEWA DANDA
'𑑌', // U+1144C BC=L BLK=Newa SC=Newa NEWA DOUBLE DANDA
'𑗂', // U+115C2 BC=L BLK=Siddham SC=Siddham SIDDHAM DANDA
'𑗃', // U+115C3 BC=L BLK=Siddham SC=Siddham SIDDHAM DOUBLE DANDA
'𑗉', // U+115C9 BC=L BLK=Siddham SC=Siddham SIDDHAM END OF TEXT MARK
'𑗊', // U+115CA BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH TRIDENT AND U-SHAPED ORNAMENTS
'𑗋', // U+115CB BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH TRIDENT AND DOTTED CRESCENTS
'𑗌', // U+115CC BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH RAYS AND DOTTED CRESCENTS
'𑗍', // U+115CD BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH RAYS AND DOTTED DOUBLE CRESCENTS
'𑗎', // U+115CE BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH RAYS AND DOTTED TRIPLE CRESCENTS
'𑗏', // U+115CF BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK DOUBLE RING
'𑗐', // U+115D0 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK DOUBLE RING WITH RAYS
'𑗑', // U+115D1 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH DOUBLE CRESCENTS
'𑗒', // U+115D2 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH TRIPLE CRESCENTS
'𑗓', // U+115D3 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH QUADRUPLE CRESCENTS
'𑗔', // U+115D4 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH SEPTUPLE CRESCENTS
'𑗕', // U+115D5 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH CIRCLES AND RAYS
'𑗖', // U+115D6 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH CIRCLES AND TWO ENCLOSURES
'𑗗', // U+115D7 BC=L BLK=Siddham SC=Siddham SIDDHAM SECTION MARK WITH CIRCLES AND FOUR ENCLOSURES
'𑙁', // U+11641 BC=L BLK=Modi SC=Modi MODI DANDA
'𑙂', // U+11642 BC=L BLK=Modi SC=Modi MODI DOUBLE DANDA
'𑜼', // U+1173C BC=L BLK=Ahom SC=Ahom AHOM SIGN SMALL SECTION
'𑜽', // U+1173D BC=L BLK=Ahom SC=Ahom AHOM SIGN SECTION
'𑜾', // U+1173E BC=L BLK=Ahom SC=Ahom AHOM SIGN RULAI
'𑥄', // U+11944 BC=L BLK=Dives_Akuru SC=Dives_Akuru DIVES AKURU DOUBLE DANDA
'𑥆', // U+11946 BC=L BLK=Dives_Akuru SC=Dives_Akuru DIVES AKURU END OF TEXT MARK
'𑩂', // U+11A42 BC=L BLK=Zanabazar_Square SC=Zanabazar_Square ZANABAZAR SQUARE MARK SHAD
'𑩃', // U+11A43 BC=L BLK=Zanabazar_Square SC=Zanabazar_Square ZANABAZAR SQUARE MARK DOUBLE SHAD
'𑪛', // U+11A9B BC=L BLK=Soyombo SC=Soyombo SOYOMBO MARK SHAD
'𑪜', // U+11A9C BC=L BLK=Soyombo SC=Soyombo SOYOMBO MARK DOUBLE SHAD
'𑱁', // U+11C41 BC=L BLK=Bhaiksuki SC=Bhaiksuki BHAIKSUKI DANDA
'𑱂', // U+11C42 BC=L BLK=Bhaiksuki SC=Bhaiksuki BHAIKSUKI DOUBLE DANDA
'𑻷', // U+11EF7 BC=L BLK=Makasar SC=Makasar MAKASAR PASSIMBANG
'𑻸', // U+11EF8 BC=L BLK=Makasar SC=Makasar MAKASAR END OF SECTION
'𑽃', // U+11F43 BC=L BLK=Kawi SC=Kawi KAWI DANDA
'𑽄', // U+11F44 BC=L BLK=Kawi SC=Kawi KAWI DOUBLE DANDA
'𖩮', // U+16A6E BC=L BLK=Mro SC=Mro MRO DANDA
'𖩯', // U+16A6F BC=L BLK=Mro SC=Mro MRO DOUBLE DANDA
'𖫵', // U+16AF5 BC=L BLK=Bassa_Vah SC=Bassa_Vah BASSA VAH FULL STOP
'𖬷', // U+16B37 BC=L BLK=Pahawh_Hmong SC=Pahawh_Hmong PAHAWH HMONG SIGN VOS THOM
'𖬸', // U+16B38 BC=L BLK=Pahawh_Hmong SC=Pahawh_Hmong PAHAWH HMONG SIGN VOS TSHAB CEEB
'𖭄', // U+16B44 BC=L BLK=Pahawh_Hmong SC=Pahawh_Hmong PAHAWH HMONG SIGN XAUS
'𖺘', // U+16E98 BC=L BLK=Medefaidrin SC=Medefaidrin MEDEFAIDRIN FULL STOP
'𛲟', // U+1BC9F BC=L BLK=Duployan SC=Duployan DUPLOYAN PUNCTUATION CHINOOK FULL STOP
'𝪈' // U+1DA88 BC=L BLK=Sutton_SignWriting SC=SignWriting SIGNWRITING FULL STOP
].concat([
// Additional manual entries.
'。', // U+3002 IDEOGRAPHIC FULL STOP
'。' // U+FF61 HALFWIDTH IDEOGRAPHIC FULL STOP
])
export default GLOBAL_SENTENCE_TERMINATORS