UNPKG

typopo

Version:

Fix frequent microtypography errors in multiple languages. Write neat texts without bothering about typography rules. Typopo works for English, German, Slovak, Czech and Rusyn language.

github.com/surfinzap/typopo

surfinzap/typopo

189 lines (166 loc) • 9.31 kB

JavaScript

import { fixAbbreviations, fixInitials, fixMultipleWordAbbreviations, fixSingleWordAbbreviations, } from "../../src/modules/words/abbreviations.js"; import { createTestSuite, transformTestSet } from "../test-utils.js"; import Locale, { supportedLocales } from "../../src/locale/locale.js"; const initialsSet = { "J. Novak": "J. Novak", // essential case, nbsp missing "J.Novak": "J. Novak", // space missing "J. Novak": "J. Novak", // double-check NBSP in the middle "Ch. Lambert": "Ch. Lambert", //double-letter as a first name initial "CH. Lambert": "CH. Lambert", //double-letter initialized as a first name initial "Philip K. Dick": "Philip K. Dick", // one middle initial "Philip K.Dick": "Philip K. Dick", // one middle initials // test cases for two-letter initials "F. X. Šalda": "F.${abbrSpace}X. Šalda", //nbsp after 1st letter, normal space after 2nd one "F.X. Šalda": "F.${abbrSpace}X. Šalda", "Ch. Ch. Šalda": "Ch.${abbrSpace}Ch. Šalda", "CH. CH. Šalda": "CH.${abbrSpace}CH. Šalda", // ch.ch gets exempted as URL. hopefully, no ones has such a name // "Ch.Ch. Šalda": "Ch.${abbrSpace}Ch. Šalda", // "CH.CH. Šalda": "CH.${abbrSpace}CH. Šalda", // test cases for three-letter initials // "Ch. G. D. Lambert": "Ch.${abbrSpace}G.${abbrSpace}D. Lambert", // nbsp after 2 letter, normal space after third one "Ch. Ch. Ch. Lambert": "Ch.${abbrSpace}Ch.${abbrSpace}Ch. Lambert", "CH. CH. CH. Lambert": "CH.${abbrSpace}CH.${abbrSpace}CH. Lambert", // false positives, this function should leave them as they are "F. X.": "F. X.", "F.X.": "F.X.", "F. X. R.": "F. X. R.", }; supportedLocales.forEach((locale) => { createTestSuite( `Fix Initials`, transformTestSet(initialsSet, locale), (text) => fixInitials(text, new Locale(locale)), {}, (text) => fixAbbreviations(text, new Locale(locale)), locale ); }); const multiWordAbbrSet = { // double-word abbreviations "hl. m. Praha": "hl.${abbrSpace}m. Praha", // set proper nbsp "hl.m.Praha": "hl.${abbrSpace}m. Praha", // include proper spaces "Hl.m.Praha": "Hl.${abbrSpace}m. Praha", // catch capitalized exception "Je to hl. m. Praha.": "Je to hl.${abbrSpace}m. Praha.", // in a sentence "Praha, hl. m.": "Praha, hl.${abbrSpace}m.", // check for abbr at the end of statement "(hl. m. Praha)": "(hl.${abbrSpace}m. Praha)", // bracket & quotes variations "(Praha, hl. m.)": "(Praha, hl.${abbrSpace}m.)", // bracket & quotes variations "(hl. m.)": "(hl.${abbrSpace}m.)", // bracket & quotes variations "hl. m.": "hl.${abbrSpace}m.", // plain abbreviation "č., s., hl. m., str.,": "č., s., hl.${abbrSpace}m., str.,", // in a list of abbreviations "Dave Grohl. m. Praha": "Dave Grohl. m. Praha", // false positive for not catching abbr. in a word "Sliačhl. m. Praha": "Sliačhl. m. Praha", // false positive for not catching abbr. in a non-latin word // triple word abbreviations "im Jahr 200 v. u. Z. als der Hunger": "im Jahr 200 v.${abbrSpace}u.${abbrSpace}Z. als der Hunger", "im Jahr 200 v.u.Z. als der Hunger": "im Jahr 200 v.${abbrSpace}u.${abbrSpace}Z. als der Hunger", "im Jahr 200 v. u. Z.": "im Jahr 200 v.${abbrSpace}u.${abbrSpace}Z.", "im Jahr 200 v.u.Z.": "im Jahr 200 v.${abbrSpace}u.${abbrSpace}Z.", "v. u. Z.": "v.${abbrSpace}u.${abbrSpace}Z.", "v.u.Z.": "v.${abbrSpace}u.${abbrSpace}Z.", // random abbreviations to randomly check various localization "1000 pr. n. l.": "1000 pr.${abbrSpace}n.${abbrSpace}l.", "im Jahr 200 v. Chr.": "im Jahr 200 v.${abbrSpace}Chr.", "Das Tier, d. h. der Fisch, lebte noch lange.": "Das Tier, d.${abbrSpace}h. der Fisch, lebte noch lange.", "Das Tier (d. h. der Fisch) lebte noch lange.": "Das Tier (d.${abbrSpace}h. der Fisch) lebte noch lange.", "т. зн. незвыкле": "т.${abbrSpace}зн. незвыкле", "the U.S.": "the U.${abbrSpace}S.", "the U. S.": "the U.${abbrSpace}S.", ", e.g. something": ", e.${abbrSpace}g. something", "(e.g. something": "(e.${abbrSpace}g. something", "a e.g. something": "a e.${abbrSpace}g. something", "abc\ne.g. something": "abc\ne.${abbrSpace}g. something", "e.g. 100 km": "e.${abbrSpace}g. 100 km", "(e.g.)": "(e.${abbrSpace}g.)", "(e.g. )": "(e.${abbrSpace}g.)", "e. g.": "e.${abbrSpace}g.", "e.g. 🥳": "e.${abbrSpace}g. 🥳", "i. e. 🥳": "i.${abbrSpace}e. 🥳", "a i.e. something": "a i.${abbrSpace}e. something", "i.e. 100 km": "i.${abbrSpace}e. 100 km", "4.20 p.m.": "4.20 p.${abbrSpace}m.", "4.20 p.m. in the afternoon": "4.20 p.${abbrSpace}m. in the afternoon", // Throwing extra space "We will continue tomorrow at 8:00 a.m.!": "We will continue tomorrow at 8:00 a.${abbrSpace}m.!", "8 a.m. is the right time": "8 a.${abbrSpace}m. is the right time", // false positives "2 PMs": "2 PMs", "She is the PM of the UK.": "She is the PM of the UK.", // false positive "brie cheese": "brie cheese", // false positive "Pam Grier": "Pam Grier", // false positive "najkrajšie": "najkrajšie", // false positive for non-latin boundaries "nevieš": "nevieš", // false positive for non-latin boundaries "ieš": "ieš", // false positive for non-latin boundaries "či e-mail marketing": "či e-mail marketing", // false positive for non-latin boundaries "(i.e.)": "(i.${abbrSpace}e.)", }; const multiWordAbbrUnitModuleSet = { "e.g. “something”": "e.${abbrSpace}g. “something”", "e.g. ‘something’": "e.${abbrSpace}g. ‘something’", "“We will continue tomorrow at 8:00 a.m.”": "“We will continue tomorrow at 8:00 a.${abbrSpace}m.”", "e.g. ```something```": "e.${abbrSpace}g. ```something```", "e.g. `something`": "e.${abbrSpace}g. `something`", "“e. g.”": "“e.${abbrSpace}g.”", "‘e. g.’": "‘e.${abbrSpace}g.’", "Das Tier – d. i. der Fisch – lebte noch lange.": "Das Tier – d.${abbrSpace}i. der Fisch – lebte noch lange.", }; supportedLocales.forEach((locale) => { let unitTestSet = multiWordAbbrSet; if (locale === "en-us") { unitTestSet = { ...multiWordAbbrSet, ...multiWordAbbrUnitModuleSet, }; } createTestSuite( `Fix multiple-word abbreviations`, transformTestSet(unitTestSet, locale), (text) => fixMultipleWordAbbreviations(text, new Locale(locale)), transformTestSet(multiWordAbbrSet, locale), (text) => fixAbbreviations(text, new Locale(locale)), locale ); }); const singleWordAbbrSet = { /* General pattern for these locales assumes nbsp after abbreviation */ "č. 5 žije": "č. 5 žije", // set nbsp "č.5 žije": "č. 5 žije", // add nbsp "preč č. 5 žije": "preč č. 5 žije", // identify abbreviation word ending in non-latin character "áno, č. 5 žije": "áno, č. 5 žije", // identify abbreviation after sentence punctuation "Prines kvetináč. 5 je číslo.": "Prines kvetináč. 5 je číslo.", //false positive where abbreviation is part of the previous sentence "(pp. 10–25)": "(pp. 10–25)", // abbr. in brackets "str. 38": "str. 38", // other abbreviation example "str. 7": "str. 7", // other abbreviation example "str. p": "str. p", // other abbreviation example "tzv. rýč": "tzv. rýč", // other abbreviation example "10 č.": "10 č.", // abbreviation at the end of the word "10 p.": "10 p.", // abbreviation at the end of the word "10 str.": "10 str.", // abbreviation at the end of the word "(10 p.)": "(10 p.)", // abbreviation at the end of the word & in brackets }; const singleWordAbbrFalsePositiveSet = { "4.20 p.m.": "4.20 p.m.", // false positive "the U.S. and": "the U.S. and", "t. č. 555-729-458": "t. č. 555-729-458", // do not correct single-word abbr. that's part of the multiple-word abbr "t. č. dačo": "t. č. dačo", // do not correct single-word abbr. that's part of the multiple-word abbr (word variation) }; supportedLocales.forEach((locale) => { createTestSuite( `Fix Single-word abbreviations`, transformTestSet({ ...singleWordAbbrSet, ...singleWordAbbrFalsePositiveSet }, locale), (text) => fixSingleWordAbbreviations(text, new Locale(locale)), transformTestSet(singleWordAbbrSet, locale), (text) => fixAbbreviations(text, new Locale(locale)), locale ); }); export const abbreviationsSet = { ...initialsSet, ...multiWordAbbrSet, ...singleWordAbbrSet, };