UNPKG

typopo

Version:

Fix frequent microtypography errors in multiple languages. Write neat texts without bothering about typography rules. Typopo works for English, German, Slovak, Czech and Rusyn language.

189 lines (166 loc) 9.31 kB
import { fixAbbreviations, fixInitials, fixMultipleWordAbbreviations, fixSingleWordAbbreviations, } from "../../src/modules/words/abbreviations.js"; import { createTestSuite, transformTestSet } from "../test-utils.js"; import Locale, { supportedLocales } from "../../src/locale/locale.js"; const initialsSet = { "J. Novak": "J. Novak", // essential case, nbsp missing "J.Novak": "J. Novak", // space missing "J. Novak": "J. Novak", // double-check NBSP in the middle "Ch. Lambert": "Ch. Lambert", //double-letter as a first name initial "CH. Lambert": "CH. Lambert", //double-letter initialized as a first name initial "Philip K. Dick": "Philip K. Dick", // one middle initial "Philip K.Dick": "Philip K. Dick", // one middle initials // test cases for two-letter initials "F. X. Šalda": "F.${abbrSpace}X. Šalda", //nbsp after 1st letter, normal space after 2nd one "F.X. Šalda": "F.${abbrSpace}X. Šalda", "Ch. Ch. Šalda": "Ch.${abbrSpace}Ch. Šalda", "CH. CH. Šalda": "CH.${abbrSpace}CH. Šalda", // ch.ch gets exempted as URL. hopefully, no ones has such a name // "Ch.Ch. Šalda": "Ch.${abbrSpace}Ch. Šalda", // "CH.CH. Šalda": "CH.${abbrSpace}CH. Šalda", // test cases for three-letter initials // "Ch. G. D. Lambert": "Ch.${abbrSpace}G.${abbrSpace}D. Lambert", // nbsp after 2 letter, normal space after third one "Ch. Ch. Ch. Lambert": "Ch.${abbrSpace}Ch.${abbrSpace}Ch. Lambert", "CH. CH. CH. Lambert": "CH.${abbrSpace}CH.${abbrSpace}CH. Lambert", // false positives, this function should leave them as they are "F. X.": "F. X.", "F.X.": "F.X.", "F. X. R.": "F. X. R.", }; supportedLocales.forEach((locale) => { createTestSuite( `Fix Initials`, transformTestSet(initialsSet, locale), (text) => fixInitials(text, new Locale(locale)), {}, (text) => fixAbbreviations(text, new Locale(locale)), locale ); }); const multiWordAbbrSet = { // double-word abbreviations "hl. m. Praha": "hl.${abbrSpace}m. Praha", // set proper nbsp "hl.m.Praha": "hl.${abbrSpace}m. Praha", // include proper spaces "Hl.m.Praha": "Hl.${abbrSpace}m. Praha", // catch capitalized exception "Je to hl. m. Praha.": "Je to hl.${abbrSpace}m. Praha.", // in a sentence "Praha, hl. m.": "Praha, hl.${abbrSpace}m.", // check for abbr at the end of statement "(hl. m. Praha)": "(hl.${abbrSpace}m. Praha)", // bracket & quotes variations "(Praha, hl. m.)": "(Praha, hl.${abbrSpace}m.)", // bracket & quotes variations "(hl. m.)": "(hl.${abbrSpace}m.)", // bracket & quotes variations "hl. m.": "hl.${abbrSpace}m.", // plain abbreviation "č., s., hl. m., str.,": "č., s., hl.${abbrSpace}m., str.,", // in a list of abbreviations "Dave Grohl. m. Praha": "Dave Grohl. m. Praha", // false positive for not catching abbr. in a word "Sliačhl. m. Praha": "Sliačhl. m. Praha", // false positive for not catching abbr. in a non-latin word // triple word abbreviations "im Jahr 200 v. u. Z. als der Hunger": "im Jahr 200 v.${abbrSpace}u.${abbrSpace}Z. als der Hunger", "im Jahr 200 v.u.Z. als der Hunger": "im Jahr 200 v.${abbrSpace}u.${abbrSpace}Z. als der Hunger", "im Jahr 200 v. u. Z.": "im Jahr 200 v.${abbrSpace}u.${abbrSpace}Z.", "im Jahr 200 v.u.Z.": "im Jahr 200 v.${abbrSpace}u.${abbrSpace}Z.", "v. u. Z.": "v.${abbrSpace}u.${abbrSpace}Z.", "v.u.Z.": "v.${abbrSpace}u.${abbrSpace}Z.", // random abbreviations to randomly check various localization "1000 pr. n. l.": "1000 pr.${abbrSpace}n.${abbrSpace}l.", "im Jahr 200 v. Chr.": "im Jahr 200 v.${abbrSpace}Chr.", "Das Tier, d. h. der Fisch, lebte noch lange.": "Das Tier, d.${abbrSpace}h. der Fisch, lebte noch lange.", "Das Tier (d. h. der Fisch) lebte noch lange.": "Das Tier (d.${abbrSpace}h. der Fisch) lebte noch lange.", "т. зн. незвыкле": "т.${abbrSpace}зн. незвыкле", "the U.S.": "the U.${abbrSpace}S.", "the U. S.": "the U.${abbrSpace}S.", ", e.g. something": ", e.${abbrSpace}g. something", "(e.g. something": "(e.${abbrSpace}g. something", "a e.g. something": "a e.${abbrSpace}g. something", "abc\ne.g. something": "abc\ne.${abbrSpace}g. something", "e.g. 100 km": "e.${abbrSpace}g. 100 km", "(e.g.)": "(e.${abbrSpace}g.)", "(e.g. )": "(e.${abbrSpace}g.)", "e. g.": "e.${abbrSpace}g.", "e.g. 🥳": "e.${abbrSpace}g. 🥳", "i. e. 🥳": "i.${abbrSpace}e. 🥳", "a i.e. something": "a i.${abbrSpace}e. something", "i.e. 100 km": "i.${abbrSpace}e. 100 km", "4.20 p.m.": "4.20 p.${abbrSpace}m.", "4.20 p.m. in the afternoon": "4.20 p.${abbrSpace}m. in the afternoon", // Throwing extra space "We will continue tomorrow at 8:00 a.m.!": "We will continue tomorrow at 8:00 a.${abbrSpace}m.!", "8 a.m. is the right time": "8 a.${abbrSpace}m. is the right time", // false positives "2 PMs": "2 PMs", "She is the PM of the UK.": "She is the PM of the UK.", // false positive "brie cheese": "brie cheese", // false positive "Pam Grier": "Pam Grier", // false positive "najkrajšie": "najkrajšie", // false positive for non-latin boundaries "nevieš": "nevieš", // false positive for non-latin boundaries "ieš": "ieš", // false positive for non-latin boundaries "či e-mail marketing": "či e-mail marketing", // false positive for non-latin boundaries "(i.e.)": "(i.${abbrSpace}e.)", }; const multiWordAbbrUnitModuleSet = { "e.g. “something”": "e.${abbrSpace}g. “something”", "e.g. ‘something’": "e.${abbrSpace}g. ‘something’", "“We will continue tomorrow at 8:00 a.m.”": "“We will continue tomorrow at 8:00 a.${abbrSpace}m.”", "e.g. ```something```": "e.${abbrSpace}g. ```something```", "e.g. `something`": "e.${abbrSpace}g. `something`", "“e. g.”": "“e.${abbrSpace}g.”", "‘e. g.’": "‘e.${abbrSpace}g.’", "Das Tier – d. i. der Fisch – lebte noch lange.": "Das Tier – d.${abbrSpace}i. der Fisch – lebte noch lange.", }; supportedLocales.forEach((locale) => { let unitTestSet = multiWordAbbrSet; if (locale === "en-us") { unitTestSet = { ...multiWordAbbrSet, ...multiWordAbbrUnitModuleSet, }; } createTestSuite( `Fix multiple-word abbreviations`, transformTestSet(unitTestSet, locale), (text) => fixMultipleWordAbbreviations(text, new Locale(locale)), transformTestSet(multiWordAbbrSet, locale), (text) => fixAbbreviations(text, new Locale(locale)), locale ); }); const singleWordAbbrSet = { /* General pattern for these locales assumes nbsp after abbreviation */ "č. 5 žije": "č. 5 žije", // set nbsp "č.5 žije": "č. 5 žije", // add nbsp "preč č. 5 žije": "preč č. 5 žije", // identify abbreviation word ending in non-latin character "áno, č. 5 žije": "áno, č. 5 žije", // identify abbreviation after sentence punctuation "Prines kvetináč. 5 je číslo.": "Prines kvetináč. 5 je číslo.", //false positive where abbreviation is part of the previous sentence "(pp. 10–25)": "(pp. 10–25)", // abbr. in brackets "str. 38": "str. 38", // other abbreviation example "str. 7": "str. 7", // other abbreviation example "str. p": "str. p", // other abbreviation example "tzv. rýč": "tzv. rýč", // other abbreviation example "10 č.": "10 č.", // abbreviation at the end of the word "10 p.": "10 p.", // abbreviation at the end of the word "10 str.": "10 str.", // abbreviation at the end of the word "(10 p.)": "(10 p.)", // abbreviation at the end of the word & in brackets }; const singleWordAbbrFalsePositiveSet = { "4.20 p.m.": "4.20 p.m.", // false positive "the U.S. and": "the U.S. and", "t. č. 555-729-458": "t. č. 555-729-458", // do not correct single-word abbr. that's part of the multiple-word abbr "t. č. dačo": "t. č. dačo", // do not correct single-word abbr. that's part of the multiple-word abbr (word variation) }; supportedLocales.forEach((locale) => { createTestSuite( `Fix Single-word abbreviations`, transformTestSet({ ...singleWordAbbrSet, ...singleWordAbbrFalsePositiveSet }, locale), (text) => fixSingleWordAbbreviations(text, new Locale(locale)), transformTestSet(singleWordAbbrSet, locale), (text) => fixAbbreviations(text, new Locale(locale)), locale ); }); export const abbreviationsSet = { ...initialsSet, ...multiWordAbbrSet, ...singleWordAbbrSet, };