yoastseo-dep
Version:
Yoast clientside page analysis
456 lines (442 loc) • 15.9 kB
JavaScript
import LanguageProcessor from "../../../src/parse/language/LanguageProcessor";
import Factory from "../../specHelpers/factory";
import memoizedSentenceTokenizer from "../../../src/languageProcessing/helpers/sentence/memoizedSentenceTokenizer";
import Sentence from "../../../src/parse/structure/Sentence";
import splitIntoTokensCustom from "../../../src/languageProcessing/languages/ja/helpers/splitIntoTokensCustom";
const researcher = Factory.buildMockResearcher( {}, true, false, false,
{ memoizedTokenizer: memoizedSentenceTokenizer } );
describe( "A test for the LanguageProcessor object", () => {
it( "should correctly create a simple LanguageProcessor object", function() {
expect( new LanguageProcessor( researcher ) ).toEqual( { researcher: researcher } );
} );
} );
describe( "A test for the splitIntoSentences method", () => {
it( "should return an array of sentence objects", function() {
const languageProcessor = new LanguageProcessor( researcher );
const sentences = languageProcessor.splitIntoSentences( "Hello, world! Hello, Yoast!" );
expect( sentences ).toEqual( [ { text: "Hello, world!", sourceCodeRange: {}, tokens: [] },
{ text: " Hello, Yoast!", sourceCodeRange: {}, tokens: [] } ] );
} );
it( "the last sentence should not consist of a whitespace if the text ends in a whitespace", function() {
const languageProcessor = new LanguageProcessor( researcher );
const sentences = languageProcessor.splitIntoSentences( "Hello, world! Hello, Yoast! " );
expect( sentences ).toEqual( [ { text: "Hello, world!", sourceCodeRange: {}, tokens: [] },
{ text: " Hello, Yoast!", sourceCodeRange: {}, tokens: [] } ] );
} );
} );
const splitIntoTokensTestCases = [
{
description: "should return an empty array if the sentence is empty",
sentence: "",
expectedTokens: [],
},
{
description: "should correctly tokenize a sentence with a single token",
sentence: "Hello",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence with a single token and a trailing whitespace",
sentence: "Hello ",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence with a single token and a leading whitespace",
sentence: " Hello",
expectedTokens: [
{ text: " ", sourceCodeRange: {} },
{ text: "Hello", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence with multiple tokens ending with a full stop",
sentence: "Hello, world.",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "world", sourceCodeRange: {} },
{ text: ".", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence with multiple tokens ending with a full stop and a trailing whitespace",
sentence: "Hello, world. ",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "world", sourceCodeRange: {} },
{ text: ".", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence with multiple punctuation marks",
sentence: "Hello, world!!!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "world", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence with multiple different punctuation marks",
sentence: "Hello, world!?!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "world", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
{ text: "?", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence with a word containing a dash",
sentence: "Hello, world-wide!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "world-wide", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence with a word containing an underscore",
sentence: "Hello, world_wide!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "world_wide", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence with a word containing a forward slash",
sentence: "Hello, world/worlds!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "world", sourceCodeRange: {} },
{ text: "/", sourceCodeRange: {} },
{ text: "worlds", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
skip: true,
},
{
description: "should correctly tokenize a sentence with a word containing a backslash",
sentence: "Hello, world\\worlds!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "world", sourceCodeRange: {} },
{ text: "\\", sourceCodeRange: {} },
{ text: "worlds", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
skip: true,
},
{
description: "should correctly tokenize a sentence with a word containing an apostrophe",
sentence: "Hello, world's!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "world's", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence with a number with a decimal point",
sentence: "Hello, 3.14!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "3.14", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence with a number with a decimal comma",
sentence: "Hello, 3,14!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "3,14", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence with a token that starts with a punctuation mark",
sentence: "Hello, .world!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: ".", sourceCodeRange: {} },
{ text: "world", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence with a token between parentheses",
sentence: "Hello, (world)!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "(", sourceCodeRange: {} },
{ text: "world", sourceCodeRange: {} },
{ text: ")", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence with a phrase between parentheses",
sentence: "Hello, (world of worlds)!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "(", sourceCodeRange: {} },
{ text: "world", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "of", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "worlds", sourceCodeRange: {} },
{ text: ")", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence with nested parentheses",
sentence: "Hello, (world (of worlds))!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "(", sourceCodeRange: {} },
{ text: "world", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "(", sourceCodeRange: {} },
{ text: "of", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "worlds", sourceCodeRange: {} },
{ text: ")", sourceCodeRange: {} },
{ text: ")", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence containing an url",
sentence: "Hello, https://www.google.com!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "https://www.google.com", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
skip: true,
},
{
description: "should correctly tokenize a sentence containing an email address",
sentence: "Hello, hugo@yoast.com!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "hugo@yoast.com", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence with a nbsp",
sentence: "Hello,\u00A0world!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: "\u00A0", sourceCodeRange: {} },
{ text: "world", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence where a punctuation mark is between two spaces",
sentence: "Hello , world!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "world", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a german sentence that contains a word with an umlaut",
sentence: "Hallo, w\u00F6rld!",
expectedTokens: [
{ text: "Hallo", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "w\u00F6rld", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence with a token that contains a number",
sentence: "Hello, 123world!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "123world", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence with a token that contains an abbreviation",
sentence: "Hello, W.O.R.L.D.!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "W.O.R.L.D.", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
skip: true,
},
{
description: "should correctly tokenize a sentence with a token that contains an emoji",
sentence: "Hello, 🌍!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "🌍", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence in a language with non latin characters (cyrillic)",
sentence: "Привет, мир!",
expectedTokens: [
{ text: "Привет", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "мир", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence in a LTR language (arabic)",
sentence: "مرحبا بالعالم!",
expectedTokens: [
{ text: "مرحبا", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "بالعالم", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence in a RTL language (arabic) where there is a punctuation mark before and after a word",
sentence: "مرحبا، ?بالعالم!",
expectedTokens: [
{ text: "مرحبا", sourceCodeRange: {} },
{ text: "،", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "?", sourceCodeRange: {} },
{ text: "بالعالم", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
{
description: "should correctly tokenize a sentence containing right-to-left marks",
sentence: "Hello \u200Fright-to-left\u200E mark!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "\u200F", sourceCodeRange: {} },
{ text: "right-to-left", sourceCodeRange: {} },
{ text: "\u200E", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "mark", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
skip: true,
},
{
description: "should correctly tokenize a sentence containing a word that is right-to-left",
sentence: "Hello, \u200Fمرحبا\u200E!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "\u200F", sourceCodeRange: {} },
{ text: "مرحبا", sourceCodeRange: {} },
{ text: "\u200E", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
skip: true,
},
{
description: "should correctly tokenize a sentence that contains multiple consecutive spaces",
sentence: "Hello, world!",
expectedTokens: [
{ text: "Hello", sourceCodeRange: {} },
{ text: ",", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: " ", sourceCodeRange: {} },
{ text: "world", sourceCodeRange: {} },
{ text: "!", sourceCodeRange: {} },
],
},
];
describe.each( splitIntoTokensTestCases )( "A test for the tokenize method", ( { description, sentence, expectedTokens, skip } ) => {
const test = skip ? it.skip : it;
test( description, function() {
const languageProcessor = new LanguageProcessor( researcher );
const tokens = languageProcessor.splitIntoTokens( new Sentence( sentence ) );
expect( tokens ).toEqual( expectedTokens );
} );
} );
describe( "A test for the splitIntoTokens method in Japanese", () => {
it( "should return an array of tokens", function() {
const japaneseResearcher = Factory.buildMockResearcher( {}, true, false, false,
{ splitIntoTokensCustom: splitIntoTokensCustom } );
const languageProcessor = new LanguageProcessor( japaneseResearcher );
const tokens = languageProcessor.splitIntoTokens( new Sentence( "ウクライナは、東ヨーロッパに位置する国家。" ) );
expect( tokens ).toEqual( [
{ text: "ウクライナ", sourceCodeRange: {} },
{ text: "は", sourceCodeRange: {} },
{ text: "、", sourceCodeRange: {} },
{ text: "東ヨーロッパ", sourceCodeRange: {} },
{ text: "に", sourceCodeRange: {} },
{ text: "位置", sourceCodeRange: {} },
{ text: "する", sourceCodeRange: {} },
{ text: "国家", sourceCodeRange: {} },
{ text: "。", sourceCodeRange: {} },
] );
} );
} );