UNPKG

yoastseo-dep

Version:

Yoast clientside page analysis

467 lines (415 loc) 20.4 kB
import SentenceTokenizer from "../../../../src/languageProcessing/helpers/sentence/SentenceTokenizer"; const mockTokenizer = new SentenceTokenizer(); const doubleQuotePairs = [ { start: "\"", end: "\"" }, { start: "〝", end: "〞" }, { start: "“", end: "”" }, { start: "‟", end: "„" }, { start: "『", end: "』" }, { start: "«", end: "»" } ]; describe( "A test for tokenizing a (html) text into sentences", function() { it( "returns whether the sentenceBeginning beginning is a valid beginning.", function() { expect( mockTokenizer.isValidSentenceBeginning( "Text with duplicate spaces." ) ).toBe( true ); expect( mockTokenizer.isValidSentenceBeginning( "2 texts with duplicate spaces." ) ).toBe( true ); expect( mockTokenizer.isValidSentenceBeginning( "'" ) ).toBe( true ); expect( mockTokenizer.isValidSentenceBeginning( "¿" ) ).toBe( true ); expect( mockTokenizer.isValidSentenceBeginning( "<" ) ).toBe( true ); expect( mockTokenizer.isValidSentenceBeginning( "أ" ) ).toBe( true ); } ); it( "returns an array of sentences for a given array of tokens started and ended with block with the " + " start and end of the quote pair", function() { doubleQuotePairs.forEach( ( quotePair ) => { const tokens = [ { type: "block-start", src: "<form>" }, { type: "html-start", src: "<p>" }, { type: "sentence", src: "First sentence" }, { type: "sentence-delimiter", src: ":" }, { type: "sentence", src: " second sentence" }, { type: "full-stop", src: "." }, { type: "sentence", src: " Third sentence" }, { type: "full-stop", src: "." }, { type: "sentence-delimiter", src: quotePair.start }, { type: "sentence", src: " Fourth sentence" }, { type: "sentence-delimiter", src: quotePair.end }, { type: "html-end", src: "<br> element.</p>" }, { type: "block-end", src: "</form>" }, ]; expect( mockTokenizer.getSentencesFromTokens( tokens ) ).toEqual( [ "<form><p>First sentence:", "second sentence.", "Third sentence." + quotePair.start, "Fourth sentence" + quotePair.end, "</form>", ] ); } ); } ); it( "returns an array of sentences for a given array of tokens that include quotation marks", function() { const tokens = [ { type: "sentence", src: "Hello", line: 1, col: 1 }, { type: "full-stop", src: ".", line: 1, col: 6 }, { type: "sentence", src: " ", line: 1, col: 7 }, { type: "sentence-delimiter", src: "\"", line: 1, col: 8 }, { type: "sentence", src: "How are you", line: 1, col: 9 }, { type: "sentence-delimiter", src: "\"", line: 1, col: 20 }, { type: "sentence", src: " Bye", line: 1, col: 21 }, ]; expect( mockTokenizer.getSentencesFromTokens( tokens ) ).toEqual( [ "Hello.", "\"How are you\" Bye", ] ); } ); it( "returns an array of sentences for a given array of tokens that include quotation marks preceded by" + "period", function() { const tokens = [ { type: "sentence", src: "Hello", line: 1, col: 1 }, { type: "full-stop", src: ".", line: 1, col: 6 }, { type: "sentence", src: " ", line: 1, col: 7 }, { type: "sentence-delimiter", src: "\"", line: 1, col: 8 }, { type: "sentence", src: "How are you", line: 1, col: 9 }, { type: "full-stop", src: ".", line: 1, col: 20 }, { type: "sentence-delimiter", src: "\"", line: 1, col: 21 }, { type: "sentence", src: " Bye", line: 1, col: 22 }, ]; expect( mockTokenizer.getSentencesFromTokens( tokens ) ).toEqual( [ "Hello.", "\"How are you.\"", "Bye", ] ); } ); it( "returns an array of sentences for a given array of tokens started and ended with html tags.", function() { const tokens = [ { type: "html-start", src: "<p>" }, { type: "sentence", src: "First sentence" }, { type: "sentence-delimiter", src: ":" }, { type: "sentence", src: " second sentence" }, { type: "full-stop", src: "." }, { type: "sentence", src: " 5 is " }, { type: "smaller-than-sign-content", src: "< than 7." }, { type: "html-end", src: "</p>" }, ]; expect( mockTokenizer.getSentencesFromTokens( tokens ) ).toEqual( [ "First sentence:", "second sentence.", "5 is < than 7.", ] ); } ); it( "returns an array of sentences for a given array of tokens when there is no next token after sentence delimiter.", function() { const tokens = [ { type: "sentence", src: "First sentence" }, { type: "sentence-delimiter", src: ":" }, ]; expect( mockTokenizer.getSentencesFromTokens( tokens ) ).toEqual( [ "First sentence:" ] ); } ); it( "returns an array of sentences for a given array of tokens when there is no next token after full stop.", function() { const tokens = [ { type: "sentence", src: "One cat is special" }, { type: "full-stop", src: "." }, ]; expect( mockTokenizer.getSentencesFromTokens( tokens ) ).toEqual( [ "One cat is special." ] ); } ); it( "does not split sentences if 'block-end' is followed by a token started with a number or " + "if it is preceded by a valid sentence ending, but not followed by a valid sentence beginning.", function() { let tokens = [ { type: "block-start", src: "<form>" }, { type: "sentence", src: "One cat is special" }, { type: "full-stop", src: "." }, { type: "block-end", src: "</form>" }, { type: "sentence", src: "5 is perfect!" }, ]; expect( mockTokenizer.getSentencesFromTokens( tokens ) ).toEqual( [ "<form>One cat is special.</form>5 is perfect!" ] ); tokens = [ { type: "block-start", src: "<form>" }, { type: "sentence", src: "One cat is special" }, { type: "full-stop", src: "." }, { type: "block-end", src: "</form>" }, { type: "sentence", src: "five is perfect!" }, ]; expect( mockTokenizer.getSentencesFromTokens( tokens ) ).toEqual( [ "<form>One cat is special.</form>five is perfect!" ] ); } ); it( "does not split sentences if it contains an abbreviation", function() { const tokens = [ { type: "sentence", src: "Our feline teacher is called Prof" }, { type: "full-stop", src: "." }, { type: "sentence", src: " Felix the cat" }, { type: "full-stop", src: "." }, ]; expect( mockTokenizer.getSentencesFromTokens( tokens ) ).toEqual( [ "Our feline teacher is called Prof. Felix the cat." ] ); } ); it( "does not split sentences if last word is No. and a number. ", function() { const tokens = [ { type: "sentence", src: "My favorite song is Mambo No" }, { type: "full-stop", src: "." }, { type: "sentence", src: " 5" }, { type: "full-stop", src: "." }, ]; expect( mockTokenizer.getSentencesFromTokens( tokens ) ).toEqual( [ "My favorite song is Mambo No. 5." ] ); } ); it( "endsWithAbbreviation returns true if a string ends in an abbreviation", function() { const testString = "This string ends with an abbreviation e.g."; expect( mockTokenizer.endsWithAbbreviation( testString ) ).toBeTruthy(); } ); it( "endsWithAbbreviation returns false if a string does not end with an abbreviation,", function() { const testString = "This string does not end with an abbreviation."; expect( mockTokenizer.endsWithAbbreviation( testString ) ).toBeFalsy(); } ); it( "endsWithAbbreviation returns false if string does not end with an abbreviation but has an abbreviation in the middle.", function() { const testString = "This is e.g. a sentence."; expect( mockTokenizer.endsWithAbbreviation( testString ) ).toBeFalsy(); } ); it( "endsWithAbbreviation returns true if string contains multiple abbreviations and ends with one.", function() { const testString = "Prof. Barabas was helped by Dr."; expect( mockTokenizer.endsWithAbbreviation( testString ) ).toBeTruthy(); } ); it( "endsWithAbbreviation returns false if string contains multiple abbreviations and does not end with one.", function() { const testString = "Prof. Barabas was helped by Dr. Wargaren."; expect( mockTokenizer.endsWithAbbreviation( testString ) ).toBeFalsy(); } ); it( "splits sentences if 'block-end' is preceded by a sentence ending and followed by a valid sentence beginning.", function() { let tokens = [ { type: "block-start", src: "<form>" }, { type: "sentence", src: "One cat is special" }, { type: "full-stop", src: "." }, { type: "block-end", src: "</form>" }, { type: "sentence", src: "Five is perfect!" }, ]; expect( mockTokenizer.getSentencesFromTokens( tokens ) ).toEqual( [ "<form>One cat is special.</form>", "Five is perfect!" ] ); tokens = [ { type: "block-start", src: "<form>" }, { type: "sentence", src: "One cat is special" }, { type: "full-stop", src: "." }, { type: "block-end", src: "</form>" }, { type: "block-start", src: "<footer>" }, { type: "block-end", src: "</footer>" }, ]; expect( mockTokenizer.getSentencesFromTokens( tokens ) ).toEqual( [ "<form>One cat is special.</form>", "<footer></footer>" ] ); } ); it( "tokens that represent a '<', followed by content until it enters another '<' or '>' gets another pass by the tokenizer.", function() { let token = { type: "smaller-than-sign-content", src: "<h2>Positive feedback</h2>" }; let tokenSentences = [ "First sentence:", "second sentence.", "<h2>Positive feedback</h2>" ]; let currentSentence = "5 is < than 7."; expect( mockTokenizer.tokenizeSmallerThanContent( token, tokenSentences, currentSentence ) ).toEqual( { currentSentence: "<h2>Positive feedback</h2>", tokenSentences: [ "First sentence:", "second sentence.", "<h2>Positive feedback</h2>", "5 is < than 7." ], } ); token = { type: "smaller-than-sign-content", src: "positive feedback" }; tokenSentences = [ "First sentence:", "second sentence." ]; currentSentence = "Third sentence."; expect( mockTokenizer.tokenizeSmallerThanContent( token, tokenSentences, currentSentence ) ).toEqual( { currentSentence: "Third sentence.<ositive feedback", tokenSentences: [ "First sentence:", "second sentence." ], } ); token = { type: "smaller-than-sign-content", src: "<p>To force<br> line breaks<br> in a text,<br> use the br<br> element.</p>" }; expect( mockTokenizer.tokenizeSmallerThanContent( token, tokenSentences, currentSentence ) ).toEqual( { currentSentence: "</p>", tokenSentences: [ "First sentence:", "second sentence.", "Third sentence.", "<p>To force", " line breaks", " in a text,", " use the br", " element." ], } ); token = { type: "smaller-than-sign-content", src: "<p>Use<br> this</p>?" }; tokenSentences = [ "First sentence:", "second sentence." ]; currentSentence = "Another sentence."; expect( mockTokenizer.tokenizeSmallerThanContent( token, tokenSentences, currentSentence ) ).toEqual( { currentSentence: "", tokenSentences: [ "First sentence:", "second sentence.", "Another sentence.", "<p>Use", " this</p>?" ], } ); } ); it( "throws an error.", function() { const errorMessage = "Test end."; const tokenizer = mockTokenizer.createTokenizer(); tokenizer.tokenizer.end = () => { const error = new Error( errorMessage ); error.tokenizer2 = "test"; throw error; }; const consoleSpy = jest.spyOn( console, "error" ); console.error.mockImplementation( () => {} ); mockTokenizer.tokenize( tokenizer.tokenizer, "This is a string." ); expect( consoleSpy ).toHaveBeenCalledWith( "Tokenizer end error:", new Error( errorMessage ), "test" ); console.error.mockRestore(); } ); it( "does not break a sentence in two if there are initials within.", function() { // The reprint was favourably reviewed by "A. B." in The Musical Times in 1935, who commented "Praise is due to Mr Mercer. const tokens = [ { type: "sentence", src: 'The reprint was favourably reviewed by "A' }, { type: "full-stop", src: "." }, { type: "sentence", src: " B" }, { type: "full-stop", src: "." }, { type: "sentence", src: '" in The Musical Times in , who commented "Praise is due to Mr Mercer' }, { type: "full-stop", src: "." }, ]; // eslint-disable-next-line max-len expect( mockTokenizer.getSentencesFromTokens( tokens ) ).toEqual( [ "The reprint was favourably reviewed by \"A. B.\" in The Musical Times in , who commented \"Praise is due to Mr Mercer." ] ); } ); it( "recognizes sentence boundary when a sentence starts with initials", function() { // 'This is a sentence. E.F. is a good writer. G. H. is a very very great personality.' const tokens = [ { type: "sentence", src: " This is a sentence" }, { type: "full-stop", src: "." }, { type: "sentence", src: " E" }, { type: "full-stop", src: "." }, { type: "sentence", src: "F" }, { type: "full-stop", src: "." }, { type: "sentence", src: " is a good writer" }, { type: "full-stop", src: "." }, { type: "sentence", src: " G" }, { type: "full-stop", src: "." }, { type: "sentence", src: " H" }, { type: "full-stop", src: "." }, { type: "sentence", src: " is a very very great personality" }, { type: "full-stop", src: "." }, ]; // eslint-disable-next-line max-len expect( mockTokenizer.getSentencesFromTokens( tokens ) ).toEqual( [ "This is a sentence.", "E.F. is a good writer.", "G. H. is a very very great personality." ] ); } ); it( "recognizes sentence boundary when a sentence ends with initials", function() { // 'A cat is possessed by C. D. This is a sentence.' const tokens = [ { type: "sentence", src: " A cat is possessed by C" }, { type: "full-stop", src: "." }, { type: "sentence", src: " D" }, { type: "full-stop", src: "." }, { type: "sentence", src: " This is a sentence" }, { type: "full-stop", src: "." }, ]; expect( mockTokenizer.getSentencesFromTokens( tokens ) ).toEqual( [ "A cat is possessed by C. D.", "This is a sentence." ] ); } ); it( "recognizes when a full stop is part of a person's initials", function() { const tokens = [ { type: "sentence", src: " A cat is possessed by C" }, { type: "full-stop", src: "." }, { type: "sentence", src: " D" }, { type: "full-stop", src: "." }, { type: "sentence", src: " This is a sentence" }, { type: "full-stop", src: "." }, ]; const token = tokens[ 1 ]; const previousToken = tokens[ 0 ]; const nextToken = tokens[ 2 ]; const secondToNextToken = tokens[ 3 ]; expect( mockTokenizer.isPartOfPersonInitial( token, previousToken, nextToken, secondToNextToken ) ).toBeTruthy(); } ); it( "recognizes when a full stop is not part of a person's initials", function() { const tokens = [ { type: "sentence", src: "This is a sentence" }, { type: "full-stop", src: "." }, { type: "sentence", src: " This is a sentence" }, { type: "full-stop", src: "." }, ]; const token = tokens[ 1 ]; const previousToken = tokens[ 0 ]; const nextToken = tokens[ 2 ]; const secondToNextToken = tokens[ 3 ]; expect( mockTokenizer.isPartOfPersonInitial( token, previousToken, nextToken, secondToNextToken ) ).toBeFalsy(); } ); xit( "recognizes when a full stop is part of initials with multiple letters", function() { const tokens = [ { type: "sentence", src: "This is a sentence" }, { type: "full-stop", src: "." }, { type: "sentence", src: " A" }, { type: "full-stop", src: "." }, { type: "sentence", src: " F" }, { type: "full-stop", src: "." }, { type: "sentence", src: " Th" }, { type: "full-stop", src: "." }, { type: "sentence", src: " van der Heijden is a well known Dutch writer." }, { type: "full-stop", src: "." }, { type: "sentence", src: "This is a sentence" }, { type: "full-stop", src: "." }, ]; const token = tokens[ 5 ]; const previousToken = tokens[ 4 ]; const nextToken = tokens[ 6 ]; const secondToNextToken = tokens[ 7 ]; expect( mockTokenizer.isPartOfPersonInitial( token, previousToken, nextToken, secondToNextToken ) ).toBeTruthy(); } ); it( "endsWithOrdinalDot should return false when the German tokenizer is not used", () => { expect( mockTokenizer.endsWithOrdinalDot( "Anything you want to put here, it shouldn't matter." ) ).toBe( false ); } ); it( "recognizes initials in the edge case where there are quotes around the initials", function() { doubleQuotePairs.forEach( ( quotePair ) => { const tokens = [ { type: "sentence", src: "The reprint was favourably reviewed by" }, { type: "sentence-delimiter", src: quotePair.start }, { type: "sentence", src: "A" }, { type: "full-stop", src: "." }, { type: "sentence", src: " B" }, { type: "full-stop", src: "." }, { type: "sentence-delimiter", src: quotePair.end }, { type: "sentence", src: " in The Musical Times in 1935, who commented" }, { type: "sentence-delimiter", src: quotePair.start }, { type: "sentence", src: "Praise is due to Mr Mercer" }, { type: "full-stop", src: "." }, { type: "sentence-delimiter", src: quotePair.end }, ]; const token = tokens[ 3 ]; const previousToken = tokens[ 2 ]; const nextToken = tokens[ 4 ]; const secondToNextToken = tokens[ 5 ]; expect( mockTokenizer.isPartOfPersonInitial( token, previousToken, nextToken, secondToNextToken ) ).toBe( true ); } ); } ); it( "correctly constructs the sentence in the edge case where there are quotes around the initials", function() { doubleQuotePairs.forEach( ( quotePair ) => { const tokens = [ { type: "sentence", src: "The reprint was favourably reviewed by" }, { type: "sentence-delimiter", src: " " + quotePair.start }, { type: "sentence", src: "A" }, { type: "full-stop", src: "." }, { type: "sentence", src: " B" }, { type: "full-stop", src: "." }, { type: "sentence-delimiter", src: quotePair.end }, { type: "sentence", src: " in The Musical Times in 1935, who commented" }, { type: "sentence-delimiter", src: quotePair.start }, { type: "sentence", src: "Praise is due to Mr Mercer" }, { type: "full-stop", src: "." }, { type: "sentence-delimiter", src: quotePair.end }, ]; expect( mockTokenizer.getSentencesFromTokens( tokens ) ).toEqual( [ "The reprint was favourably reviewed by " + quotePair.start + "A. B." + quotePair.end + " in The Musical Times in 1935, who commented" + quotePair.start + "Praise is due to Mr Mercer." + quotePair.end ] ); } ); } ); } ); describe( "Tests for isBreakTag", () => { it( "returns true for a break tag", () => { expect( mockTokenizer.isBreakTag( "<br" ) ).toBe( true ); } ); it( "returns false for a non-break tag", () => { expect( mockTokenizer.isBreakTag( "<p" ) ).toBe( false ); } ); it( "returns true for a self-closing break tag", () => { expect( mockTokenizer.isBreakTag( "<br/>" ) ).toBe( true ); } ); it( "returns true for a closing break tag", () => { expect( mockTokenizer.isBreakTag( "</br>" ) ).toBe( true ); } ); } ); describe( "testing the isValidTagPair helper method", function() { it( "returns true if the tags are of the same type and the correct type", function() { [ "p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "span" ].forEach( function( tagType ) { const mockOpenTag = { src: `<${tagType}>` }; const mockCloseTag = { src: `</${tagType}>` }; expect( mockTokenizer.isValidTagPair( mockOpenTag, mockCloseTag ) ).toBe( true ); } ); } ); it( "returns true if the tags are of the same type and the correct type and the have attributes", function() { [ "p", "div", "h1", "h2", "h3", "h4", "h5", "h6", "span" ].forEach( function( tagType ) { const mockOpenTag = { src: `<${tagType} class="onzin" messy="1">` }; const mockCloseTag = { src: `</${tagType}>` }; expect( mockTokenizer.isValidTagPair( mockOpenTag, mockCloseTag ) ).toBe( true ); } ); } ); it( "returns false if the tags are of the same type but not of the correct type", function() { const mockOpenTag = { src: "<i>" }; const mockCloseTag = { src: "</i>" }; expect( mockTokenizer.isValidTagPair( mockOpenTag, mockCloseTag ) ).toBe( false ); } ); it( "returns false if the tags are of te correct type but not of the same type", function() { const mockOpenTag = { src: "<div>" }; const mockCloseTag = { src: "</span>" }; expect( mockTokenizer.isValidTagPair( mockOpenTag, mockCloseTag ) ).toBe( false ); } ); it( "returns false if the tags are of te wrong type and not of the same type", function() { const mockOpenTag = { src: "<i>" }; const mockCloseTag = { src: "</b>" }; expect( mockTokenizer.isValidTagPair( mockOpenTag, mockCloseTag ) ).toBe( false ); } ); } );