UNPKG

yzhanhtmlparser

Version:

A streaming HTML parser based on HTML Standard. 基于 HTML 标准的流式 HTML 解析器

165 lines (150 loc) 4.72 kB
const { Writable } = require('stream') const Token = require('./Token.class') module.exports = class HtmlParser extends Writable { constructor() { super() let currentToken = null let currentAttribueName = '' let currentAttribueValue = '' let currentTagName = '' const emit = token => { this.emit('data', token.getLog()) } const isWhiteSpace = char => { return char === ' ' || char === '\t' || char === '\n' } const dataState = char => { // if (char === '&') return dataState if (char === '<') { currentToken = new Token() currentToken.setType('openTag') return tagOpen } emit(new Token('char', char)) return dataState } const tagOpen = char => { currentTagName = '' if (char === ' ') return tagOpen if (char === '/') return tagEnd return tagName(char) // reconsume } const tagName = char => { if (isWhiteSpace(char)) return beforeAttribute if (char === '/') return selfClosingStartTag if (char === '>') { currentToken.setContent(currentTagName) emit(currentToken) return dataState } currentTagName += char return tagName } const beforeAttribute = char => { if (isWhiteSpace(char)) return beforeAttribute if (char === '/') return selfClosingStartTag if (char === '>') { currentToken.setContent(currentTagName) emit(currentToken) return dataState } currentAttribueName = '' return attributeName(char) } const attributeName = char => { if (isWhiteSpace(char)) return attributeName if (char === '/') { currentToken.setAttribute(currentAttribueName, currentAttribueName) return selfClosingStartTag } if (char === '=') return beforeAttributeValue currentAttribueName += char return attributeName } const beforeAttributeValue = char => { if (isWhiteSpace(char)) return beforeAttributeValue currentAttribueValue = '' if (char === '"') return attributeValueDouble if (char === `'`) return attributeValueSingle if (char === '>') { currentToken.setAttribute(currentAttribueName, currentAttribueValue) currentToken.setContent(currentTagName) emit(currentToken) return dataState } return attributeValueUnquoted(char) } const attributeValueDouble = char => { if (char === '"') { currentToken.setAttribute(currentAttribueName, currentAttribueValue) return afterAttributeValueQuoted } // if (char === '&') return attributeValueDouble currentAttribueValue += char return attributeValueDouble } const attributeValueSingle = char => { if (char === `'`) { currentToken.setAttribute(currentAttribueName, currentAttribueValue) return afterAttributeValueQuoted } // if (char === '&') return attributeValueSingle currentAttribueValue += char return attributeValueSingle } const attributeValueUnquoted = char => { if (isWhiteSpace(char)) { currentToken.setAttribute(currentAttribueName, currentAttribueValue) return beforeAttribute } if (char === '/') { currentToken.setAttribute(currentAttribueName, currentAttribueValue) return selfClosingStartTag } // if (char === '&') return attributeValueUnquoted if (char === '>') { currentToken.setAttribute(currentAttribueName, currentAttribueValue) currentToken.setContent(currentTagName) emit(currentToken) return dataState } currentAttribueValue += char return attributeValueUnquoted } const afterAttributeValueQuoted = char => { if (char === '/') return selfClosingStartTag if (char === '>') { currentToken.setContent(currentTagName) emit(currentToken) return dataState } return beforeAttribute(char) } const selfClosingStartTag = char => { if (char === '>') { currentToken.setType('selfClosingTag') currentToken.setContent(currentTagName) emit(currentToken) return dataState } return selfClosingStartTag } const tagEnd = char => { if (isWhiteSpace(char)) return tagEnd if (char === '>') { currentToken = new Token('closeTag') emit(currentToken) return dataState } currentToken.setType('closeTag') return tagName(char) } this.state = dataState } write(char) { this.state = this.state(char) } end() { this.emit('end') } }