web-sniffer-js
Version:
Simple HTML Parser to JS Object
357 lines (277 loc) • 12.6 kB
JavaScript
var http = require('http');
module.exports = function () {
this.htmlObject = {};
this.clean = function () {
this.htmlObject = {};
}
this.clean_line = function (line) {
if (!line) return line;
while (line.indexOf(" ") !== -1) {
line = line.replace(" ", " ");
}
return (line.replace("\t", "").replace("\r", "").trim());
}
this.parseWithLink = function (url = "", callback) {
let content = "";
http.get(url, (resp) => {
resp.setEncoding("utf8");
resp.on("data", function (chunk) {
content += chunk;
});
var t = this;
resp.on("end", function () {
content = content.replace("\n", "").replace("\r", "");
content = content.substring(content.indexOf("<html"));
t.htmlObject = t.parseHtml(t.clean_line(content));
callback(t.htmlObject);
});
});
}
this.parseWithFile = function (HTMLContent = "", callback) {
var html = this.clean_line(HTMLContent.replace("\n", "").replace("\r", ""));
html = html.substring(html.indexOf("<html"));
this.htmlObject = this.parseHtml(this.clean_line(html));
callback(this.htmlObject);
}
this.formatTable = function (obj) {
let array_head = [];
let obj_body = [];
if (obj[0] && obj[0].name == "thead" && obj[0].next && obj[0].next[0] && obj[0].next[0].next) {
for (let head_id in obj[0].next[0].next)
array_head.push(obj[0].next[0].next[head_id].value);
}
if (obj[1] && obj[1].name == "tbody" && obj[1].next) {
for (let body_id in obj[1].next) {
let body_part = obj[1].next[body_id];
let obj_part = {};
if (body_part.next) {
for (let elem_id in body_part.next) {
obj_part[array_head[elem_id]] = body_part.next[elem_id].value;
}
obj_body.push(obj_part);
}
}
}
if (obj[0] && obj[0].name == "div" && obj[0].content.indexOf("class=\"thead\"") !== -1) {
for (let head_id in obj[0].next)
array_head.push(obj[0].next[head_id].value);
for (let body_id = 1; body_id < obj.length; body_id++) {
let body_part = obj[body_id];
let obj_part = {};
if (body_part.next) {
for (let elem_id in body_part.next) {
obj_part[array_head[elem_id]] = body_part.next[elem_id].value;
}
obj_body.push(obj_part);
}
}
}
return obj_body;
}
this.readSearchLine = function (line) {
var search_object = [];
let tmp_search_obj = line.split(";");
for (let s in tmp_search_obj) {
let tmp_s = tmp_search_obj[s];
let tmp_s_obj = tmp_s.split("|");
if (tmp_s_obj.length > 0) {
let new_obj = { balise: tmp_s_obj[0] };
if (tmp_s_obj.length == 3 || (tmp_s_obj.length == 2 && tmp_s_obj[1].indexOf("[") !== -1 && tmp_s_obj[1].indexOf("]") !== -1)) {
let content = tmp_s_obj[1].slice(tmp_s_obj[1].indexOf("[") + 1, tmp_s_obj[1].indexOf("]"))
new_obj["content"] = content.split(",");
}
if (tmp_s_obj.length == 3 && tmp_s_obj[2].indexOf("{") !== -1 && tmp_s_obj[2].indexOf("}") !== -1) {
new_obj["position"] = parseInt(tmp_s_obj[2].slice(tmp_s_obj[2].indexOf("{") + 1, tmp_s_obj[2].indexOf("}")));
}
if (tmp_s_obj.length == 2 && tmp_s_obj[1].indexOf("{") !== -1 && tmp_s_obj[1].indexOf("}") !== -1) {
new_obj["position"] = parseInt(tmp_s_obj[1].slice(tmp_s_obj[1].indexOf("{") + 1, tmp_s_obj[1].indexOf("}")));
}
search_object.push(new_obj);
}
}
return search_object;
}
this.execSearchFromObj = function (obj = {}, searchArray = {}) {
browseObj = obj;
for (let id in searchArray) {
let s_obj_search = searchArray[id];
let final_pos = 0;
if (s_obj_search.position)
final_pos = s_obj_search.position;
let cpt_pos = -1;
for (let current_obj_id in browseObj) {
let current_browse_obj = browseObj[current_obj_id];
if (!current_browse_obj)
break;
if (s_obj_search.balise == current_browse_obj.name) {
if (s_obj_search.content) {
let cpt_match = 0;
for (let m_id in s_obj_search.content) {
for (let m_id2 in current_browse_obj.content) {
if (s_obj_search.content[m_id] == current_browse_obj.content[m_id2]) {
cpt_match++;
break;
}
}
}
if (cpt_match == s_obj_search.content.length) {
cpt_pos++;
}
}
else
cpt_pos++;
}
if (cpt_pos == final_pos) {
browseObj = browseObj[current_obj_id].next;
break;
}
}
if (cpt_pos != final_pos)
return false;
}
return browseObj;
}
this.recusrsiveBrowseObjectSearch = function (object = {}, searchArray = []) {
let good_obj;
for (let i_obj in object) {
let n_obj = object[i_obj];
let ret = this.execSearchFromObj([n_obj], searchArray);
if (ret !== false)
return ret;
if (n_obj.next) {
good_obj = this.recusrsiveBrowseObjectSearch(n_obj.next, searchArray);
if (good_obj !== false)
return good_obj;
}
}
return false;
}
this.search = function (searchString = "") {
var search_object = this.readSearchLine(searchString);
var result_search = this.recusrsiveBrowseObjectSearch(this.htmlObject, search_object);
return result_search;
}
this.parseHtml = function (html = "", loop = 0) {
let indexHtml = 0;
html = this.clean_line(html);
let content = [];
if (loop < 0 || html.length <= 0 || html.indexOf("<") === -1)
return null;
html = html.slice(html.indexOf("<"));
while (html.indexOf("<!--") === 0) {
html = html.slice(html.indexOf("-->") + 3);
}
html = html.slice(html.indexOf("<"));
while (html.indexOf("<script") === 0) {
html = html.slice(html.indexOf("</script>") + 9);
html = html.slice(html.indexOf("<"));
}
let firstBaliseInfo = this.getBaliseInfo(html);
if (firstBaliseInfo.index == -1)
return;
let start_pos = firstBaliseInfo.index;
let lastBaliseInfo = 0;
if (firstBaliseInfo.balise.name != "script") {
if (firstBaliseInfo.balise.is_end == false) {
lastBaliseInfo = this.getPosEndOfBalise(html, firstBaliseInfo);
start_pos = lastBaliseInfo + 3 + firstBaliseInfo.balise.name.length;
let tmp = html.slice(firstBaliseInfo.index + 1, lastBaliseInfo);
if (this.getBaliseInfo(tmp).index == -1)
firstBaliseInfo.balise["value"] = tmp;
firstBaliseInfo.balise["next"] = this.parseHtml(tmp, loop + 1);
content.push(firstBaliseInfo.balise);
let tmp_ret = this.parseHtml(html.slice(start_pos), loop + 1);
if (tmp_ret) {
for (let i = 0; i < tmp_ret.length; i++) {
content.push(tmp_ret[i]);
}
}
} else {
start_pos = firstBaliseInfo.balise.pos_end;
content.push(firstBaliseInfo.balise);
let tmp_ret = this.parseHtml(html.slice(firstBaliseInfo.balise.pos_end), loop + 1);
if (tmp_ret) {
for (let i = 0; i < tmp_ret.length; i++) {
content.push(tmp_ret[i]);
}
}
}
return content;
}
return null;
}
this.getPosEndOfBalise = function (html = "", baliseInfo) {
let pos = baliseInfo.balise.name.length + 1;
let tmp_html = html.slice(pos);
let match = 0;
while (tmp_html.length > 0) {
let match_end = tmp_html.indexOf("</" + baliseInfo.balise.name);
let match_start = tmp_html.indexOf("<" + baliseInfo.balise.name);
if (match_start < match_end && match_start !== -1) {
match--;
pos += match_start + 1 + baliseInfo.balise.name.length;
} else {
match++;
if (match == 1) {
return match_end + pos;
}
pos += 3 + baliseInfo.balise.name.length + match_end;
}
tmp_html = html.slice(pos);
}
}
this.getBaliseInfo = function (html = "") {
let start_balise = html.indexOf("<");
let end_balise = html.indexOf(" ");
let tmp_end2 = html.indexOf(">");
let tmp_end_balise_char = html.indexOf("/>");
let is_end = true;
if (tmp_end_balise_char > tmp_end2 || tmp_end_balise_char === -1) {
tmp_end_balise_char = tmp_end2;
is_end = false;
}
if (tmp_end_balise_char < end_balise || end_balise === -1)
end_balise = tmp_end_balise_char;
let balise_name = html.slice(start_balise + 1, end_balise);
if (['link', 'input'].indexOf(balise_name) !== -1)
is_end = true;
let content_line = html.slice(start_balise + 1 + balise_name.length, tmp_end_balise_char).trim();
let content = null;
if (content_line != "") {
content = content_line.split(" ");
let new_content = [];
let is_quote = false;
let tmp_content = "";
for (let i = 0; i < content.length; i++) {
if (content[i].indexOf("\"") === -1 && is_quote == false) {
new_content.push(content[i]);
}
else if (content[i].indexOf("\"") != content[i].lastIndexOf("\"") && is_quote == false)
new_content.push(content[i]);
else {
if (content[i].indexOf("\"") !== -1 && is_quote == false) {
is_quote = true;
}
tmp_content += " " + content[i];
if (content[i].lastIndexOf("\"") && is_quote == true && content[i].lastIndexOf("\"") == content[i].length - 1) {
is_quote = false;
new_content.push(tmp_content.trim());
tmp_content = "";
}
}
}
content = new_content;
}
let new_balise = { name: balise_name, content: content, is_end: is_end };
if (new_balise.is_end == true)
new_balise["pos_end"] = tmp_end_balise_char;
var str_content = html.substring(html.indexOf("</" + new_balise.name + ">") + 3 + new_balise.name.length);
var res_content = str_content.trim();
if (str_content.indexOf('<') !== -1)
res_content = str_content.substring(0, str_content.indexOf('<')).trim();
if (res_content.length > 0) {
new_balise["str"] = res_content;
}
return { index: tmp_end2, balise: new_balise };
}
}