cejs
Version:
A JavaScript module framework that is simple to use.
207 lines (174 loc) • 6.37 kB
JavaScript
/**
* @name WWW work crawler sub-functions
*
* @fileoverview WWW work crawler functions: part of search
*
* @since 2019/10/13 拆分自 CeL.application.net.work_crawler
*/
;
// --------------------------------------------------------------------------------------------
if (typeof CeL === 'function') {
// 忽略沒有 Windows Component Object Model 的錯誤。
CeL.env.ignore_COM_error = true;
CeL.run({
// module name
name : 'application.net.work_crawler.search',
require : 'application.net.work_crawler.',
// 設定不匯出的子函式。
no_extend : 'this,*',
// 為了方便格式化程式碼,因此將 module 函式主體另外抽出。
code : module_code
});
}
function module_code(library_namespace) {
// requiring
var Work_crawler = library_namespace.net.work_crawler, crawler_namespace = Work_crawler.crawler_namespace;
var gettext = library_namespace.locale.gettext;
// --------------------------------------------------------------------------------------------
// @see luoxia.js, dmzj.js
function parse_search_result_token(id_list, id_data, token_parser, token) {
var matched = token.match(/<a\s([^<>]+)>([\s\S]+?)<\/a>/i);
if (library_namespace.is_RegExp(token_parser)) {
matched = token.match(token_parser);
} else {
// matched: [ link, attributes, inner HTML ]
matched = token.match(/<a\s([^<>]+)>([\s\S]+?)<\/a>/i);
}
if (!matched)
return;
var id = matched[1]
// dmzj.js: title=""href="" 中間沒有空格。
.match(/href=["'][^"'<>]+?\/([a-z\d\-_]+)(?:\/|\.html)?["']/i);
if (!id)
return;
id = id[1];
if (false && !isNaN(id)) {
id = +id;
}
id_list.push(id);
var title = matched[1].match(/title=["']([^"'<>]+)["']/);
id_data.push(crawler_namespace.get_label(title && title[1]
|| matched[2]));
}
// only for .parse_search_result() !!
function extract_work_id_from_search_result_link(PATTERN_item_token, html,
token_parser) {
// console.log(html);
var matched,
// {Array}id_list = [ id, id, ... ]
id_list = [],
// {Array}id_data = [ title, title, ... ]
id_data = [],
//
search_result_parser = typeof token_parser === 'function'
//
&& function(token) {
// function parser(token, id_list, id_data){console.log(token);}
var result = token_parser.call(this, token, id_list, id_data);
if (Array.isArray(result) && result.length === 2) {
id_list.push(result[0]);
id_data.push(result[1]);
}
};
// PATTERN_item_token 會分離出每個作品的欄位。
if (library_namespace.is_RegExp(PATTERN_item_token)) {
// assert: PATTERN_item_token.global === true
// matched: [ , HTML token to check ]
while (matched = PATTERN_item_token.exec(html)) {
if (search_result_parser) {
search_result_parser(matched[1]);
} else {
parse_search_result_token(id_list, id_data, token_parser,
matched[1]);
}
}
} else if (Array.isArray(PATTERN_item_token)) {
html.each_between(PATTERN_item_token[0], PATTERN_item_token[1],
search_result_parser
|| parse_search_result_token.bind(null, id_list,
id_data, token_parser));
} else {
throw new TypeError('extract_work_id_from_search_result_link: '
// gettext_config:{"id":"invalid-token-pattern-{$1}-$2"}
+ gettext('Invalid token pattern: {%1} %2',
typeof PATTERN_item_token, JSON
.stringify(PATTERN_item_token)));
}
// console.log([ id_list, id_data ]);
// throw 'extract_work_id_from_search_result_link';
return [ id_list, id_data ];
}
// CeL.work_crawler.extract_work_id_from_search_result_link()
Work_crawler.extract_work_id_from_search_result_link = extract_work_id_from_search_result_link;
// --------------------------------------------------------------------------------------------
var PATTERN_url_for_baidu = /([\d_]+)(?:\.html|\/(?:index\.html)?)?$/;
if (library_namespace.is_debug()) {
[ 'http://www.host/0/123/', 'http://www.host/123/index.html',
'http://www.host/123.html' ].forEach(function(url) {
console.assert('123' === 'http://www.host/123/'
.match(PATTERN_url_for_baidu)[1]);
});
}
crawler_namespace.parse_search_result_set = {
// baidu cse
baidu : function(html, get_label) {
// console.log(html);
var id_data = [],
// {Array}id_list = [id,id,...]
id_list = [], get_next_between = html.find_between(
' cpos="title" href="', '</a>'), text;
while ((text = get_next_between()) !== undefined) {
// console.log(text);
// 從URL網址中解析出作品id。
var matched = text.between(null, '"').match(
PATTERN_url_for_baidu);
// console.log(matched);
if (!matched)
continue;
id_list.push(matched[1]);
// 從URL網址中解析出作品title。
matched = text.match(/ title="([^"]+)"/);
if (matched) {
matched = matched[1];
} else {
// e.g., omanhua.js: <em>择</em><em>天</em><em>记</em>
matched = text.between('<em>', {
tail : '</em>'
});
}
// console.log(matched);
if (matched && (matched = get_label(matched))
// 只取第一個符合的。
// 避免如 http://host/123/, http://host/123/456.htm
&& !id_data.includes(matched)) {
id_data.push(matched);
} else {
id_list.pop();
}
}
// console.log([ id_list, id_data ]);
return [ id_list, id_data ];
}
};
// --------------------------------------------------------------------------------------------
// export 導出.
// @instance
Object.assign(Work_crawler.prototype, {
search_result_file_name : 'search.json',
cache_title_to_id : true,
get_search_result_file : function() {
var search_result_file = this.main_directory
+ this.search_result_file_name;
return search_result_file;
},
get_search_result : function() {
var search_result_file = this.get_search_result_file(),
// search cache
// 檢查看看之前是否有獲取過。
search_result = library_namespace.get_JSON(search_result_file);
return search_result;
}
});
// 不設定(hook)本 module 之 namespace,僅執行 module code。
return library_namespace.env.not_to_extend_keyword;
}