hexo-blogger-xml
Version:
Import xml blogger to hexo
463 lines (462 loc) • 21.5 kB
JavaScript
;
var __extends = (this && this.__extends) || (function () {
var extendStatics = function (d, b) {
extendStatics = Object.setPrototypeOf ||
({ __proto__: [] } instanceof Array && function (d, b) { d.__proto__ = b; }) ||
function (d, b) { for (var p in b) if (Object.prototype.hasOwnProperty.call(b, p)) d[p] = b[p]; };
return extendStatics(d, b);
};
return function (d, b) {
if (typeof b !== "function" && b !== null)
throw new TypeError("Class extends value " + String(b) + " is not a constructor or null");
extendStatics(d, b);
function __() { this.constructor = d; }
d.prototype = b === null ? Object.create(b) : (__.prototype = b.prototype, new __());
};
})();
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k in mod) if (k !== "default" && Object.prototype.hasOwnProperty.call(mod, k)) __createBinding(result, mod, k);
__setModuleDefault(result, mod);
return result;
};
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.BloggerParser = void 0;
var events_1 = require("events");
var fs = __importStar(require("fs"));
var fs_extra_1 = require("fs-extra");
var he_1 = __importDefault(require("he"));
var jsdom_1 = require("jsdom");
var path = __importStar(require("path"));
var rimraf_1 = require("rimraf");
var sanitize_filename_1 = __importDefault(require("sanitize-filename"));
var sbg_utility_1 = __importDefault(require("sbg-utility"));
var upath_1 = require("upath");
var xml2js_1 = __importDefault(require("xml2js"));
var config_1 = __importDefault(require("../config"));
var excludeTitle_json_1 = __importDefault(require("./excludeTitle.json"));
var html_1 = require("./html");
require("./JSON");
var id_json_1 = __importDefault(require("./lang/id.json"));
var node_username_1 = __importDefault(require("./node-username"));
var remove_double_quotes_1 = __importDefault(require("./remove_double_quotes"));
var StringBuilder_1 = __importDefault(require("./StringBuilder"));
var trim_whitespaces_1 = __importDefault(require("./trim_whitespaces"));
var url_1 = __importDefault(require("./url"));
var util_1 = require("./util");
var yaml_1 = __importDefault(require("./yaml"));
var HexoBase = typeof hexo !== "undefined" ? hexo.base_dir : process.cwd();
var buildDir = path.join(HexoBase, "tmp/hexo-blogger-xml");
var BloggerParser = exports.BloggerParser = /** @class */ (function (_super) {
__extends(BloggerParser, _super);
function BloggerParser(xmlFile) {
var _this = _super.call(this) || this;
/**
* ID Process
*/
_this.id = Math.random().toString(36).substring(2, 15) +
Math.random().toString(36).substring(2, 15);
_this.entriesDir = path.join(buildDir, "entries");
_this.parseXmlJsonResult = [];
_this.hostname = [
"webmanajemen.com",
"git.webmanajemen.com",
"web-manajemen.blogspot",
"dimaslanjaka.github.io",
];
if (!(0, fs_extra_1.existsSync)(xmlFile))
throw "".concat(xmlFile, " not found");
// reset result
_this.parseXmlJsonResult = [];
// clean build dir
_this.clean();
// write ignore to buildDir
sbg_utility_1.default.writefile(path.join((0, upath_1.dirname)(_this.entriesDir), ".gitignore"), "*");
// mkdirSync(this.entriesDir, { recursive: true });
if ((0, node_username_1.default)() == "dimaslanjaka") {
sbg_utility_1.default.writefile(path.join(_this.entriesDir, _this.id), new Date().toString());
}
// read xml
var xmlStr = (0, fs_extra_1.readFileSync)(xmlFile).toString();
// Create empty DOM, the input param here is for HTML not XML, and we don want to parse HTML
var dom = new jsdom_1.JSDOM();
// Get DOMParser, same API as in browser
var DOMParser = dom.window.DOMParser;
var parser = new DOMParser();
// Create document by parsing XML
_this.document = parser.parseFromString(xmlStr, "text/xml");
// save the xml after modifications
var xmlString = _this.document.documentElement.outerHTML;
(0, util_1.writeFileSync)((0, upath_1.join)(buildDir, "rss.xml"), xmlString);
(0, util_1.writeFileSync)((0, upath_1.join)(buildDir, "inner.xml"), _this.document.documentElement.innerHTML);
var entries = _this.document.documentElement.getElementsByTagName("entry");
if (entries.length) {
(0, util_1.writeFileSync)((0, upath_1.join)(buildDir, "entry.xml"), entries[0].innerHTML);
}
return _this;
}
BloggerParser.prototype.setHostname = function (host) {
this.hostname = this.hostname.concat(host);
};
// noinspection JSUnusedGlobalSymbols
BloggerParser.prototype.setEntriesDir = function (dir) {
if (dir.length > 0)
this.entriesDir = dir;
};
/**
* Clean build dir
*/
BloggerParser.prototype.clean = function () {
var self = this;
var deleteFolderRecursive = function (directoryPath) {
if (fs.existsSync(directoryPath)) {
// eslint-disable-next-line no-unused-vars
fs.readdirSync(directoryPath).forEach(function (file) {
var curPath = path.join(directoryPath.toString(), file);
if (fs.lstatSync(curPath).isDirectory()) {
// recurse
deleteFolderRecursive(curPath);
}
else {
// delete file
fs.unlinkSync(curPath);
}
});
fs.rmdirSync(directoryPath);
}
};
deleteFolderRecursive(this.entriesDir);
(0, rimraf_1.rimrafSync)(self.entriesDir);
return this;
};
/**
* Parse entries from feed
* @returns void
*/
BloggerParser.prototype.parseEntry = function () {
var feeds = this.document.documentElement.getElementsByTagName("entry");
var _loop_1 = function (index) {
var element = feeds[index];
var title = element.getElementsByTagName("title")[0].innerHTML;
var excludeTitle = excludeTitle_json_1.default.map(function (title) {
return title.toLowerCase().trim();
});
// skip if contains default title
if (excludeTitle.includes(title.toLowerCase().trim()))
return "continue";
/** CONTENT PROCESS START **/
var content = element.getElementsByTagName("content")[0].innerHTML;
content = he_1.default.decode(content);
/** CONTENT PROCESS END **/
// write post with decoded entities
var obj = {
entry: { content: "", id: [] },
};
//let decodedContent = he.decode(content);
xml2js_1.default.parseString(element.outerHTML, function (err, result) {
obj = result;
});
obj.entry.content = content;
obj.entry.id[0] = obj.entry.id[0].replace("tag:blogger.com,1999:", "");
//writeFileSync(path.join(this.entriesDir, sanitize(title) + ".xml"), element.outerHTML);
(0, util_1.writeFileSync)(path.join(this_1.entriesDir, (0, sanitize_filename_1.default)(title) + ".json"), JSON.stringify(obj, null, 2));
};
var this_1 = this;
for (var index = 0; index < feeds.length; index++) {
_loop_1(index);
}
return this;
};
BloggerParser.prototype.getJsonResult = function () {
var _this = this;
if (!(0, fs_extra_1.existsSync)(this.entriesDir))
throw "Entries Dir Not Found, previous process failed";
var get = fs.readdirSync(this.entriesDir).map(function (file) {
return path.join(_this.entriesDir, file);
});
var self = this;
var results = [];
if (Array.isArray(get) && get.length > 0) {
get.forEach(function (file) {
var buildPost = {
permalink: "",
headers: {
title: "",
webtitle: "",
subtitle: "",
lang: "en",
date: new Date().toISOString(),
type: "post",
tags: [],
author: {
nick: "",
link: "",
email: "",
},
modified: new Date().toISOString(),
category: [],
comments: true,
cover: "",
location: "",
},
content: "",
};
var extname = path.extname(file);
if (extname == ".json") {
var read = (0, fs_extra_1.readFileSync)(file).toString();
var json = JSON.parse(read);
// build hexo header post
if (typeof json == "object") {
buildPost.content = json.entry.content;
try {
// post permalink
if (typeof json.entry.link[4] != "undefined") {
buildPost.permalink = new URL(json.entry.link[4].$.href).pathname;
// modify html body (Content)
var mod = self.modifyHtml(json.entry.content);
// remove footer rss messages
//buildPost.content = t.stripFooterFeed(buildPost.content);
buildPost.content = mod.content;
// external link seo
//buildPost.content = t.externalLink(buildPost.content);
// post title
buildPost.headers.title = json.entry.title[0]._.trim();
// post language simple
var titleTest = buildPost.headers.title.toLocaleLowerCase();
if (new RegExp("s?" + id_json_1.default.join("|") + "s?", "gmu").test(titleTest)) {
buildPost.headers.lang = "id";
}
// post thumbnail/cover
//buildPost.headers.cover = t.getFirstImg(buildPost.content);
buildPost.headers.cover = mod.thumbnail;
// post author
buildPost.headers.author = {
nick: json.entry.author[0].name[0],
link: typeof json.entry.author[0].uri != "undefined"
? json.entry.author[0].uri[0]
: "",
email: typeof json.entry.author[0].email != "undefined"
? json.entry.author[0].email[0]
: "",
};
// post categories
json.entry.category.forEach(function (category) {
var cat = category.$.term.trim();
if (!url_1.default.isValidURL(cat))
buildPost.headers.tags.push(cat);
});
// post published
buildPost.headers.date = json.entry.published[0];
buildPost.headers.modified = json.entry.updated[0];
// post description
//const parserhtml = fromString(buildPost.content);
//const contentStr = parserhtml.window.document.documentElement.querySelector("div,p,span");
//console.log(contentStr.textContent);
//buildPost.headers.subtitle = truncate(he.decode(contentStr.textContent), 140, "").trim();
buildPost.headers.subtitle = (0, trim_whitespaces_1.default)((0, remove_double_quotes_1.default)(mod.description)).replace(new RegExp("[^a-zA-Z., ]", "m"), "");
// site title
buildPost.headers.webtitle = config_1.default.webtitle;
if (buildPost.permalink.length > 0) {
var saveFile = path.join(buildDir, "results", buildPost.permalink.replace(/\.html$/, ".json"));
results.push(buildPost);
(0, util_1.writeFileSync)(saveFile, JSON.stringify(buildPost, null, 2));
}
}
}
catch (e) {
//writeFileSync(path.join(buildDir, 'errors', "error.log"), JSON.safeStringify(e));
(0, util_1.writeFileSync)(path.join(buildDir, "errors", "error-" + (0, upath_1.basename)(file)), JSON.stringify(json, null, 2));
(0, util_1.writeFileSync)(path.join(buildDir, "errors", "error-body-" + (0, upath_1.basename)(file, ".json") + ".html"), buildPost.content);
//buildPost.content
//console.log(json.entry.content);
throw e;
}
}
}
});
}
this.parseXmlJsonResult = results;
return this;
};
/**
* Modify body content such as
* - external link
* - first img
* - post description
* @param content
*/
BloggerParser.prototype.modifyHtml = function (content) {
var self = this;
var parserhtml = (0, html_1.fromString)(content);
// strip footer rss messages
// remove custom messages in footer feed
var find1 = parserhtml.window.document.querySelector('[class="blogger-post-footer"]');
if (find1) {
find1.remove();
}
var find2 = parserhtml.window.document.getElementsByClassName("blogger-post-footer");
if (find2.length > 0) {
for (var i = 0; i < find2.length; i++) {
var item = find2.item(i);
item.remove();
}
}
// get first img
var firstImg = "https://upload.wikimedia.org/wikipedia/commons/thumb/a/ac/No_image_available.svg/2048px-No_image_available.svg.png";
var find = parserhtml.window.document.getElementsByTagName("img");
if (find.length > 0) {
for (var i = 0; i < find.length; i++) {
var item = find.item(i);
if (item.src.trim().length > 0) {
firstImg = item.src;
break;
}
}
}
// external link seo
var processLink = function (link) {
var href = self.parse_url(link.href);
if (href instanceof URL) {
var process_1 = true;
self.hostname.forEach(function (hostnameKey) {
if (href.host.includes(hostnameKey)) {
//console.log(hostnameKey, href.host, href.host.includes(hostnameKey));
process_1 = false;
}
});
if (process_1) {
link.setAttribute("rel", "noopener noreferer nofollow");
//if (t.hostname.includes(link.href.h))
//console.log(link.outerHTML);
}
}
};
// find all hyperlinks
var links = parserhtml.window.document.getElementsByTagName("a");
if (links.length > 0) {
for (var i = 0; i < links.length; i++) {
processLink(links.item(i));
}
}
// post description
var description;
var contentStr = parserhtml.window.document.documentElement.querySelector("div,p,span");
//console.log(contentStr.textContent);
if (contentStr) {
description = (0, util_1.truncate)(he_1.default.decode(contentStr.textContent), 140, "").trim();
}
else {
description = (0, util_1.truncate)(content, 140, "").trim();
}
return {
thumbnail: firstImg,
content: parserhtml.window.document.body.innerHTML,
description: description,
};
};
BloggerParser.prototype.getParsedXml = function () {
return this.parseXmlJsonResult;
};
/**
* export parsed xml to folder (default source/_posts)
* @param dir folder posts
* @param callback function called each post (required return string content after modification)
* @example
* export("source/_posts", (content) => {
* content = content.replace('http://', 'https://') // replace http to https for example
* return content; // return back the modified content
* })
*/
BloggerParser.prototype.export = function (dir, callback) {
var _this = this;
if (dir === void 0) { dir = "source/_posts"; }
var self = this;
var parsedList = this.getParsedXml();
var processResult = function (post) {
var postPath = path.join(dir, post.permalink.replace(/.html$/, ".md"));
//let postPathTest = path.join(dir, "test.md");
//console.log(post.headers);
var postHeader = yaml_1.default.fromObject(_this.objTrim(post.headers));
//console.log(postHeader);
if (typeof callback == "function") {
post.content = callback(post.content, post.headers);
}
//post.content = this.stripFooterFeed(post.content);
var postResult = new StringBuilder_1.default("---")
.appendLine(postHeader)
.appendLine("---")
.append("\n\n")
.append(post.content)
.toString();
//const postResult = `---\n${postHeader}\n---\n\n${post.content}`;
(0, util_1.writeFileSync)(postPath, postResult);
self.emit("write-post", postPath);
};
parsedList.forEach(function (i, idx, array) {
processResult(i);
if (idx === array.length - 1) {
//console.log("Last callback call at index " + idx + " with value " + i);
_this.emit("lastExport", { item: i, id: idx, array: array });
}
});
//processResult(parsedList[0]);
return this;
};
/**
* Trim Object
* @see {@link https://stackoverflow.com/a/51616282}
* @param obj
*/
BloggerParser.prototype.objTrim = function (obj) {
Object.keys(obj).map(function (k) { return (obj[k] = typeof obj[k] == "string" ? obj[k].trim() : obj[k]); });
return obj;
};
BloggerParser.prototype.parse_url = function (url) {
try {
return new URL(url);
}
catch (e) {
return url;
}
};
/**
* Automatic process xml and output into directory with custom callback each function
* @param outputDir
* @param callback
*/
BloggerParser.prototype.auto = function (file, outputDir, callback) {
if (outputDir === void 0) { outputDir = "source/_posts"; }
var parser = new BloggerParser(file);
//parser.setHostname("webmanajemen.com");
parser.clean();
var parsed = parser.parseEntry().getJsonResult();
console.log(file, parsed.getParsedXml().length, "total posts");
parsed.export(outputDir, callback);
};
BloggerParser.prototype.toString = function () {
return JSON.stringify(this.getParsedXml(), null, 4);
};
BloggerParser.debug = false;
return BloggerParser;
}(events_1.EventEmitter));
exports.default = BloggerParser;