hexo-blogger-xml
Version:
Import xml blogger to hexo
517 lines (462 loc) • 14.8 kB
text/typescript
import { EventEmitter } from "events";
import * as fs from "fs";
import { existsSync, readFileSync } from "fs-extra";
import he from "he";
import { JSDOM } from "jsdom";
import * as path from "path";
import { rimrafSync } from "rimraf";
import sanitize from "sanitize-filename";
import utility from "sbg-utility";
import { basename, dirname, join } from "upath";
import xml2js from "xml2js";
import config from "../config";
import { Entry } from "../types/entry";
import { PostHeader } from "../types/post-header";
import excludeTitleArr from "./excludeTitle.json";
import { fromString } from "./html";
import "./JSON";
import langID from "./lang/id.json";
import getUsername from "./node-username";
import remove_double_quotes from "./remove_double_quotes";
import StringBuilder from "./StringBuilder";
import trim_whitespaces from "./trim_whitespaces";
import url from "./url";
import { truncate, writeFileSync } from "./util";
import ParserYaml from "./yaml";
interface objResult {
permalink: string;
headers: PostHeader;
content: string;
}
export declare interface BloggerParser {
on<U extends keyof BloggerParser>(event: U, listener: BloggerParser[U]): this;
on(event: "lastExport", listener: (arg: Record<any, any>) => any): this;
on(event: "write-post", listener: (arg: string) => any): void;
//emit<U extends keyof BloggerParser>(event: U, ...args: Parameters<BloggerParser[U]>): boolean;
}
const HexoBase = typeof hexo !== "undefined" ? hexo.base_dir : process.cwd();
const buildDir = path.join(HexoBase, "tmp/hexo-blogger-xml");
export class BloggerParser extends EventEmitter {
static debug = false;
/**
* ID Process
*/
id =
Math.random().toString(36).substring(2, 15) +
Math.random().toString(36).substring(2, 15);
entriesDir = path.join(buildDir, "entries");
private document: Document;
parseXmlJsonResult: objResult[] = [];
hostname: string[] = [
"webmanajemen.com",
"git.webmanajemen.com",
"web-manajemen.blogspot",
"dimaslanjaka.github.io",
];
constructor(xmlFile: string | fs.PathLike) {
super();
if (!existsSync(xmlFile)) throw `${xmlFile} not found`;
// reset result
this.parseXmlJsonResult = [];
// clean build dir
this.clean();
// write ignore to buildDir
utility.writefile(path.join(dirname(this.entriesDir), ".gitignore"), "*");
// mkdirSync(this.entriesDir, { recursive: true });
if (getUsername() == "dimaslanjaka") {
utility.writefile(
path.join(this.entriesDir, this.id),
new Date().toString()
);
}
// read xml
const xmlStr = readFileSync(xmlFile).toString();
// Create empty DOM, the input param here is for HTML not XML, and we don want to parse HTML
const dom = new JSDOM();
// Get DOMParser, same API as in browser
const DOMParser = dom.window.DOMParser;
const parser = new DOMParser();
// Create document by parsing XML
this.document = parser.parseFromString(xmlStr, "text/xml");
// save the xml after modifications
const xmlString = this.document.documentElement.outerHTML;
writeFileSync(join(buildDir, "rss.xml"), xmlString);
writeFileSync(
join(buildDir, "inner.xml"),
this.document.documentElement.innerHTML
);
const entries = this.document.documentElement.getElementsByTagName("entry");
if (entries.length) {
writeFileSync(join(buildDir, "entry.xml"), entries[0].innerHTML);
}
}
setHostname(host: string[]) {
this.hostname = this.hostname.concat(host);
}
// noinspection JSUnusedGlobalSymbols
setEntriesDir(dir: string) {
if (dir.length > 0) this.entriesDir = dir;
}
/**
* Clean build dir
*/
clean() {
const self = this;
const deleteFolderRecursive = function (directoryPath: fs.PathLike) {
if (fs.existsSync(directoryPath)) {
// eslint-disable-next-line no-unused-vars
fs.readdirSync(directoryPath).forEach((file) => {
const curPath = path.join(directoryPath.toString(), file);
if (fs.lstatSync(curPath).isDirectory()) {
// recurse
deleteFolderRecursive(curPath);
} else {
// delete file
fs.unlinkSync(curPath);
}
});
fs.rmdirSync(directoryPath);
}
};
deleteFolderRecursive(this.entriesDir);
rimrafSync(self.entriesDir);
return this;
}
/**
* Parse entries from feed
* @returns void
*/
parseEntry() {
const feeds = this.document.documentElement.getElementsByTagName("entry");
for (let index = 0; index < feeds.length; index++) {
const element = feeds[index];
const title = element.getElementsByTagName("title")[0].innerHTML;
const excludeTitle = excludeTitleArr.map((title) => {
return title.toLowerCase().trim();
});
// skip if contains default title
if (excludeTitle.includes(title.toLowerCase().trim())) continue;
/** CONTENT PROCESS START **/
let content = element.getElementsByTagName("content")[0].innerHTML;
content = he.decode(content);
/** CONTENT PROCESS END **/
// write post with decoded entities
let obj = {
entry: { content: "", id: [] },
};
//let decodedContent = he.decode(content);
xml2js.parseString(element.outerHTML, function (err, result) {
obj = result;
});
obj.entry.content = content;
obj.entry.id[0] = obj.entry.id[0].replace("tag:blogger.com,1999:", "");
//writeFileSync(path.join(this.entriesDir, sanitize(title) + ".xml"), element.outerHTML);
writeFileSync(
path.join(this.entriesDir, sanitize(title) + ".json"),
JSON.stringify(obj, null, 2)
);
}
return this;
}
getJsonResult() {
if (!existsSync(this.entriesDir))
throw "Entries Dir Not Found, previous process failed";
const get = fs.readdirSync(this.entriesDir).map((file) => {
return path.join(this.entriesDir, file);
});
const self = this;
const results = [];
if (Array.isArray(get) && get.length > 0) {
get.forEach(function (file) {
const buildPost: objResult = {
permalink: "",
headers: {
title: "",
webtitle: "",
subtitle: "",
lang: "en",
date: new Date().toISOString(),
type: "post",
tags: [],
author: {
nick: "",
link: "",
email: "",
},
modified: new Date().toISOString(),
category: [],
comments: true,
cover: "",
location: "",
},
content: "",
};
const extname = path.extname(file);
if (extname == ".json") {
const read = readFileSync(file).toString();
const json: Entry = JSON.parse(read);
// build hexo header post
if (typeof json == "object") {
buildPost.content = json.entry.content;
try {
// post permalink
if (typeof json.entry.link[4] != "undefined") {
buildPost.permalink = new URL(
json.entry.link[4].$.href
).pathname;
// modify html body (Content)
const mod = self.modifyHtml(json.entry.content);
// remove footer rss messages
//buildPost.content = t.stripFooterFeed(buildPost.content);
buildPost.content = mod.content;
// external link seo
//buildPost.content = t.externalLink(buildPost.content);
// post title
buildPost.headers.title = json.entry.title[0]._.trim();
// post language simple
const titleTest = buildPost.headers.title.toLocaleLowerCase();
if (
new RegExp("s?" + langID.join("|") + "s?", "gmu").test(
titleTest
)
) {
buildPost.headers.lang = "id";
}
// post thumbnail/cover
//buildPost.headers.cover = t.getFirstImg(buildPost.content);
buildPost.headers.cover = mod.thumbnail;
// post author
buildPost.headers.author = {
nick: json.entry.author[0].name[0],
link:
typeof json.entry.author[0].uri != "undefined"
? json.entry.author[0].uri[0]
: "",
email:
typeof json.entry.author[0].email != "undefined"
? json.entry.author[0].email[0]
: "",
};
// post categories
json.entry.category.forEach(function (category) {
const cat = category.$.term.trim();
if (!url.isValidURL(cat)) buildPost.headers.tags.push(cat);
});
// post published
buildPost.headers.date = json.entry.published[0];
buildPost.headers.modified = json.entry.updated[0];
// post description
//const parserhtml = fromString(buildPost.content);
//const contentStr = parserhtml.window.document.documentElement.querySelector("div,p,span");
//console.log(contentStr.textContent);
//buildPost.headers.subtitle = truncate(he.decode(contentStr.textContent), 140, "").trim();
buildPost.headers.subtitle = trim_whitespaces(
remove_double_quotes(mod.description)
).replace(new RegExp("[^a-zA-Z., ]", "m"), "");
// site title
buildPost.headers.webtitle = config.webtitle;
if (buildPost.permalink.length > 0) {
const saveFile = path.join(
buildDir,
"results",
buildPost.permalink.replace(/\.html$/, ".json")
);
results.push(buildPost);
writeFileSync(saveFile, JSON.stringify(buildPost, null, 2));
}
}
} catch (e) {
//writeFileSync(path.join(buildDir, 'errors', "error.log"), JSON.safeStringify(e));
writeFileSync(
path.join(buildDir, "errors", "error-" + basename(file)),
JSON.stringify(json, null, 2)
);
writeFileSync(
path.join(
buildDir,
"errors",
"error-body-" + basename(file, ".json") + ".html"
),
buildPost.content
);
//buildPost.content
//console.log(json.entry.content);
throw e;
}
}
}
});
}
this.parseXmlJsonResult = results;
return this;
}
/**
* Modify body content such as
* - external link
* - first img
* - post description
* @param content
*/
modifyHtml(content: string) {
const self = this;
const parserhtml = fromString(content);
// strip footer rss messages
// remove custom messages in footer feed
const find1 = parserhtml.window.document.querySelector(
'[class="blogger-post-footer"]'
);
if (find1) {
find1.remove();
}
const find2 = parserhtml.window.document.getElementsByClassName(
"blogger-post-footer"
);
if (find2.length > 0) {
for (let i = 0; i < find2.length; i++) {
const item = find2.item(i);
item.remove();
}
}
// get first img
let firstImg =
"https://upload.wikimedia.org/wikipedia/commons/thumb/a/ac/No_image_available.svg/2048px-No_image_available.svg.png";
const find = parserhtml.window.document.getElementsByTagName("img");
if (find.length > 0) {
for (let i = 0; i < find.length; i++) {
const item = find.item(i);
if (item.src.trim().length > 0) {
firstImg = item.src;
break;
}
}
}
// external link seo
const processLink = (link: HTMLAnchorElement) => {
const href = self.parse_url(link.href);
if (href instanceof URL) {
let process = true;
self.hostname.forEach((hostnameKey) => {
if (href.host.includes(hostnameKey)) {
//console.log(hostnameKey, href.host, href.host.includes(hostnameKey));
process = false;
}
});
if (process) {
link.setAttribute("rel", "noopener noreferer nofollow");
//if (t.hostname.includes(link.href.h))
//console.log(link.outerHTML);
}
}
};
// find all hyperlinks
const links = parserhtml.window.document.getElementsByTagName("a");
if (links.length > 0) {
for (let i = 0; i < links.length; i++) {
processLink(links.item(i));
}
}
// post description
let description: string;
const contentStr =
parserhtml.window.document.documentElement.querySelector("div,p,span");
//console.log(contentStr.textContent);
if (contentStr) {
description = truncate(he.decode(contentStr.textContent), 140, "").trim();
} else {
description = truncate(content, 140, "").trim();
}
return {
thumbnail: firstImg,
content: parserhtml.window.document.body.innerHTML,
description: description,
};
}
getParsedXml() {
return this.parseXmlJsonResult;
}
/**
* export parsed xml to folder (default source/_posts)
* @param dir folder posts
* @param callback function called each post (required return string content after modification)
* @example
* export("source/_posts", (content) => {
* content = content.replace('http://', 'https://') // replace http to https for example
* return content; // return back the modified content
* })
*/
export(
dir = "source/_posts",
callback?: (arg0: string, arg1: PostHeader) => string
) {
const self = this;
const parsedList = this.getParsedXml();
const processResult = (post: objResult) => {
const postPath = path.join(dir, post.permalink.replace(/.html$/, ".md"));
//let postPathTest = path.join(dir, "test.md");
//console.log(post.headers);
const postHeader = ParserYaml.fromObject(this.objTrim(post.headers));
//console.log(postHeader);
if (typeof callback == "function") {
post.content = callback(post.content, post.headers);
}
//post.content = this.stripFooterFeed(post.content);
const postResult = new StringBuilder("---")
.appendLine(postHeader)
.appendLine("---")
.append("\n\n")
.append(post.content)
.toString();
//const postResult = `---\n${postHeader}\n---\n\n${post.content}`;
writeFileSync(postPath, postResult);
self.emit("write-post", postPath);
};
parsedList.forEach((i, idx, array) => {
processResult(i);
if (idx === array.length - 1) {
//console.log("Last callback call at index " + idx + " with value " + i);
this.emit("lastExport", { item: i, id: idx, array: array });
}
});
//processResult(parsedList[0]);
return this;
}
/**
* Trim Object
* @see {@link https://stackoverflow.com/a/51616282}
* @param obj
*/
objTrim(obj: Record<any, any>) {
Object.keys(obj).map(
(k) => (obj[k] = typeof obj[k] == "string" ? obj[k].trim() : obj[k])
);
return obj;
}
parse_url(url: string): URL | string {
try {
return new URL(url);
} catch (e) {
return url;
}
}
/**
* Automatic process xml and output into directory with custom callback each function
* @param outputDir
* @param callback
*/
auto(
file: string,
outputDir = "source/_posts",
callback: (content: string) => any
) {
const parser = new BloggerParser(file);
//parser.setHostname("webmanajemen.com");
parser.clean();
const parsed = parser.parseEntry().getJsonResult();
console.log(file, parsed.getParsedXml().length, "total posts");
parsed.export(outputDir, callback);
}
toString() {
return JSON.stringify(this.getParsedXml(), null, 4);
}
}
export default BloggerParser;