domutils
Version:
Utilities for working with htmlparser2's dom
302 lines (265 loc) • 7.66 kB
text/typescript
import type { AnyNode, Element } from "domhandler";
import { getElementsByTagName } from "./legacy.js";
import { textContent } from "./stringify.js";
/**
* The medium of a media item.
*
* @category Feeds
*/
export type FeedItemMediaMedium =
| "image"
| "audio"
| "video"
| "document"
| "executable";
/**
* The type of a media item.
*
* @category Feeds
*/
export type FeedItemMediaExpression = "sample" | "full" | "nonstop";
/**
* A media item of a feed entry.
*
* @category Feeds
*/
export interface FeedItemMedia {
medium: FeedItemMediaMedium | undefined;
isDefault: boolean;
url?: string;
fileSize?: number;
type?: string;
expression?: FeedItemMediaExpression;
bitrate?: number;
framerate?: number;
samplingrate?: number;
channels?: number;
duration?: number;
height?: number;
width?: number;
lang?: string;
}
/**
* An entry of a feed.
*
* @category Feeds
*/
export interface FeedItem {
id?: string;
title?: string;
link?: string;
description?: string;
pubDate?: Date;
media: FeedItemMedia[];
}
/**
* The root of a feed.
*
* @category Feeds
*/
export interface Feed {
type: string;
id?: string;
title?: string;
link?: string;
description?: string;
updated?: Date;
author?: string;
items: FeedItem[];
}
/**
* Get the feed object from the root of a DOM tree.
*
* @category Feeds
* @param document The DOM to extract the feed from.
* @returns The feed.
*/
export function getFeed(document: AnyNode[]): Feed | null {
const feedRoot = getOneElement(isValidFeed, document);
return feedRoot
? feedRoot.name === "feed"
? getAtomFeed(feedRoot)
: getRssFeed(feedRoot)
: null;
}
/**
* Parse an Atom feed.
*
* @param feedRoot The root of the feed.
* @returns The parsed feed.
*/
function getAtomFeed(feedRoot: Element) {
const childs = feedRoot.children;
const feed: Feed = {
type: "atom",
items: getElementsByTagName("entry", childs).map((item) => {
const { children } = item;
const entry: FeedItem = { media: getMediaElements(children) };
addConditionally(entry, "id", "id", children);
addConditionally(entry, "title", "title", children);
const href = getOneElement("link", children)?.attribs["href"];
if (href) {
entry.link = href;
}
const description =
fetch("summary", children) || fetch("content", children);
if (description) {
entry.description = description;
}
const pubDate = fetch("updated", children);
if (pubDate) {
entry.pubDate = new Date(pubDate);
}
return entry;
}),
};
addConditionally(feed, "id", "id", childs);
addConditionally(feed, "title", "title", childs);
const href = getOneElement("link", childs)?.attribs["href"];
if (href) {
feed.link = href;
}
addConditionally(feed, "description", "subtitle", childs);
const updated = fetch("updated", childs);
if (updated) {
feed.updated = new Date(updated);
}
addConditionally(feed, "author", "email", childs, true);
return feed;
}
/**
* Parse a RSS feed.
*
* @param feedRoot The root of the feed.
* @returns The parsed feed.
*/
function getRssFeed(feedRoot: Element) {
const childs = getOneElement("channel", feedRoot.children)?.children ?? [];
const feed: Feed = {
type: feedRoot.name.substr(0, 3),
id: "",
items: getElementsByTagName("item", feedRoot.children).map(
(item: Element) => {
const { children } = item;
const entry: FeedItem = { media: getMediaElements(children) };
addConditionally(entry, "id", "guid", children);
addConditionally(entry, "title", "title", children);
addConditionally(entry, "link", "link", children);
addConditionally(entry, "description", "description", children);
const pubDate =
fetch("pubDate", children) || fetch("dc:date", children);
if (pubDate) entry.pubDate = new Date(pubDate);
return entry;
},
),
};
addConditionally(feed, "title", "title", childs);
addConditionally(feed, "link", "link", childs);
addConditionally(feed, "description", "description", childs);
const updated = fetch("lastBuildDate", childs);
if (updated) {
feed.updated = new Date(updated);
}
addConditionally(feed, "author", "managingEditor", childs, true);
return feed;
}
const MEDIA_KEYS_STRING = ["url", "type", "lang"] as const;
const MEDIA_KEYS_INT = [
"fileSize",
"bitrate",
"framerate",
"samplingrate",
"channels",
"duration",
"height",
"width",
] as const;
/**
* Get all media elements of a feed item.
*
* @param where Nodes to search in.
* @returns Media elements.
*/
function getMediaElements(where: AnyNode[]): FeedItemMedia[] {
return getElementsByTagName("media:content", where).map((element) => {
const { attribs } = element;
const media: FeedItemMedia = {
medium: attribs["medium"] as unknown as
| FeedItemMediaMedium
| undefined,
isDefault: !!attribs["isDefault"],
};
for (const attrib of MEDIA_KEYS_STRING) {
if (attribs[attrib]) {
media[attrib] = attribs[attrib];
}
}
for (const attrib of MEDIA_KEYS_INT) {
if (attribs[attrib]) {
media[attrib] = Number.parseInt(attribs[attrib], 10);
}
}
if (attribs["expression"]) {
media.expression = attribs[
"expression"
] as unknown as FeedItemMediaExpression;
}
return media;
});
}
/**
* Get one element by tag name.
*
* @param tagName Tag name to look for
* @param node Node to search in
* @returns The element or null
*/
function getOneElement(
tagName: string | ((name: string) => boolean),
node: AnyNode[],
): Element | null {
return getElementsByTagName(tagName, node, true, 1)[0];
}
/**
* Get the text content of an element with a certain tag name.
*
* @param tagName Tag name to look for.
* @param where Node to search in.
* @param recurse Whether to recurse into child nodes.
* @returns The text content of the element.
*/
function fetch(
tagName: string,
where: AnyNode | AnyNode[],
recurse = false,
): string {
return textContent(getElementsByTagName(tagName, where, recurse, 1)).trim();
}
/**
* Adds a property to an object if it has a value.
*
* @param object Object to be extended.
* @param property Property name.
* @param tagName Tag name that contains the conditionally added property.
* @param where Element to search for the property.
* @param recurse Whether to recurse into child nodes.
*/
function addConditionally<T>(
object: T,
property: keyof T,
tagName: string,
where: AnyNode[],
recurse = false,
) {
const value = fetch(tagName, where, recurse);
if (value) object[property] = value as unknown as T[keyof T];
}
/**
* Checks if an element is a feed root node.
*
* @param value The name of the element to check.
* @returns Whether an element is a feed root node.
*/
function isValidFeed(value: string) {
return value === "rss" || value === "feed" || value === "rdf:RDF";
}