@dimfu/recipe-scraper
Version:
Extract recipe data from the web effortlessly
261 lines (256 loc) • 8.09 kB
JavaScript
// src/index.ts
import * as cheerio from "cheerio";
import axios from "axios";
import { validate } from "jsonschema";
import microdata from "microdata-node";
// src/schema.json
var schema_default = {
$schema: "http://json-schema.org/draft-07/schema#",
description: "A recipe scraped from the web",
type: "object",
required: ["name", "recipeIngredients"],
properties: {
name: {
type: "string",
minLength: 1
},
recipeIngredients: {
type: "array",
minItems: 1,
items: { type: "string" }
}
}
};
// src/utils.ts
var MATCH_HTML_TAGS = /<[^>]*>?/gm;
var MATCH_LINE_BREAK = /(\r\n|\n|\r)/gm;
var MATCH_MULTI_SPACE = / |\s\s+/gm;
function isValidHttpUrl(string) {
let url;
try {
url = new URL(string);
} catch (_) {
return false;
}
return url.protocol === "http:" || url.protocol === "https:";
}
// src/propertyTransforrmer.ts
import { parse } from "iso8601-duration";
function transformImage(value) {
if (typeof value === "string")
return value;
if (value.url)
return value.url;
if (Array.isArray(value))
return value[0];
return value;
}
function transformToList(value) {
if (typeof value === "string") {
if (value.includes(","))
return value.split(",").map((item) => item.trim());
return [value];
}
if (Array.isArray(value))
return value;
return value;
}
function transformToString(value) {
if (typeof value === "string")
return value;
if (Array.isArray(value))
return value[0];
return value;
}
function transformISOToString(dateObj) {
let date = "";
if (dateObj.days)
date += dateObj.days > 1 ? `${dateObj.days} days ` : `${dateObj.days} day `;
if (dateObj.hours)
date += dateObj.hours > 1 ? `${dateObj.hours} hours ` : `${dateObj.hours} hour `;
if (dateObj.minutes)
date += dateObj.minutes > 1 ? `${dateObj.minutes} minutes ` : `${dateObj.minutes} minute `;
if (dateObj.seconds)
date += dateObj.seconds > 1 ? `${dateObj.seconds} seconds ` : `${dateObj.seconds} second `;
return date.trim();
}
function transformToTime(value) {
const time = transformToString(value);
try {
const parsedISODuration = parse(time);
if (parsedISODuration)
return transformISOToString(parsedISODuration);
} catch {
}
return time;
}
function cleanString(str) {
return str.replace(MATCH_HTML_TAGS, "").replace(MATCH_LINE_BREAK, " ").replace(MATCH_MULTI_SPACE, " ").trim();
}
function transformToCleanString(value) {
return cleanString(transformToString(value));
}
function transformInstructions(value) {
if (typeof value === "string") {
const cleanedValue = cleanString(value);
if (cleanedValue.includes(".,"))
return cleanedValue.split(".,").map((item) => item.trim());
return [cleanedValue];
}
if (Array.isArray(value)) {
const firstItem = value[0];
if (typeof firstItem === "string")
return value.map((item) => cleanString(item));
return value.map((item) => {
if (item.text)
return cleanString(item.text);
return void 0;
});
}
}
function cleanIngredientAmounts(line) {
return line.replace(/¼/g, "1/4").replace(/½/g, "1/2").replace(/¾/g, "3/4").replace(/⅔/g, "2/3").replace(MATCH_HTML_TAGS, "").replace(MATCH_MULTI_SPACE, " ").trim();
}
function transformIngredients(value) {
if (value && typeof value[0] === "string")
return value.map((item) => cleanIngredientAmounts(item));
const mappedItems = [];
Object.entries(value).forEach(([, item]) => {
if (item.properties) {
const { name, amount } = item.properties;
if (name || amount) {
const _name = name && name[0];
const _amount = amount && amount[0];
const singleLine = _amount ? `${_amount} ${_name}` : _name;
mappedItems.push(cleanIngredientAmounts(singleLine));
}
}
});
if (mappedItems.length)
return mappedItems;
return [];
}
var propertyTransformerMap = {
name: transformToString,
image: transformImage,
description: transformToCleanString,
cookTime: transformToTime,
prepTime: transformToTime,
totalTime: transformToTime,
recipeYield: transformToString,
recipeIngredients: transformIngredients,
recipeInstructions: transformInstructions,
recipeCategories: transformToList,
recipeCuisines: transformToList,
keywords: transformToList
};
var propertyTransforrmer_default = propertyTransformerMap;
// src/index.ts
var DEFAULT_OPTIONS = {
maxRedirects: 5,
timeout: 1e4
};
function consolidateRecipeProperties(recipe) {
return {
url: recipe.url,
name: recipe.name,
image: recipe.image || recipe.thumbnailUrl,
description: recipe.description,
cookTime: recipe.cookTime,
prepTime: recipe.prepTime,
totalTime: recipe.totalTime,
recipeYield: recipe.recipeYield,
recipeIngredients: recipe.recipeIngredient,
recipeInstructions: recipe.recipeInstructions,
recipeCategories: recipe.recipeCategory,
recipeCuisines: recipe.recipeCuisine,
keywords: recipe.keywords
};
}
function prettifyRecipe(recipe, url) {
const transformedRecipe = {};
const consolidatedRecipe = consolidateRecipeProperties(recipe);
transformedRecipe.url = recipe.url?.toString() || (isValidHttpUrl(url) ? url : void 0);
Object.entries(consolidatedRecipe).forEach(([key, value]) => {
const propertyTransformer = propertyTransforrmer_default[key];
if (value)
transformedRecipe[key] = propertyTransformer(value);
});
return transformedRecipe;
}
async function getRecipeData(input, inputOptions = {}) {
let siteUrl, html, recipe;
if (typeof input === "object") {
inputOptions = input;
siteUrl = input;
} else {
siteUrl = input;
}
const options = { ...DEFAULT_OPTIONS, ...inputOptions };
if (!isValidHttpUrl(siteUrl) && !options.html)
throw new Error("Url must start with http:// or https://");
try {
const response = await axios.get(siteUrl, {
responseType: "text",
headers: {
"Accept-Language": options.lang
},
timeout: options.timeout,
maxRedirects: options.maxRedirects
});
html = response.data;
} catch (err) {
const message = err instanceof Error ? err.message : "Unknown error";
if (options.html)
html = options.html;
else
throw new Error(message);
}
try {
const $ = cheerio.load(html);
const tags = $('script[type="application/ld+json"]');
if (tags.length > 0) {
for (let i = 0; i < tags.length; i++) {
const tag = tags[i];
const textContent = $(tag).text();
if (textContent) {
const data = JSON.parse(textContent);
if (data["@graph"] && Array.isArray(data["@graph"])) {
data["@graph"].forEach((g) => {
if (g["@type"] === "Recipe")
recipe = data;
});
}
if (data["@type"] === "Recipe")
recipe = data;
if (Array.isArray(data["@type"]) && data["@type"].includes("Recipe"))
recipe = data;
if (Array.isArray(data))
recipe = data.find((obj) => obj["@type"] && obj["@type"].includes("Recipe"));
} else {
throw new Error("Something went wrong while scraping");
}
}
} else {
throw new Error("Trying search for microdata next");
}
} catch {
const data = microdata.toJson(html);
if (!data || !data.items || !data.items[0])
throw new Error("HTML tags provided has no valid recipe schema");
const recipeData = Object.values(data.items).find((item) => item.type[0].includes("Recipe"));
if (!recipeData?.properties)
throw new Error("Recipe not found on page");
recipe = recipeData.properties;
}
const prettifiedRecipe = prettifyRecipe(recipe, siteUrl);
if (prettifiedRecipe !== void 0) {
const response = validate({ name: prettifiedRecipe.name, recipeIngredients: prettifiedRecipe.recipeIngredients }, schema_default);
if (!response.valid)
throw new Error("Recipe is not valid");
return prettifiedRecipe;
}
}
export {
getRecipeData as default
};