UNPKG

@dimfu/recipe-scraper

Version:

Extract recipe data from the web effortlessly

261 lines (256 loc) 8.09 kB
// src/index.ts import * as cheerio from "cheerio"; import axios from "axios"; import { validate } from "jsonschema"; import microdata from "microdata-node"; // src/schema.json var schema_default = { $schema: "http://json-schema.org/draft-07/schema#", description: "A recipe scraped from the web", type: "object", required: ["name", "recipeIngredients"], properties: { name: { type: "string", minLength: 1 }, recipeIngredients: { type: "array", minItems: 1, items: { type: "string" } } } }; // src/utils.ts var MATCH_HTML_TAGS = /<[^>]*>?/gm; var MATCH_LINE_BREAK = /(\r\n|\n|\r)/gm; var MATCH_MULTI_SPACE = /&nbsp;|\s\s+/gm; function isValidHttpUrl(string) { let url; try { url = new URL(string); } catch (_) { return false; } return url.protocol === "http:" || url.protocol === "https:"; } // src/propertyTransforrmer.ts import { parse } from "iso8601-duration"; function transformImage(value) { if (typeof value === "string") return value; if (value.url) return value.url; if (Array.isArray(value)) return value[0]; return value; } function transformToList(value) { if (typeof value === "string") { if (value.includes(",")) return value.split(",").map((item) => item.trim()); return [value]; } if (Array.isArray(value)) return value; return value; } function transformToString(value) { if (typeof value === "string") return value; if (Array.isArray(value)) return value[0]; return value; } function transformISOToString(dateObj) { let date = ""; if (dateObj.days) date += dateObj.days > 1 ? `${dateObj.days} days ` : `${dateObj.days} day `; if (dateObj.hours) date += dateObj.hours > 1 ? `${dateObj.hours} hours ` : `${dateObj.hours} hour `; if (dateObj.minutes) date += dateObj.minutes > 1 ? `${dateObj.minutes} minutes ` : `${dateObj.minutes} minute `; if (dateObj.seconds) date += dateObj.seconds > 1 ? `${dateObj.seconds} seconds ` : `${dateObj.seconds} second `; return date.trim(); } function transformToTime(value) { const time = transformToString(value); try { const parsedISODuration = parse(time); if (parsedISODuration) return transformISOToString(parsedISODuration); } catch { } return time; } function cleanString(str) { return str.replace(MATCH_HTML_TAGS, "").replace(MATCH_LINE_BREAK, " ").replace(MATCH_MULTI_SPACE, " ").trim(); } function transformToCleanString(value) { return cleanString(transformToString(value)); } function transformInstructions(value) { if (typeof value === "string") { const cleanedValue = cleanString(value); if (cleanedValue.includes(".,")) return cleanedValue.split(".,").map((item) => item.trim()); return [cleanedValue]; } if (Array.isArray(value)) { const firstItem = value[0]; if (typeof firstItem === "string") return value.map((item) => cleanString(item)); return value.map((item) => { if (item.text) return cleanString(item.text); return void 0; }); } } function cleanIngredientAmounts(line) { return line.replace(/¼/g, "1/4").replace(/½/g, "1/2").replace(/¾/g, "3/4").replace(/⅔/g, "2/3").replace(MATCH_HTML_TAGS, "").replace(MATCH_MULTI_SPACE, " ").trim(); } function transformIngredients(value) { if (value && typeof value[0] === "string") return value.map((item) => cleanIngredientAmounts(item)); const mappedItems = []; Object.entries(value).forEach(([, item]) => { if (item.properties) { const { name, amount } = item.properties; if (name || amount) { const _name = name && name[0]; const _amount = amount && amount[0]; const singleLine = _amount ? `${_amount} ${_name}` : _name; mappedItems.push(cleanIngredientAmounts(singleLine)); } } }); if (mappedItems.length) return mappedItems; return []; } var propertyTransformerMap = { name: transformToString, image: transformImage, description: transformToCleanString, cookTime: transformToTime, prepTime: transformToTime, totalTime: transformToTime, recipeYield: transformToString, recipeIngredients: transformIngredients, recipeInstructions: transformInstructions, recipeCategories: transformToList, recipeCuisines: transformToList, keywords: transformToList }; var propertyTransforrmer_default = propertyTransformerMap; // src/index.ts var DEFAULT_OPTIONS = { maxRedirects: 5, timeout: 1e4 }; function consolidateRecipeProperties(recipe) { return { url: recipe.url, name: recipe.name, image: recipe.image || recipe.thumbnailUrl, description: recipe.description, cookTime: recipe.cookTime, prepTime: recipe.prepTime, totalTime: recipe.totalTime, recipeYield: recipe.recipeYield, recipeIngredients: recipe.recipeIngredient, recipeInstructions: recipe.recipeInstructions, recipeCategories: recipe.recipeCategory, recipeCuisines: recipe.recipeCuisine, keywords: recipe.keywords }; } function prettifyRecipe(recipe, url) { const transformedRecipe = {}; const consolidatedRecipe = consolidateRecipeProperties(recipe); transformedRecipe.url = recipe.url?.toString() || (isValidHttpUrl(url) ? url : void 0); Object.entries(consolidatedRecipe).forEach(([key, value]) => { const propertyTransformer = propertyTransforrmer_default[key]; if (value) transformedRecipe[key] = propertyTransformer(value); }); return transformedRecipe; } async function getRecipeData(input, inputOptions = {}) { let siteUrl, html, recipe; if (typeof input === "object") { inputOptions = input; siteUrl = input; } else { siteUrl = input; } const options = { ...DEFAULT_OPTIONS, ...inputOptions }; if (!isValidHttpUrl(siteUrl) && !options.html) throw new Error("Url must start with http:// or https://"); try { const response = await axios.get(siteUrl, { responseType: "text", headers: { "Accept-Language": options.lang }, timeout: options.timeout, maxRedirects: options.maxRedirects }); html = response.data; } catch (err) { const message = err instanceof Error ? err.message : "Unknown error"; if (options.html) html = options.html; else throw new Error(message); } try { const $ = cheerio.load(html); const tags = $('script[type="application/ld+json"]'); if (tags.length > 0) { for (let i = 0; i < tags.length; i++) { const tag = tags[i]; const textContent = $(tag).text(); if (textContent) { const data = JSON.parse(textContent); if (data["@graph"] && Array.isArray(data["@graph"])) { data["@graph"].forEach((g) => { if (g["@type"] === "Recipe") recipe = data; }); } if (data["@type"] === "Recipe") recipe = data; if (Array.isArray(data["@type"]) && data["@type"].includes("Recipe")) recipe = data; if (Array.isArray(data)) recipe = data.find((obj) => obj["@type"] && obj["@type"].includes("Recipe")); } else { throw new Error("Something went wrong while scraping"); } } } else { throw new Error("Trying search for microdata next"); } } catch { const data = microdata.toJson(html); if (!data || !data.items || !data.items[0]) throw new Error("HTML tags provided has no valid recipe schema"); const recipeData = Object.values(data.items).find((item) => item.type[0].includes("Recipe")); if (!recipeData?.properties) throw new Error("Recipe not found on page"); recipe = recipeData.properties; } const prettifiedRecipe = prettifyRecipe(recipe, siteUrl); if (prettifiedRecipe !== void 0) { const response = validate({ name: prettifiedRecipe.name, recipeIngredients: prettifiedRecipe.recipeIngredients }, schema_default); if (!response.valid) throw new Error("Recipe is not valid"); return prettifiedRecipe; } } export { getRecipeData as default };