UNPKG

@dimfu/recipe-scraper

Version:

Extract recipe data from the web effortlessly

369 lines (360 loc) 11.5 kB
#!/usr/bin/env node "use strict"; var __create = Object.create; var __defProp = Object.defineProperty; var __getOwnPropDesc = Object.getOwnPropertyDescriptor; var __getOwnPropNames = Object.getOwnPropertyNames; var __getProtoOf = Object.getPrototypeOf; var __hasOwnProp = Object.prototype.hasOwnProperty; var __copyProps = (to, from, except, desc) => { if (from && typeof from === "object" || typeof from === "function") { for (let key of __getOwnPropNames(from)) if (!__hasOwnProp.call(to, key) && key !== except) __defProp(to, key, { get: () => from[key], enumerable: !(desc = __getOwnPropDesc(from, key)) || desc.enumerable }); } return to; }; var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__getProtoOf(mod)) : {}, __copyProps( // If the importer is in node compatibility mode or this is not an ESM // file that has been converted to a CommonJS file using a Babel- // compatible transform (i.e. "__esModule" has not been set), then set // "default" to the CommonJS "module.exports" for node compatibility. isNodeMode || !mod || !mod.__esModule ? __defProp(target, "default", { value: mod, enumerable: true }) : target, mod )); // src/cli.ts var import_commander = require("commander"); // package.json var package_default = { name: "@dimfu/recipe-scraper", version: "0.3.0", description: "Extract recipe data from the web effortlessly", publishConfig: { access: "public" }, license: "MIT", homepage: "https://github.com/dimfu/recipe-scraper", repository: { type: "git", url: "git+https://github.com/dimfu/recipe-scraper.git" }, bugs: { url: "https://github.com/dimfu/recipe-scraper/issues" }, keywords: [ "recipe", "food-recipe", "scraper", "json-ld", "jsonld", "micro-data", "microdata" ], exports: { require: "./dist/index.js", import: "./dist/index.mjs" }, main: "./dist/index.js", module: "./dist/index.mjs", types: "./dist/index.d.ts", bin: "./dist/cli.js", files: [ "dist" ], scripts: { "build-fast": "tsup src/index.ts --format cjs,esm && tsup src/cli.ts", build: "pnpm run build-fast --dts-resolve", prepublishOnly: "pnpm run build" }, devDependencies: { "@antfu/eslint-config": "^0.39.4", "@types/node": "^20.2.5", eslint: "^8.41.0", "lint-staged": "^13.2.2", "simple-git-hooks": "^2.8.1", tsup: "6.6.3", typescript: "4.9.5" }, "simple-git-hooks": { "pre-commit": "pnpm lint-staged" }, "lint-staged": { "*": "eslint --fix" }, dependencies: { axios: "^1.4.0", cheerio: "1.0.0-rc.12", commander: "^10.0.1", "iso8601-duration": "^2.1.1", jsonschema: "^1.4.1", "microdata-node": "^2.0.0", "schema-dts": "^1.1.2" } }; // src/index.ts var cheerio = __toESM(require("cheerio")); var import_axios = __toESM(require("axios")); var import_jsonschema = require("jsonschema"); var import_microdata_node = __toESM(require("microdata-node")); // src/schema.json var schema_default = { $schema: "http://json-schema.org/draft-07/schema#", description: "A recipe scraped from the web", type: "object", required: ["name", "recipeIngredients"], properties: { name: { type: "string", minLength: 1 }, recipeIngredients: { type: "array", minItems: 1, items: { type: "string" } } } }; // src/utils.ts var MATCH_HTML_TAGS = /<[^>]*>?/gm; var MATCH_LINE_BREAK = /(\r\n|\n|\r)/gm; var MATCH_MULTI_SPACE = /&nbsp;|\s\s+/gm; function isValidHttpUrl(string) { let url; try { url = new URL(string); } catch (_) { return false; } return url.protocol === "http:" || url.protocol === "https:"; } // src/propertyTransforrmer.ts var import_iso8601_duration = require("iso8601-duration"); function transformImage(value) { if (typeof value === "string") return value; if (value.url) return value.url; if (Array.isArray(value)) return value[0]; return value; } function transformToList(value) { if (typeof value === "string") { if (value.includes(",")) return value.split(",").map((item) => item.trim()); return [value]; } if (Array.isArray(value)) return value; return value; } function transformToString(value) { if (typeof value === "string") return value; if (Array.isArray(value)) return value[0]; return value; } function transformISOToString(dateObj) { let date = ""; if (dateObj.days) date += dateObj.days > 1 ? `${dateObj.days} days ` : `${dateObj.days} day `; if (dateObj.hours) date += dateObj.hours > 1 ? `${dateObj.hours} hours ` : `${dateObj.hours} hour `; if (dateObj.minutes) date += dateObj.minutes > 1 ? `${dateObj.minutes} minutes ` : `${dateObj.minutes} minute `; if (dateObj.seconds) date += dateObj.seconds > 1 ? `${dateObj.seconds} seconds ` : `${dateObj.seconds} second `; return date.trim(); } function transformToTime(value) { const time = transformToString(value); try { const parsedISODuration = (0, import_iso8601_duration.parse)(time); if (parsedISODuration) return transformISOToString(parsedISODuration); } catch { } return time; } function cleanString(str) { return str.replace(MATCH_HTML_TAGS, "").replace(MATCH_LINE_BREAK, " ").replace(MATCH_MULTI_SPACE, " ").trim(); } function transformToCleanString(value) { return cleanString(transformToString(value)); } function transformInstructions(value) { if (typeof value === "string") { const cleanedValue = cleanString(value); if (cleanedValue.includes(".,")) return cleanedValue.split(".,").map((item) => item.trim()); return [cleanedValue]; } if (Array.isArray(value)) { const firstItem = value[0]; if (typeof firstItem === "string") return value.map((item) => cleanString(item)); return value.map((item) => { if (item.text) return cleanString(item.text); return void 0; }); } } function cleanIngredientAmounts(line) { return line.replace(/¼/g, "1/4").replace(/½/g, "1/2").replace(/¾/g, "3/4").replace(/⅔/g, "2/3").replace(MATCH_HTML_TAGS, "").replace(MATCH_MULTI_SPACE, " ").trim(); } function transformIngredients(value) { if (value && typeof value[0] === "string") return value.map((item) => cleanIngredientAmounts(item)); const mappedItems = []; Object.entries(value).forEach(([, item]) => { if (item.properties) { const { name: name2, amount } = item.properties; if (name2 || amount) { const _name = name2 && name2[0]; const _amount = amount && amount[0]; const singleLine = _amount ? `${_amount} ${_name}` : _name; mappedItems.push(cleanIngredientAmounts(singleLine)); } } }); if (mappedItems.length) return mappedItems; return []; } var propertyTransformerMap = { name: transformToString, image: transformImage, description: transformToCleanString, cookTime: transformToTime, prepTime: transformToTime, totalTime: transformToTime, recipeYield: transformToString, recipeIngredients: transformIngredients, recipeInstructions: transformInstructions, recipeCategories: transformToList, recipeCuisines: transformToList, keywords: transformToList }; var propertyTransforrmer_default = propertyTransformerMap; // src/index.ts var DEFAULT_OPTIONS = { maxRedirects: 5, timeout: 1e4 }; function consolidateRecipeProperties(recipe) { return { url: recipe.url, name: recipe.name, image: recipe.image || recipe.thumbnailUrl, description: recipe.description, cookTime: recipe.cookTime, prepTime: recipe.prepTime, totalTime: recipe.totalTime, recipeYield: recipe.recipeYield, recipeIngredients: recipe.recipeIngredient, recipeInstructions: recipe.recipeInstructions, recipeCategories: recipe.recipeCategory, recipeCuisines: recipe.recipeCuisine, keywords: recipe.keywords }; } function prettifyRecipe(recipe, url) { const transformedRecipe = {}; const consolidatedRecipe = consolidateRecipeProperties(recipe); transformedRecipe.url = recipe.url?.toString() || (isValidHttpUrl(url) ? url : void 0); Object.entries(consolidatedRecipe).forEach(([key, value]) => { const propertyTransformer = propertyTransforrmer_default[key]; if (value) transformedRecipe[key] = propertyTransformer(value); }); return transformedRecipe; } async function getRecipeData(input, inputOptions = {}) { let siteUrl, html, recipe; if (typeof input === "object") { inputOptions = input; siteUrl = input; } else { siteUrl = input; } const options = { ...DEFAULT_OPTIONS, ...inputOptions }; if (!isValidHttpUrl(siteUrl) && !options.html) throw new Error("Url must start with http:// or https://"); try { const response = await import_axios.default.get(siteUrl, { responseType: "text", headers: { "Accept-Language": options.lang }, timeout: options.timeout, maxRedirects: options.maxRedirects }); html = response.data; } catch (err) { const message = err instanceof Error ? err.message : "Unknown error"; if (options.html) html = options.html; else throw new Error(message); } try { const $ = cheerio.load(html); const tags = $('script[type="application/ld+json"]'); if (tags.length > 0) { for (let i = 0; i < tags.length; i++) { const tag = tags[i]; const textContent = $(tag).text(); if (textContent) { const data = JSON.parse(textContent); if (data["@graph"] && Array.isArray(data["@graph"])) { data["@graph"].forEach((g) => { if (g["@type"] === "Recipe") recipe = data; }); } if (data["@type"] === "Recipe") recipe = data; if (Array.isArray(data["@type"]) && data["@type"].includes("Recipe")) recipe = data; if (Array.isArray(data)) recipe = data.find((obj) => obj["@type"] && obj["@type"].includes("Recipe")); } else { throw new Error("Something went wrong while scraping"); } } } else { throw new Error("Trying search for microdata next"); } } catch { const data = import_microdata_node.default.toJson(html); if (!data || !data.items || !data.items[0]) throw new Error("HTML tags provided has no valid recipe schema"); const recipeData = Object.values(data.items).find((item) => item.type[0].includes("Recipe")); if (!recipeData?.properties) throw new Error("Recipe not found on page"); recipe = recipeData.properties; } const prettifiedRecipe = prettifyRecipe(recipe, siteUrl); if (prettifiedRecipe !== void 0) { const response = (0, import_jsonschema.validate)({ name: prettifiedRecipe.name, recipeIngredients: prettifiedRecipe.recipeIngredients }, schema_default); if (!response.valid) throw new Error("Recipe is not valid"); return prettifiedRecipe; } } // src/cli.ts var { name, description, version } = package_default; import_commander.program.name(`npx ${name}`).version(version).allowExcessArguments(false).arguments("<url>").description(description, { url: "food recipe url" }).action(async (url) => { try { console.log(await getRecipeData(url)); } catch (err) { console.error(err.message); process.exit(1); } }); import_commander.program.parseAsync().catch((e) => { console.error(e.message); process.exit(1); });