UNPKG

@speed_of/imdbscraper

Version:

IMDb scraper for extracting movie reviews from IMDb pages.

129 lines (128 loc) 5.93 kB
"use strict"; var __importDefault = (this && this.__importDefault) || function (mod) { return (mod && mod.__esModule) ? mod : { "default": mod }; }; Object.defineProperty(exports, "__esModule", { value: true }); exports.imdbScraper = void 0; const axios_1 = __importDefault(require("axios")); class IMDbScraper { constructor() { this.baseUrl = 'https://www.imdb.com/title'; } cleanHtmlContent(text) { if (!text) return ''; return text // Replace BR tags with newlines .replace(/<br\s*\/?>/gi, '\n') // Remove all HTML tags .replace(/<[^>]*>/g, '') // Replace common HTML entities .replace(/&quot;/g, '"') .replace(/&apos;/g, "'") .replace(/&lt;/g, '<') .replace(/&gt;/g, '>') .replace(/&amp;/g, '&') .replace(/&#39;/g, "'") .replace(/&ndash;/g, '-') .replace(/&mdash;/g, '--') .replace(/&nbsp;/g, ' ') .replace(/&lsquo;/g, "'") .replace(/&rsquo;/g, "'") .replace(/&ldquo;/g, '"') .replace(/&rdquo;/g, '"') // Clean up multiple newlines .replace(/\n\s*\n/g, '\n\n') // Clean up extra spaces .replace(/\s+/g, ' ') .trim(); } async getReviews(imdbId, sortBy = 'HELPFULNESS_SCORE') { var _a, _b, _c, _d, _e, _f, _g; try { console.log('Fetching reviews for:', imdbId); const response = await axios_1.default.post('https://graphql.imdb.com/', { operationName: 'TitleReviewsRefine', variables: { const: imdbId, filter: {}, first: 25, isInProfileImageWeblab: true, locale: 'en-US', sort: { by: sortBy, order: 'DESC' } }, extensions: { persistedQuery: { sha256Hash: 'fb58a77d474033025bf28e1fe68f9b998111d3df58e08cd8405bd9265b1a9aff', version: 1 } } }, { headers: { 'accept': 'application/graphql+json, application/json', 'content-type': 'application/json', 'x-imdb-client-name': 'imdb-web-next', } }); const edges = (_e = (_d = (_c = (_b = (_a = response.data) === null || _a === void 0 ? void 0 : _a.data) === null || _b === void 0 ? void 0 : _b.title) === null || _c === void 0 ? void 0 : _c.reviews) === null || _d === void 0 ? void 0 : _d.edges) !== null && _e !== void 0 ? _e : []; return edges.map((edge) => { var _a, _b, _c, _d, _e, _f, _g; const node = edge.node; return { title: ((_a = node.summary) === null || _a === void 0 ? void 0 : _a.originalText) || 'No summary', author: ((_c = (_b = node.author) === null || _b === void 0 ? void 0 : _b.username) === null || _c === void 0 ? void 0 : _c.text) || 'Anonymous', rating: node.authorRating || 0, date: node.submissionDate || 'Unknown date', content: this.cleanHtmlContent(((_e = (_d = node.text) === null || _d === void 0 ? void 0 : _d.originalText) === null || _e === void 0 ? void 0 : _e.plaidHtml) || ''), votes: { up: ((_f = node.helpfulness) === null || _f === void 0 ? void 0 : _f.upVotes) || 0, down: ((_g = node.helpfulness) === null || _g === void 0 ? void 0 : _g.downVotes) || 0 }, spoiler: node.spoiler || false, reviewId: node.id || '', }; }); } catch (error) { console.error('Error fetching reviews:', { message: error.message, status: (_f = error.response) === null || _f === void 0 ? void 0 : _f.status, data: ((_g = error.response) === null || _g === void 0 ? void 0 : _g.data) ? 'Response data available' : 'No response data' }); throw new Error(`Failed to fetch reviews: ${error.message}`); } } async searchMovie(title) { var _a, _b, _c, _d; try { const response = await axios_1.default.get(`https://v3.sg.media-imdb.com/suggestion/x/${encodeURIComponent(title)}.json`, { headers: { 'accept': 'application/json', } }); const results = (_b = (_a = response.data) === null || _a === void 0 ? void 0 : _a.d) !== null && _b !== void 0 ? _b : []; return results.map((movie) => { var _a; return ({ id: movie.id, titleNameText: movie.l || '', titleReleaseText: movie.y ? String(movie.y) : '', titlePosterImageUrl: ((_a = movie.i) === null || _a === void 0 ? void 0 : _a.imageUrl) || '', topCredits: movie.s ? movie.s.split(', ') : [] }); }); } catch (error) { console.error('Error searching movies:', { message: error.message, status: (_c = error.response) === null || _c === void 0 ? void 0 : _c.status, data: ((_d = error.response) === null || _d === void 0 ? void 0 : _d.data) ? 'Response data available' : 'No response data' }); throw new Error(`Failed to search for movies: ${error.message}`); } } getReviewUrl(id) { return 'https://www.imdb.com/review/' + id + '/?ref_=tturv_perm_1'; } } exports.imdbScraper = new IMDbScraper();