@speed_of/imdbscraper
Version:
IMDb scraper for extracting movie reviews from IMDb pages.
129 lines (128 loc) • 5.93 kB
JavaScript
;
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.imdbScraper = void 0;
const axios_1 = __importDefault(require("axios"));
class IMDbScraper {
constructor() {
this.baseUrl = 'https://www.imdb.com/title';
}
cleanHtmlContent(text) {
if (!text)
return '';
return text
// Replace BR tags with newlines
.replace(/<br\s*\/?>/gi, '\n')
// Remove all HTML tags
.replace(/<[^>]*>/g, '')
// Replace common HTML entities
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/&/g, '&')
.replace(/'/g, "'")
.replace(/–/g, '-')
.replace(/—/g, '--')
.replace(/ /g, ' ')
.replace(/‘/g, "'")
.replace(/’/g, "'")
.replace(/“/g, '"')
.replace(/”/g, '"')
// Clean up multiple newlines
.replace(/\n\s*\n/g, '\n\n')
// Clean up extra spaces
.replace(/\s+/g, ' ')
.trim();
}
async getReviews(imdbId, sortBy = 'HELPFULNESS_SCORE') {
var _a, _b, _c, _d, _e, _f, _g;
try {
console.log('Fetching reviews for:', imdbId);
const response = await axios_1.default.post('https://graphql.imdb.com/', {
operationName: 'TitleReviewsRefine',
variables: {
const: imdbId,
filter: {},
first: 25,
isInProfileImageWeblab: true,
locale: 'en-US',
sort: { by: sortBy, order: 'DESC' }
},
extensions: {
persistedQuery: {
sha256Hash: 'fb58a77d474033025bf28e1fe68f9b998111d3df58e08cd8405bd9265b1a9aff',
version: 1
}
}
}, {
headers: {
'accept': 'application/graphql+json, application/json',
'content-type': 'application/json',
'x-imdb-client-name': 'imdb-web-next',
}
});
const edges = (_e = (_d = (_c = (_b = (_a = response.data) === null || _a === void 0 ? void 0 : _a.data) === null || _b === void 0 ? void 0 : _b.title) === null || _c === void 0 ? void 0 : _c.reviews) === null || _d === void 0 ? void 0 : _d.edges) !== null && _e !== void 0 ? _e : [];
return edges.map((edge) => {
var _a, _b, _c, _d, _e, _f, _g;
const node = edge.node;
return {
title: ((_a = node.summary) === null || _a === void 0 ? void 0 : _a.originalText) || 'No summary',
author: ((_c = (_b = node.author) === null || _b === void 0 ? void 0 : _b.username) === null || _c === void 0 ? void 0 : _c.text) || 'Anonymous',
rating: node.authorRating || 0,
date: node.submissionDate || 'Unknown date',
content: this.cleanHtmlContent(((_e = (_d = node.text) === null || _d === void 0 ? void 0 : _d.originalText) === null || _e === void 0 ? void 0 : _e.plaidHtml) || ''),
votes: {
up: ((_f = node.helpfulness) === null || _f === void 0 ? void 0 : _f.upVotes) || 0,
down: ((_g = node.helpfulness) === null || _g === void 0 ? void 0 : _g.downVotes) || 0
},
spoiler: node.spoiler || false,
reviewId: node.id || '',
};
});
}
catch (error) {
console.error('Error fetching reviews:', {
message: error.message,
status: (_f = error.response) === null || _f === void 0 ? void 0 : _f.status,
data: ((_g = error.response) === null || _g === void 0 ? void 0 : _g.data) ? 'Response data available' : 'No response data'
});
throw new Error(`Failed to fetch reviews: ${error.message}`);
}
}
async searchMovie(title) {
var _a, _b, _c, _d;
try {
const response = await axios_1.default.get(`https://v3.sg.media-imdb.com/suggestion/x/${encodeURIComponent(title)}.json`, {
headers: {
'accept': 'application/json',
}
});
const results = (_b = (_a = response.data) === null || _a === void 0 ? void 0 : _a.d) !== null && _b !== void 0 ? _b : [];
return results.map((movie) => {
var _a;
return ({
id: movie.id,
titleNameText: movie.l || '',
titleReleaseText: movie.y ? String(movie.y) : '',
titlePosterImageUrl: ((_a = movie.i) === null || _a === void 0 ? void 0 : _a.imageUrl) || '',
topCredits: movie.s ? movie.s.split(', ') : []
});
});
}
catch (error) {
console.error('Error searching movies:', {
message: error.message,
status: (_c = error.response) === null || _c === void 0 ? void 0 : _c.status,
data: ((_d = error.response) === null || _d === void 0 ? void 0 : _d.data) ? 'Response data available' : 'No response data'
});
throw new Error(`Failed to search for movies: ${error.message}`);
}
}
getReviewUrl(id) {
return 'https://www.imdb.com/review/' + id + '/?ref_=tturv_perm_1';
}
}
exports.imdbScraper = new IMDbScraper();