UNPKG

@speed_of/imdbscraper

Version:

IMDb scraper for extracting movie reviews from IMDb pages.

153 lines (134 loc) 6.07 kB
import axios from 'axios'; interface IMDbReview { title: string; author: string; rating: number; date: string; content: string; votes: { up: number; down: number; }; spoiler: boolean; reviewId: string; } interface MovieResult { id: string; titleNameText: string; titleReleaseText: string; titlePosterImageUrl: string; topCredits: string[]; } class IMDbScraper { private baseUrl = 'https://www.imdb.com/title'; private cleanHtmlContent(text: string): string { if (!text) return ''; return text // Replace BR tags with newlines .replace(/<br\s*\/?>/gi, '\n') // Remove all HTML tags .replace(/<[^>]*>/g, '') // Replace common HTML entities .replace(/&quot;/g, '"') .replace(/&apos;/g, "'") .replace(/&lt;/g, '<') .replace(/&gt;/g, '>') .replace(/&amp;/g, '&') .replace(/&#39;/g, "'") .replace(/&ndash;/g, '-') .replace(/&mdash;/g, '--') .replace(/&nbsp;/g, ' ') .replace(/&lsquo;/g, "'") .replace(/&rsquo;/g, "'") .replace(/&ldquo;/g, '"') .replace(/&rdquo;/g, '"') // Clean up multiple newlines .replace(/\n\s*\n/g, '\n\n') // Clean up extra spaces .replace(/\s+/g, ' ') .trim(); } async getReviews(imdbId: string): Promise<IMDbReview[]> { try { console.log('Fetching reviews for:', imdbId); const response = await axios.get(`${this.baseUrl}/${imdbId}/reviews`, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml' } }); const scriptMatch = response.data.match(/<script id="__NEXT_DATA__" type="application\/json">(.*?)<\/script>/s); if (!scriptMatch) { console.error('JSON data not found in response'); throw new Error('JSON data not found'); } const jsonData = JSON.parse(scriptMatch[1]); const reviews = jsonData.props.pageProps.contentData.reviews; // console.log(`Found ${reviews.length} reviews`); return reviews.map((review: any) => { const formattedReview = { title: this.cleanHtmlContent(review.reviewSummary || 'No summary'), author: this.cleanHtmlContent(review.review.author ? review.review.author.nickName : 'Anonymous'), rating: review.review.authorRating || 0, date: review.review.submissionDate || 'Unknown date', content: this.cleanHtmlContent(review.review.reviewText || 'No content'), votes: { up: review.review.helpfulnessVotes ? review.review.helpfulnessVotes.upVotes : 0, down: review.review.helpfulnessVotes ? review.review.helpfulnessVotes.downVotes : 0 }, spoiler: review.review.spoiler || false, reviewId: review.review.reviewId || 0, }; return formattedReview; }); } catch (error: any) { console.error('Error fetching reviews:', { message: error.message, status: error.response?.status, data: error.response?.data ? 'Response data available' : 'No response data' }); throw new Error(`Failed to fetch reviews: ${error.message}`); } } async searchMovie(title: string): Promise<MovieResult[]> { try { // console.log('Searching for movies with title:', title); const response = await axios.get(`https://www.imdb.com/find?q=${encodeURIComponent(title)}&ref_=nv_sr_sm`, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Accept': 'text/html,application/xhtml+xml' } }); const scriptMatch = response.data.match(/<script id="__NEXT_DATA__" type="application\/json">(.*?)<\/script>/s); if (!scriptMatch) { console.error('JSON data not found in search response'); throw new Error('JSON data not found'); } const jsonData = JSON.parse(scriptMatch[1]); const movieResults = jsonData.props.pageProps.titleResults.results; //console.log(`Found ${movieResults.length} movies`); return movieResults.map((movie: any) => { const formattedMovie = { id: movie.id, titleNameText: this.cleanHtmlContent(movie.titleNameText), titleReleaseText: this.cleanHtmlContent(movie.titleReleaseText), titlePosterImageUrl: movie.titlePosterImageModel.url, topCredits: movie.topCredits.map((credit: string) => this.cleanHtmlContent(credit)) }; // console.log(`Processed movie: ${formattedMovie.titleNameText}`); return formattedMovie; }); } catch (error: any) { console.error('Error searching movies:', { message: error.message, status: error.response?.status, data: error.response?.data ? 'Response data available' : 'No response data' }); throw new Error(`Failed to search for movies: ${error.message}`); } } getReviewUrl(id: string) { return 'https://www.imdb.com/review/' + id + '/?ref_=tturv_perm_1' } } export const imdbScraper = new IMDbScraper();