@speed_of/imdbscraper
Version:
IMDb scraper for extracting movie reviews from IMDb pages.
153 lines (134 loc) • 6.07 kB
text/typescript
import axios from 'axios';
interface IMDbReview {
title: string;
author: string;
rating: number;
date: string;
content: string;
votes: {
up: number;
down: number;
};
spoiler: boolean;
reviewId: string;
}
interface MovieResult {
id: string;
titleNameText: string;
titleReleaseText: string;
titlePosterImageUrl: string;
topCredits: string[];
}
class IMDbScraper {
private baseUrl = 'https://www.imdb.com/title';
private cleanHtmlContent(text: string): string {
if (!text) return '';
return text
// Replace BR tags with newlines
.replace(/<br\s*\/?>/gi, '\n')
// Remove all HTML tags
.replace(/<[^>]*>/g, '')
// Replace common HTML entities
.replace(/"/g, '"')
.replace(/'/g, "'")
.replace(/</g, '<')
.replace(/>/g, '>')
.replace(/&/g, '&')
.replace(/'/g, "'")
.replace(/–/g, '-')
.replace(/—/g, '--')
.replace(/ /g, ' ')
.replace(/‘/g, "'")
.replace(/’/g, "'")
.replace(/“/g, '"')
.replace(/”/g, '"')
// Clean up multiple newlines
.replace(/\n\s*\n/g, '\n\n')
// Clean up extra spaces
.replace(/\s+/g, ' ')
.trim();
}
async getReviews(imdbId: string): Promise<IMDbReview[]> {
try {
console.log('Fetching reviews for:', imdbId);
const response = await axios.get(`${this.baseUrl}/${imdbId}/reviews`, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml'
}
});
const scriptMatch = response.data.match(/<script id="__NEXT_DATA__" type="application\/json">(.*?)<\/script>/s);
if (!scriptMatch) {
console.error('JSON data not found in response');
throw new Error('JSON data not found');
}
const jsonData = JSON.parse(scriptMatch[1]);
const reviews = jsonData.props.pageProps.contentData.reviews;
// console.log(`Found ${reviews.length} reviews`);
return reviews.map((review: any) => {
const formattedReview = {
title: this.cleanHtmlContent(review.reviewSummary || 'No summary'),
author: this.cleanHtmlContent(review.review.author ? review.review.author.nickName : 'Anonymous'),
rating: review.review.authorRating || 0,
date: review.review.submissionDate || 'Unknown date',
content: this.cleanHtmlContent(review.review.reviewText || 'No content'),
votes: {
up: review.review.helpfulnessVotes ? review.review.helpfulnessVotes.upVotes : 0,
down: review.review.helpfulnessVotes ? review.review.helpfulnessVotes.downVotes : 0
},
spoiler: review.review.spoiler || false,
reviewId: review.review.reviewId || 0,
};
return formattedReview;
});
} catch (error: any) {
console.error('Error fetching reviews:', {
message: error.message,
status: error.response?.status,
data: error.response?.data ? 'Response data available' : 'No response data'
});
throw new Error(`Failed to fetch reviews: ${error.message}`);
}
}
async searchMovie(title: string): Promise<MovieResult[]> {
try {
// console.log('Searching for movies with title:', title);
const response = await axios.get(`https://www.imdb.com/find?q=${encodeURIComponent(title)}&ref_=nv_sr_sm`, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
'Accept': 'text/html,application/xhtml+xml'
}
});
const scriptMatch = response.data.match(/<script id="__NEXT_DATA__" type="application\/json">(.*?)<\/script>/s);
if (!scriptMatch) {
console.error('JSON data not found in search response');
throw new Error('JSON data not found');
}
const jsonData = JSON.parse(scriptMatch[1]);
const movieResults = jsonData.props.pageProps.titleResults.results;
//console.log(`Found ${movieResults.length} movies`);
return movieResults.map((movie: any) => {
const formattedMovie = {
id: movie.id,
titleNameText: this.cleanHtmlContent(movie.titleNameText),
titleReleaseText: this.cleanHtmlContent(movie.titleReleaseText),
titlePosterImageUrl: movie.titlePosterImageModel.url,
topCredits: movie.topCredits.map((credit: string) => this.cleanHtmlContent(credit))
};
// console.log(`Processed movie: ${formattedMovie.titleNameText}`);
return formattedMovie;
});
} catch (error: any) {
console.error('Error searching movies:', {
message: error.message,
status: error.response?.status,
data: error.response?.data ? 'Response data available' : 'No response data'
});
throw new Error(`Failed to search for movies: ${error.message}`);
}
}
getReviewUrl(id: string) {
return 'https://www.imdb.com/review/' + id + '/?ref_=tturv_perm_1'
}
}
export const imdbScraper = new IMDbScraper();