web-scrapify
Version:
A simple web scraper that can scrape product details from various e-commerce platforms.
194 lines (177 loc) • 8.07 kB
text/typescript
import axios from 'axios';
import * as cheerio from 'cheerio';
import { ProductScraper } from '../ProductScraper';
import { AMAZON_BASE_URL, AMAZON_SORT_OPTIONS } from './Constants';
import { Product } from '../Product';
import { HeaderOptions } from '../Headers';
import { Agents } from '../../agents/Agents';
import { timer } from '../../utils/Utils';
import { AmazonSearchOptions } from './AmazonSearchOptions';
import { AmazonSortOptions } from './AmazonSortOptions';
import { AmazonFilterOptions } from './AmazonFilterOptions';
/**
* AmazonScraper class for scraping Amazon product listings and details.
*/
export class AmazonScraper extends ProductScraper {
constructor({
baseUrl = AMAZON_BASE_URL,
enableAgentRotations = false,
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
},
enableLogging = false,
timeout = 30000
}: HeaderOptions = {}) {
super({
baseUrl,
enableAgentRotations,
enableLogging,
headers,
timeout
});
}
/**
* Searches for products on Amazon.
*
* @param query - The search query object containing keyword, category, and page.
* @returns The AmazonScraper instance with the updated URL.
*/
search(query: AmazonSearchOptions): AmazonScraper {
if (query.search) {
this.searchURL = `${this.searchURL}&k=${encodeURIComponent(query.search)}`;
}
if (query.category) {
this.searchURL = `${this.searchURL}&i=${encodeURIComponent(query.category)}`;
}
if (query.page) {
this.searchURL = `${this.searchURL}&page=${query.page}`;
}
return this;
}
/**
* Filters the search results based on the specified price range.
*
* @param query - The filter query object containing price range.
* @returns The AmazonScraper instance with the updated URL.
*/
filter(query: AmazonFilterOptions): AmazonScraper {
if (query.price?.min) {
this.searchURL = `${this.searchURL}&low-price=${query.price.min}`;
}
if (query.price?.max) {
this.searchURL = `${this.searchURL}&high-price=${query.price.max}`;
}
return this;
}
/**
* Sorts the search results based on the specified option.
*
* Available sort options:
* - relevanceblender for sorting by featured
* - price-asc-rank for sorting by price low to high
* - price-desc-rank for sorting by price high to low
* - review-count for sorting by avg customer review
* - date-desc-rank for sorting by newest arrivals
* - exact-aware-popularity-rank for sorting by popularity rank (best sellers)
*/
sort(query: AmazonSortOptions): AmazonScraper {
if (query.sort) {
this.searchURL = `${this.searchURL}&s=${query.sort}`;
}
return this;
}
/**
* Builds the final URL for the Amazon search.
*
* @returns The final URL.
*/
buildURL(): string {
return `${this.baseUrl}/s?${this.searchURL}`;
}
/**
* Scrapes the listings from the Amazon search results page.
*
* @param url - The URL to scrape.
* @returns The list of products.
*/
async scrapListings(url: string = this.buildURL()): Promise<Partial<Product>[]> {
if (!url.includes(this.baseUrl)) {
url = `${this.baseUrl}/s?${url}`;
}
this.logger.info(`Scraping Amazon listings.. ${url}`)
const method = async (url: string) => {
try {
const { data } = await axios.get(url, { headers: { ...this.headers } });
const $ = cheerio.load(data);
const listings = $('.s-result-item').map((i, el) => {
return {
id: $(el).attr('data-asin'),
title: $(el).find('.a-text-normal').text().trim(),
price: $(el).find('.a-price-whole').text().trim(),
imageUrl: $(el).find('.a-link-normal').find('img').attr('src'),
rating: $(el).find('.a-icon-alt').text().trim(),
ratingCount: $(el).find('.a-size-small').text().trim(),
};
}).get();
console.log(listings);
return listings;
} catch (error) {
this.logger.error('Error scraping Amazon listings:', error);
return [];
}
}
return timer(() => method(url), this.timeout);
}
/**
* Scrapes the product details from the Amazon product page.
*
* @param url - The URL to scrape.
* @returns The product details.
*/
async scrape(url: string): Promise<Product | null> {
const method = async (url: string) => {
try {
this.logger.log('Scraping Amazon...');
if (this.enableAgentRotations) {
const agent = new Agents();
this.headers['User-Agent'] = agent.getAgent();
}
this.logger.log('Request Headers: ', this.headers);
this.logger.log('Request URL: ', `${this.baseUrl}/${url}`);
const { data } = await axios.get(`${this.baseUrl}/${url}`, { headers: { ...this.headers } });
const $ = cheerio.load(data);
const productInfo: Product = {
id: $('input[name="ASIN"]').attr('value') || null, // Product title
title: $('#productTitle').text().trim() || null, // Product title
imageUrl: $('#percolate-ui-ilm_div img').attr('src') || $('#dp-container img').attr('src') || null, // Product image URL
rating: $('#acrPopover > span:first-child > a > span:first-child').text().trim().split(' ')[0] || null, // Product rating
ratingCount: $('#acrCustomerReviewText').text().trim().split('ratings')[0].trim() || null, // Rating count
price: $('.priceToPay').text().trim(), // Price
details: $('table tr').map((i, el) => $(el)).get().reduce((acc, el) => {
const key = el.find('td:first-child').text().trim();
const value = el.find('td:nth-child(2)').text().trim();
if (key) {
acc[key] = value || null;
}
return acc;
}, {} as Record<string, string | null>) || null,
description: $('#productDescription').text().trim() || null, // Product description
features: $('#feature-bullets ul li').map((i, el) => $(el).text().trim()).get() || [], // Product features
reviews: $('#cm-cr-dp-review-list li[data-hook="review"]').map((i, el) => {
const title = $(el).find('[data-hook="review-title"] > span').text().trim() || null; // Review title
const date = $(el).find('[data-hook="review-date"]').text().trim() || null; // Review date
const description = $(el).find('[data-hook="review-body"]').text().trim() || null; // Review body
const author = $(el).find('span.a-profile-name').text().trim() || null; // Review author
const rating = $(el).find('[data-hook="review-star-rating"] span.a-icon-alt').text().trim() || null; // Review rating
return { title, date, description, author, rating };
}).get() || [],
};
return productInfo;
} catch (error) {
this.logger.error('Error scraping Amazon:', error);
return null;
}
}
return timer(() => method(url), this.timeout)
}
}