UNPKG

@apify-scrapers/shared

Version:

Shared utilities and constants for Apify scrapers

163 lines (139 loc) 4.5 kB
import { stringify } from 'csv-stringify/sync'; /** * Extract product ID from various URL patterns * @param {string} url - Product URL * @returns {string|null} Product ID or null */ export function extractProductId(url) { if (!url) return null; const patterns = [ /\/product\/(\d+)/, /i\.(\d+)\.(\d+)/, /item_id=(\d+)/, /product_id=(\d+)/, /pdp-i(\d+)\.html/, /products\/pdp-i(\d+)\.html/, /item\/(\d+)/, /product\/(\d+)/, /(\d+)\.(\d+)/ ]; for (const pattern of patterns) { const match = url.match(pattern); if (match) { return match[1]; } } return null; } /** * Clean and parse price string * @param {string} priceText - Raw price text * @returns {number|null} Parsed price or null */ export function parsePrice(priceText) { if (!priceText) return null; const cleaned = priceText.replace(/[^\d.,]/g, '').replace(',', '.'); const parsed = parseFloat(cleaned); return isNaN(parsed) ? null : parsed; } /** * Clean and parse rating string * @param {string} ratingText - Raw rating text * @returns {number|null} Parsed rating or null */ export function parseRating(ratingText) { if (!ratingText) return null; const cleaned = ratingText.replace(/[^\d.]/g, ''); const parsed = parseFloat(cleaned); return isNaN(parsed) ? null : parsed; } /** * Clean and parse count string (reviews, sold, etc.) * @param {string} countText - Raw count text * @returns {number|null} Parsed count or null */ export function parseCount(countText) { if (!countText) return null; const cleaned = countText.replace(/[^\d]/g, ''); const parsed = parseInt(cleaned); return isNaN(parsed) ? null : parsed; } /** * Normalize product URL * @param {string} url - Product URL * @param {string} baseUrl - Base URL for relative URLs * @returns {string|null} Normalized URL or null */ export function normalizeProductUrl(url, baseUrl) { if (!url) return null; if (url.startsWith('http')) { return url; } try { const base = new URL(baseUrl); return `${base.protocol}//${base.hostname}${url}`; } catch (error) { console.error('Error normalizing URL:', error); return null; } } /** * Convert data to CSV format * @param {Array} data - Array of objects to convert * @param {Object} options - CSV options * @returns {string} CSV string */ export function convertToCSV(data, options = {}) { const defaultOptions = { header: true, columns: null, ...options }; return stringify(data, defaultOptions); } /** * Validate product data * @param {Object} product - Product object to validate * @returns {Object} Validation result with isValid and errors */ export function validateProduct(product) { const errors = []; if (!product.name && !product.productId) { errors.push('Product must have either name or productId'); } if (product.price !== null && (typeof product.price !== 'number' || product.price < 0)) { errors.push('Price must be a non-negative number or null'); } if (product.rating !== null && (typeof product.rating !== 'number' || product.rating < 0 || product.rating > 5)) { errors.push('Rating must be a number between 0 and 5 or null'); } return { isValid: errors.length === 0, errors }; } /** * Clean and standardize product data * @param {Object} product - Raw product data * @returns {Object} Cleaned product data */ export function cleanProductData(product) { return { productId: product.productId || null, name: product.name?.trim() || null, price: parsePrice(product.price), originalPrice: parsePrice(product.originalPrice), discount: product.discount?.trim() || null, rating: parseRating(product.rating), reviewCount: parseCount(product.reviewCount), soldCount: parseCount(product.soldCount), seller: product.seller?.trim() || null, location: product.location?.trim() || null, image: product.image || null, productUrl: product.productUrl || null, platform: product.platform || null, keyword: product.keyword || null, scrapedAt: product.scrapedAt || new Date().toISOString(), sourceUrl: product.sourceUrl || null }; }