@apify-scrapers/shared
Version:
Shared utilities and constants for Apify scrapers
163 lines (139 loc) • 4.5 kB
JavaScript
import { stringify } from 'csv-stringify/sync';
/**
* Extract product ID from various URL patterns
* @param {string} url - Product URL
* @returns {string|null} Product ID or null
*/
export function extractProductId(url) {
if (!url) return null;
const patterns = [
/\/product\/(\d+)/,
/i\.(\d+)\.(\d+)/,
/item_id=(\d+)/,
/product_id=(\d+)/,
/pdp-i(\d+)\.html/,
/products\/pdp-i(\d+)\.html/,
/item\/(\d+)/,
/product\/(\d+)/,
/(\d+)\.(\d+)/
];
for (const pattern of patterns) {
const match = url.match(pattern);
if (match) {
return match[1];
}
}
return null;
}
/**
* Clean and parse price string
* @param {string} priceText - Raw price text
* @returns {number|null} Parsed price or null
*/
export function parsePrice(priceText) {
if (!priceText) return null;
const cleaned = priceText.replace(/[^\d.,]/g, '').replace(',', '.');
const parsed = parseFloat(cleaned);
return isNaN(parsed) ? null : parsed;
}
/**
* Clean and parse rating string
* @param {string} ratingText - Raw rating text
* @returns {number|null} Parsed rating or null
*/
export function parseRating(ratingText) {
if (!ratingText) return null;
const cleaned = ratingText.replace(/[^\d.]/g, '');
const parsed = parseFloat(cleaned);
return isNaN(parsed) ? null : parsed;
}
/**
* Clean and parse count string (reviews, sold, etc.)
* @param {string} countText - Raw count text
* @returns {number|null} Parsed count or null
*/
export function parseCount(countText) {
if (!countText) return null;
const cleaned = countText.replace(/[^\d]/g, '');
const parsed = parseInt(cleaned);
return isNaN(parsed) ? null : parsed;
}
/**
* Normalize product URL
* @param {string} url - Product URL
* @param {string} baseUrl - Base URL for relative URLs
* @returns {string|null} Normalized URL or null
*/
export function normalizeProductUrl(url, baseUrl) {
if (!url) return null;
if (url.startsWith('http')) {
return url;
}
try {
const base = new URL(baseUrl);
return `${base.protocol}//${base.hostname}${url}`;
} catch (error) {
console.error('Error normalizing URL:', error);
return null;
}
}
/**
* Convert data to CSV format
* @param {Array} data - Array of objects to convert
* @param {Object} options - CSV options
* @returns {string} CSV string
*/
export function convertToCSV(data, options = {}) {
const defaultOptions = {
header: true,
columns: null,
...options
};
return stringify(data, defaultOptions);
}
/**
* Validate product data
* @param {Object} product - Product object to validate
* @returns {Object} Validation result with isValid and errors
*/
export function validateProduct(product) {
const errors = [];
if (!product.name && !product.productId) {
errors.push('Product must have either name or productId');
}
if (product.price !== null && (typeof product.price !== 'number' || product.price < 0)) {
errors.push('Price must be a non-negative number or null');
}
if (product.rating !== null && (typeof product.rating !== 'number' || product.rating < 0 || product.rating > 5)) {
errors.push('Rating must be a number between 0 and 5 or null');
}
return {
isValid: errors.length === 0,
errors
};
}
/**
* Clean and standardize product data
* @param {Object} product - Raw product data
* @returns {Object} Cleaned product data
*/
export function cleanProductData(product) {
return {
productId: product.productId || null,
name: product.name?.trim() || null,
price: parsePrice(product.price),
originalPrice: parsePrice(product.originalPrice),
discount: product.discount?.trim() || null,
rating: parseRating(product.rating),
reviewCount: parseCount(product.reviewCount),
soldCount: parseCount(product.soldCount),
seller: product.seller?.trim() || null,
location: product.location?.trim() || null,
image: product.image || null,
productUrl: product.productUrl || null,
platform: product.platform || null,
keyword: product.keyword || null,
scrapedAt: product.scrapedAt || new Date().toISOString(),
sourceUrl: product.sourceUrl || null
};
}