focus-product-extractor2
Version:
Extract product information from chat/order data
133 lines (128 loc) • 4.88 kB
JavaScript
;
Object.defineProperty(exports, "__esModule", {
value: true
});
exports.default = exports.RuleMatcher = void 0;
class RuleMatcher {
constructor() {
this.builtInRules = {
jd: [/item\.jd\.com\/(\d+)\.html/ig, /item\.m\.jd\.com\/product\/(\d+)\.html/ig],
tb: [
// 淘宝商品详情页
/item\.taobao\.com\/item\.htm\?id=(\d+)/ig,
// 天猫商品详情页
/detail\.tmall\.com\/item\.htm\?id=(\d+)/ig,
// 通用匹配规则 - 匹配id参数
/[?&]id=(\d+)/ig,
// 匹配短链接格式
/taobao\.com\/item\/\?id=(\d+)/ig,
// 匹配带其他参数的URL
/(?:item|detail)\.(?:taobao|tmall)\.com\/item\.htm\?.*\bid=(\d+)/ig],
pdd: [
// 匹配各种PDD商品URL格式
/(?:yangkeduo\.com|pinduoduo\.com)\/group\.html\?goods_id=(\d+)/ig, /(?:m|mobile)\.(?:yangkeduo|pinduoduo)\.com\/goods\.html\?goods_id=(\d+)/ig, /(?:www\.)?(?:yangkeduo|pinduoduo)\.com\/goods\/\?goods_id=(\d+)/ig,
// 新增更通用的匹配规则
/[?&]goods_id=(\d+)/ig,
// 匹配短链接格式
/(?:yangkeduo\.com|pinduoduo\.com)\/g\/(\d+)/ig],
// 抖音规则
dy: [
// 匹配抖音电商商品详情页URL
/haohuo\.jinritemai\.com\/ecommerce\/trade\/detail\/index\.html\?id=(\d+)/ig,
// 匹配抖音电商移动端URL
/haohuo\.jinritemai\.com\/m\/trade\/detail\?id=(\d+)/ig,
// 通用匹配规则
/[?&]id=(\d+)/ig],
ks: [
// 匹配快手电商商品详情页URL
/kwaishop\.com\/merchant\/shop\/detail\?id=(\d+)/ig,
// 匹配快手电商移动端URL
/kwaishop\.com\/m\/shop\/detail\?id=(\d+)/ig,
// 通用匹配规则
/[?&]id=(\d+)/ig],
xhs: [
// 匹配小红书商品详情页URL
/xiaohongshu\.com\/goods-detail\/([a-f0-9]{24})/ig,
// 匹配带参数的URL
/xiaohongshu\.com\/goods-detail\/([a-f0-9]{24})\?/ig,
// 通用ID匹配规则
/([a-f0-9]{24})/ig],
lazada: [
// 确保能匹配.co.th和.com等域名
/lazada\.[a-z]{2,3}(?:\.[a-z]{2})?\/(?:products\/)?[^\/]+-i(\d+)(?:-s\d+)?/ig, /lazada\.[a-z]{2,3}(?:\.[a-z]{2})?\/products\/[^\/]+\/i(\d+)(?:-s\d+)?/ig,
// 更宽松的匹配规则
/lazada[^\/]+\/[^\/]+-i(\d+)/ig],
shopee: [
//更宽松地匹配商品名称部分
/shopee\.[a-z]{2,3}\/(?:[^?]+\/)?[^?]+i\.\d+\.(\d+)(?:\?|$)/ig, /xiapibuy\.com\/[^\/]+-i\.\d+\.(\d+)/ig,
// 简单的匹配规则
/i\.\d+\.(\d+)/ig],
tiktok: [/tiktok\.com\/view\/product\/(\d+)/ig // 匹配TikTok商品ID
]
};
}
extractSkuIdFromUrl(url, platform) {
if (!url) return null;
// 使用指定平台的规则进行匹配
if (this.builtInRules[platform]) {
for (const pattern of this.builtInRules[platform]) {
const matches = [...url.matchAll(pattern)];
if (matches.length > 0) return matches[0][1];
}
}
// 通用skuId匹配规则
const generalPattern = /[?&;]sku[=:]([\d]+)/ig;
const matches = [...url.matchAll(generalPattern)];
return matches.length > 0 ? matches[0][1] : null;
}
match(text, platform, customRules = []) {
// 优先使用自定义规则匹配
if (customRules.length > 0) {
const customMatches = [];
customRules.forEach(rule => {
try {
// 使用传入的flags,如果没有则默认'g'
const regex = new RegExp(rule.pattern, rule.flags || 'g');
const matches = [...text.matchAll(regex)];
matches.forEach(match => {
customMatches.push({
// 匹配到的文本
matchedText: match[0],
metadata: {
pattern: rule.pattern,
flags: rule.flags,
// 捕获组,从1开始
groups: match.slice(1),
// 原始文本
text: text
}
});
});
} catch (error) {
console.error('Invalid custom rule:', rule, error);
}
});
if (customMatches.length > 0) return customMatches;
}
// 内置规则匹配
if (this.builtInRules[platform]) {
// console.log('\nUsing built-in rules for platform:', platform);
const results = [];
for (const pattern of this.builtInRules[platform]) {
// console.log('\nTesting pattern:', pattern);
// console.log('\nTesting text:', text);
const matches = [...text.matchAll(pattern)];
if (matches.length > 0) {
// console.log('\nbind pattern:', pattern);
// 捕获skuid
results.push(...matches.map(match => match[1]));
}
}
// console.log('\nBuilt-in matches:', results);
return results;
}
return [];
}
}
exports.RuleMatcher = RuleMatcher;
var _default = exports.default = new RuleMatcher();