nitter-scraper-v2
Version:
A Twitter scraper that uses Nitter to fetch tweets without authentication
713 lines (712 loc) • 29.2 kB
JavaScript
;
var __createBinding = (this && this.__createBinding) || (Object.create ? (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
var desc = Object.getOwnPropertyDescriptor(m, k);
if (!desc || ("get" in desc ? !m.__esModule : desc.writable || desc.configurable)) {
desc = { enumerable: true, get: function() { return m[k]; } };
}
Object.defineProperty(o, k2, desc);
}) : (function(o, m, k, k2) {
if (k2 === undefined) k2 = k;
o[k2] = m[k];
}));
var __setModuleDefault = (this && this.__setModuleDefault) || (Object.create ? (function(o, v) {
Object.defineProperty(o, "default", { enumerable: true, value: v });
}) : function(o, v) {
o["default"] = v;
});
var __importStar = (this && this.__importStar) || (function () {
var ownKeys = function(o) {
ownKeys = Object.getOwnPropertyNames || function (o) {
var ar = [];
for (var k in o) if (Object.prototype.hasOwnProperty.call(o, k)) ar[ar.length] = k;
return ar;
};
return ownKeys(o);
};
return function (mod) {
if (mod && mod.__esModule) return mod;
var result = {};
if (mod != null) for (var k = ownKeys(mod), i = 0; i < k.length; i++) if (k[i] !== "default") __createBinding(result, mod, k[i]);
__setModuleDefault(result, mod);
return result;
};
})();
var __importDefault = (this && this.__importDefault) || function (mod) {
return (mod && mod.__esModule) ? mod : { "default": mod };
};
Object.defineProperty(exports, "__esModule", { value: true });
exports.fetchTweets = fetchTweets;
const cheerio = __importStar(require("cheerio"));
const axios_1 = __importDefault(require("axios"));
const dateUtils_1 = require("./utils/dateUtils");
const https_proxy_agent_1 = require("https-proxy-agent");
const BASE_URL = "https://nitter.net";
const DELAY_BETWEEN_REQUESTS = 2000; // 2 seconds delay between requests
const TWITTER_URL = "https://x.com/{username}/status/{id}";
// Variables pour la gestion des proxies
let proxies = [];
/**
* Télécharge et charge la liste des proxies depuis une URL ou utilise une liste fournie
*/
async function loadProxies(useProxies, proxyOptions) {
if (!useProxies) {
return;
}
try {
let proxyLines = [];
// Si une liste de proxies est fournie directement
if (proxyOptions?.proxyList && proxyOptions.proxyList.length > 0) {
proxyLines = proxyOptions.proxyList;
}
// Si une URL est fournie pour télécharger les proxies
else if (proxyOptions?.proxyUrl) {
try {
// Utiliser axios pour télécharger la liste au lieu de curl
const response = await axios_1.default.get(proxyOptions.proxyUrl, {
timeout: 30000,
headers: {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/19.0 Safari/605.1.15",
},
});
const data = response.data;
proxyLines = data
.split("\n")
.filter((line) => line.trim() !== "");
}
catch (downloadError) {
console.error(`Erreur lors du téléchargement depuis l'URL: ${downloadError}`);
throw new Error(`Impossible de télécharger les proxies depuis l'URL fournie`);
}
}
// Aucune source de proxy fournie
else {
throw new Error("Aucune source de proxy fournie. Veuillez fournir soit 'proxyList' soit 'proxyUrl' dans les options.");
}
// Parser chaque ligne au format: ip:port:username:password
proxies = proxyLines
.map((line) => {
const parts = line.split(":");
if (parts.length >= 4) {
return {
host: parts[0].trim(),
port: parseInt(parts[1].trim(), 10),
username: parts[2].trim(),
password: parts.slice(3).join(":").trim(), // Support des mots de passe avec ':'
};
}
return null;
})
.filter((proxy) => proxy !== null &&
Boolean(proxy.host) &&
!isNaN(proxy.port) &&
Boolean(proxy.username) &&
Boolean(proxy.password));
if (proxies.length === 0) {
throw new Error("Aucun proxy valide trouvé dans les données fournies");
}
}
catch (error) {
console.error(`Erreur lors du chargement des proxies: ${error}`);
throw error;
}
}
/**
* Obtient un proxy aléatoire de la liste
*/
function getRandomProxy(useProxies) {
if (!useProxies || proxies.length === 0) {
return null;
}
const randomIndex = Math.floor(Math.random() * proxies.length);
return proxies[randomIndex];
}
/**
* Retire un proxy défaillant de la liste
*/
function removeProxy(proxyToRemove) {
proxies = proxies.filter((proxy) => !(proxy.host === proxyToRemove.host && proxy.port === proxyToRemove.port));
}
/**
* Crée un agent proxy HTTPS pour axios avec authentification
*/
function createProxyAgent(proxy) {
const proxyUrl = `http://${encodeURIComponent(proxy.username)}:${encodeURIComponent(proxy.password)}@${proxy.host}:${proxy.port}`;
return new https_proxy_agent_1.HttpsProxyAgent(proxyUrl);
}
/**
* Fonction pour extraire les informations d'une carte (card)
*/
function extractCardInfo(cardElement, $) {
const card = $(cardElement);
// Structure à retourner
const result = {
type: "card",
url: null,
imageUrl: null,
title: "",
description: "",
destination: "",
};
// Vérifier les deux formats possibles
// Format 1: card-container est un lien <a>
const cardContainer = card.find(".card-container");
if (cardContainer.is("a")) {
result.url = cardContainer.attr("href") || null;
}
// Format 2: card-content-container est un lien <a> à l'intérieur de card-container
else {
const contentContainer = card.find(".card-content-container");
if (contentContainer.is("a")) {
result.url = contentContainer.attr("href") || null;
}
}
// Extraire l'URL de l'image (les deux formats peuvent avoir des images)
// Vérifier d'abord dans .card-image
let imgElement = card.find(".card-image img");
if (imgElement.length === 0) {
// Si pas trouvé, chercher dans d'autres structures comme .attachments
imgElement = card.find(".attachments img, .gallery-video img");
}
if (imgElement.length > 0) {
let imgSrc = imgElement.attr("src") || null;
if (imgSrc) {
// Remplacer /pic/ par https://nitter.net/pic/
if (imgSrc.startsWith("/pic/")) {
imgSrc = `${BASE_URL}${imgSrc}`;
}
else if (!imgSrc.startsWith("http")) {
imgSrc = `${BASE_URL}${imgSrc.startsWith("/") ? imgSrc : `/${imgSrc}`}`;
}
result.imageUrl = imgSrc;
}
}
// Extraire le titre (fonctionne pour les deux formats)
const titleElement = card.find(".card-title");
if (titleElement.length > 0) {
result.title = titleElement.text().trim();
}
// Extraire la description (fonctionne pour les deux formats)
const descriptionElement = card.find(".card-description");
if (descriptionElement.length > 0) {
result.description = descriptionElement.text().trim();
}
// Extraire la destination (principalement dans le format 1)
const destinationElement = card.find(".card-destination");
if (destinationElement.length > 0) {
result.destination = destinationElement.text().trim();
}
// Compter les éléments non vides
let nonEmptyCount = 0;
if (result.url !== null)
nonEmptyCount++;
if (result.imageUrl !== null)
nonEmptyCount++;
if (result.title !== "")
nonEmptyCount++;
if (result.description !== "")
nonEmptyCount++;
if (result.destination !== "")
nonEmptyCount++;
// Ne retourner la carte que si au moins deux éléments sont remplis
if (nonEmptyCount >= 2) {
return result;
}
return null;
}
/**
* Extract tweets and next cursor from HTML content
*/
function extractTweetsFromHtml(html, username, existingTweets, includeRetweets = false) {
const $ = cheerio.load(html);
const tweets = [];
let nextCursor = null;
// Find the "Load more" link to get the next cursor
$("a").each((_, element) => {
const href = $(element).attr("href");
const text = $(element).text().trim();
if (href && href.includes("cursor=") && text.includes("Load more")) {
const cursorMatch = href.match(/cursor=([^&]+)/);
if (cursorMatch && cursorMatch[1]) {
nextCursor = cursorMatch[1];
}
}
});
// Extract tweets
$(".timeline-item").each((_, element) => {
try {
const tweetElement = $(element);
// Skip pinned tweets
if (tweetElement.find(".pinned").length > 0) {
return;
}
// Détecter si c'est un retweet
const retweetHeader = tweetElement.find(".retweet-header");
const isRetweet = retweetHeader.length > 0;
let retweetedBy = null;
if (isRetweet) {
// Extraire le nom de l'utilisateur qui a retweeté
const retweetText = retweetHeader.text().trim();
const retweetMatch = retweetText.match(/(.+?)\s+retweeted/);
if (retweetMatch) {
retweetedBy = retweetMatch[1].trim();
}
// Si includeRetweets est false, ignorer ce tweet
if (!includeRetweets) {
return;
}
}
// Extract tweet ID from the permalink
const permalink = tweetElement.find(".tweet-link").attr("href");
const id = permalink ? permalink.split("/").pop() || "" : "";
// Clean the ID by removing the "#m" suffix if present
const cleanId = id.replace(/#m$/, "");
if (!cleanId) {
return; // Skip if no ID
}
// Skip if we already have this tweet
if (existingTweets.has(cleanId)) {
return;
}
const text = tweetElement.find(".tweet-content").text().trim();
// Get timestamp and full date from title attribute
const timestampElement = tweetElement.find(".tweet-date a");
const timestamp = timestampElement.text().trim();
const dateStr = timestampElement.attr("title");
// Parse the date from the timestamp
const date = (0, dateUtils_1.getDateFromTimestamp)(timestamp, dateStr);
// Extraire le nom complet du compte
const fullnameElement = tweetElement.find(".fullname");
const fullname = fullnameElement.text().trim() || username;
// Vérifier le statut de vérification et le type
const verifiedElement = tweetElement.find(".verified-icon");
let isVerified = false;
let verificationType = null;
if (verifiedElement.length > 0) {
isVerified = true;
// Extraire le type de vérification des classes CSS
const classes = verifiedElement.attr("class") || "";
if (classes.includes("business")) {
verificationType = "business";
}
else if (classes.includes("blue")) {
verificationType = "blue";
}
else {
verificationType = "verified"; // Type générique si pas spécifique
}
}
// Récupérer l'URL de l'avatar
const avatarElement = tweetElement.find(".avatar.round");
let avatarUrl = null;
if (avatarElement.length > 0) {
avatarUrl = avatarElement.attr("src") || null;
// Add base URL if it's a relative path
if (avatarUrl && !avatarUrl.startsWith("http")) {
avatarUrl = `${BASE_URL}${avatarUrl}`;
}
}
// Collect image URLs from the tweet
const imageTweet = [];
tweetElement
.find(".attachment.image a.still-image")
.each((_, imgElement) => {
const imgSrc = $(imgElement).attr("href");
if (imgSrc) {
// Add base URL if it's a relative path
const fullImgSrc = imgSrc.startsWith("http")
? imgSrc
: `${BASE_URL}${imgSrc}`;
imageTweet.push(fullImgSrc);
}
});
// Collect video URLs from the tweet (legacy)
const videoTweet = [];
// Chercher les vidéos dans différentes structures possibles
tweetElement.find("video, .gallery-video video, .attachment.video-container video").each((_, videoElement) => {
const dataUrl = $(videoElement).attr("data-url");
if (dataUrl) {
// Add base URL if it's a relative path
const fullVideoUrl = dataUrl.startsWith("http")
? dataUrl
: `${BASE_URL}${dataUrl}`;
videoTweet.push(fullVideoUrl);
}
});
// Collect detailed video information
const videos = [];
// Méthode 1: Chercher les balises video directement
tweetElement.find("video").each((_, videoElement) => {
const $video = $(videoElement);
const posterUrl = $video.attr("poster");
const dataUrl = $video.attr("data-url");
const videoInfo = {
posterUrl: null,
videoUrl: null
};
// Process poster URL
if (posterUrl) {
videoInfo.posterUrl = posterUrl.startsWith("http")
? posterUrl
: `${BASE_URL}${posterUrl}`;
}
// Process video URL
if (dataUrl) {
videoInfo.videoUrl = dataUrl.startsWith("http")
? dataUrl
: `${BASE_URL}${dataUrl}`;
}
if (videoInfo.posterUrl || videoInfo.videoUrl) {
videos.push(videoInfo);
}
});
// Méthode 2: Chercher les conteneurs de vidéo avec des images de prévisualisation
tweetElement.find(".gallery-video, .video-container").each((_, containerElement) => {
const $container = $(containerElement);
// Chercher l'image de prévisualisation dans le conteneur
const $img = $container.find("img");
const posterUrl = $img.attr("src");
// Chercher l'URL de la vidéo dans les attributs data-url
const $videoElement = $container.find("[data-url]");
const dataUrl = $videoElement.attr("data-url");
if (posterUrl || dataUrl) {
const videoInfo = {
posterUrl: null,
videoUrl: null
};
// Process poster URL
if (posterUrl) {
videoInfo.posterUrl = posterUrl.startsWith("http")
? posterUrl
: `${BASE_URL}${posterUrl}`;
}
// Process video URL
if (dataUrl) {
videoInfo.videoUrl = dataUrl.startsWith("http")
? dataUrl
: `${BASE_URL}${dataUrl}`;
}
// Vérifier qu'on n'a pas déjà cette vidéo
const isDuplicate = videos.some(v => v.videoUrl === videoInfo.videoUrl || v.posterUrl === videoInfo.posterUrl);
if (!isDuplicate && (videoInfo.posterUrl || videoInfo.videoUrl)) {
videos.push(videoInfo);
}
}
});
// Extract tweet statistics
const stats = {
comments: 0,
retweets: 0,
quotes: 0,
likes: 0,
views: 0
};
// Parse statistics from tweet-stats
tweetElement.find(".tweet-stats .tweet-stat").each((_, statElement) => {
const $stat = $(statElement);
const iconElement = $stat.find(".icon-container span");
const numberText = $stat.text().trim();
// Extract number from text (remove non-numeric characters except commas)
const numberMatch = numberText.match(/[\d,]+/);
const number = numberMatch ? parseInt(numberMatch[0].replace(/,/g, ''), 10) : 0;
// Determine stat type by icon class
if (iconElement.hasClass("icon-comment")) {
stats.comments = number;
}
else if (iconElement.hasClass("icon-retweet")) {
stats.retweets = number;
}
else if (iconElement.hasClass("icon-quote")) {
stats.quotes = number;
}
else if (iconElement.hasClass("icon-heart")) {
stats.likes = number;
}
else if (iconElement.hasClass("icon-play")) {
stats.views = number;
}
});
const cardElements = tweetElement.find(".card");
const cards = [];
cardElements.each((_, cardElement) => {
const cardInfo = extractCardInfo(cardElement, $);
if (cardInfo) {
cards.push(cardInfo);
}
});
// Create tweet object
tweets.push({
id: cleanId,
text,
username,
fullname,
isVerified,
verificationType,
created_at: date ? date.toISOString() : "",
timestamp: date ? date.getTime() : null,
imageTweet,
videoTweet,
videos,
stats,
avatarUrl,
cards,
isRetweet,
retweetedBy,
originalUrl: TWITTER_URL.replace("{username}", username).replace("{id}", cleanId),
});
}
catch (error) {
console.error(`Error extracting tweet: ${error}`);
}
});
return { tweets, nextCursor };
}
/**
* Fetch a single page of tweets
*/
async function fetchSinglePage(username, cursor, useProxies, seenTweets, includeRetweets = false) {
const url = cursor
? `${BASE_URL}/${username}?cursor=${encodeURIComponent(cursor)}`
: `${BASE_URL}/${username}`;
// Essayer plusieurs proxies en cas d'échec
let response;
let attempts = 0;
const maxAttempts = 3;
let currentProxy = null;
while (attempts < maxAttempts) {
try {
currentProxy = getRandomProxy(useProxies);
// Fetch the HTML content using axios with proxy
response = await axios_1.default.get(url, {
headers: {
Accept: "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "fr-FR,fr;q=0.9",
Priority: "u=0, i",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/19.0 Safari/605.1.15",
},
timeout: 10000, // 10 seconds timeout
httpsAgent: currentProxy
? createProxyAgent(currentProxy)
: undefined,
httpAgent: currentProxy
? createProxyAgent(currentProxy)
: undefined,
});
// Succès - sortir de la boucle
break;
}
catch (error) {
attempts++;
console.error(`Échec de la tentative ${attempts} pour ${url}: ${error}`);
// Si on utilise un proxy et qu'il y a une erreur, on le retire de la liste
if (currentProxy && useProxies) {
removeProxy(currentProxy);
}
if (attempts >= maxAttempts) {
throw error;
}
// Attendre un peu avant de réessayer avec un autre proxy
await new Promise((resolve) => setTimeout(resolve, 1000));
}
}
const html = response.data;
const result = extractTweetsFromHtml(html, username, seenTweets, includeRetweets);
return {
...result,
html
};
}
/**
* Fetch tweets for a given username
*/
async function fetchTweets(username, maxPages = 3, useProxies = false, proxyOptions, useConcurrency = false, includeRetweets = false) {
try {
// Charger les proxies au début
await loadProxies(useProxies, proxyOptions);
const allTweets = [];
const seenTweets = new Set();
let userProfile = null;
if (useConcurrency && maxPages > 1) {
// Mode concurrent optimisé : récupération séquentielle rapide sans délais
let nextCursor = null;
let pagesProcessed = 0;
// Récupérer les pages une par une mais sans délai entre les requêtes
while (pagesProcessed < maxPages) {
try {
const result = await fetchSinglePage(username, nextCursor, useProxies, seenTweets, includeRetweets);
// Extraire le profil utilisateur depuis la première page
if (pagesProcessed === 0 && result.html) {
userProfile = extractUserProfile(result.html, username);
}
// Ajouter les tweets
for (const tweet of result.tweets) {
if (!seenTweets.has(tweet.id)) {
allTweets.push(tweet);
seenTweets.add(tweet.id);
}
}
nextCursor = result.nextCursor;
pagesProcessed++;
// Pas de délai en mode concurrent - on enchaîne directement
if (!nextCursor) {
break;
}
}
catch (error) {
console.error(`Erreur lors de la récupération de la page ${pagesProcessed + 1}:`, error);
break;
}
}
}
else {
// Mode séquentiel : traitement page par page avec délais (comportement original)
let nextCursor = null;
let pagesProcessed = 0;
do {
const result = await fetchSinglePage(username, nextCursor, useProxies, seenTweets, includeRetweets);
// Extraire le profil utilisateur depuis la première page
if (pagesProcessed === 0 && result.html) {
userProfile = extractUserProfile(result.html, username);
}
// Add tweets to the result and update seen tweets
for (const tweet of result.tweets) {
allTweets.push(tweet);
seenTweets.add(tweet.id);
}
nextCursor = result.nextCursor;
pagesProcessed++;
// Add delay between requests to avoid rate limiting
if (nextCursor && pagesProcessed < maxPages) {
await new Promise((resolve) => setTimeout(resolve, DELAY_BETWEEN_REQUESTS));
}
} while (nextCursor && pagesProcessed < maxPages);
}
return {
userProfile,
tweets: allTweets
};
}
catch (error) {
console.error(`Error fetching tweets: ${error}`);
return {
userProfile: null,
tweets: []
};
}
finally {
// Pas de nettoyage de fichier nécessaire car on utilise axios directement
}
}
/**
* Extract user profile information from HTML content
*/
function extractUserProfile(html, username) {
const $ = cheerio.load(html);
try {
// Extraire le nom complet
const fullnameElement = $(".profile-card .profile-card-fullname");
const fullname = fullnameElement.text().trim() || username;
// Extraire la description/bio
const descriptionElement = $(".profile-card .profile-bio");
const description = descriptionElement.text().trim() || "";
// Vérifier le statut de vérification
const verifiedElement = $(".profile-card .verified-icon");
let isVerified = false;
let verificationType = null;
if (verifiedElement.length > 0) {
isVerified = true;
const classes = verifiedElement.attr("class") || "";
if (classes.includes("business")) {
verificationType = "business";
}
else if (classes.includes("blue")) {
verificationType = "blue";
}
else {
verificationType = "verified";
}
}
// Extraire l'URL de l'avatar
const avatarElement = $(".profile-card .profile-card-avatar img");
let avatarUrl = null;
if (avatarElement.length > 0) {
avatarUrl = avatarElement.attr("src") || null;
if (avatarUrl && !avatarUrl.startsWith("http")) {
avatarUrl = `${BASE_URL}${avatarUrl}`;
}
}
// Extraire l'URL de la bannière
const bannerElement = $(".profile-banner img");
let bannerUrl = null;
if (bannerElement.length > 0) {
bannerUrl = bannerElement.attr("src") || null;
if (bannerUrl && !bannerUrl.startsWith("http")) {
bannerUrl = `${BASE_URL}${bannerUrl}`;
}
}
// Extraire les statistiques du profil
const stats = {
tweets: 0,
following: 0,
followers: 0,
likes: 0
};
// Parser les statistiques depuis .profile-statlist li
$(".profile-statlist li").each((_, statElement) => {
const $stat = $(statElement);
const headerText = $stat.find(".profile-stat-header").text().trim().toLowerCase();
const numText = $stat.find(".profile-stat-num").text().trim();
// Convertir le nombre (gérer les virgules et les formats comme "2,303,191")
const number = parseInt(numText.replace(/,/g, ''), 10) || 0;
if (headerText.includes("tweet")) {
stats.tweets = number;
}
else if (headerText.includes("following")) {
stats.following = number;
}
else if (headerText.includes("follower")) {
stats.followers = number;
}
else if (headerText.includes("like")) {
stats.likes = number;
}
});
// Extraire la date d'inscription
const joinDateElement = $(".profile-joindate");
let joinDate = null;
if (joinDateElement.length > 0) {
const joinText = joinDateElement.text().trim();
// Extraire la date du format "Joined Month Year"
const dateMatch = joinText.match(/joined\s+(.+)/i);
if (dateMatch) {
joinDate = dateMatch[1].trim();
}
}
// Extraire la localisation
const locationElement = $(".profile-location");
const location = locationElement.text().trim() || null;
// Extraire le site web
const websiteElement = $(".profile-website a");
let website = null;
if (websiteElement.length > 0) {
website = websiteElement.attr("href") || websiteElement.text().trim() || null;
}
return {
username,
fullname,
description,
isVerified,
verificationType,
avatarUrl,
bannerUrl,
stats,
joinDate,
location,
website
};
}
catch (error) {
console.error(`Error extracting user profile: ${error}`);
return null;
}
}