UNPKG

stihirus-reader

Version:

Fetches author profile information and optionally poems from stihirus.ru

437 lines (389 loc) 19.5 kB
import fetch from 'node-fetch'; import * as cheerio from 'cheerio'; const BASE_URL = 'https://stihirus.ru'; const API_BASE_URL = `${BASE_URL}/-zbb/api`; const POEMS_PER_PAGE = 20; const DEFAULT_REQUEST_DELAY_MS = 500; const ERROR_CODES = { NETWORK_ERROR: 503, HTTP_ERROR: 502, API_ERROR: 500, PARSING_ERROR: 500, NOT_FOUND: 404, INVALID_INPUT: 400, UNKNOWN_ERROR: 500, }; const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms)); function extractStatusCode(message) { const match = message.match(/status:\s*(\d+)/); return match ? parseInt(match[1], 10) : null; } function createErrorObject(message, code, originalError = null) { const errorObj = { code, message }; if (originalError && originalError.message && originalError.message !== message) { errorObj.originalMessage = originalError.message; } return errorObj; } async function fetchHtml(url) { console.log(`[fetchHtml] Fetching: ${url}`); let response; try { response = await fetch(url, { headers: { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7', 'Connection': 'keep-alive', }, redirect: 'follow', }); if (!response.ok) { let errorBody = ''; try { errorBody = await response.text(); } catch (e) { /* ignore */ } const errorMessage = `HTTP error! status: ${response.status} for ${url}. Body: ${errorBody.substring(0, 200)}`; const error = new Error(errorMessage); error.statusCode = response.status; throw error; } return await response.text(); } catch (error) { console.error(`[fetchHtml] Error fetching ${url}:`, error.message); if (!error.statusCode) { error.statusCode = ERROR_CODES.NETWORK_ERROR; } throw error; } } async function fetchApi(endpoint, data, refererUrl = BASE_URL + '/') { const apiUrl = `${API_BASE_URL}/${endpoint}`; console.log(`[fetchApi] Posting to: ${apiUrl} with data:`, data); let response; try { response = await fetch(apiUrl, { method: 'POST', headers: { 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36', 'Accept': 'application/json, text/javascript, */*; q=0.01', 'X-Requested-With': 'XMLHttpRequest', 'Origin': BASE_URL, 'Referer': refererUrl, }, body: new URLSearchParams(data).toString(), }); if (!response.ok) { let errorBody = ''; try { errorBody = await response.text(); } catch (e) { /* ignore */ } const errorMessage = `API error! status: ${response.status} for ${apiUrl}. Body: ${errorBody.substring(0,200)}`; const error = new Error(errorMessage); error.statusCode = response.status; throw error; } const result = await response.json(); if (result.status !== 'success' && !(endpoint === 'pr_read_avtor' && Array.isArray(result.data) && result.data.length === 0)) { const errorMessage = `API returned status: ${result.status}. Message: ${result.message || 'No message'}`; console.warn(`[fetchApi] ${errorMessage}`); const error = new Error(errorMessage); error.statusCode = ERROR_CODES.API_ERROR; error.apiStatus = result.status; throw error; } return result; } catch (error) { console.error(`[fetchApi] Error posting to ${apiUrl}:`, error.message); if (!error.statusCode) { error.statusCode = error.message.includes('API returned status') ? ERROR_CODES.API_ERROR : ERROR_CODES.NETWORK_ERROR; } throw error; } } async function fetchUsernameForId(authorId) { console.log(`[fetchUsernameForId] Fetching username for ID: ${authorId}`); try { const apiResponse = await fetchApi('pr_read_avtor', { id: authorId, from: 0, }); if (apiResponse.data && apiResponse.data.length > 0 && apiResponse.data[0].useruri) { return apiResponse.data[0].useruri; } console.warn(`[fetchUsernameForId] Could not find useruri for ID ${authorId} via API (status: ${apiResponse.status}).`); return null; } catch (error) { console.error(`[fetchUsernameForId] Error fetching username for ID ${authorId}:`, error.message); return null; } } function parseAuthorInfoFromHtml(html) { try { const $ = cheerio.load(html); const info = { authorId: null, usernameFromHtml: null, description: '', avatarUrl: null, headerUrl: null, stats: { poems: 0, reviewsSent: 0, reviewsReceived: 0 }, collections: [], lastVisit: '', status: '', }; const authorIdStr = $('.avtorinfo.page_avatar').attr('data-userid'); if (authorIdStr) { info.authorId = parseInt(authorIdStr, 10); } info.usernameFromHtml = $('.avtorinfo__avtor-username').first().contents().filter(function() { return this.type === 'text'; }).text().trim() || null; if (!info.usernameFromHtml) { const userLink = $('.avtorinfo__avtor-username a').first().attr('href'); if (userLink) { const match = userLink.match(/\/avtor\/([^\/?]+)/); if (match && match[1]) info.usernameFromHtml = match[1]; } } if (!info.usernameFromHtml) { const avatarLink = $('.userinfo__avtor-username a').first().attr('href'); if (avatarLink) { const match = avatarLink.match(/\/avtor\/([^\/?]+)/); if (match && match[1]) info.usernameFromHtml = match[1]; } } info.description = $('.avtorinfo__userinfo.nl2br').text().trim(); const avatarSelectors = ['.page_avatar_img', '.userinfo__avtor-avatar']; for (const selector of avatarSelectors) { const element = $(selector).first(); if (element.length) { let rawUrl = element.is('img') ? element.attr('src') : null; if (!rawUrl) { const style = element.attr('style'); if (style) { const match = style.match(/url\(([^)]+)\)/); if (match && match[1]) rawUrl = match[1].replace(/['"]/g, ''); } } if (rawUrl) { info.avatarUrl = rawUrl.startsWith('//') ? `https:${rawUrl}` : (rawUrl.startsWith('/') ? `${BASE_URL}${rawUrl}` : rawUrl); break; } } } const headerSrc = $('.page_header_img').attr('src'); if (headerSrc) { info.headerUrl = headerSrc.startsWith('//') ? `https:${headerSrc}` : (headerSrc.startsWith('/') ? `${BASE_URL}${headerSrc}` : headerSrc); } const statsCard = $('#show_stat'); if (statsCard.length) { const progressBars = statsCard.find('.progress-bar'); const cardText = statsCard.find('.card-text').text(); const poemsMatch = cardText.match(/Произведений\s*([\d]+)/) || $(progressBars[0]).attr('aria-valuenow'); const sentMatch = cardText.match(/Написано отзывов\s*([\d]+)/) || $(progressBars[1]).attr('aria-valuenow'); const receivedMatch = cardText.match(/Получено отзывов\s*([\d]+)/) || $(progressBars[2]).attr('aria-valuenow'); info.stats.poems = parseInt(poemsMatch?.[1] ?? poemsMatch ?? '0', 10); info.stats.reviewsSent = parseInt(sentMatch?.[1] ?? sentMatch ?? '0', 10); info.stats.reviewsReceived = parseInt(receivedMatch?.[1] ?? receivedMatch ?? '0', 10); } $('#show_sborniki a.list-group-item').each((i, el) => { const name = $(el).text().trim(); const relativeUrl = $(el).attr('href'); if (name && relativeUrl) { info.collections.push({ name: name, url: `${BASE_URL}${relativeUrl}` }); } }); const footerTexts = $('.card-footer .small.text-muted'); footerTexts.each((i, el) => { const text = $(el).text().trim(); if (text.startsWith('Последний визит:')) { info.lastVisit = text.replace('Последний визит:', '').trim(); } else if (text.startsWith('Статус:')) { info.status = $(el).find('b').text().trim() || text.replace('Статус:', '').trim(); } }); return info; } catch (error) { console.error("[parseAuthorInfoFromHtml] Error parsing HTML:", error); const parseError = new Error("Failed to parse author page HTML."); parseError.statusCode = ERROR_CODES.PARSING_ERROR; throw parseError; } } export async function getAuthorData(identifier, requestDelayMs = DEFAULT_REQUEST_DELAY_MS, fetchAllPoems = true) { let authorId = null; let username = null; let authorPageUrl = null; let parsedInfo = null; const allPoems = []; try { if (typeof requestDelayMs !== 'number' || requestDelayMs < 0) { console.warn(`[getAuthorData] Invalid requestDelayMs value (${requestDelayMs}). Using default: ${DEFAULT_REQUEST_DELAY_MS}ms.`); requestDelayMs = DEFAULT_REQUEST_DELAY_MS; } if (typeof identifier === 'number') { authorId = identifier; username = await fetchUsernameForId(authorId); if (!username) { throw createErrorObject(`Could not find username for author ID ${authorId}. Author might not exist or have poems.`, ERROR_CODES.NOT_FOUND); } authorPageUrl = `https://${username}.stihirus.ru/`; } else if (typeof identifier === 'string') { try { const url = new URL(identifier.startsWith('http') ? identifier : `https://${identifier}`); if (url.hostname.endsWith('.stihirus.ru') && url.hostname.split('.').length > 2) { username = url.hostname.split('.')[0]; authorPageUrl = url.origin + '/'; } else if (url.hostname === 'stihirus.ru' && url.pathname.startsWith('/avtor/')) { username = url.pathname.split('/')[2]; if (!username) throw new Error('Could not extract username from path URL'); authorPageUrl = `https://${username}.stihirus.ru/`; } else if (url.hostname === 'stihirus.ru' && !url.pathname.startsWith('/avtor/')) { throw new Error(`Invalid path URL format. Expected /avtor/username: ${identifier}`); } else { throw new Error(`Unrecognized URL format: ${identifier}`); } } catch (e) { if (!identifier.includes('/') && !identifier.includes('.') && identifier.length > 0) { username = identifier; authorPageUrl = `https://${username}.stihirus.ru/`; } else { throw createErrorObject(`Invalid identifier format: ${identifier}. Expected ID, username, subdomain URL, or path URL.`, ERROR_CODES.INVALID_INPUT, e); } } } else { throw createErrorObject('Identifier must be a number (ID) or a string (username/URL)', ERROR_CODES.INVALID_INPUT); } if (!username || !authorPageUrl) { throw createErrorObject(`Could not determine username or author page URL from identifier: ${identifier}`, ERROR_CODES.INVALID_INPUT); } console.log(`[getAuthorData] Identified username: ${username}, Target page URL: ${authorPageUrl}, Delay: ${requestDelayMs}ms, Fetch poems: ${fetchAllPoems}`); try { const authorHtml = await fetchHtml(authorPageUrl); parsedInfo = parseAuthorInfoFromHtml(authorHtml); } catch (error) { const statusCode = error.statusCode || extractStatusCode(error.message); if (statusCode === 404 && authorPageUrl.includes(`${username}.stihirus.ru`)) { console.warn(`[getAuthorData] Subdomain URL failed (404). Trying path URL...`); authorPageUrl = `${BASE_URL}/avtor/${username}`; try { const authorHtmlFallback = await fetchHtml(authorPageUrl); parsedInfo = parseAuthorInfoFromHtml(authorHtmlFallback); console.log(`[getAuthorData] Successfully fetched from fallback URL: ${authorPageUrl}`); } catch (fallbackError) { const fallbackStatusCode = fallbackError.statusCode || extractStatusCode(fallbackError.message) || ERROR_CODES.NOT_FOUND; console.error(`[getAuthorData] Both subdomain and path URL failed for ${username}. Final error: ${fallbackError.message}`); throw createErrorObject(`Could not fetch author page for ${username}. Tried subdomain and path.`, fallbackStatusCode, fallbackError); } } else { console.error(`[getAuthorData] Failed to fetch or parse HTML from ${authorPageUrl}: ${error.message}`); throw createErrorObject(`Failed to fetch or parse author page: ${error.message}`, statusCode || ERROR_CODES.HTTP_ERROR, error); } } if (!authorId && parsedInfo.authorId) { authorId = parsedInfo.authorId; } else if (authorId && parsedInfo.authorId && authorId !== parsedInfo.authorId) { console.warn(`[getAuthorData] Initial ID ${identifier} (${typeof identifier === 'number' ? identifier : 'N/A'}) does not match ID found on page ${parsedInfo.authorId}. Using ID from page.`); authorId = parsedInfo.authorId; } if (!authorId) { throw createErrorObject(`Could not determine author ID for username: ${username}`, ERROR_CODES.NOT_FOUND); } if (fetchAllPoems) { let from = 0; let keepFetching = true; console.log(`[getAuthorData] Starting poem fetching for author ID: ${authorId}`); while (keepFetching) { const apiResponse = await fetchApi('pr_read_avtor', { id: authorId, from: from }, authorPageUrl); if (apiResponse && Array.isArray(apiResponse.data)) { const poems = apiResponse.data; console.log(`[getAuthorData] Fetched ${poems.length} poems from offset ${from}`); if (poems.length > 0) { allPoems.push(...poems); from += poems.length; if (poems.length < POEMS_PER_PAGE) { keepFetching = false; console.log('[getAuthorData] Last page reached.'); } else { await sleep(requestDelayMs); } } else { keepFetching = false; console.log('[getAuthorData] No more poems found.'); } } else { console.error('[getAuthorData] Invalid API response structure received after success check:', apiResponse); keepFetching = false; } } console.log(`[getAuthorData] Total poems fetched: ${allPoems.length}`); } else { console.log(`[getAuthorData] Skipping poem fetching because fetchAllPoems is false.`); } const finalUsername = parsedInfo.usernameFromHtml || username; let finalPoemCount = parsedInfo.stats.poems; if (fetchAllPoems) { if (finalPoemCount === 0 && allPoems.length > 0) { finalPoemCount = allPoems.length; console.log(`[getAuthorData] Updated poem count from API data: ${finalPoemCount}`); } else if (finalPoemCount > 0 && allPoems.length !== finalPoemCount && allPoems.length > 0) { console.warn(`[getAuthorData] Poem count mismatch: HTML stats (${finalPoemCount}) vs API fetched (${allPoems.length}). Using API count.`); finalPoemCount = allPoems.length; } } // If !fetchAllPoems, we just keep the count from HTML parsing. const successData = { authorId: authorId, username: finalUsername, profileUrl: authorPageUrl, canonicalUsername: username, description: parsedInfo.description, avatarUrl: parsedInfo.avatarUrl, headerUrl: parsedInfo.headerUrl, status: parsedInfo.status, lastVisit: parsedInfo.lastVisit, stats: { ...parsedInfo.stats, poems: finalPoemCount, }, collections: parsedInfo.collections, poems: fetchAllPoems ? allPoems.map(poem => ({ id: parseInt(poem.id, 10), title: poem.title || '***', text: poem.body, created: poem.created, rubric: { name: poem.razd_name, url: poem.razd_url ? `${BASE_URL}/razdel/${poem.razd_url}` : null, }, collection: poem.urazd_name !== 'не в сборнике' ? poem.urazd_name : null, rating: parseInt(poem.rating || '0', 10), commentsCount: parseInt(poem.comments_count || '0', 10), imageUrl: poem.background ? (poem.background.startsWith('/') ? `${BASE_URL}${poem.background}` : poem.background) : null, hasCertificate: poem.have_certificate === '1', })) : [], // Return empty array if poems were not fetched }; return { status: 'success', data: successData }; } catch (error) { console.error(`[getAuthorData] Operation failed for identifier "${identifier}":`, error.message || error); const errorCode = error.code || error.statusCode || ERROR_CODES.UNKNOWN_ERROR; const errorMessage = error.message || 'An unknown error occurred during processing.'; const finalError = (error.code && error.message) ? error : createErrorObject(errorMessage, errorCode, error instanceof Error ? error : null); return { status: 'error', error: finalError }; } }