stihirus-reader
Version:
Fetches author profile information and optionally poems from stihirus.ru
437 lines (389 loc) • 19.5 kB
JavaScript
import fetch from 'node-fetch';
import * as cheerio from 'cheerio';
const BASE_URL = 'https://stihirus.ru';
const API_BASE_URL = `${BASE_URL}/-zbb/api`;
const POEMS_PER_PAGE = 20;
const DEFAULT_REQUEST_DELAY_MS = 500;
const ERROR_CODES = {
NETWORK_ERROR: 503,
HTTP_ERROR: 502,
API_ERROR: 500,
PARSING_ERROR: 500,
NOT_FOUND: 404,
INVALID_INPUT: 400,
UNKNOWN_ERROR: 500,
};
const sleep = (ms) => new Promise(resolve => setTimeout(resolve, ms));
function extractStatusCode(message) {
const match = message.match(/status:\s*(\d+)/);
return match ? parseInt(match[1], 10) : null;
}
function createErrorObject(message, code, originalError = null) {
const errorObj = { code, message };
if (originalError && originalError.message && originalError.message !== message) {
errorObj.originalMessage = originalError.message;
}
return errorObj;
}
async function fetchHtml(url) {
console.log(`[fetchHtml] Fetching: ${url}`);
let response;
try {
response = await fetch(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Language': 'ru-RU,ru;q=0.9,en-US;q=0.8,en;q=0.7',
'Connection': 'keep-alive',
},
redirect: 'follow',
});
if (!response.ok) {
let errorBody = '';
try {
errorBody = await response.text();
} catch (e) { /* ignore */ }
const errorMessage = `HTTP error! status: ${response.status} for ${url}. Body: ${errorBody.substring(0, 200)}`;
const error = new Error(errorMessage);
error.statusCode = response.status;
throw error;
}
return await response.text();
} catch (error) {
console.error(`[fetchHtml] Error fetching ${url}:`, error.message);
if (!error.statusCode) {
error.statusCode = ERROR_CODES.NETWORK_ERROR;
}
throw error;
}
}
async function fetchApi(endpoint, data, refererUrl = BASE_URL + '/') {
const apiUrl = `${API_BASE_URL}/${endpoint}`;
console.log(`[fetchApi] Posting to: ${apiUrl} with data:`, data);
let response;
try {
response = await fetch(apiUrl, {
method: 'POST',
headers: {
'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36',
'Accept': 'application/json, text/javascript, */*; q=0.01',
'X-Requested-With': 'XMLHttpRequest',
'Origin': BASE_URL,
'Referer': refererUrl,
},
body: new URLSearchParams(data).toString(),
});
if (!response.ok) {
let errorBody = '';
try { errorBody = await response.text(); } catch (e) { /* ignore */ }
const errorMessage = `API error! status: ${response.status} for ${apiUrl}. Body: ${errorBody.substring(0,200)}`;
const error = new Error(errorMessage);
error.statusCode = response.status;
throw error;
}
const result = await response.json();
if (result.status !== 'success' && !(endpoint === 'pr_read_avtor' && Array.isArray(result.data) && result.data.length === 0)) {
const errorMessage = `API returned status: ${result.status}. Message: ${result.message || 'No message'}`;
console.warn(`[fetchApi] ${errorMessage}`);
const error = new Error(errorMessage);
error.statusCode = ERROR_CODES.API_ERROR;
error.apiStatus = result.status;
throw error;
}
return result;
} catch (error) {
console.error(`[fetchApi] Error posting to ${apiUrl}:`, error.message);
if (!error.statusCode) {
error.statusCode = error.message.includes('API returned status') ? ERROR_CODES.API_ERROR : ERROR_CODES.NETWORK_ERROR;
}
throw error;
}
}
async function fetchUsernameForId(authorId) {
console.log(`[fetchUsernameForId] Fetching username for ID: ${authorId}`);
try {
const apiResponse = await fetchApi('pr_read_avtor', {
id: authorId,
from: 0,
});
if (apiResponse.data && apiResponse.data.length > 0 && apiResponse.data[0].useruri) {
return apiResponse.data[0].useruri;
}
console.warn(`[fetchUsernameForId] Could not find useruri for ID ${authorId} via API (status: ${apiResponse.status}).`);
return null;
} catch (error) {
console.error(`[fetchUsernameForId] Error fetching username for ID ${authorId}:`, error.message);
return null;
}
}
function parseAuthorInfoFromHtml(html) {
try {
const $ = cheerio.load(html);
const info = {
authorId: null,
usernameFromHtml: null,
description: '',
avatarUrl: null,
headerUrl: null,
stats: { poems: 0, reviewsSent: 0, reviewsReceived: 0 },
collections: [],
lastVisit: '',
status: '',
};
const authorIdStr = $('.avtorinfo.page_avatar').attr('data-userid');
if (authorIdStr) {
info.authorId = parseInt(authorIdStr, 10);
}
info.usernameFromHtml = $('.avtorinfo__avtor-username').first().contents().filter(function() {
return this.type === 'text';
}).text().trim() || null;
if (!info.usernameFromHtml) {
const userLink = $('.avtorinfo__avtor-username a').first().attr('href');
if (userLink) {
const match = userLink.match(/\/avtor\/([^\/?]+)/);
if (match && match[1]) info.usernameFromHtml = match[1];
}
}
if (!info.usernameFromHtml) {
const avatarLink = $('.userinfo__avtor-username a').first().attr('href');
if (avatarLink) {
const match = avatarLink.match(/\/avtor\/([^\/?]+)/);
if (match && match[1]) info.usernameFromHtml = match[1];
}
}
info.description = $('.avtorinfo__userinfo.nl2br').text().trim();
const avatarSelectors = ['.page_avatar_img', '.userinfo__avtor-avatar'];
for (const selector of avatarSelectors) {
const element = $(selector).first();
if (element.length) {
let rawUrl = element.is('img') ? element.attr('src') : null;
if (!rawUrl) {
const style = element.attr('style');
if (style) {
const match = style.match(/url\(([^)]+)\)/);
if (match && match[1]) rawUrl = match[1].replace(/['"]/g, '');
}
}
if (rawUrl) {
info.avatarUrl = rawUrl.startsWith('//') ? `https:${rawUrl}` : (rawUrl.startsWith('/') ? `${BASE_URL}${rawUrl}` : rawUrl);
break;
}
}
}
const headerSrc = $('.page_header_img').attr('src');
if (headerSrc) {
info.headerUrl = headerSrc.startsWith('//') ? `https:${headerSrc}` : (headerSrc.startsWith('/') ? `${BASE_URL}${headerSrc}` : headerSrc);
}
const statsCard = $('#show_stat');
if (statsCard.length) {
const progressBars = statsCard.find('.progress-bar');
const cardText = statsCard.find('.card-text').text();
const poemsMatch = cardText.match(/Произведений\s*([\d]+)/) || $(progressBars[0]).attr('aria-valuenow');
const sentMatch = cardText.match(/Написано отзывов\s*([\d]+)/) || $(progressBars[1]).attr('aria-valuenow');
const receivedMatch = cardText.match(/Получено отзывов\s*([\d]+)/) || $(progressBars[2]).attr('aria-valuenow');
info.stats.poems = parseInt(poemsMatch?.[1] ?? poemsMatch ?? '0', 10);
info.stats.reviewsSent = parseInt(sentMatch?.[1] ?? sentMatch ?? '0', 10);
info.stats.reviewsReceived = parseInt(receivedMatch?.[1] ?? receivedMatch ?? '0', 10);
}
$('#show_sborniki a.list-group-item').each((i, el) => {
const name = $(el).text().trim();
const relativeUrl = $(el).attr('href');
if (name && relativeUrl) {
info.collections.push({
name: name,
url: `${BASE_URL}${relativeUrl}`
});
}
});
const footerTexts = $('.card-footer .small.text-muted');
footerTexts.each((i, el) => {
const text = $(el).text().trim();
if (text.startsWith('Последний визит:')) {
info.lastVisit = text.replace('Последний визит:', '').trim();
} else if (text.startsWith('Статус:')) {
info.status = $(el).find('b').text().trim() || text.replace('Статус:', '').trim();
}
});
return info;
} catch (error) {
console.error("[parseAuthorInfoFromHtml] Error parsing HTML:", error);
const parseError = new Error("Failed to parse author page HTML.");
parseError.statusCode = ERROR_CODES.PARSING_ERROR;
throw parseError;
}
}
export async function getAuthorData(identifier, requestDelayMs = DEFAULT_REQUEST_DELAY_MS, fetchAllPoems = true) {
let authorId = null;
let username = null;
let authorPageUrl = null;
let parsedInfo = null;
const allPoems = [];
try {
if (typeof requestDelayMs !== 'number' || requestDelayMs < 0) {
console.warn(`[getAuthorData] Invalid requestDelayMs value (${requestDelayMs}). Using default: ${DEFAULT_REQUEST_DELAY_MS}ms.`);
requestDelayMs = DEFAULT_REQUEST_DELAY_MS;
}
if (typeof identifier === 'number') {
authorId = identifier;
username = await fetchUsernameForId(authorId);
if (!username) {
throw createErrorObject(`Could not find username for author ID ${authorId}. Author might not exist or have poems.`, ERROR_CODES.NOT_FOUND);
}
authorPageUrl = `https://${username}.stihirus.ru/`;
}
else if (typeof identifier === 'string') {
try {
const url = new URL(identifier.startsWith('http') ? identifier : `https://${identifier}`);
if (url.hostname.endsWith('.stihirus.ru') && url.hostname.split('.').length > 2) {
username = url.hostname.split('.')[0];
authorPageUrl = url.origin + '/';
}
else if (url.hostname === 'stihirus.ru' && url.pathname.startsWith('/avtor/')) {
username = url.pathname.split('/')[2];
if (!username) throw new Error('Could not extract username from path URL');
authorPageUrl = `https://${username}.stihirus.ru/`;
}
else if (url.hostname === 'stihirus.ru' && !url.pathname.startsWith('/avtor/')) {
throw new Error(`Invalid path URL format. Expected /avtor/username: ${identifier}`);
}
else {
throw new Error(`Unrecognized URL format: ${identifier}`);
}
} catch (e) {
if (!identifier.includes('/') && !identifier.includes('.') && identifier.length > 0) {
username = identifier;
authorPageUrl = `https://${username}.stihirus.ru/`;
} else {
throw createErrorObject(`Invalid identifier format: ${identifier}. Expected ID, username, subdomain URL, or path URL.`, ERROR_CODES.INVALID_INPUT, e);
}
}
}
else {
throw createErrorObject('Identifier must be a number (ID) or a string (username/URL)', ERROR_CODES.INVALID_INPUT);
}
if (!username || !authorPageUrl) {
throw createErrorObject(`Could not determine username or author page URL from identifier: ${identifier}`, ERROR_CODES.INVALID_INPUT);
}
console.log(`[getAuthorData] Identified username: ${username}, Target page URL: ${authorPageUrl}, Delay: ${requestDelayMs}ms, Fetch poems: ${fetchAllPoems}`);
try {
const authorHtml = await fetchHtml(authorPageUrl);
parsedInfo = parseAuthorInfoFromHtml(authorHtml);
} catch (error) {
const statusCode = error.statusCode || extractStatusCode(error.message);
if (statusCode === 404 && authorPageUrl.includes(`${username}.stihirus.ru`)) {
console.warn(`[getAuthorData] Subdomain URL failed (404). Trying path URL...`);
authorPageUrl = `${BASE_URL}/avtor/${username}`;
try {
const authorHtmlFallback = await fetchHtml(authorPageUrl);
parsedInfo = parseAuthorInfoFromHtml(authorHtmlFallback);
console.log(`[getAuthorData] Successfully fetched from fallback URL: ${authorPageUrl}`);
} catch (fallbackError) {
const fallbackStatusCode = fallbackError.statusCode || extractStatusCode(fallbackError.message) || ERROR_CODES.NOT_FOUND;
console.error(`[getAuthorData] Both subdomain and path URL failed for ${username}. Final error: ${fallbackError.message}`);
throw createErrorObject(`Could not fetch author page for ${username}. Tried subdomain and path.`, fallbackStatusCode, fallbackError);
}
} else {
console.error(`[getAuthorData] Failed to fetch or parse HTML from ${authorPageUrl}: ${error.message}`);
throw createErrorObject(`Failed to fetch or parse author page: ${error.message}`, statusCode || ERROR_CODES.HTTP_ERROR, error);
}
}
if (!authorId && parsedInfo.authorId) {
authorId = parsedInfo.authorId;
} else if (authorId && parsedInfo.authorId && authorId !== parsedInfo.authorId) {
console.warn(`[getAuthorData] Initial ID ${identifier} (${typeof identifier === 'number' ? identifier : 'N/A'}) does not match ID found on page ${parsedInfo.authorId}. Using ID from page.`);
authorId = parsedInfo.authorId;
}
if (!authorId) {
throw createErrorObject(`Could not determine author ID for username: ${username}`, ERROR_CODES.NOT_FOUND);
}
if (fetchAllPoems) {
let from = 0;
let keepFetching = true;
console.log(`[getAuthorData] Starting poem fetching for author ID: ${authorId}`);
while (keepFetching) {
const apiResponse = await fetchApi('pr_read_avtor', { id: authorId, from: from }, authorPageUrl);
if (apiResponse && Array.isArray(apiResponse.data)) {
const poems = apiResponse.data;
console.log(`[getAuthorData] Fetched ${poems.length} poems from offset ${from}`);
if (poems.length > 0) {
allPoems.push(...poems);
from += poems.length;
if (poems.length < POEMS_PER_PAGE) {
keepFetching = false;
console.log('[getAuthorData] Last page reached.');
} else {
await sleep(requestDelayMs);
}
} else {
keepFetching = false;
console.log('[getAuthorData] No more poems found.');
}
} else {
console.error('[getAuthorData] Invalid API response structure received after success check:', apiResponse);
keepFetching = false;
}
}
console.log(`[getAuthorData] Total poems fetched: ${allPoems.length}`);
} else {
console.log(`[getAuthorData] Skipping poem fetching because fetchAllPoems is false.`);
}
const finalUsername = parsedInfo.usernameFromHtml || username;
let finalPoemCount = parsedInfo.stats.poems;
if (fetchAllPoems) {
if (finalPoemCount === 0 && allPoems.length > 0) {
finalPoemCount = allPoems.length;
console.log(`[getAuthorData] Updated poem count from API data: ${finalPoemCount}`);
} else if (finalPoemCount > 0 && allPoems.length !== finalPoemCount && allPoems.length > 0) {
console.warn(`[getAuthorData] Poem count mismatch: HTML stats (${finalPoemCount}) vs API fetched (${allPoems.length}). Using API count.`);
finalPoemCount = allPoems.length;
}
} // If !fetchAllPoems, we just keep the count from HTML parsing.
const successData = {
authorId: authorId,
username: finalUsername,
profileUrl: authorPageUrl,
canonicalUsername: username,
description: parsedInfo.description,
avatarUrl: parsedInfo.avatarUrl,
headerUrl: parsedInfo.headerUrl,
status: parsedInfo.status,
lastVisit: parsedInfo.lastVisit,
stats: {
...parsedInfo.stats,
poems: finalPoemCount,
},
collections: parsedInfo.collections,
poems: fetchAllPoems ? allPoems.map(poem => ({
id: parseInt(poem.id, 10),
title: poem.title || '***',
text: poem.body,
created: poem.created,
rubric: {
name: poem.razd_name,
url: poem.razd_url ? `${BASE_URL}/razdel/${poem.razd_url}` : null,
},
collection: poem.urazd_name !== 'не в сборнике' ? poem.urazd_name : null,
rating: parseInt(poem.rating || '0', 10),
commentsCount: parseInt(poem.comments_count || '0', 10),
imageUrl: poem.background ? (poem.background.startsWith('/') ? `${BASE_URL}${poem.background}` : poem.background) : null,
hasCertificate: poem.have_certificate === '1',
})) : [], // Return empty array if poems were not fetched
};
return {
status: 'success',
data: successData
};
} catch (error) {
console.error(`[getAuthorData] Operation failed for identifier "${identifier}":`, error.message || error);
const errorCode = error.code || error.statusCode || ERROR_CODES.UNKNOWN_ERROR;
const errorMessage = error.message || 'An unknown error occurred during processing.';
const finalError = (error.code && error.message)
? error
: createErrorObject(errorMessage, errorCode, error instanceof Error ? error : null);
return {
status: 'error',
error: finalError
};
}
}