scoopit
Version:
A tool that generates content files from website routes in multiple formats (text, JSON, markdown)
701 lines (599 loc) • 22.8 kB
JavaScript
const axios = require("axios");
const fs = require("fs-extra");
const path = require("path");
const readline = require("readline");
const logger = require("./utils/logger");
const {
extractMetaInfo,
extractContent,
convertToMarkdown,
} = require("./utils/contentProcessor");
// Default configuration
const DEFAULT_BASE_URL = "https://icjia.illinois.gov";
const DEFAULT_ROUTES = ["/about", "/researchHub"];
const DEFAULT_FORMAT = "text"; // Default format is 'text'
const VALID_FORMATS = ["text", "json", "markdown", "all"]; // Valid output formats
// Application version
const APP_VERSION = require("./package.json").version;
// Output directory location
let OUTPUT_DIR = path.join(process.cwd(), "output");
// Create output directory if it doesn't exist
if (!fs.existsSync(OUTPUT_DIR)) {
fs.ensureDirSync(OUTPUT_DIR);
logger.fileSystem("created", OUTPUT_DIR, { recursive: true });
}
/**
* Asks the user if they want to delete previous output files
* @returns {Promise<boolean>} - True if user wants to delete previous files, false otherwise
*/
async function shouldDeletePreviousOutputs() {
// Skip user prompt in test mode
if (process.env.NODE_ENV === 'test') {
return false;
}
// Check if output directory already has content
const outputDirs = ['text', 'json', 'markdown'].map(dir => path.join(OUTPUT_DIR, dir));
let hasExistingOutput = false;
for (const dir of outputDirs) {
if (fs.existsSync(dir)) {
const files = fs.readdirSync(dir);
if (files.length > 0) {
hasExistingOutput = true;
break;
}
}
}
// If no existing output, no need to ask
if (!hasExistingOutput) {
return false;
}
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout
});
const answer = await new Promise(resolve => {
rl.question('Previous output files detected. Delete them before proceeding? (y/N): ', response => {
rl.close();
resolve(response.toLowerCase());
});
});
return answer === 'y' || answer === 'yes';
}
/**
* Deletes previous output files but preserves logs
* @returns {Promise<void>}
*/
async function deletePreviousOutputs() {
logger.info('Deleting previous output files');
// Directories to clean (but not the log directory)
const outputDirs = ['text', 'json', 'markdown'].map(dir => path.join(OUTPUT_DIR, dir));
for (const dir of outputDirs) {
if (fs.existsSync(dir)) {
try {
await fs.emptyDir(dir);
logger.fileSystem("emptied", dir);
} catch (error) {
logger.error(`Failed to empty directory ${dir}: ${error.message}`);
}
} else {
fs.ensureDirSync(dir);
logger.fileSystem("created", dir);
}
}
logger.info('Previous output files deleted');
}
/**
* Converts a string to a slug format
* @param {string} text - The text to convert to slug
* @returns {string} - Text in slug format
*/
function convertToSlug(text) {
return text
.toLowerCase()
.replace(/[^\w\s-]/g, '') // Remove non-word chars
.replace(/[\s_-]+/g, '-') // Replace spaces and underscores with hyphens
.replace(/^-+|-+$/g, '') // Remove leading/trailing hyphens
.trim();
}
/**
* Generates a filename with title in slug form and datetime stamp
* @param {string} route - The route path
* @param {string} title - The page title
* @param {boolean} isLog - Whether this is a log file (no title/datetime needed)
* @param {boolean} isTest - Whether this is in test mode (no timestamps/titles)
* @returns {string} - Generated filename without extension
*/
function generateFilename(route, title, isLog = false, isTest = process.env.NODE_ENV === 'test') {
// Check if this is being called from test scripts
const callerStack = new Error().stack || '';
const isTestScript = callerStack.includes('verifyOutputs.js') ||
callerStack.includes('testRunner.js') ||
callerStack.includes('runAllTests.js');
// For log files or test mode or test scripts, just use the route-based name
if (isLog || isTest || isTestScript) {
return route.replace(/^\//, "").replace(/\//g, "-") || "index";
}
// Get current datetime in format YYYYMMDD-HHMMSS
const now = new Date();
const dateTimeStr = now.toISOString()
.replace(/[T]/g, '-')
.replace(/[:.]/g, '')
.substring(0, 15); // Gets YYYYMMDD-HHMMSS format
// Create slug from title
const titleSlug = title ? convertToSlug(title) : '';
// Create route slug (as fallback)
const routeSlug = route.replace(/^\//, "").replace(/\//g, "-") || "index";
// Use title slug if available, otherwise use route slug
const baseSlug = titleSlug || routeSlug;
// Combine slug with datetime
return `${baseSlug}_${dateTimeStr}`;
}
/**
* Validates a URL format
* @param {string} url - The URL to validate
* @returns {boolean} - True if valid URL format, false otherwise
*/
function isValidUrl(url) {
try {
new URL(url);
return true;
} catch (error) {
return false;
}
}
/**
* Fetch content from a URL with improved error handling
* @param {string} url - The URL to fetch content from
* @returns {Promise<string|null>} - The HTML content or null if fetch fails
*/
async function fetchContent(url) {
// Validate URL first
if (!isValidUrl(url)) {
logger.error(`Invalid URL format: ${url}`);
return null;
}
const fetchContext = logger.startOperation("fetch_content", { url });
try {
const startTime = Date.now();
const response = await axios.get(url, {
timeout: 30000, // 30s timeout
headers: {
'User-Agent': `ScoopIt Content Generator/${APP_VERSION}`,
},
validateStatus: status => status >= 200 && status < 300, // Only consider 2xx as success
});
const duration = Date.now() - startTime;
logger.httpSuccess(url, response.status, duration, {
contentLength: response.headers["content-length"],
contentType: response.headers["content-type"],
});
logger.endOperation(fetchContext, "success", {
status: response.status,
duration,
});
return response.data;
} catch (error) {
const errorDetails = {
message: error.message,
code: error.code || 'UNKNOWN',
status: error.response?.status,
statusText: error.response?.statusText,
};
logger.httpError(url, error, errorDetails);
logger.endOperation(fetchContext, "error", errorDetails);
return null;
}
}
/**
* Generate files for a route with improved error handling
* @param {string} baseUrl - The base URL
* @param {string} route - The route path
* @param {string} format - The output format (text, json, markdown, or all)
* @returns {Promise<Object|null>} - Route data or null if processing fails
*/
async function generateFilesForRoute(baseUrl, route, format = DEFAULT_FORMAT) {
// Input validation
if (!baseUrl) {
throw new Error('Base URL is required');
}
if (!route) {
throw new Error('Route is required');
}
if (!VALID_FORMATS.includes(format)) {
throw new Error(`Invalid format: ${format}. Valid formats are: ${VALID_FORMATS.join(', ')}`);
}
// Handle baseUrl as either string or object (from options)
const resolvedBaseUrl = typeof baseUrl === 'object' && baseUrl.baseUrl
? baseUrl.baseUrl
: baseUrl;
// Normalize the route path
const normalizedRoute = route.startsWith("/") ? route : `/${route}`;
const fullUrl = `${resolvedBaseUrl}${normalizedRoute}`;
const routeContext = logger.startOperation("generate_files_for_route", {
baseUrl: resolvedBaseUrl,
route: normalizedRoute,
fullUrl,
format,
});
logger.processing(`Processing ${fullUrl}...`);
// Fetch the content (could be HTML or JSON)
const content = await fetchContent(fullUrl);
if (!content) {
logger.error(`Failed to fetch content for ${fullUrl}`);
logger.endOperation(routeContext, "error", {
reason: "fetch_failed",
});
return null;
}
// Detect if the content is JSON
const { isJsonContent } = require('./utils/contentProcessor');
const isJson = isJsonContent(content);
try {
// Extract meta information
logger.processing(`Extracting metadata from ${fullUrl}`);
const metaInfo = extractMetaInfo(content, isJson);
logger.debug(`Extracted metadata`, {
title: metaInfo.title,
descriptionLength: metaInfo.description?.length || 0,
contentType: isJson ? 'JSON' : 'HTML'
});
// Extract main content
logger.processing(`Extracting content from ${fullUrl}`);
const { cleanHtml: contentHtml, textContent, isJson: detectedJson } = extractContent(content);
logger.debug(`Extracted content`, {
textLength: textContent?.length || 0,
htmlLength: contentHtml?.length || 0,
isJson: detectedJson
});
// Convert to markdown
logger.processing(`Converting content to markdown`);
const markdownContent = convertToMarkdown(contentHtml, detectedJson || isJson);
logger.debug(`Converted to markdown`, {
markdownLength: markdownContent?.length || 0,
});
// Generate filename with title slug and datetime stamp
const safeFilename = generateFilename(normalizedRoute, metaInfo.title);
logger.debug(`Generated safe filename`, { safeFilename });
// Create JSON data
const jsonData = {
url: fullUrl,
route: normalizedRoute,
title: metaInfo.title,
description: metaInfo.description,
textContent,
markdownContent,
timestamp: new Date().toISOString(),
};
// Create the appropriate output directories as needed
try {
// Write files based on format
if (format === "json" || format === "all") {
const jsonDir = path.join(OUTPUT_DIR, "json");
await fs.ensureDir(jsonDir);
const jsonFilePath = path.join(jsonDir, `${safeFilename}.json`);
await fs.writeFile(jsonFilePath, JSON.stringify(jsonData, null, 2));
logger.fileSystem("write", jsonFilePath, {
format: "json",
size: Buffer.byteLength(JSON.stringify(jsonData, null, 2)),
});
logger.info(`Generated JSON file: output/json/${safeFilename}.json`);
}
if (format === "text" || format === "all") {
const textDir = path.join(OUTPUT_DIR, "text");
await fs.ensureDir(textDir);
const textFilePath = path.join(textDir, `${safeFilename}.txt`);
await fs.writeFile(textFilePath, textContent);
logger.fileSystem("write", textFilePath, {
format: "text",
size: Buffer.byteLength(textContent),
});
logger.info(`Generated text file: output/text/${safeFilename}.txt`);
}
if (format === "markdown" || format === "all") {
const markdownDir = path.join(OUTPUT_DIR, "markdown");
await fs.ensureDir(markdownDir);
const markdownFilePath = path.join(markdownDir, `${safeFilename}.md`);
await fs.writeFile(markdownFilePath, markdownContent);
logger.fileSystem("write", markdownFilePath, {
format: "markdown",
size: Buffer.byteLength(markdownContent),
});
logger.info(
`Generated markdown file: output/markdown/${safeFilename}.md`
);
}
} catch (fileError) {
logger.error(`File system error: ${fileError.message}`, {
error: {
message: fileError.message,
code: fileError.code,
stack: fileError.stack,
},
});
throw new Error(`Failed to save output files: ${fileError.message}`);
}
logger.info(`Generated files for ${fullUrl}`);
logger.endOperation(routeContext, "success", {
formats:
format === "all" ? VALID_FORMATS.filter((f) => f !== "all") : [format],
});
// Return data for testing purposes
return {
route: normalizedRoute,
url: fullUrl,
data: jsonData
};
} catch (error) {
logger.error(
`Error processing route ${normalizedRoute}: ${error.message}`,
{
error: {
message: error.message,
stack: error.stack,
},
}
);
logger.endOperation(routeContext, "error", {
error: error.message,
});
throw error;
}
}
/**
* Find the routes file based on provided path or defaults
* @param {string|null} routePath - Optional custom path to the routes file
* @returns {string|null} - Path to the routes file or null if not found
*/
function findRoutesFile(routePath = null) {
// If a specific path is provided, use it directly
if (routePath) {
if (fs.existsSync(routePath)) {
return routePath;
}
return null; // Specific path provided but not found
}
// Default path is routes.json in the project root
const defaultPath = path.join(process.cwd(), 'routes.json');
if (fs.existsSync(defaultPath)) {
return defaultPath;
}
return null; // No routes file found in default location
}
/**
* Load routes from a JSON file
* @param {string} filePath - Path to the JSON file containing routes
* @returns {Promise<string[]>} - Array of routes from the file
*/
async function loadRoutesFromFile(filePath) {
try {
if (!fs.existsSync(filePath)) {
throw new Error(`Routes file not found: ${filePath}`);
}
const fileContent = await fs.readFile(filePath, 'utf8');
const routesData = JSON.parse(fileContent);
if (!Array.isArray(routesData)) {
throw new Error('Routes file must contain a JSON array of routes');
}
// Validate each route is a string
const validRoutes = routesData.filter(route => typeof route === 'string');
// If file exists but has no valid routes, default to a single route
if (validRoutes.length === 0) {
logger.warn(`Routes file ${filePath} exists but contains no valid routes. Using default route.`);
return ['/'];
}
return validRoutes;
} catch (error) {
logger.error(`Failed to load routes from file: ${error.message}`);
throw error;
}
}
/**
* Process multiple routes with improved error handling and monitoring
* @param {string} baseUrl - The base URL
* @param {string[]} routes - Array of routes to process
* @param {string} format - The output format (text, json, markdown, or all)
* @returns {Promise<Array>} - Array of results for testing purposes
*/
async function processRoutes(options = {}) {
// Handle options parameter to support both object and positional arguments
let baseUrl, routes, format, outputDir, quiet;
if (typeof options === 'object') {
// Object parameter style
baseUrl = options.baseUrl || DEFAULT_BASE_URL;
// Handle routePath if provided
if (options.routePath && typeof options.routePath === 'string') {
try {
routes = await loadRoutesFromFile(options.routePath);
} catch (error) {
logger.error(`Failed to load routes from path: ${options.routePath}`);
routes = DEFAULT_ROUTES;
}
} else {
routes = options.routes || DEFAULT_ROUTES;
}
format = options.format || DEFAULT_FORMAT;
outputDir = options.outputDir;
quiet = options.quiet;
} else {
// Legacy positional parameters style
baseUrl = arguments[0] || DEFAULT_BASE_URL;
routes = arguments[1] || DEFAULT_ROUTES;
format = arguments[2] || DEFAULT_FORMAT;
}
// Override OUTPUT_DIR if specified
if (outputDir) {
OUTPUT_DIR = outputDir;
}
// Input validation
if (!baseUrl) {
throw new Error('Base URL is required');
}
if (!Array.isArray(routes) || routes.length === 0) {
throw new Error('Routes must be a non-empty array');
}
// Check format validity - use default format if invalid
const validFormat = VALID_FORMATS.includes(format) ? format : DEFAULT_FORMAT;
if (!VALID_FORMATS.includes(format)) {
logger.warn(`Invalid format: ${format}. Using default format: ${DEFAULT_FORMAT}`);
}
// Resolve the baseUrl if it's an object
const resolvedBaseUrl = typeof baseUrl === 'object' && baseUrl.baseUrl
? baseUrl.baseUrl
: baseUrl;
logger.startup(APP_VERSION, {
baseUrl,
routeCount: routes.length,
format: validFormat,
});
logger.info(`Starting to process ${routes.length} routes from ${baseUrl}`);
logger.debug("Routes to process", { routes });
// Track results and errors
const results = [];
const errors = [];
for (let i = 0; i < routes.length; i++) {
const route = routes[i];
logger.progress("processing_routes", i + 1, routes.length, {
currentRoute: route,
});
try {
const result = await generateFilesForRoute(resolvedBaseUrl, route, validFormat);
if (result) {
results.push(result);
}
} catch (error) {
logger.error(`Failed to process route ${route}: ${error.message}`);
errors.push({
route,
error: error.message,
});
}
}
// Log a summary of results
logger.info(`Completed processing ${routes.length} routes`, {
successful: results.length,
failed: errors.length,
errors: errors.length > 0 ? errors : undefined,
});
return results;
}
/**
* Process a single page URL
* @param {string} url - Full URL to process
* @param {string} format - Output format
* @returns {Promise<Object|null>} - Result or null if processing fails
*/
async function processSinglePage(url, format = DEFAULT_FORMAT) {
try {
if (!isValidUrl(url)) {
throw new Error(`Invalid URL: ${url}`);
}
// Extract baseUrl and route from the full URL
const urlObj = new URL(url);
const baseUrl = `${urlObj.protocol}//${urlObj.hostname}${urlObj.port ? `:${urlObj.port}` : ''}`;
const route = urlObj.pathname;
logger.info(`Processing single page: ${url}`);
return await generateFilesForRoute(baseUrl, route, format);
} catch (error) {
logger.error(`Failed to process single page: ${error.message}`);
throw error;
}
}
// If this file is run directly (not imported)
if (require.main === module) {
(async () => {
try {
// Check for previous outputs and ask user if they want to delete them
const shouldDelete = await shouldDeletePreviousOutputs();
if (shouldDelete) {
await deletePreviousOutputs();
}
// Command line arguments handling
const args = process.argv.slice(2);
// Extract route path flag if specified
let routePathIndex = args.findIndex(arg => arg === '-routePath');
let routePathValue = null;
if (routePathIndex !== -1 && routePathIndex + 1 < args.length) {
routePathValue = args[routePathIndex + 1];
// Remove the flag and its value from args
args.splice(routePathIndex, 2);
}
// If the first argument is a full URL, process it as a single page
if (args.length > 0 && isValidUrl(args[0])) {
const url = args[0];
const format = args[1] && VALID_FORMATS.includes(args[1]) ? args[1] : DEFAULT_FORMAT;
const result = await processSinglePage(url, format);
logger.info(`\nGenerated files for page: ${url}\nCheck the output directory: ${OUTPUT_DIR}`);
}
// Look for routes file
else {
const routesFilePath = findRoutesFile(routePathValue);
// If a specific path was provided but file wasn't found, exit with error
if (routePathValue && !routesFilePath) {
console.error(`\x1b[31mError: Routes file not found at specified path: ${routePathValue}\x1b[0m`);
process.exit(1);
}
// If we have a routes file, use it
if (routesFilePath) {
// Get format from first arg if not a URL, or second arg if URL was provided
const format = args[0] && VALID_FORMATS.includes(args[0]) ? args[0] :
(args[1] && VALID_FORMATS.includes(args[1]) ? args[1] : DEFAULT_FORMAT);
// Get baseUrl from second arg if not a format, or from third arg
const baseUrl = (args[0] && !VALID_FORMATS.includes(args[0])) ? args[0] :
(args[1] && !VALID_FORMATS.includes(args[1])) ? args[1] :
(args[2]) ? args[2] : DEFAULT_BASE_URL;
try {
// Use the new options object format
await processRoutes({
baseUrl: baseUrl,
routePath: routesFilePath,
format: format
});
logger.info(`\nAll routes from file processed. Check the output directory: ${OUTPUT_DIR}`);
} catch (error) {
console.error(`\x1b[31mError loading routes file: ${error.message}\x1b[0m`);
process.exit(1);
}
}
// Otherwise, use the provided arguments or defaults
else {
const baseUrl = args[0] || DEFAULT_BASE_URL;
const routes = args[1] ? JSON.parse(args[1]) : DEFAULT_ROUTES;
const format = args[2] && VALID_FORMATS.includes(args[2]) ? args[2] : DEFAULT_FORMAT;
// Use the new options object format
const results = await processRoutes({
baseUrl: baseUrl,
routes: routes,
format: format
});
logger.info(`\nAll routes processed. Check the output directory: ${OUTPUT_DIR}`);
}
}
} catch (error) {
console.error(`\x1b[31m╔═════════════════════════════════════════════════════════╗`);
console.error(`║ ERROR ║`);
console.error(`╚═════════════════════════════════════════════════════════╝\x1b[0m`);
console.error(`\x1b[31m${error.message}\x1b[0m`);
console.error(`\x1b[33mUsage:\x1b[0m`);
console.error(` 1. Process single page: \x1b[36mnpx scoopit https://wikipedia.org/page\x1b[0m [format]`);
console.error(` 2. Use routes.json: \x1b[36mnpx scoopit\x1b[0m [format] [baseUrl]`);
console.error(` 3. Specify custom routes file: \x1b[36mnpx scoopit -routePath path/to/routes.json\x1b[0m [format] [baseUrl]`);
console.error(` 4. Process with inline routes: \x1b[36mnpx scoopit\x1b[0m [baseUrl] [routes] [format]`);
console.error(`\nCheck the logs for more details.`);
process.exit(1);
}
})();
}
module.exports = {
fetchContent,
generateFilesForRoute,
processRoutes,
processSinglePage,
loadRoutesFromFile,
isValidUrl,
shouldDeletePreviousOutputs,
deletePreviousOutputs,
DEFAULT_BASE_URL,
DEFAULT_ROUTES,
DEFAULT_FORMAT,
VALID_FORMATS,
};