apx-toolkit
Version:
Automatically discover APIs and generate complete integration packages: code in 12 languages, TypeScript types, test suites, SDK packages, API documentation, mock servers, performance reports, and contract tests. Saves 2-4 weeks of work in seconds.
336 lines (301 loc) • 13.6 kB
text/typescript
/**
* Core Runner - Decoupled execution logic for APX
* This module contains the core APX functionality without Apify Actor dependencies
* Can be used by CLI, test scripts, or other integrations
*
* Key Decoupling Features:
* - Accepts native TypeScript objects (ActorInput) instead of reading from Apify KeyValueStore
* - Uses Crawlee's local storage (works without Apify platform)
* - Returns structured results instead of pushing to global Dataset
* - Fully executable outside Apify environment
*/
import { PlaywrightCrawler, HttpCrawler, Router, RequestQueue, Dataset } from 'crawlee';
import type { ActorInput, DiscoveredAPI } from './types.js';
import { REQUEST_LABELS } from './types.js';
import { handleDiscovery } from './handlers/discovery-handler.js';
import { handleAPIProcessing } from './handlers/api-handler.js';
import { StatisticsCollector } from './utils/statistics.js';
import { setStatistics } from './utils/statistics.js';
import { ProgressTracker, type ProgressCallback } from './utils/progress-tracker.js';
export interface APXResult {
summary: {
apisDiscovered: number;
requestsProcessed: number;
itemsExtracted: number;
discoveryDuration: number;
totalDuration: number;
};
artifacts: {
codeSnippets: Record<string, any[]>;
typescriptTypes: string;
testSuites: any[];
sdkPackages: any[];
documentation: any[];
examples: any[];
};
data: any[];
statistics: any;
}
/**
* Validates input configuration
*/
function validateInput(input: ActorInput): void {
if (!input.startUrls || input.startUrls.length === 0) {
throw new Error('startUrls is required and must contain at least one URL');
}
for (const urlObj of input.startUrls) {
if (!urlObj.url || typeof urlObj.url !== 'string') {
throw new Error('Each startUrl must have a valid url string');
}
try {
new URL(urlObj.url);
} catch (error) {
throw new Error(`Invalid URL format: ${urlObj.url}`);
}
}
if (input.minResponseSize !== undefined && input.minResponseSize < 0) {
throw new Error('minResponseSize must be >= 0');
}
if (input.discoveryTimeout !== undefined && input.discoveryTimeout < 1000) {
throw new Error('discoveryTimeout must be >= 1000ms');
}
if (input.maxPages !== undefined && input.maxPages < 1) {
throw new Error('maxPages must be >= 1');
}
if (input.maxConcurrency !== undefined && input.maxConcurrency < 1) {
throw new Error('maxConcurrency must be >= 1');
}
if (input.paginationType && !['auto', 'offset', 'page', 'cursor'].includes(input.paginationType)) {
throw new Error('paginationType must be one of: auto, offset, page, cursor');
}
if (input.exportFormats) {
const validFormats = ['openapi', 'postman', 'curl', 'insomnia'];
for (const format of input.exportFormats) {
if (!validFormats.includes(format)) {
throw new Error(`Invalid export format: ${format}. Must be one of: ${validFormats.join(', ')}`);
}
}
}
}
/**
* Core APX execution function
* Runs the complete APX workflow without Apify Actor dependencies
*
* Decoupling Strategy:
* 1. Input: Accepts native TypeScript object (ActorInput) instead of reading from Apify KeyValueStore
* 2. Crawlee Setup: Uses Crawlee's local storage automatically (works without Apify platform)
* - RequestQueue, Dataset, and other storage clients work locally via file system
* - No special configuration needed - Crawlee detects environment automatically
* 3. Output: Collects generated data into structured object (APXResult) for return
* - Data is collected from Dataset after processing
* - All artifacts are structured and returned to caller
*
* This allows APX to run in multiple environments:
* - Apify Actor (via main.ts)
* - CLI tool (via cli.ts)
* - Test scripts (via test-main.ts)
* - Any Node.js environment
*
* @param input - Configuration input (native TypeScript object)
* @param options - Optional execution options (progress callbacks, error handlers)
* @returns Structured result with all generated artifacts and data
*/
export async function runAPXCore(
input: ActorInput,
options?: {
onProgress?: (message: string) => void;
onError?: (error: Error) => void;
progressTracker?: ProgressTracker;
}
): Promise<APXResult> {
const startTime = Date.now();
const log = options?.onProgress || ((msg: string) => console.log(msg));
const onError = options?.onError || ((err: Error) => console.error(err.message));
// Initialize progress tracker
const progressTracker = options?.progressTracker || new ProgressTracker();
progressTracker.onProgress((event) => {
const progressMsg = event.progress !== undefined
? `[${event.progress}%] ${event.message}`
: event.message;
log(progressMsg);
});
// Validate input
validateInput(input);
// Initialize statistics collector
const statistics = new StatisticsCollector();
setStatistics(statistics);
// Create a shared request queue
// Crawlee automatically uses local storage when not on Apify platform
// Storage location: ./storage/request_queues/default (local) or Apify cloud (on platform)
// No special configuration needed - it works out of the box
const requestQueue = await RequestQueue.open();
// Create router for request handling
const router = Router.create();
// Register START_DISCOVERY handler (Playwright-based)
router.addHandler(REQUEST_LABELS.START_DISCOVERY, async (context) => {
await handleDiscovery(context as any, input);
});
// Register API_PROCESS handler (HTTP-based)
router.addHandler(REQUEST_LABELS.API_PROCESS, async (context) => {
await handleAPIProcessing(context as any, input);
});
// Configure PlaywrightCrawler for discovery phase
// Crawlee automatically uses local storage when not on Apify platform
const playwrightCrawler = new PlaywrightCrawler({
requestHandler: router,
requestQueue,
maxRequestsPerCrawl: input.startUrls.length,
launchContext: {
launchOptions: {
headless: true,
},
},
requestHandlerTimeoutSecs: 60,
});
// Configure HttpCrawler for API processing phase
// Crawlee automatically uses local storage when not on Apify platform
const httpCrawler = new HttpCrawler({
requestHandler: router,
requestQueue,
maxRequestsPerCrawl: (input.maxPages || 100) * (input.startUrls.length || 1),
maxConcurrency: input.maxConcurrency || 5,
requestHandlerTimeoutSecs: 30,
});
// Prepare initial requests with START_DISCOVERY label
const initialRequests = input.startUrls.map((urlObj) => ({
url: urlObj.url,
label: REQUEST_LABELS.START_DISCOVERY,
}));
log('🚀 Starting APX - The API Toolkit');
log('='.repeat(60));
log(`📋 Configuration:`);
log(` Start URLs: ${input.startUrls.length}`);
log(` Max Pages: ${input.maxPages || 100}`);
log(` Max Concurrency: ${input.maxConcurrency || 5}`);
log(` Generate Documentation: ${input.generateDocumentation !== false ? 'Yes' : 'No'}`);
log(` Export Formats: ${input.exportFormats?.join(', ') || 'openapi, postman, curl'}`);
log('='.repeat(60));
log('');
try {
const discoveryStartTime = Date.now();
progressTracker.discovery('Starting API discovery phase...');
// Run PlaywrightCrawler for discovery
await playwrightCrawler.run(initialRequests);
const discoveryDuration = (Date.now() - discoveryStartTime) / 1000;
progressTracker.discovery(`Discovery phase complete (${discoveryDuration.toFixed(1)}s)`, undefined, undefined, 100);
log(`✅ Discovery phase complete (${discoveryDuration.toFixed(1)}s)`);
log('');
// Get discovery stats
const queueInfo = await requestQueue.getInfo();
const apisDiscovered = (queueInfo?.totalRequestCount || 0) - input.startUrls.length;
if (apisDiscovered > 0) {
statistics.recordDiscovery(apisDiscovered, discoveryDuration);
progressTracker.discovery(`Discovered ${apisDiscovered} API endpoint(s)`, undefined, apisDiscovered, apisDiscovered);
log(`🔍 Discovered ${apisDiscovered} API endpoint(s)`);
const queueInfoBefore = await requestQueue.getInfo();
const pendingRequests = (queueInfoBefore?.totalRequestCount || 0) - (queueInfoBefore?.handledRequestCount || 0);
log(`📋 Queue status: ${queueInfoBefore?.totalRequestCount || 0} total, ${queueInfoBefore?.handledRequestCount || 0} handled, ${pendingRequests} pending`);
progressTracker.processing('Starting API processing phase...', undefined, pendingRequests, 0);
log('⚡ Starting API processing phase...');
log('');
} else {
progressTracker.discovery('No APIs discovered', undefined, 0, 0);
log('⚠️ No APIs discovered. The site may not use API calls or they may require user interaction.');
log('');
}
// Run HttpCrawler to process all API_PROCESS requests
await httpCrawler.run();
const queueInfoAfter = await requestQueue.getInfo();
const processed = queueInfoAfter?.handledRequestCount || 0;
const total = queueInfoAfter?.totalRequestCount || 0;
progressTracker.processing('API processing phase complete', undefined, total, processed);
log('✅ API processing phase complete.');
log('');
// Get final statistics
const finalQueueInfo = await requestQueue.getInfo();
// Open dataset - Crawlee automatically uses local storage when not on Apify platform
const dataset = await Dataset.open();
const datasetInfo = await dataset.getInfo();
// Collect all data items
const data: any[] = [];
if (datasetInfo && datasetInfo.itemCount && datasetInfo.itemCount > 0) {
const { items } = await dataset.getData({ limit: datasetInfo.itemCount });
data.push(...items);
}
// Collect generated artifacts from dataset
const artifacts = {
codeSnippets: {} as Record<string, any[]>,
typescriptTypes: '',
testSuites: [] as any[],
sdkPackages: [] as any[],
documentation: [] as any[],
examples: [] as any[],
};
// Extract artifacts from dataset items
for (const item of data) {
if (item._type === 'code_snippets') {
// Code snippets are stored as a single object with snippets key
if (item.snippets) {
Object.assign(artifacts.codeSnippets, item.snippets);
}
} else if (item._type === 'typescript_types') {
artifacts.typescriptTypes = item.content || '';
} else if (item._type === 'test_suites') {
// Test suites are stored with suites array
if (item.suites && Array.isArray(item.suites)) {
artifacts.testSuites.push(...item.suites);
} else {
artifacts.testSuites.push(item);
}
} else if (item._type === 'sdk_package') {
artifacts.sdkPackages.push({
language: item.language,
packageName: item.packageName,
files: item.files,
description: item.description,
});
} else if (item._type === 'api_documentation') {
artifacts.documentation.push({
format: item.format,
filename: item.filename,
content: item.content,
mimeType: item.mimeType,
});
} else if (item._type === 'api_examples') {
if (item.examples && Array.isArray(item.examples)) {
artifacts.examples.push(...item.examples);
} else {
artifacts.examples.push(item);
}
}
}
const totalDuration = (Date.now() - startTime) / 1000;
log('📊 Execution Summary');
log('='.repeat(60));
log(` APIs Discovered: ${apisDiscovered}`);
log(` Requests Processed: ${finalQueueInfo?.handledRequestCount || 0}`);
log(` Items Extracted: ${datasetInfo?.itemCount || 0}`);
log(` Total Duration: ${totalDuration.toFixed(1)}s`);
log('');
const stats = statistics.getStats();
const summary = await statistics.saveSummary();
return {
summary: {
apisDiscovered,
requestsProcessed: finalQueueInfo?.handledRequestCount || 0,
itemsExtracted: datasetInfo?.itemCount || 0,
discoveryDuration,
totalDuration,
},
artifacts,
data,
statistics: {
stats,
summary: summary.summary,
},
};
} catch (error) {
onError(error instanceof Error ? error : new Error(String(error)));
throw error;
}
}