@plust/datasleuth
Version:
Build LLM-powered research pipelines and output structured data.
353 lines • 14.2 kB
JavaScript
/**
* Web search step for the research pipeline
* Uses @plust/search-sdk to perform web searches
*/
import { webSearch as performWebSearch, } from '@plust/search-sdk';
import { createStep } from '../utils/steps.js';
import { z } from 'zod';
import { SearchError, NetworkError, ConfigurationError, ValidationError } from '../types/errors.js';
import { logger, createStepLogger } from '../utils/logging.js';
// Schema for search result
const searchResultSchema = z.object({
url: z.string().url(),
title: z.string(),
snippet: z.string().optional(),
domain: z.string().optional(),
publishedDate: z.string().optional(),
provider: z.string().optional(),
raw: z.any().optional(),
});
/**
* Convert our search provider config to an SDK-compatible search provider if needed
*/
function ensureSDKProvider(provider) {
if ('search' in provider && 'config' in provider) {
// It's already a proper SDK provider
return provider;
}
// It's our config format, create a mock SDK provider
const config = provider;
if (!config.apiKey) {
throw new ConfigurationError({
message: `Missing API key for search provider "${config.name}"`,
step: 'WebSearch',
suggestions: [
'Provide an API key in the provider configuration',
'Check environment variables for API keys',
'Use a different search provider with valid credentials',
],
});
}
// Create a minimal compatible provider
return {
name: config.name,
config: {
apiKey: config.apiKey,
baseUrl: config.baseUrl,
// Spread the rest of the config properties except those already specified
...(({ apiKey, name, ...rest }) => rest)(config),
},
search: async (options) => {
// This is just a placeholder to satisfy the type system
// The actual search will be performed by the SDK functions
logger.warn('Mock provider search called - this should not happen in production');
return [];
},
};
}
/**
* Convert SDK search results to our internal format
*/
function convertSearchResults(sdkResults) {
return sdkResults.map((result) => ({
url: result.url,
title: result.title,
snippet: result.snippet,
domain: result.domain,
publishedDate: result.publishedDate,
provider: result.provider,
raw: result.raw ? result.raw : undefined,
}));
}
/**
* Validate search query
*/
function validateQuery(query) {
// Remove any potentially problematic characters and excessive whitespace
const cleanedQuery = query.trim();
if (!cleanedQuery) {
throw new ValidationError({
message: 'Invalid search query: Empty or whitespace only',
step: 'WebSearch',
suggestions: [
'Provide a non-empty search query',
'Check if query generation is functioning correctly',
],
});
}
if (cleanedQuery.length > 2000) {
throw new ValidationError({
message: `Search query too long (${cleanedQuery.length} chars)`,
step: 'WebSearch',
suggestions: [
'Shorten the search query to under 2000 characters',
'Split long queries into multiple smaller queries',
],
});
}
return cleanedQuery;
}
/**
* Executes web search using the provided provider
*/
async function executeWebSearchStep(state, options) {
const stepLogger = createStepLogger('WebSearch');
const { provider: optionsProvider, query: customQuery, maxResults = 10, language, region, safeSearch = 'moderate', useQueriesFromPlan = true, includeRawResults = false, includeInResults = false, requireResults = false, } = options;
stepLogger.info('Starting web search execution');
try {
// Determine which provider to use - first check options, then state's defaultSearchProvider
const provider = optionsProvider || state.defaultSearchProvider;
// Verify that we have a provider to use
if (!provider) {
throw new ConfigurationError({
message: 'No search provider specified for web search',
step: 'WebSearch',
suggestions: [
'Provide a search provider in the searchWeb options',
'Set a defaultSearchProvider in the research function',
'Example: research({ query, outputSchema, defaultSearchProvider: google.configure({...}) })',
],
});
}
// Determine which queries to use
let queries = [];
if (customQuery) {
// If a custom query is provided, use it
queries.push(validateQuery(customQuery));
}
else if (useQueriesFromPlan && state.data.researchPlan?.searchQueries) {
// Use queries from research plan if available and option is enabled
const planQueries = state.data.researchPlan.searchQueries;
// Handle the case where searchQueries might be a single string or an array
if (Array.isArray(planQueries)) {
queries = planQueries.map((q) => validateQuery(q)).filter(Boolean);
}
else if (typeof planQueries === 'string') {
queries = [validateQuery(planQueries)];
}
stepLogger.debug(`Using ${queries.length} queries from research plan`);
}
// If we still don't have any valid queries, use the main research query
if (queries.length === 0) {
queries = [validateQuery(state.query)];
stepLogger.debug('Using main research query');
}
// Ensure we have a valid SDK provider
let sdkProvider;
try {
sdkProvider = ensureSDKProvider(provider);
stepLogger.debug(`Using search provider: ${sdkProvider.name}`);
}
catch (error) {
if (error instanceof ConfigurationError) {
throw error; // Already formatted correctly
}
throw new ConfigurationError({
message: `Invalid search provider configuration: ${error instanceof Error ? error.message : String(error)}`,
step: 'WebSearch',
details: { error },
suggestions: [
'Check provider name and API key',
"Ensure you're using a supported search provider",
'Verify the structure of your provider configuration',
],
});
}
// Collect all search results
const allResults = [];
const errors = [];
// Track successful searches
let successfulSearches = 0;
// Execute each search query
stepLogger.info(`Executing ${queries.length} search queries`);
for (const query of queries) {
try {
const searchParams = {
query,
maxResults,
language,
region,
safeSearch,
provider: sdkProvider,
};
stepLogger.debug(`Searching for: "${query}"`);
const searchResults = await performWebSearch(searchParams);
// Convert SDK results to our internal format
const convertedResults = convertSearchResults(searchResults);
if (convertedResults.length > 0) {
successfulSearches++;
stepLogger.info(`Query "${query}" returned ${convertedResults.length} results`);
allResults.push(...convertedResults);
}
else {
stepLogger.warn(`Query "${query}" returned no results`);
}
}
catch (error) {
// Format the error but continue with other queries
const errorMessage = error instanceof Error ? error.message : String(error);
stepLogger.error(`Search failed for query "${query}": ${errorMessage}`);
// Add structured error for debugging
if (error instanceof Error) {
errors.push(error);
}
else {
errors.push(new Error(`Unknown error: ${String(error)}`));
}
}
}
// Check if we have any results at all
if (allResults.length === 0) {
if (requireResults) {
// If results are required, throw an error
throw new SearchError({
message: 'No search results found for any queries',
step: 'WebSearch',
details: {
queries,
errors: errors.map((e) => e.message),
},
retry: true,
suggestions: [
'Try different search queries',
'Check if the search provider is working correctly',
'Verify API keys and rate limits',
'Consider using a different search provider',
],
});
}
else {
// Otherwise just log a warning
stepLogger.warn('No search results found for any queries, continuing anyway');
}
}
// Deduplicate results by URL
const uniqueResults = allResults.filter((result, index, self) => index === self.findIndex((r) => r.url === result.url));
stepLogger.debug(`Deduplicated ${allResults.length} results to ${uniqueResults.length} unique URLs`);
// Limit to maxResults
const limitedResults = uniqueResults.slice(0, maxResults);
if (uniqueResults.length > maxResults) {
stepLogger.debug(`Limited to ${maxResults} results (dropped ${uniqueResults.length - maxResults})`);
}
// Remove raw property if not needed
if (!includeRawResults) {
limitedResults.forEach((result) => {
delete result.raw;
});
}
// Log information about found results
stepLogger.info(`Found ${limitedResults.length} search results after processing`);
// Update state with search results
const newState = {
...state,
data: {
...state.data,
searchResults: limitedResults,
searchMetadata: {
successfulQueries: successfulSearches,
totalQueries: queries.length,
provider: sdkProvider.name,
timestamp: new Date().toISOString(),
...(errors.length > 0 ? { errors: errors.map((e) => e.message) } : {}),
},
},
};
// Add to results if requested
if (includeInResults) {
return {
...newState,
results: [...newState.results, { searchResults: limitedResults }],
};
}
return newState;
}
catch (error) {
// Handle different error types
if (error instanceof ConfigurationError ||
error instanceof ValidationError ||
error instanceof SearchError) {
// These are already properly formatted, just throw them
throw error;
}
else if (error instanceof Error && error.message.includes('network')) {
// Handle network errors specifically
throw new NetworkError({
message: `Network error during web search: ${error.message}`,
step: 'WebSearch',
details: { originalError: error },
retry: true,
suggestions: [
'Check your internet connection',
"Verify the search provider's API endpoint is accessible",
'Try again later if this might be a temporary issue',
],
});
}
else {
// Generic error handling
throw new SearchError({
message: `Error during web search: ${error instanceof Error ? error.message : String(error)}`,
step: 'WebSearch',
details: { originalError: error },
retry: true,
suggestions: [
'Check search provider configuration',
'Verify API key is valid and has sufficient permissions',
'Check query format and content',
'Inspect the error details for more specific guidance',
],
});
}
}
}
/**
* Creates a web search step for the research pipeline
*
* This step will use either the provider specified in options or fall back to the defaultSearchProvider
* from the research state. At least one of these must be provided for the step to work.
*
* @param options Configuration options for the web search
* @returns A web search step for the research pipeline
*
* @example
* ```typescript
* // Using a specific provider in options
* searchWeb({
* provider: google.configure({
* apiKey: process.env.GOOGLE_API_KEY,
* cx: process.env.GOOGLE_CX
* }),
* maxResults: 10
* })
*
* // Or relying on the defaultSearchProvider from the research function
* searchWeb({
* maxResults: 10,
* useQueriesFromPlan: true
* })
* ```
*/
export function searchWeb(options) {
return createStep('WebSearch',
// Wrapper function that matches the expected signature
async (state, opts) => {
return executeWebSearchStep(state, options);
}, options, {
// Mark as retryable by default
retryable: true,
maxRetries: options.maxRetries || 3,
retryDelay: 2000,
backoffFactor: 2,
});
}
//# sourceMappingURL=searchWeb.js.map