rehiver

Version:

Super-charge your S3 hive partitioned based file operations with intelligent pattern matching, change detection, optimized data-fetching, and out-of-the-box time series support.

github.com/TimMikeladze/rehiver

TimMikeladze/rehiver

863 lines (860 loc) • 29.3 kB

text/typescript

import micromatch from 'micromatch'; import { z } from 'zod'; /** * Validate S3 bucket name according to AWS naming rules * @param bucketName Name of the bucket to validate * @returns True if valid, false otherwise */ declare function isValidBucketName(bucketName: string): boolean; /** * Log levels for the logger */ declare enum LogLevel { Debug = 0, Info = 1, Warn = 2, Error = 3, None = 100 } /** * Logger interface for custom logging implementations */ interface Logger { debug(message: string, ...args: unknown[]): void; info(message: string, ...args: unknown[]): void; warn(message: string, ...args: unknown[]): void; error(message: string, ...args: unknown[]): void; setLevel(level: LogLevel): void; getLevel(): LogLevel; } /** * Content type detection and handling utilities */ declare const ContentType: { /** * Detect content type from a file path or key * @param path Path or key to determine content type from * @returns The detected MIME type or application/octet-stream if unknown */ detect(path: string): string; /** * Get charset for a given content type * @param contentType MIME type to get charset for * @returns Charset string or null if not applicable */ charset(contentType: string): string | null; /** * Get file extension for a content type * @param contentType MIME type * @returns File extension (with dot) or false if not found */ extension(contentType: string): string | false; /** * Check if a content type represents text data * @param contentType MIME type to check * @returns True if content type is text-based */ isText(contentType: string): boolean; /** * Check if a content type represents binary data * @param contentType MIME type to check * @returns True if content type is binary */ isBinary(contentType: string): boolean; }; /** * Default logger implementation using console */ declare class ConsoleLogger implements Logger { private level; constructor(level?: LogLevel); debug(message: string, ...args: unknown[]): void; info(message: string, ...args: unknown[]): void; warn(message: string, ...args: unknown[]): void; error(message: string, ...args: unknown[]): void; setLevel(level: LogLevel): void; getLevel(): LogLevel; } /** * No-op logger that discards all log messages */ declare class NoopLogger implements Logger { debug(): void; info(): void; warn(): void; error(): void; setLevel(): void; getLevel(): LogLevel; } /** * Set the global logger instance * @param logger The logger instance to use globally */ declare function setGlobalLogger(logger: Logger): void; /** * Get the current global logger instance * @returns The current global logger */ declare function getGlobalLogger(): Logger; /** * Safely encodes S3 keys for use in URLs and handling special characters * @param key S3 key to encode * @returns URL-encoded key */ declare function encodeS3Key(key: string): string; /** * Safely decodes S3 keys from URLs * @param encodedKey URL-encoded S3 key * @returns Decoded key */ declare function decodeS3Key(encodedKey: string): string; /** * Interface for object metadata */ interface ObjectMetadata { key: string; size: number; etag: string; lastModified: Date; contentType?: string; } /** * S3 object interface matching AWS SDK structure with camelCase properties */ interface S3Object { key?: string; size?: number; etag?: string; lastModified?: Date; } /** * Change types for tracking object modifications */ declare enum ChangeType { Added = "added", Modified = "modified", Deleted = "deleted", Unchanged = "unchanged" } /** * Result of a change detection comparison */ interface ChangeResult { changeType: ChangeType; object: ObjectMetadata; previousVersion?: ObjectMetadata; } /** * Options for change detection */ interface ChangeDetectionOptions { stateFilePath?: string; compareMode?: "quick" | "full"; ignoreEtagOnSize?: boolean; trackDeleted?: boolean; } /** * Engine for detecting changes in S3 objects between runs */ declare class ChangeDetectionEngine { private previousState; private currentState; private options; /** * Creates a new ChangeDetectionEngine * @param options Engine configuration options */ constructor(options?: ChangeDetectionOptions); /** * Loads the previous state from a JSON file or creates a new state if none exists */ loadPreviousState(): Promise<void>; /** * Saves the current state to a JSON file */ saveCurrentState(): Promise<void>; /** * Adds an object to the current state * @param object Object metadata to track */ addObject(object: ObjectMetadata): void; /** * Adds multiple objects to the current state * @param objects Array of object metadata */ addObjects(objects: ObjectMetadata[]): void; /** * Converts an S3 object to our metadata format * @param s3Object S3 object from AWS SDK * @returns Normalized object metadata */ static fromS3Object(s3Object: S3Object): ObjectMetadata; /** * Determines if an object has changed between states * @param current Current object metadata * @param previous Previous object metadata * @returns True if the object has changed */ private hasObjectChanged; /** * Compares current state with previous state to detect changes * @returns Array of change results */ detectChanges(): ChangeResult[]; /** * Filter changes by type * @param changes Array of change results * @param types Types of changes to include * @returns Filtered array of change results */ static filterChangesByType(changes: ChangeResult[], types: ChangeType[]): ChangeResult[]; /** * Creates a new state from the current state */ commitChanges(): void; /** * Clears the current state */ resetCurrentState(): void; /** * Clears all state (previous and current) */ resetAllState(): void; } /** * A utility class for efficient path lookups using micromatch */ declare class PathMatcher { private readonly options; private readonly patternCache; /** * Creates a new PathMatcher instance * @param options Default options to use for all matches */ constructor(options?: micromatch.Options); /** * Check if a path matches a glob pattern * @param path Path to check * @param pattern Glob pattern to match against * @param options Additional options to override defaults * @returns True if the path matches the pattern */ isMatch(path: string, pattern: string | string[], options?: micromatch.Options): boolean; /** * Filter an array of paths based on one or more glob patterns * @param paths Array of paths to filter * @param patterns One or more glob patterns * @param options Additional options to override defaults * @returns Filtered array of paths that match the patterns */ match(paths: string[], patterns: string | string[], options?: micromatch.Options): string[]; /** * Create and cache a regular expression for a glob pattern * @param pattern Glob pattern to convert to regex * @param options Additional options to override defaults * @returns Regular expression for the pattern */ getRegex(pattern: string, options?: micromatch.Options): RegExp; /** * Filter paths using pre-compiled regex patterns for maximum performance * @param paths Array of paths to filter * @param patterns One or more glob patterns * @param options Additional options to override defaults * @returns Filtered array of paths that match the patterns */ matchFast(paths: string[], patterns: string | string[], options?: micromatch.Options): string[]; /** * Filter paths that don't match any of the provided patterns * @param paths Array of paths to filter * @param patterns One or more glob patterns to exclude * @param options Additional options to override defaults * @returns Filtered array of paths that don't match any pattern */ not(paths: string[], patterns: string | string[], options?: micromatch.Options): string[]; /** * Check if all of the provided patterns match the path * @param path Path to check * @param patterns One or more glob patterns * @param options Additional options to override defaults * @returns True if all patterns match the path */ all(path: string, patterns: string | string[], options?: micromatch.Options): boolean; /** * Capture values from a path based on a glob pattern * @param pattern Glob pattern with capture groups * @param path Path to extract values from * @param options Additional options to override defaults * @returns Array of captured values or null if no match */ capture(pattern: string, path: string, options?: micromatch.Options): string[] | null; } /** * Configuration interface for S3 client */ interface S3ClientConfig { region?: string; endpoint?: string; credentials?: { accessKeyId: string; secretAccessKey: string; sessionToken?: string; }; maxRetries?: number; requestTimeout?: number; connectTimeout?: number; } /** * Metadata cache configuration */ interface MetadataCacheConfig { enabled?: boolean; maxSize?: number; ttl?: number; refreshThreshold?: number; } /** * A utility class for working with S3 paths and glob patterns */ declare class S3PathMatcher extends PathMatcher { private readonly s3Client; private readonly metadataCache; private readonly cacheConfig; private readonly pendingRefreshes; protected readonly logger: Logger; /** * Creates a new S3PathMatcher with S3 client and caching configuration * @param options Micromatch options for path matching * @param s3Options S3 client configuration * @param cacheOptions Metadata cache configuration * @param logger Custom logger to use (defaults to global logger) */ constructor(options?: micromatch.Options, s3Options?: { region?: string; endpoint?: string; credentials?: { accessKeyId: string; secretAccessKey: string; }; forcePathStyle?: boolean; maxRetries?: number; }, cacheOptions?: MetadataCacheConfig, logger?: Logger); /** * Validates a bucket name according to AWS rules * @param bucketName Name of the bucket to validate * @throws Error if bucket name is invalid */ protected validateBucket(bucketName: string): void; /** * Gets or fetches object metadata from cache or S3 * @param bucketName S3 bucket name * @param key Object key * @returns Object metadata or null if not found */ getObjectMetadata(bucketName: string, key: string): Promise<ObjectMetadata | null>; /** * Refreshes metadata in the background without blocking * @param bucketName Bucket name * @param key Object key * @param cacheKey Cache key */ private refreshMetadataInBackground; /** * Invalidates a specific cache entry * @param bucketName Bucket name * @param key Object key */ invalidateCache(bucketName: string, key: string): void; /** * Clears the entire metadata cache */ clearCache(): void; /** * Lists all objects in an S3 bucket, with optional prefix * @param bucketName S3 bucket name * @param prefix Key prefix to filter by * @param options Additional options for listing * @returns Array of object keys */ listObjects(bucketName: string, prefix?: string, options?: { maxConcurrentRequests?: number; maxKeysPerRequest?: number; abortSignal?: AbortSignal; }): Promise<string[]>; /** * Uploads content to S3 with automatic content-type detection * @param bucketName S3 bucket name * @param key Object key * @param body Object content * @param options Additional upload options * @returns ETag of the uploaded object */ putObject(bucketName: string, key: string, body: Buffer | Uint8Array | string | Blob | ReadableStream, options?: { contentType?: string; contentEncoding?: string; contentDisposition?: string; cacheControl?: string; metadata?: Record<string, string>; tagging?: string; }): Promise<string>; /** * Find objects in an S3 bucket that match the given patterns * @param bucketOrOptions Bucket name or options object * @param patterns Glob patterns to match against object keys * @param options Additional options for listing and matching * @returns Array of matching object keys */ findMatchingObjects(bucketOrOptions: string | { bucket: string; patterns: string | string[]; prefix?: string; maxConcurrentRequests?: number; maxKeysPerRequest?: number; matchOptions?: micromatch.Options; useNegation?: boolean; abortSignal?: AbortSignal; onProgress?: (stats: { processed: number; total: number; matched: number; skippedExisting?: number; }) => void; concurrency?: { requestLimit?: number; processingLimit?: number; }; localCache?: { enabled: boolean; basePath: string; skipExisting?: boolean; }; handleSpecialChars?: boolean; }, patterns?: string | string[], options?: { prefix?: string; maxConcurrentRequests?: number; maxKeysPerRequest?: number; matchOptions?: micromatch.Options; useNegation?: boolean; abortSignal?: AbortSignal; onProgress?: (stats: { processed: number; total: number; matched: number; skippedExisting?: number; }) => void; concurrency?: { requestLimit?: number; processingLimit?: number; }; localCache?: { enabled: boolean; basePath: string; skipExisting?: boolean; }; handleSpecialChars?: boolean; }): Promise<string[]>; /** * Streams matching objects through a processor function * @param bucketOrOptions Bucket name or options object * @param patterns Glob patterns to match * @param processor Function to process each matching object * @param options Additional options * @returns Processing statistics */ streamMatchingObjects(bucketOrOptions: string | { bucket: string; patterns: string | string[]; processor: (path: string) => Promise<void>; prefix?: string; batchSize?: number; maxConcurrentRequests?: number; maxKeysPerRequest?: number; matchOptions?: micromatch.Options; maxConcurrentProcessing?: number; abortSignal?: AbortSignal; onProgress?: (stats: { processed: number; total: number; matched: number; skippedExisting?: number; }) => void; localCache?: { enabled: boolean; basePath: string; skipExisting?: boolean; }; handleSpecialChars?: boolean; }, patterns?: string | string[], processor?: (path: string) => Promise<void>, options?: { prefix?: string; batchSize?: number; maxConcurrentRequests?: number; maxKeysPerRequest?: number; matchOptions?: micromatch.Options; maxConcurrentProcessing?: number; abortSignal?: AbortSignal; onProgress?: (stats: { processed: number; total: number; matched: number; skippedExisting?: number; }) => void; localCache?: { enabled: boolean; basePath: string; skipExisting?: boolean; }; handleSpecialChars?: boolean; }): Promise<{ processed: number; matched: number; skipped: number; skippedExisting: number; }>; } /** * HivePartitionParser - A utility for parsing Hive partition paths and validating them with Zod schemas */ declare class HivePartitionParser<T extends z.ZodTypeAny> { private schema; private partitionKeys; /** * Create a new HivePartitionParser with a Zod schema * @param schema A Zod schema that defines the structure and types of partition keys */ constructor(schema: T); /** * Parse a Hive partition path and return a typed object * @param path The partition path to parse (e.g., "/table/year=2023/month=12/day=25") * @returns A validated object matching the schema type * @throws If validation fails or required partitions are missing */ parse(path: string): z.infer<T>; /** * Try to parse a path, returning a result object with success/error information * @param path The partition path to parse * @returns A Zod parse result with either data or error */ safeParse(path: string): z.SafeParseReturnType<unknown, z.infer<T>>; /** * Format a typed object into a Hive partition path * @param data An object matching the schema * @returns A formatted partition path string */ format(data: z.infer<T>): string; /** * Create a glob pattern from partial partition data * @param partialData Partial data with some partition keys specified * @returns A glob pattern that can match multiple partitions */ createGlobPattern(partialData: Partial<z.infer<T>>): string; /** * Check if a partition path is valid according to the schema * @param path The partition path to validate * @returns True if the path is valid, false otherwise */ isValid(path: string): boolean; /** * Get validation errors for a path * @param path The partition path to validate * @returns Array of error messages or empty array if valid */ getValidationErrors(path: string): string[]; /** * Find missing partition keys in a path * @param path The partition path to check * @returns Array of missing key names */ getMissingKeys(path: string): string[]; /** * Extract only specific keys from a partition path * @param path The partition path * @param keys Keys to extract * @returns Object containing only the specified keys */ extractKeys(path: string, keys: string[]): Partial<z.infer<T>>; /** * Apply a transformation function to partition values * @param path Original partition path * @param transformFn Function that takes current values and returns new values * @returns New partition path with transformed values */ transform(path: string, transformFn: (data: z.infer<T>) => Partial<z.infer<T>>): string; /** * Check if a path matches a glob pattern * Uses simplified glob matching logic for * wildcards * @param path The path to check * @param pattern The glob pattern * @returns True if the path matches the pattern */ matchesGlob(path: string, pattern: string): boolean; private segmentMatchesPattern; } /** * Supported time granularity types */ declare enum TimeGranularity { Hourly = "hourly", Daily = "daily", Monthly = "monthly", Yearly = "yearly" } /** * Configuration for time-based partition path generation */ interface TimePartitionConfig { granularity: TimeGranularity; includeHour?: boolean; includeMinute?: boolean; format?: "hive" | "path"; prefix?: string; dateFormat?: { year?: string; month?: string; day?: string; hour?: string; minute?: string; }; } /** * Utility for generating time-based partition paths */ declare class TimePartitionGenerator { private config; /** * Creates a new TimePartitionGenerator * @param config Configuration for partition path generation */ constructor(config: TimePartitionConfig); /** * Generate a partition path for a specific date * @param date The date to generate the partition for * @returns Formatted partition path string */ generatePath(date?: Date): string; /** * Generate partition paths for a time range * @param startDate Start of the range * @param endDate End of the range (inclusive) * @returns Array of partition paths */ generatePathsForRange(startDate: Date, endDate: Date): string[]; /** * Generate a partition path for the current time * @returns Formatted partition path for current time */ generateCurrentPath(): string; } /** * Main Rehiver class for S3 operations with pattern matching */ declare class Rehiver extends S3PathMatcher { static partition: { create: <T extends z.ZodTypeAny>(schema: T) => HivePartitionParser<T>; }; static time: { daily: (options?: Partial<TimePartitionConfig>) => TimePartitionGenerator; hourly: (options?: Partial<TimePartitionConfig>) => TimePartitionGenerator; monthly: (options?: Partial<TimePartitionConfig>) => TimePartitionGenerator; yearly: (options?: Partial<TimePartitionConfig>) => TimePartitionGenerator; custom: (config: TimePartitionConfig) => TimePartitionGenerator; }; static changes: { detect: (options?: ChangeDetectionOptions) => ChangeDetectionEngine; }; partition: { create: <T extends z.ZodTypeAny>(schema: T) => HivePartitionParser<T>; }; time: { daily: (options?: Partial<TimePartitionConfig>) => TimePartitionGenerator; hourly: (options?: Partial<TimePartitionConfig>) => TimePartitionGenerator; monthly: (options?: Partial<TimePartitionConfig>) => TimePartitionGenerator; yearly: (options?: Partial<TimePartitionConfig>) => TimePartitionGenerator; custom: (config: TimePartitionConfig) => TimePartitionGenerator; }; changes: { detect: (options?: ChangeDetectionOptions) => ChangeDetectionEngine; }; protected logger: Logger; constructor(options?: { matchOptions?: micromatch.Options; s3Options?: { region?: string; endpoint?: string; credentials?: { accessKeyId: string; secretAccessKey: string; }; forcePathStyle?: boolean; maxRetries?: number; }; cacheOptions?: MetadataCacheConfig; loggerOptions?: { logger?: Logger; level?: LogLevel; }; }); /** * Set the logger for this instance * @param logger Logger to use */ setLogger(logger: Logger): void; /** * Get the current logger * @returns The current logger instance */ getLogger(): Logger; /** * Create a HivePartitionParser with a given schema * @param schema The Zod schema defining the partition structure */ partitionParser<T extends z.ZodTypeAny>(schema: T): HivePartitionParser<T>; /** * Create a TimePartitionGenerator with the given configuration * @param config Configuration for time partitioning */ timePartitioner(config: TimePartitionConfig): TimePartitionGenerator; /** * Create a ChangeDetectionEngine for tracking object changes * @param options Configuration options for change detection */ changeDetector(options?: ChangeDetectionOptions): ChangeDetectionEngine; /** * Check if a path matches a pattern * @param path The path to check * @param pattern The pattern to match against * @param options Additional match options */ isMatch(path: string, pattern: string | string[], options?: micromatch.Options): boolean; /** * Match an array of paths against a pattern * @param paths The paths to filter * @param patterns The pattern(s) to match against * @param options Additional match options */ match(paths: string[], patterns: string | string[], options?: micromatch.Options): string[]; /** * Match paths using precompiled regular expressions for better performance * @param paths The paths to filter * @param patterns The pattern(s) to match against * @param options Additional match options */ matchFast(paths: string[], patterns: string | string[], options?: micromatch.Options): string[]; /** * Get paths that do NOT match the pattern * @param paths The paths to filter * @param patterns The pattern(s) to match against * @param options Additional match options */ not(paths: string[], patterns: string | string[], options?: micromatch.Options): string[]; /** * Capture values from a path using a pattern with named placeholders * @param pattern The pattern with placeholders (e.g., 'year=:year/month=:month') * @param path The path to extract values from * @param options Additional match options */ capture(pattern: string, path: string, options?: micromatch.Options): string[] | null; /** * Find objects in S3 that match the specified patterns with support for both positional * and object parameter styles */ findMatchingObjects(bucketOrOptions: string | { bucket: string; patterns: string | string[]; prefix?: string; maxConcurrentRequests?: number; maxKeysPerRequest?: number; matchOptions?: micromatch.Options; useNegation?: boolean; abortSignal?: AbortSignal; onProgress?: (stats: { processed: number; total: number; matched: number; skippedExisting?: number; }) => void; concurrency?: { requestLimit?: number; processingLimit?: number; }; localCache?: { enabled: boolean; basePath: string; skipExisting?: boolean; }; handleSpecialChars?: boolean; }, patterns?: string | string[], options?: { prefix?: string; maxConcurrentRequests?: number; maxKeysPerRequest?: number; matchOptions?: micromatch.Options; useNegation?: boolean; abortSignal?: AbortSignal; onProgress?: (stats: { processed: number; total: number; matched: number; skippedExisting?: number; }) => void; concurrency?: { requestLimit?: number; processingLimit?: number; }; localCache?: { enabled: boolean; basePath: string; skipExisting?: boolean; }; handleSpecialChars?: boolean; }): Promise<string[]>; /** * Stream and process objects in S3 that match the specified patterns with support * for both positional and object parameter styles */ streamMatchingObjects(bucketOrOptions: string | { bucket: string; patterns: string | string[]; processor: (path: string) => Promise<void>; prefix?: string; batchSize?: number; maxConcurrentRequests?: number; maxKeysPerRequest?: number; matchOptions?: micromatch.Options; maxConcurrentProcessing?: number; abortSignal?: AbortSignal; onProgress?: (stats: { processed: number; total: number; matched: number; skippedExisting?: number; }) => void; localCache?: { enabled: boolean; basePath: string; skipExisting?: boolean; }; }, patterns?: string | string[], processor?: (path: string) => Promise<void>, options?: { prefix?: string; batchSize?: number; maxConcurrentRequests?: number; maxKeysPerRequest?: number; matchOptions?: micromatch.Options; maxConcurrentProcessing?: number; abortSignal?: AbortSignal; onProgress?: (stats: { processed: number; total: number; matched: number; skippedExisting?: number; }) => void; localCache?: { enabled: boolean; basePath: string; skipExisting?: boolean; }; }): Promise<{ processed: number; matched: number; skipped: number; skippedExisting: number; }>; } export { ChangeDetectionEngine, type ChangeDetectionOptions, type ChangeResult, ChangeType, ConsoleLogger, ContentType, HivePartitionParser, LogLevel, type Logger, type MetadataCacheConfig, NoopLogger, type ObjectMetadata, PathMatcher, Rehiver, type S3ClientConfig, type S3Object, S3PathMatcher, TimeGranularity, type TimePartitionConfig, TimePartitionGenerator, decodeS3Key, Rehiver as default, encodeS3Key, getGlobalLogger, isValidBucketName, setGlobalLogger };