UNPKG

@hashgraph/solo

Version:

An opinionated CLI tool to deploy and manage private Hedera Networks.

671 lines (606 loc) 24.7 kB
// SPDX-License-Identifier: Apache-2.0 import { type CoreV1Event, type CoreV1Api, type KubeConfig, Metrics, type PodMetricsList, V1Container, V1ExecAction, V1ObjectMeta, V1Pod, type V1PodList, V1PodSpec, V1Probe, type V1ContainerStatus, type V1ContainerStateWaiting, type V1ContainerStateTerminated, } from '@kubernetes/client-node'; import {type Pods} from '../../../resources/pod/pods.js'; import {NamespaceName} from '../../../../../types/namespace/namespace-name.js'; import {PodReference} from '../../../resources/pod/pod-reference.js'; import {type Pod} from '../../../resources/pod/pod.js'; import {K8ClientPod} from './k8-client-pod.js'; import {Duration} from '../../../../../core/time/duration.js'; import {K8ClientBase} from '../../k8-client-base.js'; import {SoloError} from '../../../../../core/errors/solo-error.js'; import {MissingArgumentError} from '../../../../../core/errors/missing-argument-error.js'; import * as constants from '../../../../../core/constants.js'; import {type SoloLogger} from '../../../../../core/logging/solo-logger.js'; import {container} from 'tsyringe-neo'; import {type ContainerName} from '../../../resources/container/container-name.js'; import {PodName} from '../../../resources/pod/pod-name.js'; import {InjectTokens} from '../../../../../core/dependency-injection/inject-tokens.js'; import {KubeApiResponse} from '../../../kube-api-response.js'; import {ResourceOperation} from '../../../resources/resource-operation.js'; import {ResourceType} from '../../../resources/resource-type.js'; import {type PodMetricsItem} from '../../../resources/pod/pod-metrics-item.js'; import yaml from 'yaml'; import {sleep} from '../../../../../core/helpers.js'; /** * Waiting reasons for container states that are non-recoverable (image unavailable in registry). */ const FATAL_WAITING_REASONS: ReadonlySet<string> = new Set([ 'ImagePullBackOff', 'ErrImagePull', 'InvalidImageName', 'ImageInspectError', 'RegistryUnavailable', ]); /** * Terminated reasons for container states that are non-recoverable (e.g. out-of-memory kill). */ const FATAL_TERMINATED_REASONS: ReadonlySet<string> = new Set(['OOMKilled']); const FATAL_ERROR_RETRY_THRESHOLD: number = 3; const NON_RECOVERABLE_IMAGE_PULL_PATTERNS: ReadonlyArray<RegExp> = [ /not found/i, /manifest unknown/i, /pull access denied/i, /requested access to the resource is denied/i, /insufficient_scope/i, /unauthorized/i, /authentication required/i, /invalid reference format/i, ]; /** * Inspect a V1Pod's container statuses for non-recoverable error states and return a descriptive * error message if one is detected, or undefined if no fatal error is present. * * Covered states: * - Waiting: ImagePullBackOff, ErrImagePull, InvalidImageName, ImageInspectError, * RegistryUnavailable (image unavailable in registry) * - Terminated: OOMKilled (container killed due to out-of-memory) */ export function detectFatalContainerError(pod: V1Pod): string | undefined { const podName: string = pod.metadata?.name ?? '<unknown>'; const allContainerStatuses: V1ContainerStatus[] = [ ...(pod.status?.initContainerStatuses ?? []), ...(pod.status?.containerStatuses ?? []), ]; for (const containerStatus of allContainerStatuses) { const containerName: string = containerStatus.name ?? '<unknown>'; const waitingState: V1ContainerStateWaiting | undefined = containerStatus.state?.waiting; if (waitingState?.reason && FATAL_WAITING_REASONS.has(waitingState.reason)) { if ( (waitingState.reason === 'ErrImagePull' || waitingState.reason === 'ImagePullBackOff' || waitingState.reason === 'ImageInspectError') && !isNonRecoverableImagePullError(waitingState.message) ) { continue; } const detail: string = waitingState.message ? `: ${waitingState.message}` : ''; return ( `Pod "${podName}" container "${containerName}" is in a non-recoverable state: ` + `${waitingState.reason}${detail}` ); } const terminatedState: V1ContainerStateTerminated | undefined = containerStatus.state?.terminated; if (terminatedState?.reason && FATAL_TERMINATED_REASONS.has(terminatedState.reason)) { return ( `Pod "${podName}" container "${containerName}" was terminated due to: ` + `${terminatedState.reason} (exit code ${terminatedState.exitCode ?? 'unknown'})` ); } } return undefined; } function isNonRecoverableImagePullError(message?: string): boolean { if (!message) { return false; } return NON_RECOVERABLE_IMAGE_PULL_PATTERNS.some((pattern): boolean => pattern.test(message)); } export class K8ClientPods extends K8ClientBase implements Pods { private readonly logger: SoloLogger; public constructor( private readonly kubeClient: CoreV1Api, private readonly kubeConfig: KubeConfig, private readonly kubectlInstallationDirectory: string, ) { super(); this.logger = container.resolve(InjectTokens.SoloLogger); } public readByReference(podReference: PodReference | null): Pod { return new K8ClientPod(podReference, this, this.kubeClient, this.kubeConfig, this.kubectlInstallationDirectory); } public async read(podReference: PodReference): Promise<Pod> { const ns: NamespaceName = podReference.namespace; const fieldSelector: string = `metadata.name=${podReference.name}`; const resp: V1PodList = await this.kubeClient.listNamespacedPod({ namespace: ns.name, fieldSelector, timeoutSeconds: Duration.ofMinutes(5).toMillis(), }); return K8ClientPod.fromV1Pod( this.filterItem(resp.items, {name: podReference.name.toString()}), this, this.kubeClient, this.kubeConfig, this.kubectlInstallationDirectory, ); } public async list(namespace: NamespaceName, labels: string[]): Promise<Pod[]> { const labelSelector: string = labels ? labels.join(',') : undefined; const result: V1PodList = await this.kubeClient.listNamespacedPod({ namespace: namespace.name, labelSelector, timeoutSeconds: Duration.ofMinutes(5).toMillis(), }); const sortedItems: V1Pod[] = result?.items ? // eslint-disable-next-line unicorn/no-array-sort [...result.items].sort( (a, b): number => new Date(b.metadata?.creationTimestamp || 0).getTime() - new Date(a.metadata?.creationTimestamp || 0).getTime(), ) : []; return sortedItems.map( (item: V1Pod): Pod => K8ClientPod.fromV1Pod(item, this, this.kubeClient, this.kubeConfig, this.kubectlInstallationDirectory), ); } public async waitForReadyStatus( namespace: NamespaceName, labels: string[], maxAttempts: number = 10, delay: number = 500, createdAfter?: Date, excludeMarkedForDeletion: boolean = false, ): Promise<Pod[]> { const podReadyCondition: Map<string, string> = new Map<string, string>().set( constants.POD_CONDITION_READY, constants.POD_CONDITION_STATUS_TRUE, ); try { return await this.waitForPodConditions( namespace, podReadyCondition, labels, maxAttempts, delay, createdAfter, excludeMarkedForDeletion, ); } catch (error: Error | unknown) { const errorMessage: string = error instanceof Error ? error.message : String(error); this.logger.showUser(`Pod readiness check failed: ${errorMessage}`); throw new SoloError(`Pod with labels [${labels.join(', ')}] not ready [maxAttempts = ${maxAttempts}]`, error); } } /** * Wait until the pod identified by `podReference` appears in the Kubernetes API. * * Use this when the exact pod name is known. If the pod must be discovered by labels, * use {@link waitForReadyStatus} with an appropriate label selector instead. * * @param podReference - exact reference of the pod to wait for * @param maxAttempts - maximum polling attempts before throwing (default 20 × 3 s = 60 s) * @param delay - milliseconds between attempts (default 3000) */ public async waitForPodByReference( podReference: PodReference, maxAttempts: number = 20, delay: number = 3000, ): Promise<void> { const podName: string = podReference.name.toString(); for (let attempt: number = 1; attempt <= maxAttempts; attempt++) { const pod: Pod = await this.read(podReference); if (pod) { return; } this.logger.debug( `waitForPodByReference: pod ${podName} not yet visible in API, attempt ${attempt}/${maxAttempts}`, ); await sleep(Duration.ofMillis(delay)); } throw new SoloError(`Pod ${podName} not found after ${maxAttempts} attempts`); } /** * Check pods for conditions * @param namespace - namespace * @param conditionsMap - a map of conditions and values * @param [labels] - pod labels * @param [maxAttempts] - maximum attempts to check * @param [delay] - delay between checks in milliseconds * @param [createdAfter] - if provided, only pods created strictly after this date are considered * @param [excludeMarkedForDeletion] - if true, pods with deletionTimestamp are ignored */ private async waitForPodConditions( namespace: NamespaceName, conditionsMap: Map<string, string>, labels: string[] = [], maxAttempts: number = 10, delay: number = 500, createdAfter?: Date, excludeMarkedForDeletion: boolean = false, ): Promise<Pod[]> { if (!conditionsMap || conditionsMap.size === 0) { throw new MissingArgumentError('pod conditions are required'); } return await this.waitForRunningPhase( namespace, labels, maxAttempts, delay, (pod): boolean => { if (pod.conditions?.length > 0) { for (const cond of pod.conditions) { for (const entry of conditionsMap.entries()) { const condType: string = entry[0]; const condStatus: string = entry[1]; if (cond.type === condType && cond.status === condStatus) { this.logger.info( `Pod condition met for ${pod.podReference.name.name} [type: ${cond.type} status: ${cond.status}]`, ); return true; } } } } // condition not found return false; }, createdAfter, excludeMarkedForDeletion, ); } public async waitForRunningPhase( namespace: NamespaceName, labels: string[], maxAttempts: number, delay: number, podItemPredicate?: (items: Pod) => boolean, createdAfter?: Date, excludeMarkedForDeletion: boolean = false, ): Promise<Pod[]> { const phases: string[] = [constants.POD_PHASE_RUNNING]; const labelSelector: string = labels ? labels.join(',') : undefined; this.logger.info( `waitForRunningPhase [labelSelector: ${labelSelector}, namespace:${namespace.name}, maxAttempts: ${maxAttempts}]`, ); return new Promise<Pod[]>((resolve, reject): void => { let attempts: number = 0; const fatalErrorStreakByPod: Map<string, {count: number; error: string}> = new Map< string, {count: number; error: string} >(); const check: (resolve: (items: Pod[]) => void, reject: (reason?: Error) => void) => Promise<void> = async ( resolve: (items: Pod[]) => void, reject: (reason?: Error) => void, ): Promise<void> => { // wait for the pod to be available with the given status and labels try { const response: V1PodList = await this.kubeClient.listNamespacedPod({ namespace: namespace.name, labelSelector, timeoutSeconds: Duration.ofMinutes(5).toMillis(), }); this.logger.debug( `[attempt: ${attempts}/${maxAttempts}] ${response.items?.length} pod(s) found [labelSelector: ${labelSelector}, namespace:${namespace.name}]`, ); if (response.items?.length > 0) { // Sort pods by creation timestamp descending (newest first) // eslint-disable-next-line unicorn/no-array-sort const sortedItems: V1Pod[] = [...response.items].sort((a, b): number => { const aTime: number = a.metadata?.creationTimestamp?.getTime() || 0; const bTime: number = b.metadata?.creationTimestamp?.getTime() || 0; return bTime - aTime; }); // When a createdAfter cutoff is provided, skip pods that existed before the // cutoff (e.g. a terminating predecessor from a recreate migration). const createdAfterEligibleItems: V1Pod[] = createdAfter ? sortedItems.filter( (pod): boolean => (pod.metadata?.creationTimestamp?.getTime() || 0) > createdAfter.getTime(), ) : sortedItems; const eligibleItems: V1Pod[] = excludeMarkedForDeletion ? createdAfterEligibleItems.filter((pod): boolean => !pod.metadata?.deletionTimestamp) : createdAfterEligibleItems; // Allow transient startup states to recover; only fail after repeated fatal detections. for (const item of eligibleItems) { const fatalError: string | undefined = detectFatalContainerError(item); const podName: string = item.metadata?.name ?? '<unknown>'; if (fatalError) { const previous: {count: number; error: string} | undefined = fatalErrorStreakByPod.get(podName); const nextCount: number = previous?.error === fatalError ? previous.count + 1 : 1; fatalErrorStreakByPod.set(podName, {count: nextCount, error: fatalError}); if (nextCount >= FATAL_ERROR_RETRY_THRESHOLD) { return reject(new SoloError(fatalError)); } this.logger.info( `Detected fatal pod state for "${podName}" (${nextCount}/${FATAL_ERROR_RETRY_THRESHOLD}); retrying`, ); } else { fatalErrorStreakByPod.delete(podName); } } if (eligibleItems.length > 0) { // Only check the newest eligible pod const newestItem: V1Pod = eligibleItems[0]; const pod: Pod = K8ClientPod.fromV1Pod( newestItem, this, this.kubeClient, this.kubeConfig, this.kubectlInstallationDirectory, ); if (phases.includes(newestItem.status?.phase) && (!podItemPredicate || podItemPredicate(pod))) { return resolve([pod]); } } } } catch (error) { this.logger.info('Error occurred while waiting for pods, retrying', error); } if (++attempts < maxAttempts) { setTimeout((): Promise<void> => check(resolve, reject), delay); } else { return reject( new SoloError( `Expected at least 1 pod not found for labels: ${labelSelector}, phases: ${phases.join(',')} [attempts = ${attempts}/${maxAttempts}]`, ), ); } }; check(resolve, reject); }); } public async listForAllNamespaces(labels: string[]): Promise<Pod[]> { const labelSelector: string = labels ? labels.join(',') : undefined; const pods: Pod[] = []; try { const response: V1PodList = await this.kubeClient.listPodForAllNamespaces({labelSelector}); if (response?.items?.length > 0) { for (const item of response.items) { pods.push( new K8ClientPod( PodReference.of(NamespaceName.of(item.metadata?.namespace), PodName.of(item.metadata?.name)), this, this.kubeClient, this.kubeConfig, this.kubectlInstallationDirectory, ), ); } } } catch (error) { KubeApiResponse.throwError(error, ResourceOperation.LIST, ResourceType.POD, undefined, ''); } return pods; } public async create( podReference: PodReference, labels: Record<string, string>, containerName: ContainerName, containerImage: string, containerCommand: string[], startupProbeCommand: string[], ): Promise<Pod> { const v1Metadata: V1ObjectMeta = new V1ObjectMeta(); v1Metadata.name = podReference.name.toString(); v1Metadata.namespace = podReference.namespace.toString(); v1Metadata.labels = labels; const v1ExecAction: V1ExecAction = new V1ExecAction(); v1ExecAction.command = startupProbeCommand; const v1Probe: V1Probe = new V1Probe(); v1Probe.exec = v1ExecAction; const v1Container: V1Container = new V1Container(); v1Container.name = containerName.name; v1Container.image = containerImage; v1Container.command = containerCommand; v1Container.startupProbe = v1Probe; const v1Spec: V1PodSpec = new V1PodSpec(); v1Spec.containers = [v1Container]; const v1Pod: V1Pod = new V1Pod(); v1Pod.metadata = v1Metadata; v1Pod.spec = v1Spec; let result: V1Pod; try { result = await this.kubeClient.createNamespacedPod({namespace: podReference.namespace.toString(), body: v1Pod}); } catch (error) { if (error instanceof SoloError) { throw error; } KubeApiResponse.throwError( error, ResourceOperation.CREATE, ResourceType.POD, podReference.namespace, podReference.name.toString(), ); } if (result) { return new K8ClientPod(podReference, this, this.kubeClient, this.kubeConfig, this.kubectlInstallationDirectory); } else { throw new SoloError('Error creating pod', result); } } public async delete(podReference: PodReference): Promise<void> { try { await this.kubeClient.deleteNamespacedPod({ namespace: podReference.namespace.toString(), name: podReference.name.toString(), }); } catch (error) { KubeApiResponse.throwError( error, ResourceOperation.DELETE, ResourceType.POD, podReference.namespace, podReference.name.toString(), ); } } public async readLogs(podReference: PodReference, timestamps: boolean = true): Promise<string> { const namespace: string = podReference.namespace.toString(); const name: string = podReference.name.toString(); const pod: V1Pod = await this.kubeClient.readNamespacedPod({name, namespace}); const containerNames: string[] = [ ...(pod.spec?.initContainers?.map((container: V1Container): string => container.name) ?? []), ...(pod.spec?.containers?.map((container: V1Container): string => container.name) ?? []), ...(pod.spec?.ephemeralContainers?.map((container: V1Container): string => container.name) ?? []), ].filter(Boolean); if (containerNames.length === 0) { const log: string = await this.kubeClient.readNamespacedPodLog({ name, namespace, timestamps, }); return log ?? ''; } const containerLogs: string[] = []; for (const containerName of containerNames) { try { const containerLog: string = await this.kubeClient.readNamespacedPodLog({ name, namespace, container: containerName, timestamps, }); containerLogs.push(`===== Container: ${containerName} =====\n${containerLog ?? ''}`.trimEnd()); } catch (error) { containerLogs.push( `===== Container: ${containerName} =====\nFailed to read logs: ${error instanceof Error ? error.message : String(error)}`, ); } } return containerLogs.join('\n\n'); } public async readDescribe(podReference: PodReference): Promise<string> { const namespace: string = podReference.namespace.toString(); const name: string = podReference.name.toString(); const pod: V1Pod = await this.kubeClient.readNamespacedPod({name, namespace}); const events: {items?: CoreV1Event[]} = await this.kubeClient.listNamespacedEvent({ namespace, fieldSelector: `involvedObject.name=${name},involvedObject.namespace=${namespace}`, }); // eslint-disable-next-line unicorn/no-array-sort const sortedEvents: CoreV1Event[] = [...(events?.items ?? [])].sort((left, right): number => { const leftTime: number = new Date( left.lastTimestamp ?? left.eventTime ?? left.firstTimestamp ?? left.metadata?.creationTimestamp ?? 0, ).getTime(); const rightTime: number = new Date( right.lastTimestamp ?? right.eventTime ?? right.firstTimestamp ?? right.metadata?.creationTimestamp ?? 0, ).getTime(); return leftTime - rightTime; }); const describeData: {pod: V1Pod; events: typeof sortedEvents} = { pod, events: sortedEvents, }; return yaml.stringify(describeData); } public async topPods(namespace?: NamespaceName, labelSelector?: string): Promise<PodMetricsItem[]> { const metrics: Metrics = new Metrics(this.kubeConfig); const podMetricsList: PodMetricsList = await metrics.getPodMetrics(namespace?.name); let allowedPodKeys: Set<string> | undefined; if (labelSelector) { const podList: V1PodList = namespace ? await this.kubeClient.listNamespacedPod({ namespace: namespace.name, labelSelector, timeoutSeconds: Duration.ofMinutes(5).toMillis(), }) : await this.kubeClient.listPodForAllNamespaces({labelSelector}); allowedPodKeys = new Set( podList.items.map((p): string => `${p.metadata?.namespace ?? ''}/${p.metadata?.name ?? ''}`), ); } return podMetricsList.items .filter((podMetric): boolean => { if (!allowedPodKeys) { return true; } return allowedPodKeys.has(`${podMetric.metadata.namespace}/${podMetric.metadata.name}`); }) .map((podMetric): PodMetricsItem => { let cpuInMillicores: number = 0; let memoryInMebibytes: number = 0; for (const c of podMetric.containers) { cpuInMillicores += K8ClientPods.parseMillicores(c.usage.cpu); memoryInMebibytes += K8ClientPods.parseMebibytes(c.usage.memory); } return { namespace: NamespaceName.of(podMetric.metadata.namespace), podName: PodName.of(podMetric.metadata.name), cpuInMillicores, memoryInMebibytes, }; }); } /** * Parse a Kubernetes CPU quantity string into millicores. * Examples: "100m" -> 100, "1" -> 1000, "0.5" -> 500, "100000n" -> 0 (rounded) */ private static parseMillicores(quantity: string): number { if (!quantity) { return 0; } if (quantity.endsWith('n')) { return Math.round(Number.parseInt(quantity.slice(0, -1), 10) / 1_000_000); } if (quantity.endsWith('u')) { return Math.round(Number.parseInt(quantity.slice(0, -1), 10) / 1000); } if (quantity.endsWith('m')) { return Number.parseInt(quantity.slice(0, -1), 10); } return Math.round(Number.parseFloat(quantity) * 1000); } /** * Parse a Kubernetes memory quantity string into mebibytes (MiB). * Examples: "50Mi" -> 50, "1Gi" -> 1024, "52428800" -> 50, "512Ki" -> 0 (rounded) */ private static parseMebibytes(quantity: string): number { if (!quantity) { return 0; } if (quantity.endsWith('Ki')) { return Math.round(Number.parseInt(quantity.slice(0, -2), 10) / 1024); } if (quantity.endsWith('Mi')) { return Number.parseInt(quantity.slice(0, -2), 10); } if (quantity.endsWith('Gi')) { return Number.parseInt(quantity.slice(0, -2), 10) * 1024; } if (quantity.endsWith('Ti')) { return Number.parseInt(quantity.slice(0, -2), 10) * 1024 * 1024; } if (quantity.endsWith('Pi')) { return Number.parseInt(quantity.slice(0, -2), 10) * 1024 * 1024 * 1024; } if (quantity.endsWith('k')) { return Math.round((Number.parseInt(quantity.slice(0, -1), 10) * 1000) / (1024 * 1024)); } if (quantity.endsWith('M')) { return Math.round((Number.parseInt(quantity.slice(0, -1), 10) * 1_000_000) / (1024 * 1024)); } if (quantity.endsWith('G')) { return Math.round((Number.parseInt(quantity.slice(0, -1), 10) * 1_000_000_000) / (1024 * 1024)); } // Plain number (bytes) return Math.round(Number.parseFloat(quantity) / (1024 * 1024)); } }