@hashgraph/solo
Version:
An opinionated CLI tool to deploy and manage private Hedera Networks.
204 lines (203 loc) • 10.3 kB
TypeScript
import { type SoloLogger } from '../../core/logging/solo-logger.js';
/**
* Severity-ordered categories for diagnostics findings.
*
* Ordering (lowest value = highest severity in the report):
* 1. image-pull — container image could not be pulled; pod will never start.
* 2. oom — container was killed by the kernel due to memory exhaustion.
* 3. pod-readiness — pod is not Running or its readiness probe is failing.
* 4. consensus-active — consensus node did not reach ACTIVE platform status.
* 5. log-exception — an exception/stack-trace was found in an application log.
* 6. app-error — an ERROR line was found in a pod's raw container log.
*/
export type DiagnosticsFindingCategory = 'image-pull' | 'oom' | 'pod-readiness' | 'consensus-active' | 'log-exception' | 'app-error';
/** A single detected problem with its supporting evidence lines. */
export type DiagnosticsFinding = {
category: DiagnosticsFindingCategory;
title: string;
/** Relative path of the source file (or "archive:entry") that triggered this finding. */
source: string;
/** Up to 14 verbatim lines from the source that match the failure pattern. */
evidence: string[];
};
/**
* DiagnosticsAnalyzer scans a previously-collected diagnostics output directory
* (produced by `deployment diagnostics logs`) and identifies common failure
* signatures without requiring a live cluster connection.
*
* ## Input sources
*
* ### 1. Solo CLI log (`solo.log`)
* The Solo CLI's own Pino log file (`~/.solo/logs/solo.log` by default, or
* `solo.log` found recursively under `customOutputDirectory`). Lines
* matching `] ERROR:` are captured as `app-error` findings. ANSI escape
* codes and `[traceId="..."]` suffixes are stripped before matching.
*
* ### 2. Pod describe files (`*.describe.txt`)
* Written by `downloadHieroComponentLogs()` for every pod across all clusters.
* These are the output of `kubectl describe pod <name> -n <namespace>` and
* contain the pod's status, container states, events, and resource usage.
*
* Detectable errors:
*
* | Category | Detected keywords / conditions |
* |-----------------|----------------------------------------------------------------------------------------|
* | `image-pull` | `ErrImagePull`, `ImagePullBackOff`, `Back-off pulling image`, |
* | | `failed to pull and unpack image`, `unexpected EOF` (truncated layer), |
* | | `toomanyrequests`, `rate limit exceeded`, `429 Too Many Requests` |
* | `oom` | `OOMKilled`, `out of memory`, `reason: OOMKilled` |
* | `pod-readiness` | Pod `Status` field is not `Running`, or `Ready: False` is present in container status; |
* | | supporting `Reason:` / `Message:` lines are captured as evidence |
*
* ### 2. Consensus node log archives (`*-log-config.zip`)
* Written by `getNodeLogsAndConfigs()` under `~/.solo/logs/<namespace>/`.
* Each zip contains the node's log and config snapshot. Only two log files
* inside the archive are inspected:
*
* - `output/swirlds.log` — Hashgraph platform log
* - `output/hgcaa.log` — Hedera application log
*
* Detectable errors:
*
* | Category | Detected keywords / conditions |
* |--------------------|-------------------------------------------------------------------------------------|
* | `consensus-active` | `swirlds.log` never contains the word `ACTIVE` — the node stalled during |
* | | startup (e.g. stuck in `STARTING_UP`, `OBSERVING`, or `REPLAYING_EVENTS`); |
* | | status-transition lines are captured as evidence |
* | `log-exception` | Any line in `swirlds.log` or `hgcaa.log` matching `Exception`, `Error`, |
* | | or `Caused by:` — the first matching stack-trace block (up to 14 lines) is |
* | | captured as evidence |
*
* ## Output
* All findings are written to `diagnostics-analysis.txt` inside the input
* directory. Up to 10 findings are also printed to the terminal in severity
* order. Duplicate findings (same category + title + source) are suppressed.
*/
export declare class DiagnosticsAnalyzer {
private readonly logger;
private static readonly CONSENSUS_LOG_DEFINITIONS;
constructor(logger: SoloLogger);
/**
* Run the full analysis against `customOutputDirectory` (or the default
* `~/.solo/logs/hiero-components-logs` when empty).
*
* Consensus node zip archives are looked up under
* `~/.solo/logs/<namespaceName>/` when `namespaceName` is provided, or
* directly under `~/.solo/logs/` otherwise.
*/
analyze(customOutputDirectory: string, namespaceName: string | undefined): void;
/**
* Recursively scans `rootDirectory` for `*.describe.txt` files (one per pod)
* and checks each for image-pull failures, OOM kills, and pod-readiness
* problems.
*
* Detected errors:
* - `image-pull` ErrImagePull / ImagePullBackOff / rate-limit / unexpected EOF
* - `oom` OOMKilled / out of memory
* - `pod-readiness` Status != Running OR Ready: False
*/
private analyzeDescribeFiles;
/**
* Recursively scans `rootDirectory` for `*.log` pod log files and checks each
* for application-level ERROR lines (category: `app-error`).
*
* These are the raw container logs downloaded by `downloadHieroComponentLogs()`
* alongside the `*.describe.txt` files. Each file is scanned for lines
* containing `ERROR` and the first matching block (up to 8 lines) is captured.
*/
private analyzePodLogFiles;
/**
* Searches for `solo.log` in `hieroOutputDirectory` (recursively) and, when
* no custom output directory was specified, also checks the standard
* `~/.solo/logs/solo.log` location. ERROR lines are extracted and reported
* as `app-error` findings.
*
*/
private analyzeSoloLogFiles;
/**
* Recursively scans `archiveRootDirectory` for `*-log-config.zip` archives
* produced by `getNodeLogsAndConfigs()` and inspects two log files inside
* each archive:
*
* - `output/swirlds.log` — checked for absence of the `ACTIVE` platform
* status marker (category: `consensus-active`) and for exception blocks
* (category: `log-exception`).
* - `output/hgcaa.log` — checked for exception blocks only
* (category: `log-exception`).
*
* Only the first exception block per log file is captured (up to 14 lines)
* to keep the report readable.
*/
private analyzeConsensusNodeArchives;
private findConsensusLogDefinition;
private analyzeConsensusLogEntry;
/**
* A healthy consensus node transitions through STARTING_UP → OBSERVING →
* REPLAYING_EVENTS → ACTIVE. If `ACTIVE` never appears in swirlds.log,
* the node likely stalled before becoming ready for transactions.
*/
private analyzeConsensusActiveStatus;
/**
* Captures the first exception/stack-trace block from a consensus log file.
*/
private analyzeExceptionBlocks;
/**
* Adds `finding` to `findings` unless an identical entry (same category,
* title, and source) already exists. Evidence lines are deduplicated and
* capped at 14 entries to keep the report compact.
*/
private addDiagnosticsFinding;
/**
* Walks `rootDirectory` recursively and returns all file paths for which
* `matcher` returns `true`.
*/
private collectFilesRecursively;
/**
* Extracts up to `maxBlocks` ERROR blocks from a solo.log file.
*
* Each block starts on a line matching `] ERROR:` and continues while
* subsequent lines are indented (part of the Pino `err:` object dump).
* A new log entry — any line starting with `[HH:MM:SS` — terminates the
* current block. Each block is capped at `maxLinesPerBlock` lines.
*
* Evidence lines are returned flat (one string per line) in
* `"line <N>: <content>"` format so they render consistently with other
* findings.
*/
private extractSoloLogErrorBlocks;
/**
* Returns up to `maxMatches` lines from `content` that match `pattern`,
* formatted as `"line <N>: <trimmed line>"`.
*
* The global (`g`) flag is stripped before matching so the RegExp lastIndex
* does not interfere with repeated calls against the same pattern instance.
*/
private extractMatchSnippets;
/**
* Like {@link extractMatchSnippets} but joins indented continuation lines
* (YAML/kubectl-describe multi-line values) into a single evidence entry.
*
* When a matching key line is found, any immediately following lines whose
* leading whitespace is strictly greater than the key line's indentation are
* appended (space-separated) before the snippet is recorded. This collapses
* a multi-line `message:` value into one readable line instead of surfacing
* only the truncated first line.
*/
private extractMatchSnippetsJoiningContinuations;
/**
* Extracts up to `maxBlocks` exception/stack-trace blocks from `content`.
*
* A block starts on any line matching `Exception`, `Error`, or `Caused by:`
* and continues as long as subsequent lines are stack frames (`at …`),
* chained causes (`Caused by:`), or truncation markers (`… N more`).
* Each block is capped at `maxLinesPerBlock` lines.
*/
private extractExceptionBlocks;
/**
* Renders all findings into a human-readable plain-text report, sorted by
* severity (image-pull → oom → pod-readiness → consensus-active →
* log-exception). Returns the report as a string ready to be written to
* `diagnostics-analysis.txt`.
*/
private renderDiagnosticsFindings;
}