Press n or j to go to the next uncovered block, b, p or k for the previous block.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 | 17x 17x 17x 17x 17x 17x 17x 17x 17x 17x 3x 3x 17x 18x 18x 18x 50x 50x 49x 1x 59x 59x 56x 3x 17x 69x 69x 10x 10x 2x 8x 7x 59x 59x 59x 3x 59x 59x 18x 18x 3x 18x 59x 41x 41x 6x 6x 6x 6x 35x 17x 17x 17x 1x 16x 11x 11x 17x 27x 27x 27x 27x 26x 1x 27x 24x 23x 22x 22x 22x 22x 21x 6x 15x 3x 10x 10x 10x 4x 22x 17x 17x 14x 50x 10x 11x 91x 20x 20x 6x 6x 6x 6x 62x 5x 25x 25x 25x 25x 61x 69x 48x 6x 6x 10x 15x 10x 1x 2x 1x 17x 17x 68x 68x 68x 68x 107x 107x 2x 63x 63x 63x 2x 2x 1x 2x 61x 61x 61x 61x 61x 185x 61x 61x 61x 62x 62x 61x 61x 61x 61x 62x 62x 185x 185x 185x 83x 83x 83x 83x 83x 2x 1x 1x 1x 2x 83x 83x 1x 1x 83x 396x 396x 204x 192x 192x 142x 168x 100x 64x 64x 63x 9x 9x 1x 8x 8x 8x 9x 8x 8x 8x 4x 4x 8x 8x 7x 7x 3x 3x 3x 11x 11x 11x 11x 11x 5x 5x 5x 4x 8x 103x 61x 61x 61x 103x 61x 61x 103x 103x 103x 103x 103x 131x 66x 66x | /**
* Connection Health Monitor
*
* Manages per-instance GitLab connection lifecycle using XState v5 state machines.
* Handles startup timeouts, automatic reconnection with exponential backoff,
* health checks, and dynamic tool availability based on connection state.
*
* State machine per instance:
* CONNECTING → HEALTHY | DEGRADED | DISCONNECTED | FAILED
* HEALTHY ↔ DEGRADED (introspection succeeds/fails)
* HEALTHY → DISCONNECTED (consecutive transient failures)
* DEGRADED → DISCONNECTED (consecutive transient failures)
* DISCONNECTED → CONNECTING (reconnect timer fires)
* FAILED (auth/config error — no auto-reconnect, only manual RECONNECT event)
*/
import {
setup,
assign,
createActor,
fromPromise,
type ActorRefFrom,
type SnapshotFrom,
} from 'xstate';
import { ConnectionManager } from './ConnectionManager';
import { normalizeInstanceUrl } from '../utils/url';
import { InstanceRegistry } from './InstanceRegistry';
import { classifyError, parseGitLabApiError, type ErrorCategory } from '../utils/error-handler';
import { enhancedFetch } from '../utils/fetch';
import { logInfo, logWarn, logError, logDebug } from '../logger';
import {
INIT_TIMEOUT_MS,
RECONNECT_BASE_DELAY_MS,
RECONNECT_MAX_DELAY_MS,
HEALTH_CHECK_INTERVAL_MS,
FAILURE_THRESHOLD,
GITLAB_BASE_URL,
GITLAB_TOKEN,
} from '../config';
import { isOAuthEnabled } from '../oauth/index';
// ============================================================================
// Types
// ============================================================================
export type ConnectionState = 'connecting' | 'healthy' | 'degraded' | 'disconnected' | 'failed';
export interface InstanceHealthSnapshot {
state: ConnectionState;
consecutiveFailures: number;
reconnectAttempt: number;
lastSuccessAt: number | null;
lastFailureAt: number | null;
lastError: string | null;
}
/** Dedicated error for initialization timeouts — replaces fragile string matching. */
export class InitializationTimeoutError extends Error {
constructor(timeoutMs: number) {
super(`Initialization timeout after ${timeoutMs}ms`);
this.name = 'InitializationTimeoutError';
}
}
// ============================================================================
// XState Machine Context & Events
// ============================================================================
interface MachineContext {
instanceUrl: string;
consecutiveFailures: number;
reconnectAttempt: number;
lastSuccessAt: number | null;
lastFailureAt: number | null;
lastError: string | null;
}
type MachineEvent =
| { type: 'TOOL_SUCCESS' }
| { type: 'TOOL_FAILURE'; error: string; category: ErrorCategory }
| { type: 'RECONNECT' };
// ============================================================================
// Backoff calculation
// ============================================================================
/**
* Calculate reconnect delay with exponential backoff and ±10% jitter,
* clamped to [RECONNECT_BASE_DELAY_MS, RECONNECT_MAX_DELAY_MS].
* Assumes BASE <= MAX (invalid config yields BASE as constant delay).
*/
export function calculateBackoffDelay(attempt: number): number {
const exponential = Math.min(
RECONNECT_BASE_DELAY_MS * Math.pow(2, attempt),
RECONNECT_MAX_DELAY_MS,
);
// Math.random is safe here — jitter is for load distribution, not security
const jitter = exponential * 0.1 * (Math.random() * 2 - 1); // ±10%
return Math.max(
RECONNECT_BASE_DELAY_MS,
Math.min(Math.round(exponential + jitter), RECONNECT_MAX_DELAY_MS),
);
}
// ============================================================================
// Async actors for XState
// ============================================================================
/**
* Check if schema introspection data is available for a URL.
* Used to distinguish healthy (version + schema) from degraded (version only).
*/
function hasSchemaInfo(connectionManager: ConnectionManager, instanceUrl: string): boolean {
try {
connectionManager.getSchemaInfo(instanceUrl);
return true;
} catch {
return false;
}
}
/**
* Check if an instance is running in degraded mode.
* Degraded = version unknown (REST/OAuth fallback) OR schema introspection incomplete.
*/
function isDegradedInstance(connectionManager: ConnectionManager, instanceUrl: string): boolean {
try {
const info = connectionManager.getInstanceInfo(instanceUrl);
return info.version === 'unknown' || !hasSchemaInfo(connectionManager, instanceUrl);
} catch {
return true;
}
}
// performConnect handles three phases in one function: (1) fast-path for already-connected
// instances, (2) full initialization with timeout budget, (3) degraded-path reachability probe.
// SonarCloud flags cognitive complexity=17 (limit 15). Extracting phase helpers would split
// the shared `deadline` variable across callsites and obscure the single-budget invariant.
// The three phases are demarcated by inline comments; complexity is intentional here.
const performConnect = fromPromise<{ degraded: boolean }, { instanceUrl: string }>(
async ({ input }) => {
const connectionManager = ConnectionManager.getInstance();
// Fast-path: if already initialized for this URL, verify with health check.
// Use HEALTH_CHECK_PROBE_MS (not INIT_TIMEOUT_MS) — init timeout may be
// configured very low to speed up startup, which would cause spurious disconnects.
if (connectionManager.isConnected(input.instanceUrl)) {
const healthy = await quickHealthCheck(input.instanceUrl, HEALTH_CHECK_PROBE_MS);
if (!healthy) {
// Intentionally a new Error (not the original fetch cause) — health check
// failures are always transient regardless of the underlying cause.
// classifyError maps this to 'transient' → disconnected → auto-reconnect.
throw new Error(`Health check failed for ${input.instanceUrl}`);
}
// Re-validate the token on reconnect, not just during steady-state polls.
// Without this, forceReconnect() while the token is still revoked would
// bounce failed → healthy until the next health-check interval.
await authenticatedTokenCheck(input.instanceUrl, HEALTH_CHECK_PROBE_MS);
return { degraded: isDegradedInstance(connectionManager, input.instanceUrl) };
}
// Full initialization with timeout.
// On timeout, clearInflight removes the hung promise so the next reconnect
// starts a fresh doInitialize() instead of re-awaiting the stale one.
// Single timeout budget for the entire connect flow (init + degraded probe)
const deadline = Date.now() + INIT_TIMEOUT_MS;
let timeoutId: ReturnType<typeof setTimeout> | undefined;
const timeoutPromise = new Promise<never>((_, reject) => {
timeoutId = setTimeout(
() => reject(new InitializationTimeoutError(INIT_TIMEOUT_MS)),
INIT_TIMEOUT_MS,
);
});
try {
await Promise.race([connectionManager.initialize(input.instanceUrl), timeoutPromise]);
} catch (error) {
// Only clear the inflight promise on timeout — for auth/network errors the
// underlying doInitialize() has already settled and cleaned up normally.
// Clearing on non-timeout errors could race with a concurrent doInitialize().
const isTimeout = error instanceof InitializationTimeoutError;
if (isTimeout) {
connectionManager.clearInflight(input.instanceUrl);
}
throw error;
} finally {
clearTimeout(timeoutId);
}
// Degraded = version unknown OR schema introspection incomplete.
// When degraded, verify actual reachability — OAuth/REST-only init can
// succeed with fallback data even when GitLab is unreachable. Without this
// check, the state machine would report "degraded" (reachable) instead of
// "disconnected", keeping all tools exposed.
// OAuth deferred: init may succeed but instanceInfo not yet available → degraded
const isDegraded = isDegradedInstance(connectionManager, input.instanceUrl);
if (isDegraded) {
// Verify reachability — OAuth/REST-only init can succeed with fallback
// data even when GitLab is unreachable. Throwing here lands in disconnected.
// Keep the degraded-path probe within the original startup budget.
// If the budget is exhausted or nearly exhausted (< 500ms), skip the probe —
// init already succeeded, and deadline jitter or a near-zero timeout would
// almost certainly fail and cause a false disconnect.
const remainingMs = deadline - Date.now();
Iif (remainingMs <= 0) {
// Budget exhausted — init succeeded so treat as reachable (avoids false disconnect
// from timer/event-loop jitter that can flip remainingMs negative by a few ms).
return { degraded: isDegraded };
}
Eif (remainingMs < 500) {
// Not enough time for a meaningful probe — assume reachable since init succeeded
return { degraded: isDegraded };
}
const reachable = await quickHealthCheck(input.instanceUrl, remainingMs);
if (!reachable) {
throw new Error(
`Health check failed for ${input.instanceUrl}: instance unreachable after degraded init`,
);
}
}
return { degraded: isDegraded };
},
);
const performHealthCheck = fromPromise<{ degraded: boolean }, { instanceUrl: string }>(
async ({ input }) => {
const healthy = await quickHealthCheck(input.instanceUrl);
if (!healthy) {
throw new Error(`Health check failed for ${input.instanceUrl}`);
}
// Detect mid-session token revocation in static token mode.
// Throws GitLab API 401/403 when the token is invalid or lacks required scope.
// healthCheckErrorIsAuth guard detects these by parsing the error message
// and routes to '#connection.failed' (no auto-reconnect).
// No-op in OAuth mode (no global token) and when GITLAB_TOKEN is unset.
await authenticatedTokenCheck(input.instanceUrl, HEALTH_CHECK_PROBE_MS);
const connectionManager = ConnectionManager.getInstance();
return { degraded: isDegradedInstance(connectionManager, input.instanceUrl) };
},
);
/**
* Lightweight health check: HEAD request to /api/v4/version with short timeout.
* Uses enhancedFetch to respect proxy/TLS/custom CA settings.
*/
// Steady-state probes use a shorter timeout than startup init
const HEALTH_CHECK_PROBE_MS = 3000;
async function quickHealthCheck(
instanceUrl: string,
timeoutMs: number = HEALTH_CHECK_PROBE_MS,
): Promise<boolean> {
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
try {
// Health probes are intentionally unauthenticated — 401 still confirms
// the server is alive. skipAuth prevents OAuth "no token context" warnings.
const response = await enhancedFetch(`${instanceUrl}/api/v4/version`, {
method: 'HEAD',
signal: controller.signal,
retry: false,
skipAuth: true,
rateLimit: false,
});
// Any non-5xx response means the server is reachable. The probe measures
// connectivity, not API correctness: 401/403 = auth needed, 3xx = redirect,
// 400/404 = unusual but still a responding HTTP endpoint. Actual API errors
// are caught at tool-call level, not here.
return response.status < 500;
} catch {
// Intentionally swallows the error — health checks are lightweight probes.
// Error classification (transient vs permanent) happens in performConnect
// during the full init/reconnect path, not during periodic probes.
return false;
} finally {
clearTimeout(timeoutId);
}
}
/**
* Authenticated token validity check: HEAD /api/v4/user with the static token.
* Detects mid-session token revocation that the unauthenticated reachability check
* cannot see (401 from /api/v4/version is treated as "server alive").
*
* Only runs in static token mode — OAuth tokens are per-request context and are
* not available during background health checks.
*
* Throws a GitLab API 401 or 403 error when the token is invalid, revoked,
* expired, or lacks the required scope. The healthCheckErrorIsAuth guard detects
* these by parsing the status code and transitions to 'failed' (no auto-reconnect).
*
* AbortError (our own timeout) and transient connectivity failures are swallowed:
* reachability was already confirmed by quickHealthCheck. Unexpected errors are
* logged and re-thrown so programming bugs don't silently leave the instance healthy.
*/
async function authenticatedTokenCheck(instanceUrl: string, timeoutMs: number): Promise<void> {
// OAuth mode: token is per-request context, unavailable during background checks
if (isOAuthEnabled()) return;
// No static token configured — nothing to validate
if (!GITLAB_TOKEN) return;
const controller = new AbortController();
const timeoutId = setTimeout(() => controller.abort(), timeoutMs);
try {
const response = await enhancedFetch(`${instanceUrl}/api/v4/user`, {
method: 'HEAD',
signal: controller.signal,
retry: false,
rateLimit: false,
// skipAuth suppresses auto-injected credentials (session cookies, getAuthHeaders()).
// The explicit PRIVATE-TOKEN header ensures we validate ONLY the static token —
// a valid session cookie must not mask a revoked token and keep the probe alive.
skipAuth: true,
headers: { 'PRIVATE-TOKEN': GITLAB_TOKEN },
});
if (response.status === 401 || response.status === 403) {
// Both 401 (invalid/revoked token) and 403 (insufficient scope) mean the configured
// token cannot authenticate — include the actual status for accurate log messages.
throw new Error(
`GitLab API error: ${response.status} - token invalid or lacks required scope`,
);
}
if (!response.ok) {
// Non-auth, non-2xx response (e.g. 429 rate-limit, 5xx server error) — throw so
// the catch block can classify it as transient and swallow appropriately, rather
// than letting the probe silently succeed with a broken status code.
throw new Error(`GitLab API error: ${response.status} - authenticated health probe failed`);
}
} catch (error) {
// Re-throw auth errors from the token probe (401 = invalid, 403 = insufficient scope).
Eif (error instanceof Error) {
const parsed = parseGitLabApiError(error.message);
if (parsed?.status === 401 || parsed?.status === 403) throw error;
// Swallow our own AbortController timeout and transient connectivity failures.
// Reachability was already confirmed by quickHealthCheck; failures on this
// second request are noise, not signal.
Eif (error.name === 'AbortError' || classifyError(error) === 'transient') return;
}
// Unexpected error (programming bug, invalid URL, etc.) — log and rethrow so it
// doesn't silently leave the instance healthy with a broken probe.
logError('Unexpected error during authenticated token health check', {
err: error instanceof Error ? error : new Error(String(error)),
});
throw error;
} finally {
clearTimeout(timeoutId);
}
}
// ============================================================================
// XState Machine Definition
// ============================================================================
// Shared onError handler for health-check substates (healthy.checking, degraded.checking).
// Auth errors (401/403 from the authenticated probe) → failed, no auto-reconnect.
// All other errors → idle via recordFailure (transient failures accumulate toward threshold).
const healthCheckOnError = [
{
guard: 'healthCheckErrorIsAuth' as const,
target: '#connection.failed' as const,
actions: 'recordFailure' as const,
},
{
target: 'idle' as const,
actions: 'recordFailure' as const,
},
] as const;
const connectionMachine = setup({
types: {
context: {} as MachineContext,
events: {} as MachineEvent,
input: {} as { instanceUrl: string },
},
actors: {
performConnect,
performHealthCheck,
},
delays: {
reconnectDelay: ({ context }) => calculateBackoffDelay(context.reconnectAttempt),
healthCheckInterval: () => HEALTH_CHECK_INTERVAL_MS,
degradedCheckInterval: () => Math.min(HEALTH_CHECK_INTERVAL_MS, 30_000),
},
guards: {
isTransient: (_, params: { category: ErrorCategory }) => params.category === 'transient',
thresholdReached: ({ context }) => context.consecutiveFailures >= FAILURE_THRESHOLD,
// Classify connect/health-check errors: only transient → reconnect
connectErrorIsTransient: ({ event }) => {
const error = (event as { error?: unknown }).error;
return classifyError(error) === 'transient';
},
// Auth error during periodic health check → failed (no auto-reconnect).
// Uses parseGitLabApiError to extract the status code: both 401 (invalid token)
// and 403 (insufficient scope) from the authenticated probe are terminal failures.
// Direct message parsing is used because classifyError maps 403 → 'permanent',
// not 'auth', so we can't rely on classifyError for the 403 path.
healthCheckErrorIsAuth: ({ event }) => {
const error = (event as { error?: unknown }).error;
/* istanbul ignore if */
if (!(error instanceof Error)) return false;
const parsed = parseGitLabApiError(error.message);
return parsed?.status === 401 || parsed?.status === 403;
},
},
actions: {
recordSuccess: assign({
consecutiveFailures: 0,
reconnectAttempt: 0,
lastSuccessAt: () => Date.now(),
lastError: null,
}),
incrementReconnectAttempt: assign({
reconnectAttempt: ({ context }) => context.reconnectAttempt + 1,
}),
// Shared action for TOOL_FAILURE and health check onError — increments
// failure counter and records the error for threshold-based disconnect.
recordFailure: assign({
consecutiveFailures: ({ context }) => context.consecutiveFailures + 1,
lastFailureAt: () => Date.now(),
lastError: ({ event }) => {
const e = (event as { error?: unknown }).error;
return e instanceof Error ? e.message : typeof e === 'string' ? e : String(e);
},
}),
},
}).createMachine({
id: 'connection',
initial: 'connecting',
context: ({ input }: { input: { instanceUrl: string } }) => ({
instanceUrl: input.instanceUrl,
consecutiveFailures: 0,
reconnectAttempt: 0,
lastSuccessAt: null,
lastFailureAt: null,
lastError: null,
}),
states: {
connecting: {
invoke: {
src: 'performConnect',
input: ({ context }) => ({ instanceUrl: context.instanceUrl }),
onDone: [
{
guard: ({ event }) => event.output.degraded,
target: 'degraded',
actions: 'recordSuccess',
},
{
target: 'healthy',
actions: 'recordSuccess',
},
],
onError: [
{
// Transient errors (network, timeout, 5xx) → disconnected → auto-reconnect
guard: 'connectErrorIsTransient',
target: 'disconnected',
actions: 'recordFailure',
},
{
// Auth/permanent errors (401, config) → failed, no auto-reconnect
target: 'failed',
actions: assign({
lastFailureAt: () => Date.now(),
lastError: ({ event }) =>
event.error instanceof Error ? event.error.message : String(event.error),
}),
},
],
},
},
healthy: {
initial: 'idle',
// Tool success/failure handlers on parent state so they're active in both
// idle AND checking substates (events during health check probe aren't dropped).
on: {
TOOL_SUCCESS: {
actions: 'recordSuccess',
},
TOOL_FAILURE: [
{
// Only transient errors (network, 5xx) affect connection health.
// Auth errors (401/403) during tool calls are intentionally ignored here —
// mid-session token revocation requires authenticated health checks (#370).
guard: {
type: 'isTransient',
params: ({ event }) => ({ category: event.category }),
},
actions: 'recordFailure',
},
],
},
// XState v5: always transitions are re-evaluated after any context change
// (including assign from TOOL_FAILURE), so thresholdReached fires correctly.
always: [
{
guard: 'thresholdReached',
target: '#connection.disconnected',
},
],
states: {
idle: {
after: {
healthCheckInterval: 'checking',
},
},
checking: {
invoke: {
src: 'performHealthCheck',
input: ({ context }) => ({ instanceUrl: context.instanceUrl }),
onDone: [
{
guard: ({ event }) => event.output.degraded,
target: '#connection.degraded',
actions: 'recordSuccess',
},
{
target: 'idle',
actions: 'recordSuccess',
},
],
onError: healthCheckOnError,
},
},
},
},
degraded: {
initial: 'idle',
on: {
TOOL_SUCCESS: {
actions: 'recordSuccess',
},
TOOL_FAILURE: [
{
guard: {
type: 'isTransient',
params: ({ event }) => ({ category: event.category }),
},
actions: 'recordFailure',
},
],
},
// XState v5: always re-evaluated after assign from TOOL_FAILURE
always: [
{
guard: 'thresholdReached',
target: '#connection.disconnected',
},
],
states: {
idle: {
after: {
degradedCheckInterval: 'checking',
},
},
checking: {
invoke: {
src: 'performHealthCheck',
input: ({ context }) => ({ instanceUrl: context.instanceUrl }),
onDone: [
{
guard: ({ event }) => !event.output.degraded,
target: '#connection.healthy',
actions: 'recordSuccess',
},
{
target: 'idle',
actions: 'recordSuccess',
},
],
onError: healthCheckOnError,
},
},
},
},
disconnected: {
after: {
reconnectDelay: 'connecting',
},
exit: ['incrementReconnectAttempt'],
on: {
RECONNECT: {
target: 'connecting',
},
},
},
// Terminal state: auth/config errors that won't fix themselves.
// No auto-reconnect. Only RECONNECT event (manual forceReconnect) can retry.
failed: {
on: {
RECONNECT: {
target: 'connecting',
},
},
},
},
});
// ============================================================================
// HealthMonitor Service
// ============================================================================
type ConnectionActor = ActorRefFrom<typeof connectionMachine>;
/**
* Callback invoked when any instance changes connection state.
* Used by HealthMonitor to trigger tool list updates and logging.
*/
type StateChangeCallback = (
instanceUrl: string,
from: ConnectionState,
to: ConnectionState,
) => void;
/**
* Singleton service that manages per-instance GitLab connection health using XState state machines.
* Tracks connectivity state, drives automatic reconnection, and notifies listeners of state changes.
*/
export class HealthMonitor {
private static instance: HealthMonitor | null = null;
private readonly actors = new Map<string, ConnectionActor>();
private readonly previousStates = new Map<string, ConnectionState>();
private stateChangeCallbacks: StateChangeCallback[] = [];
private readonly subscriptions = new Map<string, { unsubscribe: () => void }>();
private constructor() {}
/** Return the singleton instance, creating it on first call. */
public static getInstance(): HealthMonitor {
HealthMonitor.instance ??= new HealthMonitor();
return HealthMonitor.instance;
}
/**
* Register a callback for connection state changes.
*/
// Registered once from handlers.ts at startup (guarded by healthMonitorStartup promise).
// No unregister needed — callbacks are cleared on shutdown().
public onStateChange(callback: StateChangeCallback): void {
this.stateChangeCallbacks.push(callback);
}
/**
* Initialize health monitoring for an instance.
* Returns a promise that resolves when the initial connection attempt completes
* (success or timeout — never blocks indefinitely).
*/
public async initialize(instanceUrl?: string): Promise<void> {
const url = this.resolveUrl(instanceUrl);
// Don't create duplicate actors for the same instance.
// If the existing actor is still connecting, wait for the initial outcome.
const existingActor = this.actors.get(url);
if (existingActor) {
logDebug('HealthMonitor: actor already exists for instance', { url });
if (this.getActorState(existingActor) === 'connecting') {
await this.waitForInitialState(existingActor);
}
return;
}
logInfo('HealthMonitor: initializing connection monitoring', { url });
const actor = createActor(connectionMachine, {
input: { instanceUrl: url },
});
this.actors.set(url, actor);
this.previousStates.set(url, 'connecting');
// Subscribe to state changes
const subscription = actor.subscribe((snapshot) => {
this.handleStateChange(url, snapshot);
});
this.subscriptions.set(url, subscription);
// Start the actor (begins connecting)
actor.start();
// Wait for the initial connection attempt to resolve
// This ensures setupHandlers() doesn't return until we know the state
await this.waitForInitialState(actor);
}
/**
* Wait for the actor to leave the 'connecting' state (success or failure).
*/
private waitForInitialState(actor: ConnectionActor): Promise<void> {
return new Promise<void>((resolve) => {
// Subscribe first, then check — avoids race where state transitions
// between getActorState() and subscribe() (the subscribe callback
// wouldn't fire for the missed transition).
const sub = actor.subscribe((snapshot) => {
const state = this.extractState(snapshot);
Eif (state !== 'connecting') {
sub.unsubscribe();
resolve();
}
});
// Check current state AFTER subscribing — if already past connecting,
// the subscribe callback may not fire (XState v5 doesn't replay current
// snapshot to new subscribers), so resolve immediately.
const currentState = this.getActorState(actor);
Iif (currentState !== 'connecting') {
sub.unsubscribe();
resolve();
}
});
}
/**
* Handle state transitions — log, update InstanceRegistry, fire callbacks.
*/
private handleStateChange(
instanceUrl: string,
snapshot: SnapshotFrom<typeof connectionMachine>,
): void {
const newState = this.extractState(snapshot);
const previousState = this.previousStates.get(instanceUrl);
if (previousState === newState) return;
const context = snapshot.context;
logInfo('Connection state changed', {
instanceUrl,
from: previousState,
to: newState,
consecutiveFailures: context.consecutiveFailures,
reconnectAttempt: context.reconnectAttempt,
lastError: context.lastError,
});
// Update InstanceRegistry connection status
try {
const registry = InstanceRegistry.getInstance();
if (registry.isInitialized()) {
let registryStatus: 'healthy' | 'degraded' | 'offline';
if (newState === 'healthy') {
registryStatus = 'healthy';
} else Iif (newState === 'degraded') {
registryStatus = 'degraded';
} else {
registryStatus = 'offline';
}
registry.updateConnectionStatus(instanceUrl, registryStatus);
}
} catch {
// InstanceRegistry may not be initialized yet
}
// Fire callbacks — use 'connecting' as default previous for the first emission
// so broadcastToolsListChanged fires on the initial connecting→healthy transition.
const effectivePrevious = previousState ?? 'connecting';
for (const callback of this.stateChangeCallbacks) {
try {
callback(instanceUrl, effectivePrevious, newState);
} catch (error) {
logError('State change callback error', { err: error as Error });
}
}
// Update previousStates AFTER callbacks so they see the pre-transition value
this.previousStates.set(instanceUrl, newState);
}
/**
* Extract top-level state name from XState snapshot.
* Handles compound states (e.g., "healthy.idle" → "healthy").
*/
private extractState(snapshot: SnapshotFrom<typeof connectionMachine>): ConnectionState {
const value = snapshot.value;
if (typeof value === 'string') {
return value as ConnectionState;
}
// Compound state: { healthy: 'idle' } or { healthy: 'checking' }
const topLevel = Object.keys(value)[0];
return topLevel as ConnectionState;
}
/** Return the current top-level state for an actor. */
private getActorState(actor: ConnectionActor): ConnectionState {
return this.extractState(actor.getSnapshot());
}
/** Resolve and normalize an optional instance URL to a consistent Map key */
private resolveUrl(instanceUrl?: string): string {
return normalizeInstanceUrl(instanceUrl ?? GITLAB_BASE_URL);
}
/** Look up actor for an instance URL (returns undefined if untracked) */
private getActor(instanceUrl?: string): ConnectionActor | undefined {
return this.actors.get(this.resolveUrl(instanceUrl));
}
// ============================================================================
// Public API — called from handlers.ts
// ============================================================================
/**
* Get connection state for an instance.
*/
// Note: returns 'disconnected' for untracked URLs (no actor). This differs from
// isInstanceReachable() which treats untracked URLs as reachable. Use
// isInstanceReachable() for gate decisions; use getState() only for status display.
public getState(instanceUrl?: string): ConnectionState {
const actor = this.getActor(instanceUrl);
if (!actor) return 'disconnected';
return this.getActorState(actor);
}
/**
* Get health snapshot for an instance.
*/
public getSnapshot(instanceUrl?: string): InstanceHealthSnapshot {
const actor = this.getActor(instanceUrl);
if (!actor) {
return {
state: 'disconnected',
consecutiveFailures: 0,
reconnectAttempt: 0,
lastSuccessAt: null,
lastFailureAt: null,
lastError: null,
};
}
const snapshot = actor.getSnapshot();
const context = snapshot.context;
return {
state: this.extractState(snapshot),
consecutiveFailures: context.consecutiveFailures,
reconnectAttempt: context.reconnectAttempt,
lastSuccessAt: context.lastSuccessAt,
lastFailureAt: context.lastFailureAt,
lastError: context.lastError,
};
}
/**
* Check if at least one monitored instance is healthy, degraded, or connecting.
* Connecting is included to avoid context-only tools/list during startup.
* Used by registry-manager to decide tool filtering.
*/
public isAnyInstanceHealthy(): boolean {
// No actors = HealthMonitor not yet initialized, don't restrict tools
if (this.actors.size === 0) return true;
for (const actor of this.actors.values()) {
const state = this.getActorState(actor);
// connecting = init in progress — include to avoid context-only tools/list
// during startup (first session would see empty tool list for 5s otherwise)
if (state === 'healthy' || state === 'degraded' || state === 'connecting') {
return true;
}
}
return false;
}
/**
* Check if a specific instance is reachable (healthy or degraded).
* Untracked instances (no actor) are assumed reachable — we don't block
* tool calls for instances the monitor hasn't seen yet (e.g., OAuth context switch).
*/
public isInstanceReachable(instanceUrl?: string): boolean {
const actor = this.getActor(instanceUrl);
if (!actor) return true; // Untracked = assume reachable
const state = this.getActorState(actor);
return state === 'healthy' || state === 'degraded';
}
/**
* Report a successful tool execution.
*/
public reportSuccess(instanceUrl?: string): void {
const actor = this.getActor(instanceUrl);
Eif (actor) {
actor.send({ type: 'TOOL_SUCCESS' });
}
}
/**
* Report a failed tool execution.
* Error is classified to determine if it affects connection health.
*/
public reportError(instanceUrl?: string, error?: Error): void {
const actor = this.getActor(instanceUrl);
Iif (!actor || !error) return;
const category = classifyError(error);
actor.send({
type: 'TOOL_FAILURE',
error: error.message,
category,
});
if (category === 'transient') {
logWarn('Transient error reported to health monitor', {
instanceUrl: this.resolveUrl(instanceUrl),
error: error.message,
});
}
}
/**
* Force an immediate reconnection attempt.
*/
public forceReconnect(instanceUrl?: string): void {
const actor = this.getActor(instanceUrl);
if (actor) {
actor.send({ type: 'RECONNECT' });
}
}
/**
* Get all monitored instance URLs.
*/
public getMonitoredInstances(): string[] {
return [...this.actors.keys()];
}
/**
* Stop all actors and clear state.
*/
public shutdown(): void {
for (const [url, actor] of this.actors) {
try {
actor.stop();
} catch {
// Actor may already be stopped
}
logDebug('HealthMonitor: stopped actor', { url });
}
for (const sub of this.subscriptions.values()) {
try {
sub.unsubscribe();
} catch {
// Subscription may already be cleaned up
}
}
this.actors.clear();
this.subscriptions.clear();
this.previousStates.clear();
this.stateChangeCallbacks = [];
logInfo('HealthMonitor shut down');
}
/**
* Reset singleton (for testing).
*/
public static resetInstance(): void {
if (HealthMonitor.instance) {
HealthMonitor.instance.shutdown();
HealthMonitor.instance = null;
}
}
}
|