From 7a2a48f4f1eb88b1bb6e362aa7e57954d3575b53 Mon Sep 17 00:00:00 2001 From: Alexander Whitestone Date: Sat, 11 Apr 2026 00:21:12 +0000 Subject: [PATCH] fix: [MONITORING] Integrate Kimi Heartbeat status into Nexus Watchdog (#1192) --- .../nexus_watchdog.cpython-312.pyc | Bin 23972 -> 28637 bytes bin/nexus_watchdog.py | 117 ++++++++++++++++++ 2 files changed, 117 insertions(+) diff --git a/bin/__pycache__/nexus_watchdog.cpython-312.pyc b/bin/__pycache__/nexus_watchdog.cpython-312.pyc index 20fd6b1b1b76a1dcb5ec31d3efaa9c8304c62316..e28700065fe45c7a055a48f3c72bdd990e2bfb01 100644 GIT binary patch delta 8977 zcma)Bdw5jGb-#DtFRkA1*VPI=R!A!lAV44)0eXP|Aqi{*c(v@kl2)vC<+)cDTCcKl z>a-{!L3r}jN^Sa)#9sq}G^Fw$P8+vQ?4(Z8eqBYlk#9(YTQ{!1q={maHX-eoo-_N< z!#H0TbY||%nKNh3oSFHZnHT?t|M5?F`}b2)EFApqKKqt$>diD(}v z6;lTUs7)K)PiN{1ccg`^94DqD9dL*l1Lb1oz$P(kphCOC3Lp6_V({@Ywfo*ulrg0Mq^!VMf8uqLFyf>~@s5bW zVSz~FL88bOPt2~JtCsWl{1XI>=MM&kWuGXqc0p)N1n-FCJulN&%zs(nxVuZUkUqEb4OQyTdVs-Z+EC-L{Y}&ruzD!AQ_fi zFpLYHfGCm45u9En7?51v;MhW~b%y6l=~rwHOCE?JNM`9ro@aO$pV`bw*TajYwEo`k_^Noe2KtlE&5I2mvw-w=GsdLZ#B!OD!7N@Ko&=mvhTB+DcW z6+c~wI?n3(2h>&FqCX7<59kxs`KdQFWClLS86ZbVE&W_|Ie&z{Qr!!jX-E(3t7?!g_+&+nDT?ZoE_&LzBc%eTK)oj&+(HiQHKb8drPMxa5JyzABkE({? z(2y)C#1B;j2Y1w&O3+zVliQ7aZntU}hYcy9kc-H}mhUH#!3Ya`95_fm{ZW0nbslS( zm}TXDlEKOqYoSoYeubf zu%y67v|-J(KCmmSjTH<*fffoXl$5w!PSlGA(I}df)I>>GljvnlnKn^2tnu-;%s1nD z^>JZ+n0uZREmxV|(}u7Cy1KThxNCh6*02G%Y*)2n${MMLgw*vi&76{+&}KA)Ug^xF zuQg<&d2$jBrj1Hrf+cLEzuJ{umM&%_!ArujW(9XN_KR`lW91<U_8}%NkJ|I|Kd=a@KZ+wXixq z+*brTOAFgc)drs_3a7$|io-n6l5iT((lxp#5i^tvVe9kURWR~ezgZ`(Omr}5{P5;)>w0Ft#G9DF#692|7IgouDhmw{K%9PyPR1q6>09P@cW$dqty z3iV@x*5fWb8z$CcUU|Zcr_a!Y|3k1D5OCZgtwP*P3_M1J$q@;VGTKTI1_u$Jx{y>m zg*qXLR)rRUvAf46{EBbf?*nXDlZ2WH_&PYq*T z1F%}64jL5e8GnXYQVGu5<>j;0#NiEqq9rc_g6U9leBFpa8V@%APAj+u_PjP|?f*>l6Rc-F9NPP<$(TXM&o zznq?jpXrxNXG`yx3zpOKXASS9LRz+*mA{AIVmF}MA>T~~HRi(I$)rObxNQtGyu&gJy%=kl)P%~xORT1t1#9J*_< zUv|zqKec(@yl5$#IdIRII=A)X)AurSFPiS93D&sl{^|&{SZqJJwExj)>toT!2JW1GJUTcM zJs(^g3`UhF7u`=Tot}&iT!>zniarvW>3-XonzY}&jN`BN(#UsbwXS|2Uha$9qI z?`_ws+Sd=fVSeps^gtif_1pNu<-&^j+7-Pfy_){ACMQz7oR+nmQF(3hn(un#xy9Sy{+8uy_W^9KG0By)pN~c5+pr<-yYYaCG*Z{z8veiNslCPgPCB`_{Ma(ohK%(-Sq4Q^eZOyP zYzo3x?2;@3QOzD%_6-LlNxs3eEy!tzh17?U;SsxdeKWTp{$ z7piY)*Y=798rarw5bnUl*g1)i|A4}^V^=dK#>I^)ybnw|Fp#_hGBeye^lr;Grp>$A zNlnOqLLK=9l0|CU*UP_4-TMmbe~ASvK-P7V?)3-d^~3yc;7QTY8~er!t03z|!z6wH zoH>|S{)4}u+gndK6pUfn?`Aw5)=%pcYy5*#Qs@hnEZn27v>x8fj#U>_I*shEfW_u4 z#$_=i+o)r=9BtIKzq;`1JELO#TOW%g{3I^wR#SO?OR2OQw<) ztHExVX}@phY0f1=0Qa|{|dfEqOP zeET~vU$6rBxrNb=IiAm>zw2~q0_C)|tGz_o1dZ5_6hLf<=4U^QTyL3ZX;=9XeiV0y z4Zl7PLI%7+U_S^6>^ygh3x5P}63d(E=eu@CehZDr10=sg!i?U6xkrG6%oVawAveLF zlP4SuLbV5^3lp+CahpjnRDrrLVUo?}fvk!|hgl;RC!Z7nQH}oK@GuxlrOk&@BFy|+ zC|Ywn$L0ae7V9?C-PzjbKG1vY=qkcgv)vE72PJHK)2liU8J1(Sq-SnIY}o%XAnQFU z7w$^XrGf#kgtp_3peJc~`qe{!9{Fce<8n&+VtQaj(0~ZF=pubv#fnmks5KpB*x6`Kb$64@Y_0+?`dNuiy^MeqzCD z9s~)o(t2~Z6#Vh%+qgnX4%P0O48{@%kV)* zTLM)p+p_j2t`Xpcq4bl+i^d^d6rM4`MNdUSsnMiQCrUVO)z3#O*{w(2QJeGs+oC$& z!b)e3>dQ7Kq=u8%OgyL5t(Kyi&t{s64svJVB4t`TYlMmA0=WwM7yFHF!69ucy9pDI zKc2f(tm9^lgum1}#Z7U_L;9H3!Rl(D>7AMUD>QISpnrNS$FA&&3E*()?WMoSC_t_j zy4aCf=13TKTCcPw*pvf_+~`3Zl~zC69#1@G=ajP;hH>=}PP9*y~iS5gDaK}>>;NdkaIrb@#!819NoU@v~ z-}6t#bf_d5JbmN%w=|Gk>RpUvLy@Wz5esBMHe{)*L)2XdVMEBl;;v+9$b#Ge#-Y`s zG~|J(wMk&%ODH$qh{653)p$1K+8}{3z;(PV#F^|E^{$Ge{=Tmna1SBBr{(=sJiH_H zZ{x4gNWZWHH_`WSbVP^4I1M?#3%Ccx!aTMm?ZeMjB#1towe(JZ6aN{SdLo<8rxhnU zA{qEbwz~1S4;|ajq9(O;Lq%Ud1i|hjC%XH)db+#XdQ}Ttz2XTATal_~nN3VRtLg-|S; z;1f?e`0b0KC*7FG7a-fAatt@pZ!E|s!Eb(k(w32Rr)0}=>ef3A2cuitUOgU7>7Z|) z%*`&lrN6WDf0gk(@&E z86?M%;JsMl>4oJDF?7pLq00~9HIQ3a%4B^7bGX!3WswGq%AKYdP-a{+Nq!7(AJ~tv zHxiHkwCM9HXJbu`P?J!j5sVOr3NdKI6lJ9b@+%a^)MqBG!Oso#k%uy==L8kWs#qZwCvmc85ngfsHZV@y?jKIO(Ab@Y(SJ^j_CdSibVO2Tn(( z9`ZXMVeeR~1s)kD;Nc6N83>$B94SGG%iy1%VE7B-+q#pZAr zb73UUBKZQ6uOnfR#oL(s84`vV8Fplgm4O+A7i?*;SHbNY9{C=a_Qoy~TSzM~!hcTQ z18ZnS%k%uNbjNvK_v;~g=k(^x6^o7+eyr2nH%*h-8`4(ST1Q^(f9K_Tw&N&w3xY<;u8*)7dVDexS{NOK18-`-+y+75rvpGJ`W^0yg4><$}^B7!8b8 ah(V%nnoR8)?mHS|yD3Hc9UI@CqWwP@q4^g8 delta 4832 zcmb6c3s6+o_1)dKA6Wi~zyiy{@_T@M1r3Q-0*D|VCE#Zhv#$Fdun4=G`yNrKh)GS2 zjuSOEGj&?qq|}mABkQhLkE4+m_I`tF+FSU)*k7F@r_0M`Oa|TS`5N;wsi(PhKJC1HS_JZs1qSRdTgl0F)JSja&$ns%g1a_5e~X*U3fD*U0s9G4!>#09Pl* zlr8r1O1T7-)X5EUqg)DDy}U{;gIOzO>7d|kP#Wd(-DQG6?GT!V*p`SeVV*EctH+%- zL5S~B_KvIWtgw8+ z^;NYut*)wDRaM(q-&9{yQxiR!cz}>RmY7@^zZ|Af1iRTy$(7b(K4iO-b&}22r(9g_ z1j1>PZmO!$NkGLgy4|#dm8PyC`znfM?_D7#i+a!`S=FEJhvK(b_fyJ4+)gOr{A(>_& zIR`-#f;?7`m0gWWb(5y*R#o$BKscmYLB!1)*cr?!C-+7l%GyV! zar-@zNw-o(8>b;!0b?+;7do1Wcsz9>o{zwdpa4N30&e1Jz;t_rh9e5qCJd{3kXwX+ zd$$BJh_eASKvXpTKH&|K9doWSLQod{qw}iC!S{#0qB-ohdCisAjFFfJU@~bF4ewIi zgZ|*CA~(4uGu?u`t*pkiF%KiF$F=PZC=o3f4jG%U#3W9-@^jG~JtZ)#1a|xOsJMfF4@TJ}dW-jV!OC4|ABP7JYy_aj0pPYHpxmX&hW*2epK7}lzebOO%%?(hXJxGgy!K%2=51*}kwje_ zgvjY49#Y*J9voB^jgA0|;_iB+WQwO%xBGl3=kw{-h@UDUjoyng7+`u31#E%vSOg?! zCcC-L<9HAmT#d?tbvc#X$M`fXSeR~Ds7{15|6U&$I;suRQy@_lJzG~wqRs1O&EyD6 zYwaVunbNvBJq4GsQdOhD2yI9G8`#sWE|SXLZ7p@ENca{4FLSn)r16Ny(}$75Lvw`n zwUt(JJK})|cFZ@!VM)HBpr-hJRJm(hfi<24nStnIZLgc`+)+0ABzs^(AvwgJ*swb3 zDd4IL+hPCO;IjMx|l@shLdGuptKH+p){1AEd|9aL2?P`C64$Hpe$x}hob z8~{Deud2bJkfNxk2|WT7$!es44ixz{0^Ak7fS`v(Hx^d%eTXndUl@)^3s5*Oiq8+R z)O>t3`X`Xs&q&u@us5~qob~sp;XGT>oxK`Yrq2L4ZN64J5=?pmpY98dkL^+@y$HmG zy|ew@S=JYTa*VK3-D^Ox-u+I7p%|!i7QrP1=h%xqedHM?_2yJxM#6Id7W4qFrv^sD z>h%`uKtD;C+8fEiv=_iV#u^@Ih#mRpA@*hOfOANk5;eR+ylIXYHWJTfOLN)xHg%Mg z1C8f@@F=Dxo(lMt;wRR~r!6dbb6MlznCQ}sl1j%z^T*uBkG@J@2iY?i`E_h;bE#`C z&XQz3u>Ru&AtCG0qQ|5kN_+&Q-( zwg$jtT(RmdR-51_5&9NTq%`gmx;PDEbj?@Ja{@9|g7g`cv+;5vKf^f+89YG^3F()^A z^7a|R?kPxTS9T<3VLn~b=s7J*UPnlSx z(UqWK#lYK$o1I;k!H(S-&+cl>i(6puxE;MSfnB-N;wg?v?<29Iod{a#B6TE}^<+4w zTZJ7^_)M7(+f)+~LYbNr6B)tFvZO&ZJb#3Ua5PU84%;X>+B_jl2-*!6oDyN580jUA zUeK!NNtptOU|wC)*a6>icGBl&J2ULlAhLQ1ABJc0X6BQ@ltsHa#@Fyb@;0(IhAsB{ zEI0ZZ0qU)bGBFIOtsH0LJKfXkW8}qtpLJxz;wXjzwKIk<_CW1g9K|r%bxJ&8I$AFZ z0n?OK&Oc?A-De77$6(RbS}$H!-q^2pW)z6jBo~Irwb7UW6@;tVBS;?B=YQ7r0ZgDD z3ao6`Z%oi#-E}GX6CmlLjPK%yfU-{c2`k<|`X(bvDJfx+QV$jT4#nMo@q7=1P2WL) zMFc!jPQ(|oC`A|3Zy>;@P?Ffsl%~x40o9XR+txP^boKkT-saoX+t&}}gn6)iIv$D+ z%lW${LGjdr zb1dqpqD1IF0ssFgN4G?1a2o2Jb`-_H87X&hGdHM;b+bC5>LQePnq=n{lM3(ORh;}E z06Os*9>Eupe;Ckz1EylM@F9k_&UlQ)F@G>b)8oLQ?_)+9ihg@2jYR)Gyn{?XEjD&~ zQm`^ZA)gQ4rY)$C7m`~MLysC?513bCUlJU*8buLLuRJSXFKg5+cB6da3a1uS9|7$1 zq~2*>()&kmmxBRqhp~+9nq@M;UVL?MUKxU31e|K6&m+#4{Ca#{>JJZ}lfudqkPJmxDxI(uS`_rFSHfwJ#^8%$cFT0=)(LIjfM7J!eDAE-cTU zi$g44u%>`#H7+BCA69h}QZl#GB!~(3PhK`>|5>!(w|>qnh)!<$e*w$Z BF_-`V diff --git a/bin/nexus_watchdog.py b/bin/nexus_watchdog.py index 28df7a83..f2f5cfa3 100644 --- a/bin/nexus_watchdog.py +++ b/bin/nexus_watchdog.py @@ -60,6 +60,23 @@ If the heartbeat is older than --stale-threshold seconds, the mind is considered dead even if the process is still running (e.g., hung on a blocking call). +KIMI HEARTBEAT +============== +The Kimi triage pipeline writes a cron heartbeat file after each run: + + /var/run/bezalel/heartbeats/kimi-heartbeat.last + (fallback: ~/.bezalel/heartbeats/kimi-heartbeat.last) + { + "job": "kimi-heartbeat", + "timestamp": 1711843200.0, + "interval_seconds": 900, + "pid": 12345, + "status": "ok" + } + +If the heartbeat is stale (>2x declared interval), the watchdog reports +a Kimi Heartbeat failure alongside the other checks. + ZERO DEPENDENCIES ================= Pure stdlib. No pip installs. Same machine as the nexus. @@ -104,6 +121,10 @@ DEFAULT_HEARTBEAT_PATH = Path.home() / ".nexus" / "heartbeat.json" DEFAULT_STALE_THRESHOLD = 300 # 5 minutes without a heartbeat = dead DEFAULT_INTERVAL = 60 # seconds between checks in watch mode +# Kimi Heartbeat — cron job heartbeat file written by the triage pipeline +KIMI_HEARTBEAT_JOB = "kimi-heartbeat" +KIMI_HEARTBEAT_STALE_MULTIPLIER = 2.0 # stale at 2x declared interval + GITEA_URL = os.environ.get("GITEA_URL", "https://forge.alexanderwhitestone.com") GITEA_TOKEN = os.environ.get("GITEA_TOKEN", "") GITEA_REPO = os.environ.get("NEXUS_REPO", "Timmy_Foundation/the-nexus") @@ -345,6 +366,93 @@ def check_syntax_health() -> CheckResult: ) +def check_kimi_heartbeat( + job: str = KIMI_HEARTBEAT_JOB, + stale_multiplier: float = KIMI_HEARTBEAT_STALE_MULTIPLIER, +) -> CheckResult: + """Check if the Kimi Heartbeat cron job is alive. + + Reads the ``.last`` file from the standard Bezalel heartbeat + directory (``/var/run/bezalel/heartbeats/`` or fallback + ``~/.bezalel/heartbeats/``). The file is written atomically by the + cron_heartbeat module after each successful triage pipeline run. + + A job is stale when: + ``time.time() - timestamp > stale_multiplier * interval_seconds`` + (same rule used by ``check_cron_heartbeats.py``). + """ + # Resolve heartbeat directory — same logic as cron_heartbeat._resolve + primary = Path("/var/run/bezalel/heartbeats") + fallback = Path.home() / ".bezalel" / "heartbeats" + env_dir = os.environ.get("BEZALEL_HEARTBEAT_DIR") + if env_dir: + hb_dir = Path(env_dir) + elif primary.exists(): + hb_dir = primary + elif fallback.exists(): + hb_dir = fallback + else: + return CheckResult( + name="Kimi Heartbeat", + healthy=False, + message="Heartbeat directory not found — no triage pipeline deployed yet", + details={"searched": [str(primary), str(fallback)]}, + ) + + hb_file = hb_dir / f"{job}.last" + if not hb_file.exists(): + return CheckResult( + name="Kimi Heartbeat", + healthy=False, + message=f"No heartbeat file at {hb_file} — Kimi triage pipeline has never reported", + details={"path": str(hb_file)}, + ) + + try: + data = json.loads(hb_file.read_text()) + except (json.JSONDecodeError, OSError) as e: + return CheckResult( + name="Kimi Heartbeat", + healthy=False, + message=f"Heartbeat file corrupt: {e}", + details={"path": str(hb_file), "error": str(e)}, + ) + + timestamp = float(data.get("timestamp", 0)) + interval = int(data.get("interval_seconds", 0)) + raw_status = data.get("status", "unknown") + age = time.time() - timestamp + + if interval <= 0: + # No declared interval — use raw timestamp age (30 min default) + interval = 1800 + + threshold = stale_multiplier * interval + is_stale = age > threshold + + age_str = f"{int(age)}s" if age < 3600 else f"{int(age // 3600)}h {int((age % 3600) // 60)}m" + interval_str = f"{int(interval)}s" if interval < 3600 else f"{int(interval // 3600)}h {int((interval % 3600) // 60)}m" + + if is_stale: + return CheckResult( + name="Kimi Heartbeat", + healthy=False, + message=( + f"Silent for {age_str} " + f"(threshold: {stale_multiplier}x {interval_str} = {int(threshold)}s). " + f"Status: {raw_status}" + ), + details=data, + ) + + return CheckResult( + name="Kimi Heartbeat", + healthy=True, + message=f"Alive — last beat {age_str} ago (interval {interval_str}, status={raw_status})", + details=data, + ) + + # ── Gitea alerting ─────────────────────────────────────────────────── def _gitea_request(method: str, path: str, data: Optional[dict] = None) -> Any: @@ -446,6 +554,7 @@ def run_health_checks( check_mind_process(), check_heartbeat(heartbeat_path, stale_threshold), check_syntax_health(), + check_kimi_heartbeat(), ] return HealthReport(timestamp=time.time(), checks=checks) @@ -545,6 +654,14 @@ def main(): "--json", action="store_true", dest="output_json", help="Output results as JSON (for integration with other tools)", ) + parser.add_argument( + "--kimi-job", default=KIMI_HEARTBEAT_JOB, + help=f"Kimi heartbeat job name (default: {KIMI_HEARTBEAT_JOB})", + ) + parser.add_argument( + "--kimi-stale-multiplier", type=float, default=KIMI_HEARTBEAT_STALE_MULTIPLIER, + help=f"Kimi heartbeat staleness multiplier (default: {KIMI_HEARTBEAT_STALE_MULTIPLIER})", + ) args = parser.parse_args()