[{"data":1,"prerenderedAt":265},["ShallowReactive",2],{"navigation":3,"posts-undefined-复盘-0-999":20},[4,8,12,16],{"title":5,"path":6,"stem":7},"首页","\u002F","00.index",{"title":9,"path":10,"stem":11},"文章","\u002Fposts","01.posts",{"title":13,"path":14,"stem":15},"动态","\u002Fmoments","02.moments",{"title":17,"path":18,"stem":19},"关于","\u002Fabout","09.about",[21],{"id":22,"title":23,"body":24,"class":243,"cover":244,"coverSize":243,"date":245,"description":236,"draft":246,"extension":247,"hideComments":246,"location":243,"meta":248,"navigation":249,"path":250,"readingTime":251,"seo":256,"sitemap":257,"stem":258,"tags":259,"time":243,"weather":243,"__hash__":264},"posts\u002Fposts\u002F2025\u002F20250707.homelab-disaster-postmortem.md","一次 HomeLab 灾难级事故的复盘",{"type":25,"value":26,"toc":235},"minimark",[27,31,149,152,198,201,221,224],[28,29,30],"h2",{"id":30},"时间线",[32,33,34,42,48,54,60,66,72,78,84,90,96,101,107,113,119,125,131,137,143],"ul",{},[35,36,37,41],"li",{},[38,39,40],"strong",{},"2025-07-07 09:33",": TP-LINK 主路由设备上线告警（上次离线原因：设备重启）",[35,43,44,47],{},[38,45,46],{},"2025-07-07 09:34",": 收到群晖异常关机的邮件通知（收到该通知说明群晖已经重启过了，实际重启时间会更早一点）",[35,49,50,53],{},[38,51,52],{},"2025-07-07 09:36",": 尝试登录群晖 DSM，发现域名解析有问题，无法登录；尝试 ToDesk 远程连接家里的 PC，发现不在线（未开机）",[35,55,56,59],{},[38,57,58],{},"2025-07-07 09:40",": 收到 Uptime Kuma 监控服务的各种告警通知，多项服务不可用",[35,61,62,65],{},[38,63,64],{},"2025-07-07 09:52",": 通过 TP-LINK 商用云平台远程查看主路由，发现可连接，但由于之前为了 IPTV 改为了光猫的子路由（非桥接），无法查看到公网 IP；尝试通过电信的小翼管家查看公网 IP，发现没有入口可查",[35,67,68,71],{},[38,69,70],{},"2025-07-07 09:54",": 尝试通过群晖的 QuickConnect 远程访问，发现之前被我关闭了",[35,73,74,77],{},[38,75,76],{},"2025-07-07 10:30",": 查看自己写的 bots 服务代码（含 ddns 功能），请求失败时，有 backoff 策略，首次失败休眠 1 分钟，然后再失败休眠 10 分钟，再失败休眠 1 小时，决定再等一小时看看",[35,79,80,83],{},[38,81,82],{},"2025-07-07 11:00",": 在 TP-LINK 主路由管理页面尝试通过网络唤醒服务唤醒家里的 PC，发现无法唤醒（事后发现之前记录的网卡 MAC 不对）",[35,85,86,89],{},[38,87,88],{},"2025-07-07 11:30",": 通过米家控制办公桌的智能插座电源重启，尝试唤醒 PC，未成功；打算通过控制机柜的智能插座重启，实现所有服务的重启，但还打算再等等 bots 的 ddns 能否生效",[35,91,92,95],{},[38,93,94],{},"2025-07-07 11:44",": 等了 2 个多小时了，感觉 bots 服务可能已经不在运行，再等下去也没用了，经过深思熟虑决定重启整个机柜电源",[35,97,98,100],{},[38,99,94],{},": 通过米家控制智能插座关闭电源，发现状态未更新，再次点击发现操作失败，此时发现智能插座设备已离线，意识到机柜一旦断电，所有米家设备也无法控制了，再也无法打开",[35,102,103,106],{},[38,104,105],{},"2025-07-07 11:50",": 出发回家，准备手动重启机柜电源",[35,108,109,112],{},[38,110,111],{},"2025-07-07 12:49",": 到家，手动开启机柜智能插座电源",[35,114,115,118],{},[38,116,117],{},"2025-07-07 12:50",": 打开 PC，发现 主板 PCI-E 设备唤醒是 Enabled",[35,120,121,124],{},[38,122,123],{},"2025-07-07 12:51",": 进入 PC 系统，发现网卡的允许设备唤醒也是启用的，但网卡 MAC 地址和之前配置的不一样，原因后面详述",[35,126,127,130],{},[38,128,129],{},"2025-07-07 12:53",": 通过 PC 内网登录 portainer，发现 bots 容器处于 stopped 状态（Stopped for 3 hours with exit code 127），finished 时间为 09:33:52",[35,132,133,136],{},[38,134,135],{},"2025-07-07 12:54",": 手动重新启动 bots 容器，正常启动",[35,138,139,142],{},[38,140,141],{},"2025-07-07 12:55",": bots 服务已正常更新域名解析，手机切换到蜂窝测试，已经可正常访问",[35,144,145,148],{},[38,146,147],{},"2025-07-07 13:01",": 出门赶回公司",[28,150,151],{"id":151},"原因分析",[32,153,154,160,166,172,192],{},[35,155,156,159],{},[38,157,158],{},"导火索","：家里异常断电（TP-LINK 和群晖都在机柜里，他俩同时重启，可断定机柜掉电了；光猫在弱电箱里，查看光猫的启动时间，也在同一时间重启过，可判断是全屋断电了）",[35,161,162,165],{},[38,163,164],{},"直接原因","：自建的 DDNS 服务在光猫重启后公网 IP 发生变化的情况下未更新解析，导致所有服务无法远程访问",[35,167,168,171],{},[38,169,170],{},"根本原因","：包含了 DDNS 服务的 bots 容器在宿主机重启后未能重启成功，经过分析发现因为 bots 容器启动过程中挂载了群晖中的一个目录，用来更新 clash 的配置文件，但是群晖启动会比 bots 容器所在的宿主机慢，可能导致了启动失败",[35,173,174,177,178],{},[38,175,176],{},"处理慢的原因（多种补救措施失效）","：\n",[32,179,180,183,186,189],{},[35,181,182],{},"家里的 PC 未开机，无法通过 ToDesk 远程连接处理（之前几次类似问题都是通过 ToDesk 远程修复）",[35,184,185],{},"家里没人，无法帮忙手动启动 PC",[35,187,188],{},"PC 的远程唤醒功能失效，原因是网卡 MAC 地址记录不正确，这是因为之前记录的是一个虚拟网卡的 MAC，上次去掉了虚拟网卡，直接走的物理网卡，但是忘记记录 MAC 地址",[35,190,191],{},"群晖的 QuickConnect 远程访问服务失效，之前感觉用不到被我手动关闭了",[35,193,194,197],{},[38,195,196],{},"故障升级原因","：由于多个补救方案失效，尝试通过机柜断电重启的方式补救，结果所有设备断电，断绝了任何远程补救的可能",[28,199,200],{"id":200},"改进措施",[32,202,203,206,209,212,215,218],{},[35,204,205],{},"✅ 购买 UPS，确保机柜设备在短暂断电时能够继续供电，避免意外断电导致的服务中断（07-08 更新: 已购买山特 SANTAK TG-BOX850 UPS）",[35,207,208],{},"✅ 提升 DDNS 服务的核心程度，从 bots 项目中独立出来，减少其他依赖（07-08 更新: 已完成）",[35,210,211],{},"✅ 启用群晖的 QuickConnect 服务， DDNS 失效后可连接到群晖上进行一些处理",[35,213,214],{},"✅ 确保 PC 的网络远程唤醒功能正常，可通过远程连接到 PC 解决问题",[35,216,217],{},"✅ 部署一个 Cloudflare Tunnel 容器，作为 DDNS 失效后的备用方案",[35,219,220],{},"✅ 把机柜的米家插座从米家 APP 首页移除，避免误操作关闭电源，吸取教训，以后不要再给机柜断电了",[28,222,223],{"id":223},"经验教训",[32,225,226,229,232],{},[35,227,228],{},"之前出现过一次机柜断电后 DDNS 服务不可用导致无法访问的问题，当时通过 ToDesk 远程连接到 PC，然后通过内网重启了 bots 服务解决了问题，但应该更进一步，看看为什么 bots 服务没有自动重启成功，从而可以避免这次的事故",[35,230,231],{},"核心的服务需要保障高可用，例如公网访问这件事，除了自建的 DDNS 之外，还需要通过 QuickConnect、Cloudflare Tunnel 等多种手段保证可用性",[35,233,234],{},"任何情况下都不要尝试给整个机柜断电这种操作，应该优先考虑其他补救措施",{"title":236,"searchDepth":237,"depth":237,"links":238},"",2,[239,240,241,242],{"id":30,"depth":237,"text":30},{"id":151,"depth":237,"text":151},{"id":200,"depth":237,"text":200},{"id":223,"depth":237,"text":223},null,"jpg","2025-07-07",false,"md",{},true,"\u002Fposts\u002F2025\u002Fhomelab-disaster-postmortem",{"text":252,"minutes":253,"time":254,"words":255},"8 min read",7.38,442800,1476,{"title":23,"description":236},{"loc":250,"lastmod":245},"posts\u002F2025\u002F20250707.homelab-disaster-postmortem",[260,261,262,263],"技术","HomeLab","运维","复盘","ICyfDes8hfks9c7nlNUEMy8kuWktGhB5Iu1TriECazU",1777580270848]