Coverage for src / server_list / spec / data_collector.py: 48%

738 statements  

« prev     ^ index     » next       coverage.py v7.13.1, created at 2026-01-31 11:45 +0000

1#!/usr/bin/env python3 

2""" 

3Unified data collector for server-list. 

4Collects ESXi VM data and uptime, caches to SQLite. 

5Runs periodically every 5 minutes. 

6""" 

7 

8import atexit 

9import logging 

10import sqlite3 

11import ssl 

12import threading 

13from collections.abc import Generator 

14from contextlib import contextmanager 

15from datetime import datetime 

16from typing import Any 

17 

18import requests 

19from pyVim.connect import Disconnect, SmartConnect 

20from pyVmomi import vim 

21 

22import my_lib.config 

23import my_lib.safe_access 

24import my_lib.webapp.event 

25 

26import server_list.spec.cpu_benchmark as cpu_benchmark 

27import server_list.spec.db as db 

28import server_list.spec.db_config as db_config 

29import server_list.spec.models as models 

30import server_list.spec.ups_collector as ups_collector 

31 

32UPDATE_INTERVAL_SEC = 300 # 5 minutes 

33 

34_update_thread: threading.Thread | None = None 

35_should_stop = threading.Event() 

36 

37 

38@contextmanager 

39def _get_connection() -> Generator[sqlite3.Connection, None, None]: 

40 """DB接続を取得するコンテキストマネージャ. 

41 

42 SQLite 自体がファイルレベルのロック機構を持っているため、 

43 アプリケーション側でのロックは不要。 

44 """ 

45 with db.get_connection(db_config.get_server_data_db_path()) as conn: 

46 yield conn 

47 

48 

49def init_db(): 

50 """Initialize the SQLite database using schema file.""" 

51 db.init_schema_from_file(db_config.get_server_data_db_path(), db.SQLITE_SCHEMA_PATH) 

52 

53 

54def load_secret() -> dict: 

55 """Load secret.yaml containing ESXi credentials. 

56 

57 Constructs path from db.BASE_DIR to allow test mocking. 

58 """ 

59 secret_path = db.BASE_DIR / "secret.yaml" 

60 if not secret_path.exists(): 

61 return {} 

62 

63 return my_lib.config.load(secret_path, db.SECRET_SCHEMA_PATH) 

64 

65 

66def load_config() -> dict: 

67 """Load config.yaml containing machine definitions. 

68 

69 Constructs path from db.BASE_DIR to allow test mocking. 

70 """ 

71 config_path = db.BASE_DIR / "config.yaml" 

72 if not config_path.exists(): 72 ↛ 75line 72 didn't jump to line 75 because the condition on line 72 was always true

73 return {} 

74 

75 return my_lib.config.load(config_path, db.CONFIG_SCHEMA_PATH) 

76 

77 

78def connect_to_esxi(host: str, username: str, password: str, port: int = 443) -> Any | None: 

79 """Connect to ESXi host.""" 

80 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT) 

81 context.check_hostname = False 

82 context.verify_mode = ssl.CERT_NONE 

83 

84 try: 

85 si: Any = SmartConnect( 

86 host=host, 

87 user=username, 

88 pwd=password, 

89 port=port, 

90 sslContext=context 

91 ) 

92 atexit.register(Disconnect, si) 

93 return si 

94 except Exception as e: # pyVmomi can raise various unexpected exceptions 

95 logging.warning("Failed to connect to %s: %s", host, e) 

96 return None 

97 

98 

99def get_vm_storage_size(vm) -> float: 

100 """Calculate total storage size for a VM in GB.""" 

101 total_bytes = 0 

102 

103 try: 

104 for device in vm.config.hardware.device: 

105 if isinstance(device, vim.vm.device.VirtualDisk): 105 ↛ 104line 105 didn't jump to line 104 because the condition on line 105 was always true

106 if device.capacityInBytes is not None: 106 ↛ 104line 106 didn't jump to line 104 because the condition on line 106 was always true

107 total_bytes += device.capacityInBytes 

108 except Exception as e: 

109 # pyVmomi attribute access can fail for various reasons 

110 logging.debug("Failed to get storage size for VM: %s", e) 

111 

112 return total_bytes / (1024 ** 3) 

113 

114 

115def fetch_vm_data(si, esxi_host: str) -> list[models.VMInfo]: 

116 """Fetch VM data from ESXi.""" 

117 content = si.RetrieveContent() 

118 container = content.rootFolder 

119 view_type = [vim.VirtualMachine] 

120 recursive = True 

121 

122 container_view = content.viewManager.CreateContainerView( 

123 container, view_type, recursive 

124 ) 

125 

126 vms: list[models.VMInfo] = [] 

127 for vm in container_view.view: 

128 try: 

129 # Get CPU and memory usage from quickStats 

130 cpu_usage_mhz = None 

131 memory_usage_mb = None 

132 if vm.summary and vm.summary.quickStats: 132 ↛ 137line 132 didn't jump to line 137 because the condition on line 132 was always true

133 stats = vm.summary.quickStats 

134 cpu_usage_mhz = stats.overallCpuUsage 

135 memory_usage_mb = stats.guestMemoryUsage 

136 

137 vm_info = models.VMInfo( 

138 esxi_host=esxi_host, 

139 vm_name=vm.name, 

140 cpu_count=vm.config.hardware.numCPU if vm.config else None, 

141 ram_mb=vm.config.hardware.memoryMB if vm.config else None, 

142 storage_gb=get_vm_storage_size(vm) if vm.config else None, 

143 power_state=str(vm.runtime.powerState) if vm.runtime else None, 

144 cpu_usage_mhz=cpu_usage_mhz, 

145 memory_usage_mb=memory_usage_mb, 

146 ) 

147 vms.append(vm_info) 

148 except Exception as e: # pyVmomi attribute access can fail unexpectedly 

149 logging.warning("Error getting VM info for %s: %s", vm.name, e) 

150 

151 container_view.Destroy() 

152 return vms 

153 

154 

155def _extract_cpu_info(host_system) -> tuple[int | None, int | None]: 

156 """ESXi ホストから CPU 情報を抽出. 

157 

158 Args: 

159 host_system: pyVmomi HostSystem オブジェクト 

160 

161 Returns: 

162 (cpu_threads, cpu_cores) のタプル 

163 """ 

164 safe_host = my_lib.safe_access.safe(host_system) 

165 cpu_info = safe_host.hardware.cpuInfo 

166 return (cpu_info.numCpuThreads.value(), cpu_info.numCpuCores.value()) 

167 

168 

169def _extract_memory_total(host_system) -> float | None: 

170 """ESXi ホストから合計メモリを抽出. 

171 

172 Args: 

173 host_system: pyVmomi HostSystem オブジェクト 

174 

175 Returns: 

176 メモリサイズ (bytes) または None 

177 """ 

178 safe_host = my_lib.safe_access.safe(host_system) 

179 memory_size = safe_host.hardware.memorySize.value() 

180 return float(memory_size) if memory_size else None 

181 

182 

183def _extract_usage_from_quickstats( 

184 host_system, memory_total_bytes: float | None 

185) -> tuple[float | None, float | None, float | None]: 

186 """ESXi quickStats から CPU/メモリ使用率を抽出. 

187 

188 Args: 

189 host_system: pyVmomi HostSystem オブジェクト 

190 memory_total_bytes: 合計メモリ (bytes) 

191 

192 Returns: 

193 (cpu_usage_percent, memory_usage_percent, memory_used_bytes) のタプル 

194 """ 

195 cpu_usage_percent = None 

196 memory_usage_percent = None 

197 memory_used_bytes = None 

198 

199 safe_host = my_lib.safe_access.safe(host_system) 

200 stats = safe_host.summary.quickStats.value() 

201 if not stats: 201 ↛ 202line 201 didn't jump to line 202 because the condition on line 201 was never true

202 return cpu_usage_percent, memory_usage_percent, memory_used_bytes 

203 

204 # CPU usage in MHz 

205 cpu_info = safe_host.hardware.cpuInfo 

206 cpu_hz = cpu_info.hz.value() 

207 num_cores = cpu_info.numCpuCores.value() 

208 if stats.overallCpuUsage is not None and cpu_hz and num_cores: 208 ↛ 213line 208 didn't jump to line 213 because the condition on line 208 was always true

209 total_cpu_mhz = (cpu_hz / 1_000_000) * num_cores 

210 cpu_usage_percent = (stats.overallCpuUsage / total_cpu_mhz) * 100 

211 

212 # Memory usage in MB 

213 if stats.overallMemoryUsage is not None and memory_total_bytes: 213 ↛ 217line 213 didn't jump to line 217 because the condition on line 213 was always true

214 memory_used_bytes = float(stats.overallMemoryUsage * 1024 * 1024) 

215 memory_usage_percent = (memory_used_bytes / memory_total_bytes) * 100 

216 

217 return cpu_usage_percent, memory_usage_percent, memory_used_bytes 

218 

219 

220def _extract_os_version(host_system) -> str | None: 

221 """ESXi ホストから OS バージョンを抽出. 

222 

223 Args: 

224 host_system: pyVmomi HostSystem オブジェクト 

225 

226 Returns: 

227 OS バージョン文字列 (例: "VMware ESXi 8.0.0 build-xxx") 

228 """ 

229 safe_host = my_lib.safe_access.safe(host_system) 

230 return safe_host.config.product.fullName.value() 

231 

232 

233def fetch_host_info(si, host: str) -> models.HostInfo | None: 

234 """Fetch host info including uptime, CPU, memory usage, and ESXi version from ESXi host.""" 

235 try: 

236 content = si.RetrieveContent() 

237 

238 for datacenter in content.rootFolder.childEntity: 

239 if not hasattr(datacenter, "hostFolder"): 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true

240 continue 

241 

242 for cluster in datacenter.hostFolder.childEntity: 

243 hosts = [] 

244 if hasattr(cluster, "host"): 244 ↛ 246line 244 didn't jump to line 246 because the condition on line 244 was always true

245 hosts = cluster.host 

246 elif hasattr(cluster, "childEntity"): 

247 for child in cluster.childEntity: 

248 if hasattr(child, "host"): 

249 hosts.extend(child.host) 

250 

251 for host_system in hosts: 

252 try: 

253 boot_time = host_system.runtime.bootTime 

254 if not boot_time: 254 ↛ 255line 254 didn't jump to line 255 because the condition on line 254 was never true

255 continue 

256 

257 # ヘルパー関数で各情報を抽出 

258 cpu_threads, cpu_cores = _extract_cpu_info(host_system) 

259 memory_total_bytes = _extract_memory_total(host_system) 

260 cpu_usage, mem_usage, mem_used = _extract_usage_from_quickstats( 

261 host_system, memory_total_bytes 

262 ) 

263 os_version = _extract_os_version(host_system) 

264 

265 return models.HostInfo( 

266 host=host, 

267 boot_time=boot_time.isoformat(), 

268 uptime_seconds=(datetime.now(boot_time.tzinfo) - boot_time).total_seconds(), 

269 status="running", 

270 cpu_threads=cpu_threads, 

271 cpu_cores=cpu_cores, 

272 os_version=os_version, 

273 cpu_usage_percent=cpu_usage, 

274 memory_usage_percent=mem_usage, 

275 memory_total_bytes=memory_total_bytes, 

276 memory_used_bytes=mem_used, 

277 ) 

278 except Exception as e: # pyVmomi attribute access can fail unexpectedly 

279 logging.warning("Error getting host info: %s", e) 

280 

281 except Exception as e: # pyVmomi can raise various unexpected exceptions 

282 logging.warning("Failed to get host info for %s: %s", host, e) 

283 

284 return None 

285 

286 

287# ============================================================================= 

288# iLO Power Meter functions (via Redfish API) 

289# ============================================================================= 

290 

291 

292def fetch_ilo_power(host: str, username: str, password: str) -> models.PowerInfo | None: 

293 """Fetch power consumption data from HP iLO via Redfish API. 

294 

295 Args: 

296 host: iLO hostname or IP address 

297 username: iLO username 

298 password: iLO password 

299 

300 Returns: 

301 PowerInfo or None if failed 

302 """ 

303 # iLO uses self-signed certificates, disable verification 

304 url = f"https://{host}/redfish/v1/Chassis/1/Power" 

305 

306 try: 

307 response = requests.get( 

308 url, 

309 auth=(username, password), 

310 verify=False, # noqa: S501 - iLO uses self-signed certs 

311 timeout=30 

312 ) 

313 

314 if response.status_code != 200: 

315 logging.warning("iLO API returned status %d for %s", response.status_code, host) 

316 return None 

317 

318 data = response.json() 

319 

320 # Extract power data from PowerControl array 

321 power_control = data.get("PowerControl", []) 

322 if not power_control: 

323 logging.warning("No PowerControl data in iLO response for %s", host) 

324 return None 

325 

326 # Get first PowerControl entry (main chassis power) 

327 power_entry = power_control[0] 

328 power_watts = power_entry.get("PowerConsumedWatts") 

329 

330 # Get power metrics (average, min, max) 

331 power_metrics = power_entry.get("PowerMetrics", {}) 

332 power_average = power_metrics.get("AverageConsumedWatts") 

333 power_max = power_metrics.get("MaxConsumedWatts") 

334 power_min = power_metrics.get("MinConsumedWatts") 

335 

336 return models.PowerInfo( 

337 power_watts=power_watts, 

338 power_average_watts=power_average, 

339 power_max_watts=power_max, 

340 power_min_watts=power_min, 

341 ) 

342 

343 except requests.exceptions.Timeout: 

344 logging.warning("Timeout connecting to iLO at %s", host) 

345 return None 

346 except requests.exceptions.ConnectionError as e: 

347 logging.warning("Connection error to iLO at %s: %s", host, e) 

348 return None 

349 except (requests.RequestException, ValueError, KeyError) as e: 

350 # Handle JSON parsing errors and unexpected response format 

351 logging.warning("Error fetching power data from iLO at %s: %s", host, e) 

352 return None 

353 

354 

355def save_power_info(host: str, power_data: models.PowerInfo): 

356 """Save power consumption info to SQLite cache.""" 

357 collected_at = datetime.now().isoformat() 

358 

359 with _get_connection() as conn: 

360 cursor = conn.cursor() 

361 

362 cursor.execute(""" 

363 INSERT OR REPLACE INTO power_info 

364 (host, power_watts, power_average_watts, power_max_watts, power_min_watts, collected_at) 

365 VALUES (?, ?, ?, ?, ?, ?) 

366 """, ( 

367 host, 

368 power_data.power_watts, 

369 power_data.power_average_watts, 

370 power_data.power_max_watts, 

371 power_data.power_min_watts, 

372 collected_at 

373 )) 

374 

375 conn.commit() 

376 

377 

378def get_power_info(host: str) -> models.PowerInfo | None: 

379 """Get power consumption info from cache.""" 

380 with _get_connection() as conn: 

381 cursor = conn.cursor() 

382 

383 cursor.execute(""" 

384 SELECT power_watts, power_average_watts, power_max_watts, power_min_watts, collected_at 

385 FROM power_info 

386 WHERE host = ? 

387 """, (host,)) 

388 

389 row = cursor.fetchone() 

390 return models.PowerInfo.parse_row(row) if row else None 

391 

392 

393def get_all_power_info() -> dict[str, models.PowerInfo]: 

394 """Get all power consumption info from cache.""" 

395 with _get_connection() as conn: 

396 cursor = conn.cursor() 

397 

398 cursor.execute(""" 

399 SELECT host, power_watts, power_average_watts, power_max_watts, power_min_watts, collected_at 

400 FROM power_info 

401 """) 

402 

403 return dict(models.PowerInfo.parse_row_with_host(row) for row in cursor.fetchall()) 

404 

405 

406def collect_ilo_power_data(): 

407 """Collect power data from configured iLO hosts.""" 

408 secret = load_secret() 

409 ilo_auth = secret.get("ilo_auth", {}) 

410 

411 if not ilo_auth: 411 ↛ 414line 411 didn't jump to line 414 because the condition on line 411 was always true

412 return 

413 

414 for host, credentials in ilo_auth.items(): 

415 ilo_host = credentials.get("host", host) 

416 logging.info("Collecting power data from iLO %s...", ilo_host) 

417 

418 power_data = fetch_ilo_power( 

419 host=ilo_host, 

420 username=credentials["username"], 

421 password=credentials["password"] 

422 ) 

423 

424 if power_data: 

425 save_power_info(host, power_data) 

426 logging.info(" Cached power data for %s: %s W", host, power_data.power_watts) 

427 

428 

429# ============================================================================= 

430# Prometheus common helpers 

431# ============================================================================= 

432 

433 

434def _prometheus_request(prometheus_url: str, query: str) -> list[dict]: 

435 """Prometheus API への HTTP リクエスト共通処理. 

436 

437 Args: 

438 prometheus_url: Prometheus サーバー URL 

439 query: PromQL クエリ 

440 

441 Returns: 

442 結果リスト(失敗時は空リスト) 

443 """ 

444 try: 

445 response = requests.get( 

446 f"{prometheus_url}/api/v1/query", 

447 params={"query": query}, 

448 timeout=30, 

449 ) 

450 response.raise_for_status() 

451 data = response.json() 

452 

453 if data.get("status") != "success": 

454 return [] 

455 

456 return data.get("data", {}).get("result", []) 

457 

458 except requests.RequestException as e: 

459 logging.warning("Prometheus query failed: %s", e) 

460 return [] 

461 

462 

463def _execute_prometheus_query(prometheus_url: str, query: str) -> dict | None: 

464 """Prometheus API 呼び出しで最初の結果を取得. 

465 

466 Args: 

467 prometheus_url: Prometheus サーバー URL 

468 query: PromQL クエリ 

469 

470 Returns: 

471 最初の結果エントリ (dict)、または None (失敗時) 

472 """ 

473 results = _prometheus_request(prometheus_url, query) 

474 return results[0] if results else None 

475 

476 

477def _fetch_prometheus_metric(prometheus_url: str, query: str) -> float | None: 

478 """Prometheus から単一メトリクスを取得する共通関数. 

479 

480 Args: 

481 prometheus_url: Prometheus サーバー URL 

482 query: PromQL クエリ 

483 

484 Returns: 

485 メトリクス値 (float) または None (失敗時) 

486 """ 

487 result = _execute_prometheus_query(prometheus_url, query) 

488 if result: 

489 try: 

490 value = result.get("value", [None, None])[1] 

491 return float(value) if value is not None else None 

492 except (ValueError, TypeError, IndexError) as e: 

493 logging.debug("Prometheus metric parsing failed: %s", e) 

494 return None 

495 

496 

497def _fetch_prometheus_metric_with_timestamp(prometheus_url: str, query: str) -> tuple[float, float] | None: 

498 """Prometheus からタイムスタンプ付きでメトリクスを取得. 

499 

500 Args: 

501 prometheus_url: Prometheus サーバー URL 

502 query: PromQL クエリ 

503 

504 Returns: 

505 (timestamp, value) のタプル、または None (失敗時) 

506 """ 

507 result = _execute_prometheus_query(prometheus_url, query) 

508 if result: 

509 try: 

510 value_list = result.get("value", []) 

511 if len(value_list) >= 2: 

512 return float(value_list[0]), float(value_list[1]) 

513 except (ValueError, TypeError, IndexError) as e: 

514 logging.debug("Prometheus metric with timestamp parsing failed: %s", e) 

515 return None 

516 

517 

518# ============================================================================= 

519# Prometheus uptime functions (for Linux servers) 

520# ============================================================================= 

521 

522 

523def fetch_prometheus_uptime( 

524 prometheus_url: str, instance: str, is_windows: bool = False 

525) -> models.UptimeData | None: 

526 """Fetch uptime data from Prometheus. 

527 

528 Linux/Windows 共通の uptime 取得関数。 

529 Linux: node_boot_time_seconds メトリクス 

530 Windows: windows_system_system_up_time メトリクス 

531 

532 Args: 

533 prometheus_url: Prometheus server URL (e.g., http://192.168.0.20:9090) 

534 instance: Prometheus instance label 

535 is_windows: True の場合 Windows メトリクスを使用 

536 

537 Returns: 

538 UptimeData object or None if failed 

539 """ 

540 if is_windows: 

541 metric = f'windows_system_system_up_time{{instance=~"{instance}.*"}}' 

542 else: 

543 metric = f'node_boot_time_seconds{{instance=~"{instance}.*"}}' 

544 

545 result = _fetch_prometheus_metric_with_timestamp(prometheus_url, metric) 

546 if result is None: 

547 os_name = "Windows" if is_windows else "Linux" 

548 logging.warning("No Prometheus %s uptime data for instance %s", os_name, instance) 

549 return None 

550 

551 current_time, boot_time = result 

552 uptime_seconds = current_time - boot_time 

553 

554 return models.UptimeData( 

555 boot_time=datetime.fromtimestamp(boot_time).isoformat(), 

556 uptime_seconds=uptime_seconds, 

557 status="running", 

558 ) 

559 

560 

561def fetch_prometheus_usage( 

562 prometheus_url: str, instance: str, is_windows: bool = False 

563) -> models.UsageMetrics | None: 

564 """Fetch CPU and memory usage from Prometheus. 

565 

566 Linux/Windows 共通の CPU/メモリ使用率取得関数。 

567 

568 Linux (node_exporter): 

569 - CPU: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) 

570 - Memory: node_memory_MemTotal_bytes, node_memory_MemAvailable_bytes 

571 

572 Windows (windows_exporter): 

573 - CPU: 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[5m])) * 100) 

574 - Memory: windows_cs_physical_memory_bytes, windows_os_physical_memory_free_bytes 

575 

576 Args: 

577 prometheus_url: Prometheus server URL 

578 instance: Prometheus instance label 

579 is_windows: True の場合 Windows メトリクスを使用 

580 

581 Returns: 

582 UsageMetrics object with cpu/memory usage, or None if no data available 

583 """ 

584 cpu_usage_percent: float | None = None 

585 memory_usage_percent: float | None = None 

586 memory_total_bytes: float | None = None 

587 memory_used_bytes: float | None = None 

588 

589 # OS 固有のメトリクス名を設定 

590 if is_windows: 

591 cpu_metric = "windows_cpu_time_total" 

592 mem_total_metric = "windows_cs_physical_memory_bytes" 

593 mem_avail_metric = "windows_os_physical_memory_free_bytes" 

594 else: 

595 cpu_metric = "node_cpu_seconds_total" 

596 mem_total_metric = "node_memory_MemTotal_bytes" 

597 mem_avail_metric = "node_memory_MemAvailable_bytes" 

598 

599 # Get CPU usage (100 - idle percentage) 

600 cpu_query = f'100 - (avg by (instance) (rate({cpu_metric}{{instance=~"{instance}.*",mode="idle"}}[5m])) * 100)' 

601 cpu_usage_percent = _fetch_prometheus_metric(prometheus_url, cpu_query) 

602 

603 # Get memory total 

604 mem_total_query = f'{mem_total_metric}{{instance=~"{instance}.*"}}' 

605 memory_total_bytes = _fetch_prometheus_metric(prometheus_url, mem_total_query) 

606 

607 # Get memory available/free 

608 mem_avail_query = f'{mem_avail_metric}{{instance=~"{instance}.*"}}' 

609 mem_avail = _fetch_prometheus_metric(prometheus_url, mem_avail_query) 

610 if mem_avail is not None and memory_total_bytes is not None: 

611 memory_used_bytes = memory_total_bytes - mem_avail 

612 memory_usage_percent = (memory_used_bytes / memory_total_bytes) * 100 

613 

614 # Return None if no data was collected 

615 if all(v is None for v in [cpu_usage_percent, memory_usage_percent, memory_total_bytes, memory_used_bytes]): 

616 return None 

617 

618 return models.UsageMetrics( 

619 cpu_usage_percent=cpu_usage_percent, 

620 memory_usage_percent=memory_usage_percent, 

621 memory_total_bytes=memory_total_bytes, 

622 memory_used_bytes=memory_used_bytes, 

623 ) 

624 

625 

626def get_prometheus_instance(host: str, instance_map: dict) -> str: 

627 """Get Prometheus instance name for a host. 

628 

629 If the host is in instance_map, use that mapping. 

630 Otherwise, derive from FQDN (use first part before the dot). 

631 

632 Args: 

633 host: Host name (e.g., "tanzania.green-rabbit.net") 

634 instance_map: Optional mapping of host -> instance 

635 

636 Returns: 

637 Prometheus instance name (e.g., "tanzania") 

638 """ 

639 if host in instance_map: 

640 return instance_map[host] 

641 

642 # Derive from FQDN: use the first part before the dot 

643 if "." in host: 

644 return host.split(".")[0] 

645 

646 return host 

647 

648 

649def collect_prometheus_uptime_data() -> bool: 

650 """Collect uptime and usage data from Prometheus for configured hosts. 

651 

652 Automatically collects uptime and CPU/memory usage for machines without ESXi configuration. 

653 Uses node_boot_time_seconds for Linux and windows_system_system_up_time for Windows. 

654 Instance name is derived from FQDN (first part) unless explicitly 

655 configured in prometheus.instance_map. 

656 

657 Returns: 

658 True if any data was collected, False otherwise 

659 """ 

660 config = load_config() 

661 cfg = my_lib.config.accessor(config) 

662 

663 prometheus_url = cfg.get("prometheus", "url") 

664 if not prometheus_url: 664 ↛ 667line 664 didn't jump to line 667 because the condition on line 664 was always true

665 return False 

666 

667 instance_map = cfg.get_dict("prometheus", "instance_map") 

668 

669 # Find machines without ESXi (servers that need Prometheus uptime) 

670 machines = cfg.get_list("machine") 

671 target_machines = [(m["name"], m.get("os", "")) for m in machines if not m.get("esxi")] 

672 

673 if not target_machines: 

674 return False 

675 

676 updated = False 

677 

678 for host, os_type in target_machines: 

679 instance = get_prometheus_instance(host, instance_map) 

680 logging.info("Collecting uptime and usage from Prometheus for %s (instance: %s, os: %s)...", host, instance, os_type) 

681 

682 # Use appropriate fetch function based on OS 

683 is_windows = os_type.lower() == "windows" 

684 uptime_data = fetch_prometheus_uptime(prometheus_url, instance, is_windows=is_windows) 

685 usage_data = fetch_prometheus_usage(prometheus_url, instance, is_windows=is_windows) 

686 

687 if uptime_data: 

688 host_info = models.HostInfo( 

689 host=host, 

690 boot_time=uptime_data.boot_time, 

691 uptime_seconds=uptime_data.uptime_seconds, 

692 status=uptime_data.status, 

693 cpu_threads=None, 

694 cpu_cores=None, 

695 os_version=None, 

696 cpu_usage_percent=usage_data.cpu_usage_percent if usage_data else None, 

697 memory_usage_percent=usage_data.memory_usage_percent if usage_data else None, 

698 memory_total_bytes=usage_data.memory_total_bytes if usage_data else None, 

699 memory_used_bytes=usage_data.memory_used_bytes if usage_data else None, 

700 ) 

701 save_host_info(host_info) 

702 logging.info(" Cached uptime for %s: %.1f days", 

703 host, uptime_data.uptime_seconds / 86400) 

704 updated = True 

705 else: 

706 save_host_info_failed(host) 

707 

708 return updated 

709 

710 

711# ============================================================================= 

712# Prometheus ZFS pool functions (for Linux servers with ZFS) 

713# ============================================================================= 

714 

715 

716def fetch_prometheus_zfs_pools(prometheus_url: str, instance: str) -> list[models.ZfsPoolInfo]: 

717 """Fetch ZFS pool data from Prometheus. 

718 

719 Args: 

720 prometheus_url: Prometheus server URL 

721 instance: Prometheus instance label 

722 

723 Returns: 

724 List of ZfsPoolInfo objects (empty list if no data) 

725 """ 

726 metrics = ["zfs_pool_size_bytes", "zfs_pool_allocated_bytes", "zfs_pool_free_bytes", "zfs_pool_health"] 

727 pool_data: dict[str, dict[str, float | None]] = {} 

728 

729 for metric in metrics: 

730 query = f'{metric}{{instance=~"{instance}.*"}}' 

731 results = _prometheus_request(prometheus_url, query) 

732 

733 for result in results: 

734 pool_name = result.get("metric", {}).get("pool", "unknown") 

735 value = result.get("value", [None, None])[1] 

736 

737 if pool_name not in pool_data: 

738 pool_data[pool_name] = {} 

739 

740 # Convert metric name to field name 

741 field = metric.replace("zfs_pool_", "") 

742 if value is not None: 

743 try: 

744 pool_data[pool_name][field] = float(value) 

745 except (ValueError, TypeError) as e: 

746 logging.debug("ZFS metric value conversion failed: %s", e) 

747 

748 if not pool_data: 

749 return [] 

750 

751 return [ 

752 models.ZfsPoolInfo( 

753 pool_name=name, 

754 size_bytes=data.get("size_bytes"), 

755 allocated_bytes=data.get("allocated_bytes"), 

756 free_bytes=data.get("free_bytes"), 

757 health=data.get("health"), 

758 ) 

759 for name, data in pool_data.items() 

760 ] 

761 

762 

763def save_zfs_pool_info(host: str, pools: list[models.ZfsPoolInfo]): 

764 """Save ZFS pool info to SQLite cache.""" 

765 collected_at = datetime.now().isoformat() 

766 

767 with _get_connection() as conn: 

768 cursor = conn.cursor() 

769 

770 # Delete existing pools for this host 

771 cursor.execute("DELETE FROM zfs_pool_info WHERE host = ?", (host,)) 

772 

773 # Insert new pool data 

774 for pool in pools: 

775 cursor.execute(""" 

776 INSERT INTO zfs_pool_info 

777 (host, pool_name, size_bytes, allocated_bytes, free_bytes, health, collected_at) 

778 VALUES (?, ?, ?, ?, ?, ?, ?) 

779 """, ( 

780 host, 

781 pool.pool_name, 

782 pool.size_bytes, 

783 pool.allocated_bytes, 

784 pool.free_bytes, 

785 pool.health, 

786 collected_at 

787 )) 

788 

789 conn.commit() 

790 

791 

792def get_zfs_pool_info(host: str) -> list[models.ZfsPoolInfo]: 

793 """Get ZFS pool info from cache.""" 

794 with _get_connection() as conn: 

795 cursor = conn.cursor() 

796 

797 cursor.execute(""" 

798 SELECT pool_name, size_bytes, allocated_bytes, free_bytes, health, collected_at 

799 FROM zfs_pool_info 

800 WHERE host = ? 

801 """, (host,)) 

802 

803 return [models.ZfsPoolInfo.parse_row(row) for row in cursor.fetchall()] 

804 

805 

806def collect_prometheus_zfs_data() -> bool: 

807 """Collect ZFS pool data from Prometheus for configured hosts. 

808 

809 Automatically collects ZFS pool data for machines with filesystem: ['zfs']. 

810 Instance name is derived from FQDN (first part) unless explicitly 

811 configured in prometheus.instance_map. 

812 

813 Returns: 

814 True if any data was collected, False otherwise 

815 """ 

816 config = load_config() 

817 cfg = my_lib.config.accessor(config) 

818 

819 prometheus_url = cfg.get("prometheus", "url") 

820 if not prometheus_url: 820 ↛ 823line 820 didn't jump to line 823 because the condition on line 820 was always true

821 return False 

822 

823 instance_map = cfg.get_dict("prometheus", "instance_map") 

824 

825 # Find machines with filesystem: ['zfs'] 

826 machines = cfg.get_list("machine") 

827 target_hosts = [m["name"] for m in machines if "zfs" in m.get("filesystem", [])] 

828 

829 if not target_hosts: 

830 return False 

831 

832 updated = False 

833 

834 for host in target_hosts: 

835 instance = get_prometheus_instance(host, instance_map) 

836 logging.info("Collecting ZFS pool data from Prometheus for %s (instance: %s)...", host, instance) 

837 

838 pools = fetch_prometheus_zfs_pools(prometheus_url, instance) 

839 

840 if pools: 

841 save_zfs_pool_info(host, pools) 

842 logging.info(" Cached %d ZFS pools for %s", len(pools), host) 

843 updated = True 

844 

845 return updated 

846 

847 

848# ============================================================================= 

849# Prometheus mount point functions (for Linux servers) 

850# ============================================================================= 

851 

852 

853def fetch_btrfs_uuid(prometheus_url: str, label: str) -> str | None: 

854 """Get btrfs UUID from label via node_btrfs_info metric. 

855 

856 Args: 

857 prometheus_url: Prometheus server URL 

858 label: Btrfs filesystem label 

859 

860 Returns: 

861 UUID string or None if not found 

862 """ 

863 query = f'node_btrfs_info{{label="{label}"}}' 

864 result = _execute_prometheus_query(prometheus_url, query) 

865 if result: 

866 uuid = result.get("metric", {}).get("uuid") 

867 return str(uuid) if uuid is not None else None 

868 return None 

869 

870 

871def fetch_btrfs_metrics(prometheus_url: str, uuid: str) -> models.StorageMetrics | None: 

872 """Fetch btrfs size and used bytes from Prometheus. 

873 

874 Args: 

875 prometheus_url: Prometheus server URL 

876 uuid: Btrfs filesystem UUID 

877 

878 Returns: 

879 StorageMetrics with size_bytes, avail_bytes, used_bytes or None if failed 

880 """ 

881 # Get total size: sum of all device sizes 

882 size_query = f'sum(node_btrfs_device_size_bytes{{uuid="{uuid}"}})' 

883 size_bytes = _fetch_prometheus_metric(prometheus_url, size_query) 

884 

885 # Get used bytes: sum of used across all block group types 

886 used_query = f'sum(node_btrfs_used_bytes{{uuid="{uuid}"}})' 

887 used_bytes = _fetch_prometheus_metric(prometheus_url, used_query) 

888 

889 if size_bytes is not None and used_bytes is not None: 

890 return models.StorageMetrics( 

891 size_bytes=size_bytes, 

892 avail_bytes=size_bytes - used_bytes, 

893 used_bytes=used_bytes, 

894 ) 

895 

896 return None 

897 

898 

899def fetch_windows_disk_metrics( 

900 prometheus_url: str, volume: str, instance: str 

901) -> models.StorageMetrics | None: 

902 """Fetch Windows logical disk metrics from Prometheus. 

903 

904 Uses windows_exporter metrics: 

905 - windows_logical_disk_size_bytes{volume="C:"} 

906 - windows_logical_disk_free_bytes{volume="C:"} 

907 

908 Args: 

909 prometheus_url: Prometheus server URL 

910 volume: Windows volume label (e.g., "C:") 

911 instance: Prometheus instance name (derived from FQDN) 

912 

913 Returns: 

914 StorageMetrics with size_bytes, avail_bytes, used_bytes or None if failed 

915 """ 

916 # Get total size 

917 size_query = f'windows_logical_disk_size_bytes{{volume="{volume}",instance=~"{instance}.*"}}' 

918 size_bytes = _fetch_prometheus_metric(prometheus_url, size_query) 

919 

920 # Get free bytes 

921 avail_query = f'windows_logical_disk_free_bytes{{volume="{volume}",instance=~"{instance}.*"}}' 

922 avail_bytes = _fetch_prometheus_metric(prometheus_url, avail_query) 

923 

924 if size_bytes is not None and avail_bytes is not None: 

925 return models.StorageMetrics( 

926 size_bytes=size_bytes, 

927 avail_bytes=avail_bytes, 

928 used_bytes=size_bytes - avail_bytes, 

929 ) 

930 

931 return None 

932 

933 

934def _fetch_filesystem_mount_metrics(prometheus_url: str, label: str, path: str) -> models.StorageMetrics | None: 

935 """node_filesystem_* メトリクスからマウント情報を取得. 

936 

937 Args: 

938 prometheus_url: Prometheus server URL 

939 label: Prometheus instance label 

940 path: マウントポイントのパス 

941 

942 Returns: 

943 StorageMetrics with size_bytes, avail_bytes, used_bytes or None if failed 

944 """ 

945 size_query = f'node_filesystem_size_bytes{{instance=~"{label}.*",mountpoint="{path}"}}' 

946 avail_query = f'node_filesystem_avail_bytes{{instance=~"{label}.*",mountpoint="{path}"}}' 

947 

948 size_bytes = _fetch_prometheus_metric(prometheus_url, size_query) 

949 avail_bytes = _fetch_prometheus_metric(prometheus_url, avail_query) 

950 

951 if size_bytes is not None and avail_bytes is not None: 

952 return models.StorageMetrics( 

953 size_bytes=size_bytes, 

954 avail_bytes=avail_bytes, 

955 used_bytes=size_bytes - avail_bytes, 

956 ) 

957 

958 return None 

959 

960 

961def _fetch_mount_for_config( 

962 prometheus_url: str, mount_config: dict, host: str, instance_map: dict 

963) -> models.MountInfo | None: 

964 """単一のマウント設定からマウント情報を取得. 

965 

966 Args: 

967 prometheus_url: Prometheus server URL 

968 mount_config: マウント設定 (label, path, type) 

969 host: ホスト名 (FQDN) 

970 instance_map: ホスト -> Prometheus instance マッピング 

971 

972 Returns: 

973 MountInfo、または None 

974 """ 

975 label = mount_config.get("label", "") 

976 path = mount_config.get("path", label) 

977 mount_type = mount_config.get("type", "filesystem") 

978 

979 if not label: 

980 return None 

981 

982 storage_data: models.StorageMetrics | None = None 

983 

984 if mount_type == "btrfs": 

985 uuid = fetch_btrfs_uuid(prometheus_url, label) 

986 if uuid: 

987 storage_data = fetch_btrfs_metrics(prometheus_url, uuid) 

988 elif mount_type == "windows": 

989 instance = get_prometheus_instance(host, instance_map) 

990 storage_data = fetch_windows_disk_metrics(prometheus_url, label, instance) 

991 else: 

992 storage_data = _fetch_filesystem_mount_metrics(prometheus_url, label, path) 

993 

994 if storage_data: 

995 return models.MountInfo( 

996 mountpoint=path, 

997 size_bytes=storage_data.size_bytes, 

998 avail_bytes=storage_data.avail_bytes, 

999 used_bytes=storage_data.used_bytes, 

1000 ) 

1001 

1002 return None 

1003 

1004 

1005def fetch_prometheus_mount_info( 

1006 prometheus_url: str, mount_configs: list[dict], host: str, instance_map: dict 

1007) -> list[models.MountInfo]: 

1008 """Fetch mount point data from Prometheus. 

1009 

1010 Supports filesystem (node_filesystem_*), btrfs (node_btrfs_*), 

1011 and windows (windows_logical_disk_*) metrics. 

1012 

1013 Args: 

1014 prometheus_url: Prometheus server URL 

1015 mount_configs: List of mount configs with 'label', 'path', and optional 'type' 

1016 host: Host name (FQDN) for deriving Prometheus instance 

1017 instance_map: Optional mapping of host -> instance 

1018 

1019 Returns: 

1020 List of MountInfo (empty list if no data collected) 

1021 """ 

1022 mount_data: list[models.MountInfo] = [] 

1023 

1024 for mount_config in mount_configs: 

1025 result = _fetch_mount_for_config(prometheus_url, mount_config, host, instance_map) 

1026 if result: 

1027 mount_data.append(result) 

1028 

1029 return mount_data 

1030 

1031 

1032def save_mount_info(host: str, mounts: list[models.MountInfo]): 

1033 """Save mount info to SQLite cache.""" 

1034 collected_at = datetime.now().isoformat() 

1035 

1036 with _get_connection() as conn: 

1037 cursor = conn.cursor() 

1038 

1039 # Delete existing mounts for this host 

1040 cursor.execute("DELETE FROM mount_info WHERE host = ?", (host,)) 

1041 

1042 # Insert new mount data 

1043 for mount in mounts: 

1044 cursor.execute(""" 

1045 INSERT INTO mount_info 

1046 (host, mountpoint, size_bytes, avail_bytes, used_bytes, collected_at) 

1047 VALUES (?, ?, ?, ?, ?, ?) 

1048 """, ( 

1049 host, 

1050 mount.mountpoint, 

1051 mount.size_bytes, 

1052 mount.avail_bytes, 

1053 mount.used_bytes, 

1054 collected_at, 

1055 )) 

1056 

1057 conn.commit() 

1058 

1059 

1060def get_mount_info(host: str) -> list[models.MountInfo]: 

1061 """Get mount info from cache.""" 

1062 with _get_connection() as conn: 

1063 cursor = conn.cursor() 

1064 

1065 cursor.execute(""" 

1066 SELECT mountpoint, size_bytes, avail_bytes, used_bytes, collected_at 

1067 FROM mount_info 

1068 WHERE host = ? 

1069 """, (host,)) 

1070 

1071 return [models.MountInfo.parse_row(row) for row in cursor.fetchall()] 

1072 

1073 

1074# ============================================================================= 

1075# UPS functions (from NUT - Network UPS Tools) 

1076# ============================================================================= 

1077 

1078 

1079def save_ups_info(ups_info_list: list[models.UPSInfo]): 

1080 """Save UPS info to SQLite cache.""" 

1081 collected_at = datetime.now().isoformat() 

1082 

1083 with _get_connection() as conn: 

1084 cursor = conn.cursor() 

1085 

1086 for ups in ups_info_list: 

1087 cursor.execute(""" 

1088 INSERT OR REPLACE INTO ups_info 

1089 (ups_name, host, model, battery_charge, battery_runtime, 

1090 ups_load, ups_status, ups_temperature, input_voltage, output_voltage, collected_at) 

1091 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) 

1092 """, ( 

1093 ups.ups_name, 

1094 ups.host, 

1095 ups.model, 

1096 ups.battery_charge, 

1097 ups.battery_runtime, 

1098 ups.ups_load, 

1099 ups.ups_status, 

1100 ups.ups_temperature, 

1101 ups.input_voltage, 

1102 ups.output_voltage, 

1103 collected_at, 

1104 )) 

1105 

1106 conn.commit() 

1107 

1108 

1109def save_ups_clients(clients: list[models.UPSClient]): 

1110 """Save UPS client info to SQLite cache.""" 

1111 collected_at = datetime.now().isoformat() 

1112 

1113 with _get_connection() as conn: 

1114 cursor = conn.cursor() 

1115 

1116 # Delete existing clients for these UPS devices 

1117 for client in clients: 

1118 cursor.execute( 

1119 "DELETE FROM ups_client WHERE ups_name = ? AND host = ?", 

1120 (client.ups_name, client.host) 

1121 ) 

1122 

1123 # Insert new client data 

1124 for client in clients: 

1125 cursor.execute(""" 

1126 INSERT INTO ups_client 

1127 (ups_name, host, client_ip, client_hostname, esxi_host, machine_name, collected_at) 

1128 VALUES (?, ?, ?, ?, ?, ?, ?) 

1129 """, ( 

1130 client.ups_name, 

1131 client.host, 

1132 client.client_ip, 

1133 client.client_hostname, 

1134 client.esxi_host, 

1135 client.machine_name, 

1136 collected_at, 

1137 )) 

1138 

1139 conn.commit() 

1140 

1141 

1142def get_all_ups_info() -> list[models.UPSInfo]: 

1143 """Get all UPS info from cache.""" 

1144 with _get_connection() as conn: 

1145 cursor = conn.cursor() 

1146 

1147 cursor.execute(""" 

1148 SELECT ups_name, host, model, battery_charge, battery_runtime, 

1149 ups_load, ups_status, ups_temperature, input_voltage, output_voltage, collected_at 

1150 FROM ups_info 

1151 """) 

1152 

1153 return [models.UPSInfo.parse_row(row) for row in cursor.fetchall()] 

1154 

1155 

1156def get_ups_info(ups_name: str, host: str) -> models.UPSInfo | None: 

1157 """Get specific UPS info from cache.""" 

1158 with _get_connection() as conn: 

1159 cursor = conn.cursor() 

1160 

1161 cursor.execute(""" 

1162 SELECT ups_name, host, model, battery_charge, battery_runtime, 

1163 ups_load, ups_status, ups_temperature, input_voltage, output_voltage, collected_at 

1164 FROM ups_info 

1165 WHERE ups_name = ? AND host = ? 

1166 """, (ups_name, host)) 

1167 

1168 row = cursor.fetchone() 

1169 return models.UPSInfo.parse_row(row) if row else None 

1170 

1171 

1172def get_ups_clients(ups_name: str, host: str) -> list[models.UPSClient]: 

1173 """Get UPS clients from cache.""" 

1174 with _get_connection() as conn: 

1175 cursor = conn.cursor() 

1176 

1177 cursor.execute(""" 

1178 SELECT ups_name, host, client_ip, client_hostname, esxi_host, machine_name, collected_at 

1179 FROM ups_client 

1180 WHERE ups_name = ? AND host = ? 

1181 """, (ups_name, host)) 

1182 

1183 return [models.UPSClient.parse_row(row) for row in cursor.fetchall()] 

1184 

1185 

1186def get_all_ups_clients() -> list[models.UPSClient]: 

1187 """Get all UPS clients from cache.""" 

1188 with _get_connection() as conn: 

1189 cursor = conn.cursor() 

1190 

1191 cursor.execute(""" 

1192 SELECT ups_name, host, client_ip, client_hostname, esxi_host, machine_name, collected_at 

1193 FROM ups_client 

1194 """) 

1195 

1196 return [models.UPSClient.parse_row(row) for row in cursor.fetchall()] 

1197 

1198 

1199def _resolve_hostname(ip: str) -> str | None: 

1200 """Resolve IP address to hostname using DNS.""" 

1201 import socket 

1202 try: 

1203 hostname, _, _ = socket.gethostbyaddr(ip) 

1204 # Remove domain suffix if present (e.g., "server.local" -> "server") 

1205 return hostname.split(".")[0] 

1206 except (socket.herror, socket.gaierror): 

1207 logging.debug("Could not resolve hostname for IP: %s", ip) 

1208 return None 

1209 

1210 

1211def _find_vm_esxi_host(hostname: str) -> str | None: 

1212 """Find the ESXi host running a VM with the given hostname. 

1213 

1214 Args: 

1215 hostname: The VM name to search for 

1216 

1217 Returns: 

1218 ESXi host name if found, None otherwise 

1219 """ 

1220 with _get_connection() as conn: 

1221 cursor = conn.cursor() 

1222 cursor.execute(""" 

1223 SELECT esxi_host FROM vm_info 

1224 WHERE vm_name = ? OR vm_name LIKE ? 

1225 """, (hostname, f"{hostname}.%")) 

1226 row = cursor.fetchone() 

1227 return row[0] if row else None 

1228 

1229 

1230def _apply_domain(hostname: str, domain: str | None) -> str: 

1231 """Apply domain to a short hostname if needed. 

1232 

1233 Args: 

1234 hostname: The hostname (may be short or FQDN) 

1235 domain: The domain to append (e.g., "green-rabbit.net") 

1236 

1237 Returns: 

1238 FQDN if domain is provided and hostname is short, otherwise hostname as-is 

1239 """ 

1240 if not domain: 

1241 return hostname 

1242 # If hostname already contains a dot, assume it's already a FQDN 

1243 if "." in hostname: 

1244 return hostname 

1245 return f"{hostname}.{domain}" 

1246 

1247 

1248def _enrich_ups_clients( 

1249 clients: list[models.UPSClient], 

1250 domain: str | None = None, 

1251) -> list[models.UPSClient]: 

1252 """Enrich UPS clients with hostname resolution and VM info. 

1253 

1254 For each client: 

1255 1. Replace localhost with UPS master host 

1256 2. Resolve IP to hostname if not already set 

1257 3. Check if hostname is a VM and get its ESXi host 

1258 4. Set machine_name for linking (with domain appended if needed) 

1259 

1260 Args: 

1261 clients: List of UPS clients to enrich 

1262 domain: Domain to append to short hostnames for linking 

1263 """ 

1264 enriched: list[models.UPSClient] = [] 

1265 

1266 for client in clients: 

1267 hostname = client.client_hostname 

1268 ups_master_host = client.host 

1269 

1270 # Handle localhost - replace with UPS master host 

1271 if hostname == "localhost" or client.client_ip == "127.0.0.1": 1271 ↛ 1273line 1271 didn't jump to line 1273 because the condition on line 1271 was never true

1272 # Use the short hostname of the UPS master 

1273 hostname = ups_master_host.split(".")[0] 

1274 

1275 if not hostname: 1275 ↛ 1278line 1275 didn't jump to line 1278 because the condition on line 1275 was always true

1276 hostname = _resolve_hostname(client.client_ip) 

1277 

1278 esxi_host = None 

1279 machine_name = None 

1280 

1281 if hostname: 1281 ↛ 1283line 1281 didn't jump to line 1283 because the condition on line 1281 was never true

1282 # Check if this is a VM 

1283 esxi_host = _find_vm_esxi_host(hostname) 

1284 if esxi_host: 

1285 # It's a VM - link to the ESXi host (already FQDN usually) 

1286 machine_name = _apply_domain(esxi_host, domain) 

1287 else: 

1288 # It's a physical machine - link to the hostname with domain 

1289 machine_name = _apply_domain(hostname, domain) 

1290 

1291 enriched.append(models.UPSClient( 

1292 ups_name=client.ups_name, 

1293 host=client.host, 

1294 client_ip=client.client_ip, 

1295 client_hostname=hostname, 

1296 esxi_host=esxi_host, 

1297 machine_name=machine_name, 

1298 collected_at=client.collected_at, 

1299 )) 

1300 

1301 return enriched 

1302 

1303 

1304def collect_ups_data() -> bool: 

1305 """Collect UPS data from configured NUT hosts. 

1306 

1307 Returns: 

1308 True if any data was collected, False otherwise 

1309 """ 

1310 config = load_config() 

1311 cfg = my_lib.config.accessor(config) 

1312 

1313 ups_configs = cfg.get_list("ups") 

1314 if not ups_configs: 

1315 return False 

1316 

1317 # Get domain for hostname resolution 

1318 domain = cfg.get("domain") 

1319 

1320 updated = False 

1321 all_ups_info: list[models.UPSInfo] = [] 

1322 all_clients: list[models.UPSClient] = [] 

1323 

1324 for ups_config in ups_configs: 

1325 host = ups_config.get("host") 

1326 if not host: 1326 ↛ 1327line 1326 didn't jump to line 1327 because the condition on line 1326 was never true

1327 continue 

1328 

1329 port = ups_config.get("port", 3493) 

1330 ups_name_filter = ups_config.get("name") 

1331 

1332 logging.info("Collecting UPS data from NUT host %s:%d...", host, port) 

1333 

1334 ups_info_list, clients = ups_collector.fetch_all_ups_from_host( 

1335 host, port, ups_name_filter 

1336 ) 

1337 

1338 if ups_info_list: 1338 ↛ 1324line 1338 didn't jump to line 1324 because the condition on line 1338 was always true

1339 all_ups_info.extend(ups_info_list) 

1340 all_clients.extend(clients) 

1341 logging.info(" Found %d UPS(s) with %d client(s)", len(ups_info_list), len(clients)) 

1342 updated = True 

1343 

1344 if all_ups_info: 1344 ↛ 1346line 1344 didn't jump to line 1346 because the condition on line 1344 was always true

1345 save_ups_info(all_ups_info) 

1346 if all_clients: 1346 ↛ 1351line 1346 didn't jump to line 1351 because the condition on line 1346 was always true

1347 # Enrich clients with hostname resolution, VM info, and domain 

1348 enriched_clients = _enrich_ups_clients(all_clients, domain) 

1349 save_ups_clients(enriched_clients) 

1350 

1351 return updated 

1352 

1353 

1354def collect_prometheus_mount_data() -> bool: 

1355 """Collect mount point data from Prometheus for configured hosts. 

1356 

1357 Automatically collects mount data for machines with mount configuration. 

1358 Each mount config specifies 'label' (Prometheus label to match) and 

1359 optional 'path' (display path, defaults to label). 

1360 The 'type' field determines the metrics source: filesystem, btrfs, or windows. 

1361 If 'type' is not specified and machine os is 'Windows', defaults to 'windows'. 

1362 

1363 Returns: 

1364 True if any data was collected, False otherwise 

1365 """ 

1366 config = load_config() 

1367 cfg = my_lib.config.accessor(config) 

1368 

1369 prometheus_url = cfg.get("prometheus", "url") 

1370 if not prometheus_url: 1370 ↛ 1373line 1370 didn't jump to line 1373 because the condition on line 1370 was always true

1371 return False 

1372 

1373 instance_map = cfg.get_dict("prometheus", "instance_map") 

1374 

1375 # Find machines with mount configuration 

1376 machines = cfg.get_list("machine") 

1377 target_machines = [ 

1378 (m["name"], m["mount"], m.get("os", "")) 

1379 for m in machines if m.get("mount") 

1380 ] 

1381 

1382 if not target_machines: 

1383 return False 

1384 

1385 updated = False 

1386 

1387 for host, mount_configs, os_type in target_machines: 

1388 logging.info("Collecting mount data from Prometheus for %s...", host) 

1389 

1390 # Auto-detect type based on OS if not specified 

1391 processed_configs = [] 

1392 for mc in mount_configs: 

1393 config_copy = dict(mc) 

1394 if "type" not in config_copy and os_type.lower() == "windows": 

1395 config_copy["type"] = "windows" 

1396 processed_configs.append(config_copy) 

1397 

1398 mounts = fetch_prometheus_mount_info(prometheus_url, processed_configs, host, instance_map) 

1399 

1400 if mounts: 

1401 save_mount_info(host, mounts) 

1402 logging.info(" Cached %d mount points for %s", len(mounts), host) 

1403 updated = True 

1404 

1405 return updated 

1406 

1407 

1408# ============================================================================= 

1409# Database save/get functions 

1410# ============================================================================= 

1411 

1412 

1413# ----------------------------------------------------------------------------- 

1414# Save functions 

1415# ----------------------------------------------------------------------------- 

1416 

1417 

1418def save_vm_data(esxi_host: str, vms: list[models.VMInfo]): 

1419 """Save VM data to SQLite cache. 

1420 

1421 Deletes all existing VMs for the host first, then inserts new data. 

1422 This ensures deleted VMs are removed from the cache. 

1423 """ 

1424 collected_at = datetime.now().isoformat() 

1425 

1426 with _get_connection() as conn: 

1427 cursor = conn.cursor() 

1428 

1429 # Delete all existing VMs for this host first 

1430 cursor.execute("DELETE FROM vm_info WHERE esxi_host = ?", (esxi_host,)) 

1431 

1432 # Insert new VM data 

1433 for vm in vms: 

1434 cursor.execute(""" 

1435 INSERT INTO vm_info 

1436 (esxi_host, vm_name, cpu_count, ram_mb, storage_gb, power_state, 

1437 cpu_usage_mhz, memory_usage_mb, collected_at) 

1438 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) 

1439 """, ( 

1440 vm.esxi_host, 

1441 vm.vm_name, 

1442 vm.cpu_count, 

1443 vm.ram_mb, 

1444 vm.storage_gb, 

1445 vm.power_state, 

1446 vm.cpu_usage_mhz, 

1447 vm.memory_usage_mb, 

1448 collected_at 

1449 )) 

1450 

1451 conn.commit() 

1452 

1453 

1454def save_host_info(host_info: models.HostInfo): 

1455 """Save host info (uptime + CPU + ESXi version + usage) to SQLite cache.""" 

1456 collected_at = datetime.now().isoformat() 

1457 

1458 with _get_connection() as conn: 

1459 cursor = conn.cursor() 

1460 

1461 cursor.execute(""" 

1462 INSERT OR REPLACE INTO host_info 

1463 (host, boot_time, uptime_seconds, status, cpu_threads, cpu_cores, os_version, 

1464 cpu_usage_percent, memory_usage_percent, memory_total_bytes, memory_used_bytes, collected_at) 

1465 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) 

1466 """, ( 

1467 host_info.host, 

1468 host_info.boot_time, 

1469 host_info.uptime_seconds, 

1470 host_info.status, 

1471 host_info.cpu_threads, 

1472 host_info.cpu_cores, 

1473 host_info.os_version, 

1474 host_info.cpu_usage_percent, 

1475 host_info.memory_usage_percent, 

1476 host_info.memory_total_bytes, 

1477 host_info.memory_used_bytes, 

1478 collected_at 

1479 )) 

1480 

1481 conn.commit() 

1482 

1483 

1484def save_host_info_failed(host: str): 

1485 """Save failed host info status to SQLite cache. 

1486 

1487 When ESXi is unreachable, set status to 'unknown' to indicate 

1488 we cannot determine the actual state. 

1489 """ 

1490 collected_at = datetime.now().isoformat() 

1491 

1492 with _get_connection() as conn: 

1493 cursor = conn.cursor() 

1494 

1495 cursor.execute(""" 

1496 INSERT OR REPLACE INTO host_info 

1497 (host, boot_time, uptime_seconds, status, cpu_threads, cpu_cores, os_version, 

1498 cpu_usage_percent, memory_usage_percent, memory_total_bytes, memory_used_bytes, collected_at) 

1499 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) 

1500 """, (host, None, None, "unknown", None, None, None, None, None, None, None, collected_at)) 

1501 

1502 conn.commit() 

1503 

1504 

1505def update_collection_status(host: str, status: str): 

1506 """Update the collection status for a host.""" 

1507 with _get_connection() as conn: 

1508 cursor = conn.cursor() 

1509 

1510 cursor.execute(""" 

1511 INSERT OR REPLACE INTO collection_status (host, last_fetch, status) 

1512 VALUES (?, ?, ?) 

1513 """, (host, datetime.now().isoformat(), status)) 

1514 

1515 conn.commit() 

1516 

1517 

1518def get_collection_status(host: str) -> models.CollectionStatus | None: 

1519 """Get the collection status for a host.""" 

1520 with _get_connection() as conn: 

1521 cursor = conn.cursor() 

1522 

1523 cursor.execute(""" 

1524 SELECT host, last_fetch, status 

1525 FROM collection_status 

1526 WHERE host = ? 

1527 """, (host,)) 

1528 

1529 row = cursor.fetchone() 

1530 return models.CollectionStatus.parse_row(row) if row else None 

1531 

1532 

1533def is_host_reachable(host: str) -> bool: 

1534 """Check if a host was successfully reached in the last collection.""" 

1535 status = get_collection_status(host) 

1536 return status is not None and status.status == "success" 

1537 

1538 

1539def get_all_collection_status() -> dict[str, models.CollectionStatus]: 

1540 """Get all collection statuses in a single DB query. 

1541 

1542 Returns: 

1543 Dict mapping host name to CollectionStatus 

1544 """ 

1545 with _get_connection() as conn: 

1546 cursor = conn.cursor() 

1547 

1548 cursor.execute(""" 

1549 SELECT host, last_fetch, status 

1550 FROM collection_status 

1551 """) 

1552 

1553 return {row[0]: models.CollectionStatus.parse_row(row) for row in cursor.fetchall()} 

1554 

1555 

1556def get_all_vm_info() -> dict[str, list[models.VMInfo]]: 

1557 """Get all VM info grouped by ESXi host in a single DB query. 

1558 

1559 Returns: 

1560 Dict mapping ESXi host name to list of VMInfo 

1561 """ 

1562 from collections import defaultdict 

1563 

1564 with _get_connection() as conn: 

1565 cursor = conn.cursor() 

1566 

1567 cursor.execute(""" 

1568 SELECT vm_name, cpu_count, ram_mb, storage_gb, power_state, esxi_host, 

1569 cpu_usage_mhz, memory_usage_mb, collected_at 

1570 FROM vm_info 

1571 """) 

1572 

1573 result: dict[str, list[models.VMInfo]] = defaultdict(list) 

1574 for row in cursor.fetchall(): 

1575 if vm_info := models.VMInfo.parse_row_full(row): 1575 ↛ 1574line 1575 didn't jump to line 1574 because the condition on line 1575 was always true

1576 result[vm_info.esxi_host].append(vm_info) 

1577 

1578 return dict(result) 

1579 

1580 

1581def get_vm_info(vm_name: str, esxi_host: str | None = None) -> models.VMInfo | None: 

1582 """Get VM info from cache.""" 

1583 with _get_connection() as conn: 

1584 cursor = conn.cursor() 

1585 

1586 if esxi_host: 1586 ↛ 1594line 1586 didn't jump to line 1594 because the condition on line 1586 was always true

1587 cursor.execute(""" 

1588 SELECT vm_name, cpu_count, ram_mb, storage_gb, power_state, esxi_host, 

1589 cpu_usage_mhz, memory_usage_mb, collected_at 

1590 FROM vm_info 

1591 WHERE vm_name = ? AND esxi_host = ? 

1592 """, (vm_name, esxi_host)) 

1593 else: 

1594 cursor.execute(""" 

1595 SELECT vm_name, cpu_count, ram_mb, storage_gb, power_state, esxi_host, 

1596 cpu_usage_mhz, memory_usage_mb, collected_at 

1597 FROM vm_info 

1598 WHERE vm_name = ? 

1599 """, (vm_name,)) 

1600 

1601 row = cursor.fetchone() 

1602 return models.VMInfo.parse_row_full(row) if row else None 

1603 

1604 

1605def get_all_vm_info_for_host(esxi_host: str) -> list[models.VMInfo]: 

1606 """Get all VM info for a specific ESXi host from cache.""" 

1607 with _get_connection() as conn: 

1608 cursor = conn.cursor() 

1609 

1610 cursor.execute(""" 

1611 SELECT vm_name, cpu_count, ram_mb, storage_gb, power_state, 

1612 cpu_usage_mhz, memory_usage_mb, collected_at 

1613 FROM vm_info 

1614 WHERE esxi_host = ? 

1615 """, (esxi_host,)) 

1616 

1617 return [models.VMInfo.parse_row(row, esxi_host) for row in cursor.fetchall()] 

1618 

1619 

1620def get_host_info(host: str) -> models.HostInfo | None: 

1621 """Get uptime info from cache.""" 

1622 with _get_connection() as conn: 

1623 cursor = conn.cursor() 

1624 

1625 cursor.execute(""" 

1626 SELECT host, boot_time, uptime_seconds, status, cpu_threads, cpu_cores, os_version, 

1627 cpu_usage_percent, memory_usage_percent, memory_total_bytes, memory_used_bytes, collected_at 

1628 FROM host_info 

1629 WHERE host = ? 

1630 """, (host,)) 

1631 

1632 row = cursor.fetchone() 

1633 return models.HostInfo.parse_row(row) if row else None 

1634 

1635 

1636def get_all_host_info() -> dict[str, models.HostInfo]: 

1637 """Get all uptime info from cache.""" 

1638 with _get_connection() as conn: 

1639 cursor = conn.cursor() 

1640 

1641 cursor.execute(""" 

1642 SELECT host, boot_time, uptime_seconds, status, cpu_threads, cpu_cores, os_version, 

1643 cpu_usage_percent, memory_usage_percent, memory_total_bytes, memory_used_bytes, collected_at 

1644 FROM host_info 

1645 """) 

1646 

1647 return {row[0]: models.HostInfo.parse_row(row) for row in cursor.fetchall()} 

1648 

1649 

1650def _collect_esxi_host_data(si, host: str) -> bool: 

1651 """単一 ESXi ホストからデータ収集の共通処理. 

1652 

1653 Args: 

1654 si: ESXi 接続インスタンス (SmartConnect の戻り値) 

1655 host: ホスト名 

1656 

1657 Returns: 

1658 True: 成功, False: 失敗 

1659 """ 

1660 try: 

1661 # Collect VM data 

1662 vms = fetch_vm_data(si, host) 

1663 save_vm_data(host, vms) 

1664 logging.info(" Cached %d VMs from %s", len(vms), host) 

1665 

1666 # Collect host info (uptime + CPU) 

1667 host_info = fetch_host_info(si, host) 

1668 if host_info: 1668 ↛ 1672line 1668 didn't jump to line 1672 because the condition on line 1668 was always true

1669 save_host_info(host_info) 

1670 logging.info(" Cached host info for %s (CPU threads: %s)", host, host_info.cpu_threads) 

1671 else: 

1672 save_host_info_failed(host) 

1673 

1674 update_collection_status(host, "success") 

1675 return True 

1676 

1677 except Exception as e: # ESXi/pyVmomi operations can raise various exceptions 

1678 logging.warning("Error collecting data from %s: %s", host, e) 

1679 update_collection_status(host, f"error: {e}") 

1680 save_host_info_failed(host) 

1681 return False 

1682 

1683 finally: 

1684 Disconnect(si) 

1685 

1686 

1687def collect_cpu_benchmark_data() -> bool: 

1688 """Collect CPU benchmark data for all configured machines. 

1689 

1690 Checks which CPUs don't have benchmark data yet and fetches them. 

1691 Uses rate limiting to avoid overwhelming cpubenchmark.net. 

1692 

1693 Returns: 

1694 True if any new data was collected, False otherwise 

1695 """ 

1696 config = load_config() 

1697 cfg = my_lib.config.accessor(config) 

1698 

1699 machines = cfg.get_list("machine") 

1700 if not machines: 1700 ↛ 1704line 1700 didn't jump to line 1704 because the condition on line 1700 was always true

1701 return False 

1702 

1703 # Get unique CPU names from config 

1704 cpu_names = set() 

1705 for machine in machines: 

1706 cpu_name = machine.get("cpu") 

1707 if cpu_name: 

1708 cpu_names.add(cpu_name) 

1709 

1710 if not cpu_names: 

1711 return False 

1712 

1713 # Check which CPUs don't have benchmark data yet 

1714 missing_cpus = [] 

1715 for cpu_name in cpu_names: 

1716 if not cpu_benchmark.get_benchmark(cpu_name): 

1717 missing_cpus.append(cpu_name) 

1718 

1719 if not missing_cpus: 

1720 logging.debug("All CPU benchmarks are already cached") 

1721 return False 

1722 

1723 logging.info("Fetching CPU benchmark data for %d CPU(s)...", len(missing_cpus)) 

1724 

1725 updated = False 

1726 for cpu_name in missing_cpus: 

1727 result = cpu_benchmark.fetch_and_save_benchmark(cpu_name) 

1728 if result: 

1729 logging.info(" Cached benchmark for: %s", cpu_name) 

1730 updated = True 

1731 else: 

1732 logging.warning(" Could not find benchmark for: %s", cpu_name) 

1733 

1734 return updated 

1735 

1736 

1737def collect_all_data(): 

1738 """Collect all data from configured ESXi and iLO hosts.""" 

1739 secret = load_secret() 

1740 esxi_auth = secret.get("esxi_auth", {}) 

1741 

1742 updated = False 

1743 

1744 # Collect ESXi data 

1745 if esxi_auth: 

1746 for host, credentials in esxi_auth.items(): 

1747 logging.info("Collecting data from %s...", host) 

1748 

1749 si = connect_to_esxi( 

1750 host=credentials.get("host", host), 

1751 username=credentials["username"], 

1752 password=credentials["password"], 

1753 port=credentials.get("port", 443), 

1754 ) 

1755 

1756 if not si: 

1757 update_collection_status(host, "connection_failed") 

1758 save_host_info_failed(host) 

1759 continue 

1760 

1761 if _collect_esxi_host_data(si, host): 1761 ↛ 1746line 1761 didn't jump to line 1746 because the condition on line 1761 was always true

1762 updated = True 

1763 

1764 # Collect iLO power data 

1765 collect_ilo_power_data() 

1766 

1767 # Collect Prometheus uptime data (for Linux servers) 

1768 if collect_prometheus_uptime_data(): 1768 ↛ 1769line 1768 didn't jump to line 1769 because the condition on line 1768 was never true

1769 updated = True 

1770 

1771 # Collect Prometheus ZFS pool data (for Linux servers with ZFS) 

1772 if collect_prometheus_zfs_data(): 1772 ↛ 1773line 1772 didn't jump to line 1773 because the condition on line 1772 was never true

1773 updated = True 

1774 

1775 # Collect Prometheus mount point data 

1776 if collect_prometheus_mount_data(): 1776 ↛ 1777line 1776 didn't jump to line 1777 because the condition on line 1776 was never true

1777 updated = True 

1778 

1779 # Collect UPS data from NUT 

1780 if collect_ups_data(): 1780 ↛ 1781line 1780 didn't jump to line 1781 because the condition on line 1780 was never true

1781 updated = True 

1782 

1783 # Collect CPU benchmark data for configured machines 

1784 if collect_cpu_benchmark_data(): 1784 ↛ 1785line 1784 didn't jump to line 1785 because the condition on line 1784 was never true

1785 updated = True 

1786 

1787 if updated: 

1788 my_lib.webapp.event.notify_event(my_lib.webapp.event.EVENT_TYPE.CONTENT) 

1789 logging.info("Data collection complete, clients notified") 

1790 

1791 

1792def collect_host_data(host: str) -> bool: 

1793 """Collect data from a specific ESXi host. 

1794 

1795 Args: 

1796 host: The ESXi host name to collect data from 

1797 

1798 Returns: 

1799 True if data was successfully collected, False otherwise 

1800 """ 

1801 secret = load_secret() 

1802 esxi_auth = secret.get("esxi_auth", {}) 

1803 

1804 if host not in esxi_auth: 

1805 logging.warning("No credentials found for host: %s", host) 

1806 return False 

1807 

1808 credentials = esxi_auth[host] 

1809 logging.info("Collecting data from %s (manual refresh)...", host) 

1810 

1811 si = connect_to_esxi( 

1812 host=credentials.get("host", host), 

1813 username=credentials["username"], 

1814 password=credentials["password"], 

1815 port=credentials.get("port", 443), 

1816 ) 

1817 

1818 if not si: 

1819 update_collection_status(host, "connection_failed") 

1820 save_host_info_failed(host) 

1821 my_lib.webapp.event.notify_event(my_lib.webapp.event.EVENT_TYPE.CONTENT) 

1822 return False 

1823 

1824 success = _collect_esxi_host_data(si, host) 

1825 my_lib.webapp.event.notify_event(my_lib.webapp.event.EVENT_TYPE.CONTENT) 

1826 logging.info("Data collection complete for %s, clients notified", host) 

1827 return success 

1828 

1829 

1830def _update_worker(): 

1831 """Background worker that collects data periodically.""" 

1832 logging.info("Data collector started (interval: %d sec)", UPDATE_INTERVAL_SEC) 

1833 

1834 # Initial collection 

1835 try: 

1836 collect_all_data() 

1837 except Exception: 

1838 logging.exception("Error in initial data collection") 

1839 

1840 while not _should_stop.wait(UPDATE_INTERVAL_SEC): 

1841 try: 

1842 collect_all_data() 

1843 except Exception: 

1844 logging.exception("Error in periodic data collection") 

1845 

1846 logging.info("Data collector stopped") 

1847 

1848 

1849def start_collector(): 

1850 """Start the background data collector.""" 

1851 global _update_thread 

1852 

1853 init_db() 

1854 

1855 if _update_thread and _update_thread.is_alive(): 1855 ↛ 1856line 1855 didn't jump to line 1856 because the condition on line 1855 was never true

1856 return 

1857 

1858 _should_stop.clear() 

1859 _update_thread = threading.Thread(target=_update_worker, daemon=True) 

1860 _update_thread.start() 

1861 

1862 

1863def stop_collector(): 

1864 """Stop the background data collector.""" 

1865 _should_stop.set() 

1866 if _update_thread: 1866 ↛ exitline 1866 didn't return from function 'stop_collector' because the condition on line 1866 was always true

1867 _update_thread.join(timeout=5) 

1868 

1869 

1870if __name__ == "__main__": 

1871 import sys 

1872 

1873 logging.basicConfig(level=logging.INFO) 

1874 init_db() 

1875 

1876 if "--once" in sys.argv: 

1877 collect_all_data() 

1878 else: 

1879 start_collector() 

1880 try: 

1881 while True: 

1882 import time 

1883 time.sleep(1) 

1884 except KeyboardInterrupt: 

1885 stop_collector()