Coverage for src/server_list/spec/data

1#!/usr/bin/env python3

2"""

3Unified data collector for server-list.

4Collects ESXi VM data and uptime, caches to SQLite.

5Runs periodically every 5 minutes.

6"""

8import atexit

9import logging

10import sqlite3

11import ssl

12import threading

13from collections.abc import Generator

14from contextlib import contextmanager

15from datetime import datetime

16from typing import Any

18import requests

19from pyVim.connect import Disconnect, SmartConnect

20from pyVmomi import vim

22import my_lib.config

23import my_lib.safe_access

24import my_lib.webapp.event

26import server_list.spec.cpu_benchmark as cpu_benchmark

27import server_list.spec.db as db

28import server_list.spec.db_config as db_config

29import server_list.spec.models as models

30import server_list.spec.ups_collector as ups_collector

32UPDATE_INTERVAL_SEC = 300 # 5 minutes

34_update_thread: threading.Thread | None = None

35_should_stop = threading.Event()

38@contextmanager

39def _get_connection() -> Generator[sqlite3.Connection, None, None]:

40 """DB接続を取得するコンテキストマネージャ.

42 SQLite 自体がファイルレベルのロック機構を持っているため、

43 アプリケーション側でのロックは不要。

44 """

45 with db.get_connection(db_config.get_server_data_db_path()) as conn:

46 yield conn

49def init_db():

50 """Initialize the SQLite database using schema file."""

51 db.init_schema_from_file(db_config.get_server_data_db_path(), db.SQLITE_SCHEMA_PATH)

54def load_secret() -> dict:

55 """Load secret.yaml containing ESXi credentials.

57 Constructs path from db.BASE_DIR to allow test mocking.

58 """

59 secret_path = db.BASE_DIR / "secret.yaml"

60 if not secret_path.exists():

61 return {}

63 return my_lib.config.load(secret_path, db.SECRET_SCHEMA_PATH)

66def load_config() -> dict:

67 """Load config.yaml containing machine definitions.

69 Constructs path from db.BASE_DIR to allow test mocking.

70 """

71 config_path = db.BASE_DIR / "config.yaml"

72 if not config_path.exists(): 72 ↛ 75line 72 didn't jump to line 75 because the condition on line 72 was always true

73 return {}

75 return my_lib.config.load(config_path, db.CONFIG_SCHEMA_PATH)

78def connect_to_esxi(host: str, username: str, password: str, port: int = 443) -> Any | None:

79 """Connect to ESXi host."""

80 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)

81 context.check_hostname = False

82 context.verify_mode = ssl.CERT_NONE

84 try:

85 si: Any = SmartConnect(

86 host=host,

87 user=username,

88 pwd=password,

89 port=port,

90 sslContext=context

91 )

92 atexit.register(Disconnect, si)

93 return si

94 except Exception as e: # pyVmomi can raise various unexpected exceptions

95 logging.warning("Failed to connect to %s: %s", host, e)

96 return None

99def get_vm_storage_size(vm) -> float:

100 """Calculate total storage size for a VM in GB."""

101 total_bytes = 0

102

103 try:

104 for device in vm.config.hardware.device:

105 if isinstance(device, vim.vm.device.VirtualDisk): 105 ↛ 104line 105 didn't jump to line 104 because the condition on line 105 was always true

106 if device.capacityInBytes is not None: 106 ↛ 104line 106 didn't jump to line 104 because the condition on line 106 was always true

107 total_bytes += device.capacityInBytes

108 except Exception as e:

109 # pyVmomi attribute access can fail for various reasons

110 logging.debug("Failed to get storage size for VM: %s", e)

111

112 return total_bytes / (1024 ** 3)

113

114

115def fetch_vm_data(si, esxi_host: str) -> list[models.VMInfo]:

116 """Fetch VM data from ESXi."""

117 content = si.RetrieveContent()

118 container = content.rootFolder

119 view_type = [vim.VirtualMachine]

120 recursive = True

121

122 container_view = content.viewManager.CreateContainerView(

123 container, view_type, recursive

124 )

125

126 vms: list[models.VMInfo] = []

127 for vm in container_view.view:

128 try:

129 # Get CPU and memory usage from quickStats

130 cpu_usage_mhz = None

131 memory_usage_mb = None

132 if vm.summary and vm.summary.quickStats: 132 ↛ 137line 132 didn't jump to line 137 because the condition on line 132 was always true

133 stats = vm.summary.quickStats

134 cpu_usage_mhz = stats.overallCpuUsage

135 memory_usage_mb = stats.guestMemoryUsage

136

137 vm_info = models.VMInfo(

138 esxi_host=esxi_host,

139 vm_name=vm.name,

140 cpu_count=vm.config.hardware.numCPU if vm.config else None,

141 ram_mb=vm.config.hardware.memoryMB if vm.config else None,

142 storage_gb=get_vm_storage_size(vm) if vm.config else None,

143 power_state=str(vm.runtime.powerState) if vm.runtime else None,

144 cpu_usage_mhz=cpu_usage_mhz,

145 memory_usage_mb=memory_usage_mb,

146 )

147 vms.append(vm_info)

148 except Exception as e: # pyVmomi attribute access can fail unexpectedly

149 logging.warning("Error getting VM info for %s: %s", vm.name, e)

150

151 container_view.Destroy()

152 return vms

153

154

155def _extract_cpu_info(host_system) -> tuple[int | None, int | None]:

156 """ESXi ホストから CPU 情報を抽出.

157

158 Args:

159 host_system: pyVmomi HostSystem オブジェクト

160

161 Returns:

162 (cpu_threads, cpu_cores) のタプル

163 """

164 safe_host = my_lib.safe_access.safe(host_system)

165 cpu_info = safe_host.hardware.cpuInfo

166 return (cpu_info.numCpuThreads.value(), cpu_info.numCpuCores.value())

167

168

169def _extract_memory_total(host_system) -> float | None:

170 """ESXi ホストから合計メモリを抽出.

171

172 Args:

173 host_system: pyVmomi HostSystem オブジェクト

174

175 Returns:

176 メモリサイズ (bytes) または None

177 """

178 safe_host = my_lib.safe_access.safe(host_system)

179 memory_size = safe_host.hardware.memorySize.value()

180 return float(memory_size) if memory_size else None

181

182

183def _extract_usage_from_quickstats(

184 host_system, memory_total_bytes: float | None

185) -> tuple[float | None, float | None, float | None]:

186 """ESXi quickStats から CPU/メモリ使用率を抽出.

187

188 Args:

189 host_system: pyVmomi HostSystem オブジェクト

190 memory_total_bytes: 合計メモリ (bytes)

191

192 Returns:

193 (cpu_usage_percent, memory_usage_percent, memory_used_bytes) のタプル

194 """

195 cpu_usage_percent = None

196 memory_usage_percent = None

197 memory_used_bytes = None

198

199 safe_host = my_lib.safe_access.safe(host_system)

200 stats = safe_host.summary.quickStats.value()

201 if not stats: 201 ↛ 202line 201 didn't jump to line 202 because the condition on line 201 was never true

202 return cpu_usage_percent, memory_usage_percent, memory_used_bytes

203

204 # CPU usage in MHz

205 cpu_info = safe_host.hardware.cpuInfo

206 cpu_hz = cpu_info.hz.value()

207 num_cores = cpu_info.numCpuCores.value()

208 if stats.overallCpuUsage is not None and cpu_hz and num_cores: 208 ↛ 213line 208 didn't jump to line 213 because the condition on line 208 was always true

209 total_cpu_mhz = (cpu_hz / 1_000_000) * num_cores

210 cpu_usage_percent = (stats.overallCpuUsage / total_cpu_mhz) * 100

211

212 # Memory usage in MB

213 if stats.overallMemoryUsage is not None and memory_total_bytes: 213 ↛ 217line 213 didn't jump to line 217 because the condition on line 213 was always true

214 memory_used_bytes = float(stats.overallMemoryUsage * 1024 * 1024)

215 memory_usage_percent = (memory_used_bytes / memory_total_bytes) * 100

216

217 return cpu_usage_percent, memory_usage_percent, memory_used_bytes

218

219

220def _extract_os_version(host_system) -> str | None:

221 """ESXi ホストから OS バージョンを抽出.

222

223 Args:

224 host_system: pyVmomi HostSystem オブジェクト

225

226 Returns:

227 OS バージョン文字列 (例: "VMware ESXi 8.0.0 build-xxx")

228 """

229 safe_host = my_lib.safe_access.safe(host_system)

230 return safe_host.config.product.fullName.value()

231

232

233def fetch_host_info(si, host: str) -> models.HostInfo | None:

234 """Fetch host info including uptime, CPU, memory usage, and ESXi version from ESXi host."""

235 try:

236 content = si.RetrieveContent()

237

238 for datacenter in content.rootFolder.childEntity:

239 if not hasattr(datacenter, "hostFolder"): 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true

240 continue

241

242 for cluster in datacenter.hostFolder.childEntity:

243 hosts = []

244 if hasattr(cluster, "host"): 244 ↛ 246line 244 didn't jump to line 246 because the condition on line 244 was always true

245 hosts = cluster.host

246 elif hasattr(cluster, "childEntity"):

247 for child in cluster.childEntity:

248 if hasattr(child, "host"):

249 hosts.extend(child.host)

250

251 for host_system in hosts:

252 try:

253 boot_time = host_system.runtime.bootTime

254 if not boot_time: 254 ↛ 255line 254 didn't jump to line 255 because the condition on line 254 was never true

255 continue

256

257 # ヘルパー関数で各情報を抽出

258 cpu_threads, cpu_cores = _extract_cpu_info(host_system)

259 memory_total_bytes = _extract_memory_total(host_system)

260 cpu_usage, mem_usage, mem_used = _extract_usage_from_quickstats(

261 host_system, memory_total_bytes

262 )

263 os_version = _extract_os_version(host_system)

264

265 return models.HostInfo(

266 host=host,

267 boot_time=boot_time.isoformat(),

268 uptime_seconds=(datetime.now(boot_time.tzinfo) - boot_time).total_seconds(),

269 status="running",

270 cpu_threads=cpu_threads,

271 cpu_cores=cpu_cores,

272 os_version=os_version,

273 cpu_usage_percent=cpu_usage,

274 memory_usage_percent=mem_usage,

275 memory_total_bytes=memory_total_bytes,

276 memory_used_bytes=mem_used,

277 )

278 except Exception as e: # pyVmomi attribute access can fail unexpectedly

279 logging.warning("Error getting host info: %s", e)

280

281 except Exception as e: # pyVmomi can raise various unexpected exceptions

282 logging.warning("Failed to get host info for %s: %s", host, e)

283

284 return None

285

286

287# =============================================================================

288# iLO Power Meter functions (via Redfish API)

289# =============================================================================

290

291

292def fetch_ilo_power(host: str, username: str, password: str) -> models.PowerInfo | None:

293 """Fetch power consumption data from HP iLO via Redfish API.

294

295 Args:

296 host: iLO hostname or IP address

297 username: iLO username

298 password: iLO password

299

300 Returns:

301 PowerInfo or None if failed

302 """

303 # iLO uses self-signed certificates, disable verification

304 url = f"https://{host}/redfish/v1/Chassis/1/Power"

305

306 try:

307 response = requests.get(

308 url,

309 auth=(username, password),

310 verify=False, # noqa: S501 - iLO uses self-signed certs

311 timeout=30

312 )

313

314 if response.status_code != 200:

315 logging.warning("iLO API returned status %d for %s", response.status_code, host)

316 return None

317

318 data = response.json()

319

320 # Extract power data from PowerControl array

321 power_control = data.get("PowerControl", [])

322 if not power_control:

323 logging.warning("No PowerControl data in iLO response for %s", host)

324 return None

325

326 # Get first PowerControl entry (main chassis power)

327 power_entry = power_control[0]

328 power_watts = power_entry.get("PowerConsumedWatts")

329

330 # Get power metrics (average, min, max)

331 power_metrics = power_entry.get("PowerMetrics", {})

332 power_average = power_metrics.get("AverageConsumedWatts")

333 power_max = power_metrics.get("MaxConsumedWatts")

334 power_min = power_metrics.get("MinConsumedWatts")

335

336 return models.PowerInfo(

337 power_watts=power_watts,

338 power_average_watts=power_average,

339 power_max_watts=power_max,

340 power_min_watts=power_min,

341 )

342

343 except requests.exceptions.Timeout:

344 logging.warning("Timeout connecting to iLO at %s", host)

345 return None

346 except requests.exceptions.ConnectionError as e:

347 logging.warning("Connection error to iLO at %s: %s", host, e)

348 return None

349 except (requests.RequestException, ValueError, KeyError) as e:

350 # Handle JSON parsing errors and unexpected response format

351 logging.warning("Error fetching power data from iLO at %s: %s", host, e)

352 return None

353

354

355def save_power_info(host: str, power_data: models.PowerInfo):

356 """Save power consumption info to SQLite cache."""

357 collected_at = datetime.now().isoformat()

358

359 with _get_connection() as conn:

360 cursor = conn.cursor()

361

362 cursor.execute("""

363 INSERT OR REPLACE INTO power_info

364 (host, power_watts, power_average_watts, power_max_watts, power_min_watts, collected_at)

365 VALUES (?, ?, ?, ?, ?, ?)

366 """, (

367 host,

368 power_data.power_watts,

369 power_data.power_average_watts,

370 power_data.power_max_watts,

371 power_data.power_min_watts,

372 collected_at

373 ))

374

375 conn.commit()

376

377

378def get_power_info(host: str) -> models.PowerInfo | None:

379 """Get power consumption info from cache."""

380 with _get_connection() as conn:

381 cursor = conn.cursor()

382

383 cursor.execute("""

384 SELECT power_watts, power_average_watts, power_max_watts, power_min_watts, collected_at

385 FROM power_info

386 WHERE host = ?

387 """, (host,))

388

389 row = cursor.fetchone()

390 return models.PowerInfo.parse_row(row) if row else None

391

392

393def get_all_power_info() -> dict[str, models.PowerInfo]:

394 """Get all power consumption info from cache."""

395 with _get_connection() as conn:

396 cursor = conn.cursor()

397

398 cursor.execute("""

399 SELECT host, power_watts, power_average_watts, power_max_watts, power_min_watts, collected_at

400 FROM power_info

401 """)

402

403 return dict(models.PowerInfo.parse_row_with_host(row) for row in cursor.fetchall())

404

405

406def collect_ilo_power_data():

407 """Collect power data from configured iLO hosts."""

408 secret = load_secret()

409 ilo_auth = secret.get("ilo_auth", {})

410

411 if not ilo_auth: 411 ↛ 414line 411 didn't jump to line 414 because the condition on line 411 was always true

412 return

413

414 for host, credentials in ilo_auth.items():

415 ilo_host = credentials.get("host", host)

416 logging.info("Collecting power data from iLO %s...", ilo_host)

417

418 power_data = fetch_ilo_power(

419 host=ilo_host,

420 username=credentials["username"],

421 password=credentials["password"]

422 )

423

424 if power_data:

425 save_power_info(host, power_data)

426 logging.info(" Cached power data for %s: %s W", host, power_data.power_watts)

427

428

429# =============================================================================

430# Prometheus common helpers

431# =============================================================================

432

433

434def _prometheus_request(prometheus_url: str, query: str) -> list[dict]:

435 """Prometheus API への HTTP リクエスト共通処理.

436

437 Args:

438 prometheus_url: Prometheus サーバー URL

439 query: PromQL クエリ

440

441 Returns:

442 結果リスト（失敗時は空リスト）

443 """

444 try:

445 response = requests.get(

446 f"{prometheus_url}/api/v1/query",

447 params={"query": query},

448 timeout=30,

449 )

450 response.raise_for_status()

451 data = response.json()

452

453 if data.get("status") != "success":

454 return []

455

456 return data.get("data", {}).get("result", [])

457

458 except requests.RequestException as e:

459 logging.warning("Prometheus query failed: %s", e)

460 return []

461

462

463def _execute_prometheus_query(prometheus_url: str, query: str) -> dict | None:

464 """Prometheus API 呼び出しで最初の結果を取得.

465

466 Args:

467 prometheus_url: Prometheus サーバー URL

468 query: PromQL クエリ

469

470 Returns:

471 最初の結果エントリ (dict)、または None (失敗時)

472 """

473 results = _prometheus_request(prometheus_url, query)

474 return results[0] if results else None

475

476

477def _fetch_prometheus_metric(prometheus_url: str, query: str) -> float | None:

478 """Prometheus から単一メトリクスを取得する共通関数.

479

480 Args:

481 prometheus_url: Prometheus サーバー URL

482 query: PromQL クエリ

483

484 Returns:

485 メトリクス値 (float) または None (失敗時)

486 """

487 result = _execute_prometheus_query(prometheus_url, query)

488 if result:

489 try:

490 value = result.get("value", [None, None])[1]

491 return float(value) if value is not None else None

492 except (ValueError, TypeError, IndexError) as e:

493 logging.debug("Prometheus metric parsing failed: %s", e)

494 return None

495

496

497def _fetch_prometheus_metric_with_timestamp(prometheus_url: str, query: str) -> tuple[float, float] | None:

498 """Prometheus からタイムスタンプ付きでメトリクスを取得.

499

500 Args:

501 prometheus_url: Prometheus サーバー URL

502 query: PromQL クエリ

503

504 Returns:

505 (timestamp, value) のタプル、または None (失敗時)

506 """

507 result = _execute_prometheus_query(prometheus_url, query)

508 if result:

509 try:

510 value_list = result.get("value", [])

511 if len(value_list) >= 2:

512 return float(value_list[0]), float(value_list[1])

513 except (ValueError, TypeError, IndexError) as e:

514 logging.debug("Prometheus metric with timestamp parsing failed: %s", e)

515 return None

516

517

518# =============================================================================

519# Prometheus uptime functions (for Linux servers)

520# =============================================================================

521

522

523def fetch_prometheus_uptime(

524 prometheus_url: str, instance: str, is_windows: bool = False

525) -> models.UptimeData | None:

526 """Fetch uptime data from Prometheus.

527

528 Linux/Windows 共通の uptime 取得関数。

529 Linux: node_boot_time_seconds メトリクス

530 Windows: windows_system_system_up_time メトリクス

531

532 Args:

533 prometheus_url: Prometheus server URL (e.g., http://192.168.0.20:9090)

534 instance: Prometheus instance label

535 is_windows: True の場合 Windows メトリクスを使用

536

537 Returns:

538 UptimeData object or None if failed

539 """

540 if is_windows:

541 metric = f'windows_system_system_up_time{{instance=~"{instance}.*"}}'

542 else:

543 metric = f'node_boot_time_seconds{{instance=~"{instance}.*"}}'

544

545 result = _fetch_prometheus_metric_with_timestamp(prometheus_url, metric)

546 if result is None:

547 os_name = "Windows" if is_windows else "Linux"

548 logging.warning("No Prometheus %s uptime data for instance %s", os_name, instance)

549 return None

550

551 current_time, boot_time = result

552 uptime_seconds = current_time - boot_time

553

554 return models.UptimeData(

555 boot_time=datetime.fromtimestamp(boot_time).isoformat(),

556 uptime_seconds=uptime_seconds,

557 status="running",

558 )

559

560

561def fetch_prometheus_usage(

562 prometheus_url: str, instance: str, is_windows: bool = False

563) -> models.UsageMetrics | None:

564 """Fetch CPU and memory usage from Prometheus.

565

566 Linux/Windows 共通の CPU/メモリ使用率取得関数。

567

568 Linux (node_exporter):

569 - CPU: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)

570 - Memory: node_memory_MemTotal_bytes, node_memory_MemAvailable_bytes

571

572 Windows (windows_exporter):

573 - CPU: 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[5m])) * 100)

574 - Memory: windows_cs_physical_memory_bytes, windows_os_physical_memory_free_bytes

575

576 Args:

577 prometheus_url: Prometheus server URL

578 instance: Prometheus instance label

579 is_windows: True の場合 Windows メトリクスを使用

580

581 Returns:

582 UsageMetrics object with cpu/memory usage, or None if no data available

583 """

584 cpu_usage_percent: float | None = None

585 memory_usage_percent: float | None = None

586 memory_total_bytes: float | None = None

587 memory_used_bytes: float | None = None

588

589 # OS 固有のメトリクス名を設定

590 if is_windows:

591 cpu_metric = "windows_cpu_time_total"

592 mem_total_metric = "windows_cs_physical_memory_bytes"

593 mem_avail_metric = "windows_os_physical_memory_free_bytes"

594 else:

595 cpu_metric = "node_cpu_seconds_total"

596 mem_total_metric = "node_memory_MemTotal_bytes"

597 mem_avail_metric = "node_memory_MemAvailable_bytes"

598

599 # Get CPU usage (100 - idle percentage)

600 cpu_query = f'100 - (avg by (instance) (rate({cpu_metric}{{instance=~"{instance}.*",mode="idle"}}[5m])) * 100)'

601 cpu_usage_percent = _fetch_prometheus_metric(prometheus_url, cpu_query)

602

603 # Get memory total

604 mem_total_query = f'{mem_total_metric}{{instance=~"{instance}.*"}}'

605 memory_total_bytes = _fetch_prometheus_metric(prometheus_url, mem_total_query)

606

607 # Get memory available/free

608 mem_avail_query = f'{mem_avail_metric}{{instance=~"{instance}.*"}}'

609 mem_avail = _fetch_prometheus_metric(prometheus_url, mem_avail_query)

610 if mem_avail is not None and memory_total_bytes is not None:

611 memory_used_bytes = memory_total_bytes - mem_avail

612 memory_usage_percent = (memory_used_bytes / memory_total_bytes) * 100

613

614 # Return None if no data was collected

615 if all(v is None for v in [cpu_usage_percent, memory_usage_percent, memory_total_bytes, memory_used_bytes]):

616 return None

617

618 return models.UsageMetrics(

619 cpu_usage_percent=cpu_usage_percent,

620 memory_usage_percent=memory_usage_percent,

621 memory_total_bytes=memory_total_bytes,

622 memory_used_bytes=memory_used_bytes,

623 )

624

625

626def get_prometheus_instance(host: str, instance_map: dict) -> str:

627 """Get Prometheus instance name for a host.

628

629 If the host is in instance_map, use that mapping.

630 Otherwise, derive from FQDN (use first part before the dot).

631

632 Args:

633 host: Host name (e.g., "tanzania.green-rabbit.net")

634 instance_map: Optional mapping of host -> instance

635

636 Returns:

637 Prometheus instance name (e.g., "tanzania")

638 """

639 if host in instance_map:

640 return instance_map[host]

641

642 # Derive from FQDN: use the first part before the dot

643 if "." in host:

644 return host.split(".")[0]

645

646 return host

647

648

649def collect_prometheus_uptime_data() -> bool:

650 """Collect uptime and usage data from Prometheus for configured hosts.

651

652 Automatically collects uptime and CPU/memory usage for machines without ESXi configuration.

653 Uses node_boot_time_seconds for Linux and windows_system_system_up_time for Windows.

654 Instance name is derived from FQDN (first part) unless explicitly

655 configured in prometheus.instance_map.

656

657 Returns:

658 True if any data was collected, False otherwise

659 """

660 config = load_config()

661 cfg = my_lib.config.accessor(config)

662

663 prometheus_url = cfg.get("prometheus", "url")

664 if not prometheus_url: 664 ↛ 667line 664 didn't jump to line 667 because the condition on line 664 was always true

665 return False

666

667 instance_map = cfg.get_dict("prometheus", "instance_map")

668

669 # Find machines without ESXi (servers that need Prometheus uptime)

670 machines = cfg.get_list("machine")

671 target_machines = [(m["name"], m.get("os", "")) for m in machines if not m.get("esxi")]

672

673 if not target_machines:

674 return False

675

676 updated = False

677

678 for host, os_type in target_machines:

679 instance = get_prometheus_instance(host, instance_map)

680 logging.info("Collecting uptime and usage from Prometheus for %s (instance: %s, os: %s)...", host, instance, os_type)

681

682 # Use appropriate fetch function based on OS

683 is_windows = os_type.lower() == "windows"

684 uptime_data = fetch_prometheus_uptime(prometheus_url, instance, is_windows=is_windows)

685 usage_data = fetch_prometheus_usage(prometheus_url, instance, is_windows=is_windows)

686

687 if uptime_data:

688 host_info = models.HostInfo(

689 host=host,

690 boot_time=uptime_data.boot_time,

691 uptime_seconds=uptime_data.uptime_seconds,

692 status=uptime_data.status,

693 cpu_threads=None,

694 cpu_cores=None,

695 os_version=None,

696 cpu_usage_percent=usage_data.cpu_usage_percent if usage_data else None,

697 memory_usage_percent=usage_data.memory_usage_percent if usage_data else None,

698 memory_total_bytes=usage_data.memory_total_bytes if usage_data else None,

699 memory_used_bytes=usage_data.memory_used_bytes if usage_data else None,

700 )

701 save_host_info(host_info)

702 logging.info(" Cached uptime for %s: %.1f days",

703 host, uptime_data.uptime_seconds / 86400)

704 updated = True

705 else:

706 save_host_info_failed(host)

707

708 return updated

709

710

711# =============================================================================

712# Prometheus ZFS pool functions (for Linux servers with ZFS)

713# =============================================================================

714

715

716def fetch_prometheus_zfs_pools(prometheus_url: str, instance: str) -> list[models.ZfsPoolInfo]:

717 """Fetch ZFS pool data from Prometheus.

718

719 Args:

720 prometheus_url: Prometheus server URL

721 instance: Prometheus instance label

722

723 Returns:

724 List of ZfsPoolInfo objects (empty list if no data)

725 """

726 metrics = ["zfs_pool_size_bytes", "zfs_pool_allocated_bytes", "zfs_pool_free_bytes", "zfs_pool_health"]

727 pool_data: dict[str, dict[str, float | None]] = {}

728

729 for metric in metrics:

730 query = f'{metric}{{instance=~"{instance}.*"}}'

731 results = _prometheus_request(prometheus_url, query)

732

733 for result in results:

734 pool_name = result.get("metric", {}).get("pool", "unknown")

735 value = result.get("value", [None, None])[1]

736

737 if pool_name not in pool_data:

738 pool_data[pool_name] = {}

739

740 # Convert metric name to field name

741 field = metric.replace("zfs_pool_", "")

742 if value is not None:

743 try:

744 pool_data[pool_name][field] = float(value)

745 except (ValueError, TypeError) as e:

746 logging.debug("ZFS metric value conversion failed: %s", e)

747

748 if not pool_data:

749 return []

750

751 return [

752 models.ZfsPoolInfo(

753 pool_name=name,

754 size_bytes=data.get("size_bytes"),

755 allocated_bytes=data.get("allocated_bytes"),

756 free_bytes=data.get("free_bytes"),

757 health=data.get("health"),

758 )

759 for name, data in pool_data.items()

760 ]

761

762

763def save_zfs_pool_info(host: str, pools: list[models.ZfsPoolInfo]):

764 """Save ZFS pool info to SQLite cache."""

765 collected_at = datetime.now().isoformat()

766

767 with _get_connection() as conn:

768 cursor = conn.cursor()

769

770 # Delete existing pools for this host

771 cursor.execute("DELETE FROM zfs_pool_info WHERE host = ?", (host,))

772

773 # Insert new pool data

774 for pool in pools:

775 cursor.execute("""

776 INSERT INTO zfs_pool_info

777 (host, pool_name, size_bytes, allocated_bytes, free_bytes, health, collected_at)

778 VALUES (?, ?, ?, ?, ?, ?, ?)

779 """, (

780 host,

781 pool.pool_name,

782 pool.size_bytes,

783 pool.allocated_bytes,

784 pool.free_bytes,

785 pool.health,

786 collected_at

787 ))

788

789 conn.commit()

790

791

792def get_zfs_pool_info(host: str) -> list[models.ZfsPoolInfo]:

793 """Get ZFS pool info from cache."""

794 with _get_connection() as conn:

795 cursor = conn.cursor()

796

797 cursor.execute("""

798 SELECT pool_name, size_bytes, allocated_bytes, free_bytes, health, collected_at

799 FROM zfs_pool_info

800 WHERE host = ?

801 """, (host,))

802

803 return [models.ZfsPoolInfo.parse_row(row) for row in cursor.fetchall()]

804

805

806def collect_prometheus_zfs_data() -> bool:

807 """Collect ZFS pool data from Prometheus for configured hosts.

808

809 Automatically collects ZFS pool data for machines with filesystem: ['zfs'].

810 Instance name is derived from FQDN (first part) unless explicitly

811 configured in prometheus.instance_map.

812

813 Returns:

814 True if any data was collected, False otherwise

815 """

816 config = load_config()

817 cfg = my_lib.config.accessor(config)

818

819 prometheus_url = cfg.get("prometheus", "url")

820 if not prometheus_url: 820 ↛ 823line 820 didn't jump to line 823 because the condition on line 820 was always true

821 return False

822

823 instance_map = cfg.get_dict("prometheus", "instance_map")

824

825 # Find machines with filesystem: ['zfs']

826 machines = cfg.get_list("machine")

827 target_hosts = [m["name"] for m in machines if "zfs" in m.get("filesystem", [])]

828

829 if not target_hosts:

830 return False

831

832 updated = False

833

834 for host in target_hosts:

835 instance = get_prometheus_instance(host, instance_map)

836 logging.info("Collecting ZFS pool data from Prometheus for %s (instance: %s)...", host, instance)

837

838 pools = fetch_prometheus_zfs_pools(prometheus_url, instance)

839

840 if pools:

841 save_zfs_pool_info(host, pools)

842 logging.info(" Cached %d ZFS pools for %s", len(pools), host)

843 updated = True

844

845 return updated

846

847

848# =============================================================================

849# Prometheus mount point functions (for Linux servers)

850# =============================================================================

851

852

853def fetch_btrfs_uuid(prometheus_url: str, label: str) -> str | None:

854 """Get btrfs UUID from label via node_btrfs_info metric.

855

856 Args:

857 prometheus_url: Prometheus server URL

858 label: Btrfs filesystem label

859

860 Returns:

861 UUID string or None if not found

862 """

863 query = f'node_btrfs_info{{label="{label}"}}'

864 result = _execute_prometheus_query(prometheus_url, query)

865 if result:

866 uuid = result.get("metric", {}).get("uuid")

867 return str(uuid) if uuid is not None else None

868 return None

869

870

871def fetch_btrfs_metrics(prometheus_url: str, uuid: str) -> models.StorageMetrics | None:

872 """Fetch btrfs size and used bytes from Prometheus.

873

874 Args:

875 prometheus_url: Prometheus server URL

876 uuid: Btrfs filesystem UUID

877

878 Returns:

879 StorageMetrics with size_bytes, avail_bytes, used_bytes or None if failed

880 """

881 # Get total size: sum of all device sizes

882 size_query = f'sum(node_btrfs_device_size_bytes{{uuid="{uuid}"}})'

883 size_bytes = _fetch_prometheus_metric(prometheus_url, size_query)

884

885 # Get used bytes: sum of used across all block group types

886 used_query = f'sum(node_btrfs_used_bytes{{uuid="{uuid}"}})'

887 used_bytes = _fetch_prometheus_metric(prometheus_url, used_query)

888

889 if size_bytes is not None and used_bytes is not None:

890 return models.StorageMetrics(

891 size_bytes=size_bytes,

892 avail_bytes=size_bytes - used_bytes,

893 used_bytes=used_bytes,

894 )

895

896 return None

897

898

899def fetch_windows_disk_metrics(

900 prometheus_url: str, volume: str, instance: str

901) -> models.StorageMetrics | None:

902 """Fetch Windows logical disk metrics from Prometheus.

903

904 Uses windows_exporter metrics:

905 - windows_logical_disk_size_bytes{volume="C:"}

906 - windows_logical_disk_free_bytes{volume="C:"}

907

908 Args:

909 prometheus_url: Prometheus server URL

910 volume: Windows volume label (e.g., "C:")

911 instance: Prometheus instance name (derived from FQDN)

912

913 Returns:

914 StorageMetrics with size_bytes, avail_bytes, used_bytes or None if failed

915 """

916 # Get total size

917 size_query = f'windows_logical_disk_size_bytes{{volume="{volume}",instance=~"{instance}.*"}}'

918 size_bytes = _fetch_prometheus_metric(prometheus_url, size_query)

919

920 # Get free bytes

921 avail_query = f'windows_logical_disk_free_bytes{{volume="{volume}",instance=~"{instance}.*"}}'

922 avail_bytes = _fetch_prometheus_metric(prometheus_url, avail_query)

923

924 if size_bytes is not None and avail_bytes is not None:

925 return models.StorageMetrics(

926 size_bytes=size_bytes,

927 avail_bytes=avail_bytes,

928 used_bytes=size_bytes - avail_bytes,

929 )

930

931 return None

932

933

934def _fetch_filesystem_mount_metrics(prometheus_url: str, label: str, path: str) -> models.StorageMetrics | None:

935 """node_filesystem_* メトリクスからマウント情報を取得.

936

937 Args:

938 prometheus_url: Prometheus server URL

939 label: Prometheus instance label

940 path: マウントポイントのパス

941

942 Returns:

943 StorageMetrics with size_bytes, avail_bytes, used_bytes or None if failed

944 """

945 size_query = f'node_filesystem_size_bytes{{instance=~"{label}.*",mountpoint="{path}"}}'

946 avail_query = f'node_filesystem_avail_bytes{{instance=~"{label}.*",mountpoint="{path}"}}'

947

948 size_bytes = _fetch_prometheus_metric(prometheus_url, size_query)

949 avail_bytes = _fetch_prometheus_metric(prometheus_url, avail_query)

950

951 if size_bytes is not None and avail_bytes is not None:

952 return models.StorageMetrics(

953 size_bytes=size_bytes,

954 avail_bytes=avail_bytes,

955 used_bytes=size_bytes - avail_bytes,

956 )

957

958 return None

959

960

961def _fetch_mount_for_config(

962 prometheus_url: str, mount_config: dict, host: str, instance_map: dict

963) -> models.MountInfo | None:

964 """単一のマウント設定からマウント情報を取得.

965

966 Args:

967 prometheus_url: Prometheus server URL

968 mount_config: マウント設定 (label, path, type)

969 host: ホスト名 (FQDN)

970 instance_map: ホスト -> Prometheus instance マッピング

971

972 Returns:

973 MountInfo、または None

974 """

975 label = mount_config.get("label", "")

976 path = mount_config.get("path", label)

977 mount_type = mount_config.get("type", "filesystem")

978

979 if not label:

980 return None

981

982 storage_data: models.StorageMetrics | None = None

983

984 if mount_type == "btrfs":

985 uuid = fetch_btrfs_uuid(prometheus_url, label)

986 if uuid:

987 storage_data = fetch_btrfs_metrics(prometheus_url, uuid)

988 elif mount_type == "windows":

989 instance = get_prometheus_instance(host, instance_map)

990 storage_data = fetch_windows_disk_metrics(prometheus_url, label, instance)

991 else:

992 storage_data = _fetch_filesystem_mount_metrics(prometheus_url, label, path)

993

994 if storage_data:

995 return models.MountInfo(

996 mountpoint=path,

997 size_bytes=storage_data.size_bytes,

998 avail_bytes=storage_data.avail_bytes,

999 used_bytes=storage_data.used_bytes,

1000 )

1001

1002 return None

1003

1004

1005def fetch_prometheus_mount_info(

1006 prometheus_url: str, mount_configs: list[dict], host: str, instance_map: dict

1007) -> list[models.MountInfo]:

1008 """Fetch mount point data from Prometheus.

1009

1010 Supports filesystem (node_filesystem_*), btrfs (node_btrfs_*),

1011 and windows (windows_logical_disk_*) metrics.

1012

1013 Args:

1014 prometheus_url: Prometheus server URL

1015 mount_configs: List of mount configs with 'label', 'path', and optional 'type'

1016 host: Host name (FQDN) for deriving Prometheus instance

1017 instance_map: Optional mapping of host -> instance

1018

1019 Returns:

1020 List of MountInfo (empty list if no data collected)

1021 """

1022 mount_data: list[models.MountInfo] = []

1023

1024 for mount_config in mount_configs:

1025 result = _fetch_mount_for_config(prometheus_url, mount_config, host, instance_map)

1026 if result:

1027 mount_data.append(result)

1028

1029 return mount_data

1030

1031

1032def save_mount_info(host: str, mounts: list[models.MountInfo]):

1033 """Save mount info to SQLite cache."""

1034 collected_at = datetime.now().isoformat()

1035

1036 with _get_connection() as conn:

1037 cursor = conn.cursor()

1038

1039 # Delete existing mounts for this host

1040 cursor.execute("DELETE FROM mount_info WHERE host = ?", (host,))

1041

1042 # Insert new mount data

1043 for mount in mounts:

1044 cursor.execute("""

1045 INSERT INTO mount_info

1046 (host, mountpoint, size_bytes, avail_bytes, used_bytes, collected_at)

1047 VALUES (?, ?, ?, ?, ?, ?)

1048 """, (

1049 host,

1050 mount.mountpoint,

1051 mount.size_bytes,

1052 mount.avail_bytes,

1053 mount.used_bytes,

1054 collected_at,

1055 ))

1056

1057 conn.commit()

1058

1059

1060def get_mount_info(host: str) -> list[models.MountInfo]:

1061 """Get mount info from cache."""

1062 with _get_connection() as conn:

1063 cursor = conn.cursor()

1064

1065 cursor.execute("""

1066 SELECT mountpoint, size_bytes, avail_bytes, used_bytes, collected_at

1067 FROM mount_info

1068 WHERE host = ?

1069 """, (host,))

1070

1071 return [models.MountInfo.parse_row(row) for row in cursor.fetchall()]

1072

1073

1074# =============================================================================

1075# UPS functions (from NUT - Network UPS Tools)

1076# =============================================================================

1077

1078

1079def save_ups_info(ups_info_list: list[models.UPSInfo]):

1080 """Save UPS info to SQLite cache."""

1081 collected_at = datetime.now().isoformat()

1082

1083 with _get_connection() as conn:

1084 cursor = conn.cursor()

1085

1086 for ups in ups_info_list:

1087 cursor.execute("""

1088 INSERT OR REPLACE INTO ups_info

1089 (ups_name, host, model, battery_charge, battery_runtime,

1090 ups_load, ups_status, ups_temperature, input_voltage, output_voltage, collected_at)

1091 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)

1092 """, (

1093 ups.ups_name,

1094 ups.host,

1095 ups.model,

1096 ups.battery_charge,

1097 ups.battery_runtime,

1098 ups.ups_load,

1099 ups.ups_status,

1100 ups.ups_temperature,

1101 ups.input_voltage,

1102 ups.output_voltage,

1103 collected_at,

1104 ))

1105

1106 conn.commit()

1107

1108

1109def save_ups_clients(clients: list[models.UPSClient]):

1110 """Save UPS client info to SQLite cache."""

1111 collected_at = datetime.now().isoformat()

1112

1113 with _get_connection() as conn:

1114 cursor = conn.cursor()

1115

1116 # Delete existing clients for these UPS devices

1117 for client in clients:

1118 cursor.execute(

1119 "DELETE FROM ups_client WHERE ups_name = ? AND host = ?",

1120 (client.ups_name, client.host)

1121 )

1122

1123 # Insert new client data

1124 for client in clients:

1125 cursor.execute("""

1126 INSERT INTO ups_client

1127 (ups_name, host, client_ip, client_hostname, esxi_host, machine_name, collected_at)

1128 VALUES (?, ?, ?, ?, ?, ?, ?)

1129 """, (

1130 client.ups_name,

1131 client.host,

1132 client.client_ip,

1133 client.client_hostname,

1134 client.esxi_host,

1135 client.machine_name,

1136 collected_at,

1137 ))

1138

1139 conn.commit()

1140

1141

1142def get_all_ups_info() -> list[models.UPSInfo]:

1143 """Get all UPS info from cache."""

1144 with _get_connection() as conn:

1145 cursor = conn.cursor()

1146

1147 cursor.execute("""

1148 SELECT ups_name, host, model, battery_charge, battery_runtime,

1149 ups_load, ups_status, ups_temperature, input_voltage, output_voltage, collected_at

1150 FROM ups_info

1151 """)

1152

1153 return [models.UPSInfo.parse_row(row) for row in cursor.fetchall()]

1154

1155

1156def get_ups_info(ups_name: str, host: str) -> models.UPSInfo | None:

1157 """Get specific UPS info from cache."""

1158 with _get_connection() as conn:

1159 cursor = conn.cursor()

1160

1161 cursor.execute("""

1162 SELECT ups_name, host, model, battery_charge, battery_runtime,

1163 ups_load, ups_status, ups_temperature, input_voltage, output_voltage, collected_at

1164 FROM ups_info

1165 WHERE ups_name = ? AND host = ?

1166 """, (ups_name, host))

1167

1168 row = cursor.fetchone()

1169 return models.UPSInfo.parse_row(row) if row else None

1170

1171

1172def get_ups_clients(ups_name: str, host: str) -> list[models.UPSClient]:

1173 """Get UPS clients from cache."""

1174 with _get_connection() as conn:

1175 cursor = conn.cursor()

1176

1177 cursor.execute("""

1178 SELECT ups_name, host, client_ip, client_hostname, esxi_host, machine_name, collected_at

1179 FROM ups_client

1180 WHERE ups_name = ? AND host = ?

1181 """, (ups_name, host))

1182

1183 return [models.UPSClient.parse_row(row) for row in cursor.fetchall()]

1184

1185

1186def get_all_ups_clients() -> list[models.UPSClient]:

1187 """Get all UPS clients from cache."""

1188 with _get_connection() as conn:

1189 cursor = conn.cursor()

1190

1191 cursor.execute("""

1192 SELECT ups_name, host, client_ip, client_hostname, esxi_host, machine_name, collected_at

1193 FROM ups_client

1194 """)

1195

1196 return [models.UPSClient.parse_row(row) for row in cursor.fetchall()]

1197

1198

1199def _resolve_hostname(ip: str) -> str | None:

1200 """Resolve IP address to hostname using DNS."""

1201 import socket

1202 try:

1203 hostname, _, _ = socket.gethostbyaddr(ip)

1204 # Remove domain suffix if present (e.g., "server.local" -> "server")

1205 return hostname.split(".")[0]

1206 except (socket.herror, socket.gaierror):

1207 logging.debug("Could not resolve hostname for IP: %s", ip)

1208 return None

1209

1210

1211def _find_vm_esxi_host(hostname: str) -> str | None:

1212 """Find the ESXi host running a VM with the given hostname.

1213

1214 Args:

1215 hostname: The VM name to search for

1216

1217 Returns:

1218 ESXi host name if found, None otherwise

1219 """

1220 with _get_connection() as conn:

1221 cursor = conn.cursor()

1222 cursor.execute("""

1223 SELECT esxi_host FROM vm_info

1224 WHERE vm_name = ? OR vm_name LIKE ?

1225 """, (hostname, f"{hostname}.%"))

1226 row = cursor.fetchone()

1227 return row[0] if row else None

1228

1229

1230def _apply_domain(hostname: str, domain: str | None) -> str:

1231 """Apply domain to a short hostname if needed.

1232

1233 Args:

1234 hostname: The hostname (may be short or FQDN)

1235 domain: The domain to append (e.g., "green-rabbit.net")

1236

1237 Returns:

1238 FQDN if domain is provided and hostname is short, otherwise hostname as-is

1239 """

1240 if not domain:

1241 return hostname

1242 # If hostname already contains a dot, assume it's already a FQDN

1243 if "." in hostname:

1244 return hostname

1245 return f"{hostname}.{domain}"

1246

1247

1248def _enrich_ups_clients(

1249 clients: list[models.UPSClient],

1250 domain: str | None = None,

1251) -> list[models.UPSClient]:

1252 """Enrich UPS clients with hostname resolution and VM info.

1253

1254 For each client:

1255 1. Replace localhost with UPS master host

1256 2. Resolve IP to hostname if not already set

1257 3. Check if hostname is a VM and get its ESXi host

1258 4. Set machine_name for linking (with domain appended if needed)

1259

1260 Args:

1261 clients: List of UPS clients to enrich

1262 domain: Domain to append to short hostnames for linking

1263 """

1264 enriched: list[models.UPSClient] = []

1265

1266 for client in clients:

1267 hostname = client.client_hostname

1268 ups_master_host = client.host

1269

1270 # Handle localhost - replace with UPS master host

1271 if hostname == "localhost" or client.client_ip == "127.0.0.1": 1271 ↛ 1273line 1271 didn't jump to line 1273 because the condition on line 1271 was never true

1272 # Use the short hostname of the UPS master

1273 hostname = ups_master_host.split(".")[0]

1274

1275 if not hostname: 1275 ↛ 1278line 1275 didn't jump to line 1278 because the condition on line 1275 was always true

1276 hostname = _resolve_hostname(client.client_ip)

1277

1278 esxi_host = None

1279 machine_name = None

1280

1281 if hostname: 1281 ↛ 1283line 1281 didn't jump to line 1283 because the condition on line 1281 was never true

1282 # Check if this is a VM

1283 esxi_host = _find_vm_esxi_host(hostname)

1284 if esxi_host:

1285 # It's a VM - link to the ESXi host (already FQDN usually)

1286 machine_name = _apply_domain(esxi_host, domain)

1287 else:

1288 # It's a physical machine - link to the hostname with domain

1289 machine_name = _apply_domain(hostname, domain)

1290

1291 enriched.append(models.UPSClient(

1292 ups_name=client.ups_name,

1293 host=client.host,

1294 client_ip=client.client_ip,

1295 client_hostname=hostname,

1296 esxi_host=esxi_host,

1297 machine_name=machine_name,

1298 collected_at=client.collected_at,

1299 ))

1300

1301 return enriched

1302

1303

1304def collect_ups_data() -> bool:

1305 """Collect UPS data from configured NUT hosts.

1306

1307 Returns:

1308 True if any data was collected, False otherwise

1309 """

1310 config = load_config()

1311 cfg = my_lib.config.accessor(config)

1312

1313 ups_configs = cfg.get_list("ups")

1314 if not ups_configs:

1315 return False

1316

1317 # Get domain for hostname resolution

1318 domain = cfg.get("domain")

1319

1320 updated = False

1321 all_ups_info: list[models.UPSInfo] = []

1322 all_clients: list[models.UPSClient] = []

1323

1324 for ups_config in ups_configs:

1325 host = ups_config.get("host")

1326 if not host: 1326 ↛ 1327line 1326 didn't jump to line 1327 because the condition on line 1326 was never true

1327 continue

1328

1329 port = ups_config.get("port", 3493)

1330 ups_name_filter = ups_config.get("name")

1331

1332 logging.info("Collecting UPS data from NUT host %s:%d...", host, port)

1333

1334 ups_info_list, clients = ups_collector.fetch_all_ups_from_host(

1335 host, port, ups_name_filter

1336 )

1337

1338 if ups_info_list: 1338 ↛ 1324line 1338 didn't jump to line 1324 because the condition on line 1338 was always true

1339 all_ups_info.extend(ups_info_list)

1340 all_clients.extend(clients)

1341 logging.info(" Found %d UPS(s) with %d client(s)", len(ups_info_list), len(clients))

1342 updated = True

1343

1344 if all_ups_info: 1344 ↛ 1346line 1344 didn't jump to line 1346 because the condition on line 1344 was always true

1345 save_ups_info(all_ups_info)

1346 if all_clients: 1346 ↛ 1351line 1346 didn't jump to line 1351 because the condition on line 1346 was always true

1347 # Enrich clients with hostname resolution, VM info, and domain

1348 enriched_clients = _enrich_ups_clients(all_clients, domain)

1349 save_ups_clients(enriched_clients)

1350

1351 return updated

1352

1353

1354def collect_prometheus_mount_data() -> bool:

1355 """Collect mount point data from Prometheus for configured hosts.

1356

1357 Automatically collects mount data for machines with mount configuration.

1358 Each mount config specifies 'label' (Prometheus label to match) and

1359 optional 'path' (display path, defaults to label).

1360 The 'type' field determines the metrics source: filesystem, btrfs, or windows.

1361 If 'type' is not specified and machine os is 'Windows', defaults to 'windows'.

1362

1363 Returns:

1364 True if any data was collected, False otherwise

1365 """

1366 config = load_config()

1367 cfg = my_lib.config.accessor(config)

1368

1369 prometheus_url = cfg.get("prometheus", "url")

1370 if not prometheus_url: 1370 ↛ 1373line 1370 didn't jump to line 1373 because the condition on line 1370 was always true

1371 return False

1372

1373 instance_map = cfg.get_dict("prometheus", "instance_map")

1374

1375 # Find machines with mount configuration

1376 machines = cfg.get_list("machine")

1377 target_machines = [

1378 (m["name"], m["mount"], m.get("os", ""))

1379 for m in machines if m.get("mount")

1380 ]

1381

1382 if not target_machines:

1383 return False

1384

1385 updated = False

1386

1387 for host, mount_configs, os_type in target_machines:

1388 logging.info("Collecting mount data from Prometheus for %s...", host)

1389

1390 # Auto-detect type based on OS if not specified

1391 processed_configs = []

1392 for mc in mount_configs:

1393 config_copy = dict(mc)

1394 if "type" not in config_copy and os_type.lower() == "windows":

1395 config_copy["type"] = "windows"

1396 processed_configs.append(config_copy)

1397

1398 mounts = fetch_prometheus_mount_info(prometheus_url, processed_configs, host, instance_map)

1399

1400 if mounts:

1401 save_mount_info(host, mounts)

1402 logging.info(" Cached %d mount points for %s", len(mounts), host)

1403 updated = True

1404

1405 return updated

1406

1407

1408# =============================================================================

1409# Database save/get functions

1410# =============================================================================

1411

1412

1413# -----------------------------------------------------------------------------

1414# Save functions

1415# -----------------------------------------------------------------------------

1416

1417

1418def save_vm_data(esxi_host: str, vms: list[models.VMInfo]):

1419 """Save VM data to SQLite cache.

1420

1421 Deletes all existing VMs for the host first, then inserts new data.

1422 This ensures deleted VMs are removed from the cache.

1423 """

1424 collected_at = datetime.now().isoformat()

1425

1426 with _get_connection() as conn:

1427 cursor = conn.cursor()

1428

1429 # Delete all existing VMs for this host first

1430 cursor.execute("DELETE FROM vm_info WHERE esxi_host = ?", (esxi_host,))

1431

1432 # Insert new VM data

1433 for vm in vms:

1434 cursor.execute("""

1435 INSERT INTO vm_info

1436 (esxi_host, vm_name, cpu_count, ram_mb, storage_gb, power_state,

1437 cpu_usage_mhz, memory_usage_mb, collected_at)

1438 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)

1439 """, (

1440 vm.esxi_host,

1441 vm.vm_name,

1442 vm.cpu_count,

1443 vm.ram_mb,

1444 vm.storage_gb,

1445 vm.power_state,

1446 vm.cpu_usage_mhz,

1447 vm.memory_usage_mb,

1448 collected_at

1449 ))

1450

1451 conn.commit()

1452

1453

1454def save_host_info(host_info: models.HostInfo):

1455 """Save host info (uptime + CPU + ESXi version + usage) to SQLite cache."""

1456 collected_at = datetime.now().isoformat()

1457

1458 with _get_connection() as conn:

1459 cursor = conn.cursor()

1460

1461 cursor.execute("""

1462 INSERT OR REPLACE INTO host_info

1463 (host, boot_time, uptime_seconds, status, cpu_threads, cpu_cores, os_version,

1464 cpu_usage_percent, memory_usage_percent, memory_total_bytes, memory_used_bytes, collected_at)

1465 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)

1466 """, (

1467 host_info.host,

1468 host_info.boot_time,

1469 host_info.uptime_seconds,

1470 host_info.status,

1471 host_info.cpu_threads,

1472 host_info.cpu_cores,

1473 host_info.os_version,

1474 host_info.cpu_usage_percent,

1475 host_info.memory_usage_percent,

1476 host_info.memory_total_bytes,

1477 host_info.memory_used_bytes,

1478 collected_at

1479 ))

1480

1481 conn.commit()

1482

1483

1484def save_host_info_failed(host: str):

1485 """Save failed host info status to SQLite cache.

1486

1487 When ESXi is unreachable, set status to 'unknown' to indicate

1488 we cannot determine the actual state.

1489 """

1490 collected_at = datetime.now().isoformat()

1491

1492 with _get_connection() as conn:

1493 cursor = conn.cursor()

1494

1495 cursor.execute("""

1496 INSERT OR REPLACE INTO host_info

1497 (host, boot_time, uptime_seconds, status, cpu_threads, cpu_cores, os_version,

1498 cpu_usage_percent, memory_usage_percent, memory_total_bytes, memory_used_bytes, collected_at)

1499 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)

1500 """, (host, None, None, "unknown", None, None, None, None, None, None, None, collected_at))

1501

1502 conn.commit()

1503

1504

1505def update_collection_status(host: str, status: str):

1506 """Update the collection status for a host."""

1507 with _get_connection() as conn:

1508 cursor = conn.cursor()

1509

1510 cursor.execute("""

1511 INSERT OR REPLACE INTO collection_status (host, last_fetch, status)

1512 VALUES (?, ?, ?)

1513 """, (host, datetime.now().isoformat(), status))

1514

1515 conn.commit()

1516

1517

1518def get_collection_status(host: str) -> models.CollectionStatus | None:

1519 """Get the collection status for a host."""

1520 with _get_connection() as conn:

1521 cursor = conn.cursor()

1522

1523 cursor.execute("""

1524 SELECT host, last_fetch, status

1525 FROM collection_status

1526 WHERE host = ?

1527 """, (host,))

1528

1529 row = cursor.fetchone()

1530 return models.CollectionStatus.parse_row(row) if row else None

1531

1532

1533def is_host_reachable(host: str) -> bool:

1534 """Check if a host was successfully reached in the last collection."""

1535 status = get_collection_status(host)

1536 return status is not None and status.status == "success"

1537

1538

1539def get_all_collection_status() -> dict[str, models.CollectionStatus]:

1540 """Get all collection statuses in a single DB query.

1541

1542 Returns:

1543 Dict mapping host name to CollectionStatus

1544 """

1545 with _get_connection() as conn:

1546 cursor = conn.cursor()

1547

1548 cursor.execute("""

1549 SELECT host, last_fetch, status

1550 FROM collection_status

1551 """)

1552

1553 return {row[0]: models.CollectionStatus.parse_row(row) for row in cursor.fetchall()}

1554

1555

1556def get_all_vm_info() -> dict[str, list[models.VMInfo]]:

1557 """Get all VM info grouped by ESXi host in a single DB query.

1558

1559 Returns:

1560 Dict mapping ESXi host name to list of VMInfo

1561 """

1562 from collections import defaultdict

1563

1564 with _get_connection() as conn:

1565 cursor = conn.cursor()

1566

1567 cursor.execute("""

1568 SELECT vm_name, cpu_count, ram_mb, storage_gb, power_state, esxi_host,

1569 cpu_usage_mhz, memory_usage_mb, collected_at

1570 FROM vm_info

1571 """)

1572

1573 result: dict[str, list[models.VMInfo]] = defaultdict(list)

1574 for row in cursor.fetchall():

1575 if vm_info := models.VMInfo.parse_row_full(row): 1575 ↛ 1574line 1575 didn't jump to line 1574 because the condition on line 1575 was always true

1576 result[vm_info.esxi_host].append(vm_info)

1577

1578 return dict(result)

1579

1580

1581def get_vm_info(vm_name: str, esxi_host: str | None = None) -> models.VMInfo | None:

1582 """Get VM info from cache."""

1583 with _get_connection() as conn:

1584 cursor = conn.cursor()

1585

1586 if esxi_host: 1586 ↛ 1594line 1586 didn't jump to line 1594 because the condition on line 1586 was always true

1587 cursor.execute("""

1588 SELECT vm_name, cpu_count, ram_mb, storage_gb, power_state, esxi_host,

1589 cpu_usage_mhz, memory_usage_mb, collected_at

1590 FROM vm_info

1591 WHERE vm_name = ? AND esxi_host = ?

1592 """, (vm_name, esxi_host))

1593 else:

1594 cursor.execute("""

1595 SELECT vm_name, cpu_count, ram_mb, storage_gb, power_state, esxi_host,

1596 cpu_usage_mhz, memory_usage_mb, collected_at

1597 FROM vm_info

1598 WHERE vm_name = ?

1599 """, (vm_name,))

1600

1601 row = cursor.fetchone()

1602 return models.VMInfo.parse_row_full(row) if row else None

1603

1604

1605def get_all_vm_info_for_host(esxi_host: str) -> list[models.VMInfo]:

1606 """Get all VM info for a specific ESXi host from cache."""

1607 with _get_connection() as conn:

1608 cursor = conn.cursor()

1609

1610 cursor.execute("""

1611 SELECT vm_name, cpu_count, ram_mb, storage_gb, power_state,

1612 cpu_usage_mhz, memory_usage_mb, collected_at

1613 FROM vm_info

1614 WHERE esxi_host = ?

1615 """, (esxi_host,))

1616

1617 return [models.VMInfo.parse_row(row, esxi_host) for row in cursor.fetchall()]

1618

1619

1620def get_host_info(host: str) -> models.HostInfo | None:

1621 """Get uptime info from cache."""

1622 with _get_connection() as conn:

1623 cursor = conn.cursor()

1624

1625 cursor.execute("""

1626 SELECT host, boot_time, uptime_seconds, status, cpu_threads, cpu_cores, os_version,

1627 cpu_usage_percent, memory_usage_percent, memory_total_bytes, memory_used_bytes, collected_at

1628 FROM host_info

1629 WHERE host = ?

1630 """, (host,))

1631

1632 row = cursor.fetchone()

1633 return models.HostInfo.parse_row(row) if row else None

1634

1635

1636def get_all_host_info() -> dict[str, models.HostInfo]:

1637 """Get all uptime info from cache."""

1638 with _get_connection() as conn:

1639 cursor = conn.cursor()

1640

1641 cursor.execute("""

1642 SELECT host, boot_time, uptime_seconds, status, cpu_threads, cpu_cores, os_version,

1643 cpu_usage_percent, memory_usage_percent, memory_total_bytes, memory_used_bytes, collected_at

1644 FROM host_info

1645 """)

1646

1647 return {row[0]: models.HostInfo.parse_row(row) for row in cursor.fetchall()}

1648

1649

1650def _collect_esxi_host_data(si, host: str) -> bool:

1651 """単一 ESXi ホストからデータ収集の共通処理.

1652

1653 Args:

1654 si: ESXi 接続インスタンス (SmartConnect の戻り値)

1655 host: ホスト名

1656

1657 Returns:

1658 True: 成功, False: 失敗

1659 """

1660 try:

1661 # Collect VM data

1662 vms = fetch_vm_data(si, host)

1663 save_vm_data(host, vms)

1664 logging.info(" Cached %d VMs from %s", len(vms), host)

1665

1666 # Collect host info (uptime + CPU)

1667 host_info = fetch_host_info(si, host)

1668 if host_info: 1668 ↛ 1672line 1668 didn't jump to line 1672 because the condition on line 1668 was always true

1669 save_host_info(host_info)

1670 logging.info(" Cached host info for %s (CPU threads: %s)", host, host_info.cpu_threads)

1671 else:

1672 save_host_info_failed(host)

1673

1674 update_collection_status(host, "success")

1675 return True

1676

1677 except Exception as e: # ESXi/pyVmomi operations can raise various exceptions

1678 logging.warning("Error collecting data from %s: %s", host, e)

1679 update_collection_status(host, f"error: {e}")

1680 save_host_info_failed(host)

1681 return False

1682

1683 finally:

1684 Disconnect(si)

1685

1686

1687def collect_cpu_benchmark_data() -> bool:

1688 """Collect CPU benchmark data for all configured machines.

1689

1690 Checks which CPUs don't have benchmark data yet and fetches them.

1691 Uses rate limiting to avoid overwhelming cpubenchmark.net.

1692

1693 Returns:

1694 True if any new data was collected, False otherwise

1695 """

1696 config = load_config()

1697 cfg = my_lib.config.accessor(config)

1698

1699 machines = cfg.get_list("machine")

1700 if not machines: 1700 ↛ 1704line 1700 didn't jump to line 1704 because the condition on line 1700 was always true

1701 return False

1702

1703 # Get unique CPU names from config

1704 cpu_names = set()

1705 for machine in machines:

1706 cpu_name = machine.get("cpu")

1707 if cpu_name:

1708 cpu_names.add(cpu_name)

1709

1710 if not cpu_names:

1711 return False

1712

1713 # Check which CPUs don't have benchmark data yet

1714 missing_cpus = []

1715 for cpu_name in cpu_names:

1716 if not cpu_benchmark.get_benchmark(cpu_name):

1717 missing_cpus.append(cpu_name)

1718

1719 if not missing_cpus:

1720 logging.debug("All CPU benchmarks are already cached")

1721 return False

1722

1723 logging.info("Fetching CPU benchmark data for %d CPU(s)...", len(missing_cpus))

1724

1725 updated = False

1726 for cpu_name in missing_cpus:

1727 result = cpu_benchmark.fetch_and_save_benchmark(cpu_name)

1728 if result:

1729 logging.info(" Cached benchmark for: %s", cpu_name)

1730 updated = True

1731 else:

1732 logging.warning(" Could not find benchmark for: %s", cpu_name)

1733

1734 return updated

1735

1736

1737def collect_all_data():

1738 """Collect all data from configured ESXi and iLO hosts."""

1739 secret = load_secret()

1740 esxi_auth = secret.get("esxi_auth", {})

1741

1742 updated = False

1743

1744 # Collect ESXi data

1745 if esxi_auth:

1746 for host, credentials in esxi_auth.items():

1747 logging.info("Collecting data from %s...", host)

1748

1749 si = connect_to_esxi(

1750 host=credentials.get("host", host),

1751 username=credentials["username"],

1752 password=credentials["password"],

1753 port=credentials.get("port", 443),

1754 )

1755

1756 if not si:

1757 update_collection_status(host, "connection_failed")

1758 save_host_info_failed(host)

1759 continue

1760

1761 if _collect_esxi_host_data(si, host): 1761 ↛ 1746line 1761 didn't jump to line 1746 because the condition on line 1761 was always true

1762 updated = True

1763

1764 # Collect iLO power data

1765 collect_ilo_power_data()

1766

1767 # Collect Prometheus uptime data (for Linux servers)

1768 if collect_prometheus_uptime_data(): 1768 ↛ 1769line 1768 didn't jump to line 1769 because the condition on line 1768 was never true

1769 updated = True

1770

1771 # Collect Prometheus ZFS pool data (for Linux servers with ZFS)

1772 if collect_prometheus_zfs_data(): 1772 ↛ 1773line 1772 didn't jump to line 1773 because the condition on line 1772 was never true

1773 updated = True

1774

1775 # Collect Prometheus mount point data

1776 if collect_prometheus_mount_data(): 1776 ↛ 1777line 1776 didn't jump to line 1777 because the condition on line 1776 was never true

1777 updated = True

1778

1779 # Collect UPS data from NUT

1780 if collect_ups_data(): 1780 ↛ 1781line 1780 didn't jump to line 1781 because the condition on line 1780 was never true

1781 updated = True

1782

1783 # Collect CPU benchmark data for configured machines

1784 if collect_cpu_benchmark_data(): 1784 ↛ 1785line 1784 didn't jump to line 1785 because the condition on line 1784 was never true

1785 updated = True

1786

1787 if updated:

1788 my_lib.webapp.event.notify_event(my_lib.webapp.event.EVENT_TYPE.CONTENT)

1789 logging.info("Data collection complete, clients notified")

1790

1791

1792def collect_host_data(host: str) -> bool:

1793 """Collect data from a specific ESXi host.

1794

1795 Args:

1796 host: The ESXi host name to collect data from

1797

1798 Returns:

1799 True if data was successfully collected, False otherwise

1800 """

1801 secret = load_secret()

1802 esxi_auth = secret.get("esxi_auth", {})

1803

1804 if host not in esxi_auth:

1805 logging.warning("No credentials found for host: %s", host)

1806 return False

1807

1808 credentials = esxi_auth[host]

1809 logging.info("Collecting data from %s (manual refresh)...", host)

1810

1811 si = connect_to_esxi(

1812 host=credentials.get("host", host),

1813 username=credentials["username"],

1814 password=credentials["password"],

1815 port=credentials.get("port", 443),

1816 )

1817

1818 if not si:

1819 update_collection_status(host, "connection_failed")

1820 save_host_info_failed(host)

1821 my_lib.webapp.event.notify_event(my_lib.webapp.event.EVENT_TYPE.CONTENT)

1822 return False

1823

1824 success = _collect_esxi_host_data(si, host)

1825 my_lib.webapp.event.notify_event(my_lib.webapp.event.EVENT_TYPE.CONTENT)

1826 logging.info("Data collection complete for %s, clients notified", host)

1827 return success

1828

1829

1830def _update_worker():

1831 """Background worker that collects data periodically."""

1832 logging.info("Data collector started (interval: %d sec)", UPDATE_INTERVAL_SEC)

1833

1834 # Initial collection

1835 try:

1836 collect_all_data()

1837 except Exception:

1838 logging.exception("Error in initial data collection")

1839

1840 while not _should_stop.wait(UPDATE_INTERVAL_SEC):

1841 try:

1842 collect_all_data()

1843 except Exception:

1844 logging.exception("Error in periodic data collection")

1845

1846 logging.info("Data collector stopped")

1847

1848

1849def start_collector():

1850 """Start the background data collector."""

1851 global _update_thread

1852

1853 init_db()

1854

1855 if _update_thread and _update_thread.is_alive(): 1855 ↛ 1856line 1855 didn't jump to line 1856 because the condition on line 1855 was never true

1856 return

1857

1858 _should_stop.clear()

1859 _update_thread = threading.Thread(target=_update_worker, daemon=True)

1860 _update_thread.start()

1861

1862

1863def stop_collector():

1864 """Stop the background data collector."""

1865 _should_stop.set()

1866 if _update_thread: 1866 ↛ exitline 1866 didn't return from function 'stop_collector' because the condition on line 1866 was always true

1867 _update_thread.join(timeout=5)

1868

1869

1870if __name__ == "__main__":

1871 import sys

1872

1873 logging.basicConfig(level=logging.INFO)

1874 init_db()

1875

1876 if "--once" in sys.argv:

1877 collect_all_data()

1878 else:

1879 start_collector()

1880 try:

1881 while True:

1882 import time

1883 time.sleep(1)

1884 except KeyboardInterrupt:

1885 stop_collector()

Coverage for src / server_list / spec / data_collector.py: 48%

738 statements