Coverage for src / server_list / spec / data_collector.py: 48%
738 statements
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-31 11:45 +0000
« prev ^ index » next coverage.py v7.13.1, created at 2026-01-31 11:45 +0000
1#!/usr/bin/env python3
2"""
3Unified data collector for server-list.
4Collects ESXi VM data and uptime, caches to SQLite.
5Runs periodically every 5 minutes.
6"""
8import atexit
9import logging
10import sqlite3
11import ssl
12import threading
13from collections.abc import Generator
14from contextlib import contextmanager
15from datetime import datetime
16from typing import Any
18import requests
19from pyVim.connect import Disconnect, SmartConnect
20from pyVmomi import vim
22import my_lib.config
23import my_lib.safe_access
24import my_lib.webapp.event
26import server_list.spec.cpu_benchmark as cpu_benchmark
27import server_list.spec.db as db
28import server_list.spec.db_config as db_config
29import server_list.spec.models as models
30import server_list.spec.ups_collector as ups_collector
32UPDATE_INTERVAL_SEC = 300 # 5 minutes
34_update_thread: threading.Thread | None = None
35_should_stop = threading.Event()
38@contextmanager
39def _get_connection() -> Generator[sqlite3.Connection, None, None]:
40 """DB接続を取得するコンテキストマネージャ.
42 SQLite 自体がファイルレベルのロック機構を持っているため、
43 アプリケーション側でのロックは不要。
44 """
45 with db.get_connection(db_config.get_server_data_db_path()) as conn:
46 yield conn
49def init_db():
50 """Initialize the SQLite database using schema file."""
51 db.init_schema_from_file(db_config.get_server_data_db_path(), db.SQLITE_SCHEMA_PATH)
54def load_secret() -> dict:
55 """Load secret.yaml containing ESXi credentials.
57 Constructs path from db.BASE_DIR to allow test mocking.
58 """
59 secret_path = db.BASE_DIR / "secret.yaml"
60 if not secret_path.exists():
61 return {}
63 return my_lib.config.load(secret_path, db.SECRET_SCHEMA_PATH)
66def load_config() -> dict:
67 """Load config.yaml containing machine definitions.
69 Constructs path from db.BASE_DIR to allow test mocking.
70 """
71 config_path = db.BASE_DIR / "config.yaml"
72 if not config_path.exists(): 72 ↛ 75line 72 didn't jump to line 75 because the condition on line 72 was always true
73 return {}
75 return my_lib.config.load(config_path, db.CONFIG_SCHEMA_PATH)
78def connect_to_esxi(host: str, username: str, password: str, port: int = 443) -> Any | None:
79 """Connect to ESXi host."""
80 context = ssl.SSLContext(ssl.PROTOCOL_TLS_CLIENT)
81 context.check_hostname = False
82 context.verify_mode = ssl.CERT_NONE
84 try:
85 si: Any = SmartConnect(
86 host=host,
87 user=username,
88 pwd=password,
89 port=port,
90 sslContext=context
91 )
92 atexit.register(Disconnect, si)
93 return si
94 except Exception as e: # pyVmomi can raise various unexpected exceptions
95 logging.warning("Failed to connect to %s: %s", host, e)
96 return None
99def get_vm_storage_size(vm) -> float:
100 """Calculate total storage size for a VM in GB."""
101 total_bytes = 0
103 try:
104 for device in vm.config.hardware.device:
105 if isinstance(device, vim.vm.device.VirtualDisk): 105 ↛ 104line 105 didn't jump to line 104 because the condition on line 105 was always true
106 if device.capacityInBytes is not None: 106 ↛ 104line 106 didn't jump to line 104 because the condition on line 106 was always true
107 total_bytes += device.capacityInBytes
108 except Exception as e:
109 # pyVmomi attribute access can fail for various reasons
110 logging.debug("Failed to get storage size for VM: %s", e)
112 return total_bytes / (1024 ** 3)
115def fetch_vm_data(si, esxi_host: str) -> list[models.VMInfo]:
116 """Fetch VM data from ESXi."""
117 content = si.RetrieveContent()
118 container = content.rootFolder
119 view_type = [vim.VirtualMachine]
120 recursive = True
122 container_view = content.viewManager.CreateContainerView(
123 container, view_type, recursive
124 )
126 vms: list[models.VMInfo] = []
127 for vm in container_view.view:
128 try:
129 # Get CPU and memory usage from quickStats
130 cpu_usage_mhz = None
131 memory_usage_mb = None
132 if vm.summary and vm.summary.quickStats: 132 ↛ 137line 132 didn't jump to line 137 because the condition on line 132 was always true
133 stats = vm.summary.quickStats
134 cpu_usage_mhz = stats.overallCpuUsage
135 memory_usage_mb = stats.guestMemoryUsage
137 vm_info = models.VMInfo(
138 esxi_host=esxi_host,
139 vm_name=vm.name,
140 cpu_count=vm.config.hardware.numCPU if vm.config else None,
141 ram_mb=vm.config.hardware.memoryMB if vm.config else None,
142 storage_gb=get_vm_storage_size(vm) if vm.config else None,
143 power_state=str(vm.runtime.powerState) if vm.runtime else None,
144 cpu_usage_mhz=cpu_usage_mhz,
145 memory_usage_mb=memory_usage_mb,
146 )
147 vms.append(vm_info)
148 except Exception as e: # pyVmomi attribute access can fail unexpectedly
149 logging.warning("Error getting VM info for %s: %s", vm.name, e)
151 container_view.Destroy()
152 return vms
155def _extract_cpu_info(host_system) -> tuple[int | None, int | None]:
156 """ESXi ホストから CPU 情報を抽出.
158 Args:
159 host_system: pyVmomi HostSystem オブジェクト
161 Returns:
162 (cpu_threads, cpu_cores) のタプル
163 """
164 safe_host = my_lib.safe_access.safe(host_system)
165 cpu_info = safe_host.hardware.cpuInfo
166 return (cpu_info.numCpuThreads.value(), cpu_info.numCpuCores.value())
169def _extract_memory_total(host_system) -> float | None:
170 """ESXi ホストから合計メモリを抽出.
172 Args:
173 host_system: pyVmomi HostSystem オブジェクト
175 Returns:
176 メモリサイズ (bytes) または None
177 """
178 safe_host = my_lib.safe_access.safe(host_system)
179 memory_size = safe_host.hardware.memorySize.value()
180 return float(memory_size) if memory_size else None
183def _extract_usage_from_quickstats(
184 host_system, memory_total_bytes: float | None
185) -> tuple[float | None, float | None, float | None]:
186 """ESXi quickStats から CPU/メモリ使用率を抽出.
188 Args:
189 host_system: pyVmomi HostSystem オブジェクト
190 memory_total_bytes: 合計メモリ (bytes)
192 Returns:
193 (cpu_usage_percent, memory_usage_percent, memory_used_bytes) のタプル
194 """
195 cpu_usage_percent = None
196 memory_usage_percent = None
197 memory_used_bytes = None
199 safe_host = my_lib.safe_access.safe(host_system)
200 stats = safe_host.summary.quickStats.value()
201 if not stats: 201 ↛ 202line 201 didn't jump to line 202 because the condition on line 201 was never true
202 return cpu_usage_percent, memory_usage_percent, memory_used_bytes
204 # CPU usage in MHz
205 cpu_info = safe_host.hardware.cpuInfo
206 cpu_hz = cpu_info.hz.value()
207 num_cores = cpu_info.numCpuCores.value()
208 if stats.overallCpuUsage is not None and cpu_hz and num_cores: 208 ↛ 213line 208 didn't jump to line 213 because the condition on line 208 was always true
209 total_cpu_mhz = (cpu_hz / 1_000_000) * num_cores
210 cpu_usage_percent = (stats.overallCpuUsage / total_cpu_mhz) * 100
212 # Memory usage in MB
213 if stats.overallMemoryUsage is not None and memory_total_bytes: 213 ↛ 217line 213 didn't jump to line 217 because the condition on line 213 was always true
214 memory_used_bytes = float(stats.overallMemoryUsage * 1024 * 1024)
215 memory_usage_percent = (memory_used_bytes / memory_total_bytes) * 100
217 return cpu_usage_percent, memory_usage_percent, memory_used_bytes
220def _extract_os_version(host_system) -> str | None:
221 """ESXi ホストから OS バージョンを抽出.
223 Args:
224 host_system: pyVmomi HostSystem オブジェクト
226 Returns:
227 OS バージョン文字列 (例: "VMware ESXi 8.0.0 build-xxx")
228 """
229 safe_host = my_lib.safe_access.safe(host_system)
230 return safe_host.config.product.fullName.value()
233def fetch_host_info(si, host: str) -> models.HostInfo | None:
234 """Fetch host info including uptime, CPU, memory usage, and ESXi version from ESXi host."""
235 try:
236 content = si.RetrieveContent()
238 for datacenter in content.rootFolder.childEntity:
239 if not hasattr(datacenter, "hostFolder"): 239 ↛ 240line 239 didn't jump to line 240 because the condition on line 239 was never true
240 continue
242 for cluster in datacenter.hostFolder.childEntity:
243 hosts = []
244 if hasattr(cluster, "host"): 244 ↛ 246line 244 didn't jump to line 246 because the condition on line 244 was always true
245 hosts = cluster.host
246 elif hasattr(cluster, "childEntity"):
247 for child in cluster.childEntity:
248 if hasattr(child, "host"):
249 hosts.extend(child.host)
251 for host_system in hosts:
252 try:
253 boot_time = host_system.runtime.bootTime
254 if not boot_time: 254 ↛ 255line 254 didn't jump to line 255 because the condition on line 254 was never true
255 continue
257 # ヘルパー関数で各情報を抽出
258 cpu_threads, cpu_cores = _extract_cpu_info(host_system)
259 memory_total_bytes = _extract_memory_total(host_system)
260 cpu_usage, mem_usage, mem_used = _extract_usage_from_quickstats(
261 host_system, memory_total_bytes
262 )
263 os_version = _extract_os_version(host_system)
265 return models.HostInfo(
266 host=host,
267 boot_time=boot_time.isoformat(),
268 uptime_seconds=(datetime.now(boot_time.tzinfo) - boot_time).total_seconds(),
269 status="running",
270 cpu_threads=cpu_threads,
271 cpu_cores=cpu_cores,
272 os_version=os_version,
273 cpu_usage_percent=cpu_usage,
274 memory_usage_percent=mem_usage,
275 memory_total_bytes=memory_total_bytes,
276 memory_used_bytes=mem_used,
277 )
278 except Exception as e: # pyVmomi attribute access can fail unexpectedly
279 logging.warning("Error getting host info: %s", e)
281 except Exception as e: # pyVmomi can raise various unexpected exceptions
282 logging.warning("Failed to get host info for %s: %s", host, e)
284 return None
287# =============================================================================
288# iLO Power Meter functions (via Redfish API)
289# =============================================================================
292def fetch_ilo_power(host: str, username: str, password: str) -> models.PowerInfo | None:
293 """Fetch power consumption data from HP iLO via Redfish API.
295 Args:
296 host: iLO hostname or IP address
297 username: iLO username
298 password: iLO password
300 Returns:
301 PowerInfo or None if failed
302 """
303 # iLO uses self-signed certificates, disable verification
304 url = f"https://{host}/redfish/v1/Chassis/1/Power"
306 try:
307 response = requests.get(
308 url,
309 auth=(username, password),
310 verify=False, # noqa: S501 - iLO uses self-signed certs
311 timeout=30
312 )
314 if response.status_code != 200:
315 logging.warning("iLO API returned status %d for %s", response.status_code, host)
316 return None
318 data = response.json()
320 # Extract power data from PowerControl array
321 power_control = data.get("PowerControl", [])
322 if not power_control:
323 logging.warning("No PowerControl data in iLO response for %s", host)
324 return None
326 # Get first PowerControl entry (main chassis power)
327 power_entry = power_control[0]
328 power_watts = power_entry.get("PowerConsumedWatts")
330 # Get power metrics (average, min, max)
331 power_metrics = power_entry.get("PowerMetrics", {})
332 power_average = power_metrics.get("AverageConsumedWatts")
333 power_max = power_metrics.get("MaxConsumedWatts")
334 power_min = power_metrics.get("MinConsumedWatts")
336 return models.PowerInfo(
337 power_watts=power_watts,
338 power_average_watts=power_average,
339 power_max_watts=power_max,
340 power_min_watts=power_min,
341 )
343 except requests.exceptions.Timeout:
344 logging.warning("Timeout connecting to iLO at %s", host)
345 return None
346 except requests.exceptions.ConnectionError as e:
347 logging.warning("Connection error to iLO at %s: %s", host, e)
348 return None
349 except (requests.RequestException, ValueError, KeyError) as e:
350 # Handle JSON parsing errors and unexpected response format
351 logging.warning("Error fetching power data from iLO at %s: %s", host, e)
352 return None
355def save_power_info(host: str, power_data: models.PowerInfo):
356 """Save power consumption info to SQLite cache."""
357 collected_at = datetime.now().isoformat()
359 with _get_connection() as conn:
360 cursor = conn.cursor()
362 cursor.execute("""
363 INSERT OR REPLACE INTO power_info
364 (host, power_watts, power_average_watts, power_max_watts, power_min_watts, collected_at)
365 VALUES (?, ?, ?, ?, ?, ?)
366 """, (
367 host,
368 power_data.power_watts,
369 power_data.power_average_watts,
370 power_data.power_max_watts,
371 power_data.power_min_watts,
372 collected_at
373 ))
375 conn.commit()
378def get_power_info(host: str) -> models.PowerInfo | None:
379 """Get power consumption info from cache."""
380 with _get_connection() as conn:
381 cursor = conn.cursor()
383 cursor.execute("""
384 SELECT power_watts, power_average_watts, power_max_watts, power_min_watts, collected_at
385 FROM power_info
386 WHERE host = ?
387 """, (host,))
389 row = cursor.fetchone()
390 return models.PowerInfo.parse_row(row) if row else None
393def get_all_power_info() -> dict[str, models.PowerInfo]:
394 """Get all power consumption info from cache."""
395 with _get_connection() as conn:
396 cursor = conn.cursor()
398 cursor.execute("""
399 SELECT host, power_watts, power_average_watts, power_max_watts, power_min_watts, collected_at
400 FROM power_info
401 """)
403 return dict(models.PowerInfo.parse_row_with_host(row) for row in cursor.fetchall())
406def collect_ilo_power_data():
407 """Collect power data from configured iLO hosts."""
408 secret = load_secret()
409 ilo_auth = secret.get("ilo_auth", {})
411 if not ilo_auth: 411 ↛ 414line 411 didn't jump to line 414 because the condition on line 411 was always true
412 return
414 for host, credentials in ilo_auth.items():
415 ilo_host = credentials.get("host", host)
416 logging.info("Collecting power data from iLO %s...", ilo_host)
418 power_data = fetch_ilo_power(
419 host=ilo_host,
420 username=credentials["username"],
421 password=credentials["password"]
422 )
424 if power_data:
425 save_power_info(host, power_data)
426 logging.info(" Cached power data for %s: %s W", host, power_data.power_watts)
429# =============================================================================
430# Prometheus common helpers
431# =============================================================================
434def _prometheus_request(prometheus_url: str, query: str) -> list[dict]:
435 """Prometheus API への HTTP リクエスト共通処理.
437 Args:
438 prometheus_url: Prometheus サーバー URL
439 query: PromQL クエリ
441 Returns:
442 結果リスト(失敗時は空リスト)
443 """
444 try:
445 response = requests.get(
446 f"{prometheus_url}/api/v1/query",
447 params={"query": query},
448 timeout=30,
449 )
450 response.raise_for_status()
451 data = response.json()
453 if data.get("status") != "success":
454 return []
456 return data.get("data", {}).get("result", [])
458 except requests.RequestException as e:
459 logging.warning("Prometheus query failed: %s", e)
460 return []
463def _execute_prometheus_query(prometheus_url: str, query: str) -> dict | None:
464 """Prometheus API 呼び出しで最初の結果を取得.
466 Args:
467 prometheus_url: Prometheus サーバー URL
468 query: PromQL クエリ
470 Returns:
471 最初の結果エントリ (dict)、または None (失敗時)
472 """
473 results = _prometheus_request(prometheus_url, query)
474 return results[0] if results else None
477def _fetch_prometheus_metric(prometheus_url: str, query: str) -> float | None:
478 """Prometheus から単一メトリクスを取得する共通関数.
480 Args:
481 prometheus_url: Prometheus サーバー URL
482 query: PromQL クエリ
484 Returns:
485 メトリクス値 (float) または None (失敗時)
486 """
487 result = _execute_prometheus_query(prometheus_url, query)
488 if result:
489 try:
490 value = result.get("value", [None, None])[1]
491 return float(value) if value is not None else None
492 except (ValueError, TypeError, IndexError) as e:
493 logging.debug("Prometheus metric parsing failed: %s", e)
494 return None
497def _fetch_prometheus_metric_with_timestamp(prometheus_url: str, query: str) -> tuple[float, float] | None:
498 """Prometheus からタイムスタンプ付きでメトリクスを取得.
500 Args:
501 prometheus_url: Prometheus サーバー URL
502 query: PromQL クエリ
504 Returns:
505 (timestamp, value) のタプル、または None (失敗時)
506 """
507 result = _execute_prometheus_query(prometheus_url, query)
508 if result:
509 try:
510 value_list = result.get("value", [])
511 if len(value_list) >= 2:
512 return float(value_list[0]), float(value_list[1])
513 except (ValueError, TypeError, IndexError) as e:
514 logging.debug("Prometheus metric with timestamp parsing failed: %s", e)
515 return None
518# =============================================================================
519# Prometheus uptime functions (for Linux servers)
520# =============================================================================
523def fetch_prometheus_uptime(
524 prometheus_url: str, instance: str, is_windows: bool = False
525) -> models.UptimeData | None:
526 """Fetch uptime data from Prometheus.
528 Linux/Windows 共通の uptime 取得関数。
529 Linux: node_boot_time_seconds メトリクス
530 Windows: windows_system_system_up_time メトリクス
532 Args:
533 prometheus_url: Prometheus server URL (e.g., http://192.168.0.20:9090)
534 instance: Prometheus instance label
535 is_windows: True の場合 Windows メトリクスを使用
537 Returns:
538 UptimeData object or None if failed
539 """
540 if is_windows:
541 metric = f'windows_system_system_up_time{{instance=~"{instance}.*"}}'
542 else:
543 metric = f'node_boot_time_seconds{{instance=~"{instance}.*"}}'
545 result = _fetch_prometheus_metric_with_timestamp(prometheus_url, metric)
546 if result is None:
547 os_name = "Windows" if is_windows else "Linux"
548 logging.warning("No Prometheus %s uptime data for instance %s", os_name, instance)
549 return None
551 current_time, boot_time = result
552 uptime_seconds = current_time - boot_time
554 return models.UptimeData(
555 boot_time=datetime.fromtimestamp(boot_time).isoformat(),
556 uptime_seconds=uptime_seconds,
557 status="running",
558 )
561def fetch_prometheus_usage(
562 prometheus_url: str, instance: str, is_windows: bool = False
563) -> models.UsageMetrics | None:
564 """Fetch CPU and memory usage from Prometheus.
566 Linux/Windows 共通の CPU/メモリ使用率取得関数。
568 Linux (node_exporter):
569 - CPU: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
570 - Memory: node_memory_MemTotal_bytes, node_memory_MemAvailable_bytes
572 Windows (windows_exporter):
573 - CPU: 100 - (avg by (instance) (rate(windows_cpu_time_total{mode="idle"}[5m])) * 100)
574 - Memory: windows_cs_physical_memory_bytes, windows_os_physical_memory_free_bytes
576 Args:
577 prometheus_url: Prometheus server URL
578 instance: Prometheus instance label
579 is_windows: True の場合 Windows メトリクスを使用
581 Returns:
582 UsageMetrics object with cpu/memory usage, or None if no data available
583 """
584 cpu_usage_percent: float | None = None
585 memory_usage_percent: float | None = None
586 memory_total_bytes: float | None = None
587 memory_used_bytes: float | None = None
589 # OS 固有のメトリクス名を設定
590 if is_windows:
591 cpu_metric = "windows_cpu_time_total"
592 mem_total_metric = "windows_cs_physical_memory_bytes"
593 mem_avail_metric = "windows_os_physical_memory_free_bytes"
594 else:
595 cpu_metric = "node_cpu_seconds_total"
596 mem_total_metric = "node_memory_MemTotal_bytes"
597 mem_avail_metric = "node_memory_MemAvailable_bytes"
599 # Get CPU usage (100 - idle percentage)
600 cpu_query = f'100 - (avg by (instance) (rate({cpu_metric}{{instance=~"{instance}.*",mode="idle"}}[5m])) * 100)'
601 cpu_usage_percent = _fetch_prometheus_metric(prometheus_url, cpu_query)
603 # Get memory total
604 mem_total_query = f'{mem_total_metric}{{instance=~"{instance}.*"}}'
605 memory_total_bytes = _fetch_prometheus_metric(prometheus_url, mem_total_query)
607 # Get memory available/free
608 mem_avail_query = f'{mem_avail_metric}{{instance=~"{instance}.*"}}'
609 mem_avail = _fetch_prometheus_metric(prometheus_url, mem_avail_query)
610 if mem_avail is not None and memory_total_bytes is not None:
611 memory_used_bytes = memory_total_bytes - mem_avail
612 memory_usage_percent = (memory_used_bytes / memory_total_bytes) * 100
614 # Return None if no data was collected
615 if all(v is None for v in [cpu_usage_percent, memory_usage_percent, memory_total_bytes, memory_used_bytes]):
616 return None
618 return models.UsageMetrics(
619 cpu_usage_percent=cpu_usage_percent,
620 memory_usage_percent=memory_usage_percent,
621 memory_total_bytes=memory_total_bytes,
622 memory_used_bytes=memory_used_bytes,
623 )
626def get_prometheus_instance(host: str, instance_map: dict) -> str:
627 """Get Prometheus instance name for a host.
629 If the host is in instance_map, use that mapping.
630 Otherwise, derive from FQDN (use first part before the dot).
632 Args:
633 host: Host name (e.g., "tanzania.green-rabbit.net")
634 instance_map: Optional mapping of host -> instance
636 Returns:
637 Prometheus instance name (e.g., "tanzania")
638 """
639 if host in instance_map:
640 return instance_map[host]
642 # Derive from FQDN: use the first part before the dot
643 if "." in host:
644 return host.split(".")[0]
646 return host
649def collect_prometheus_uptime_data() -> bool:
650 """Collect uptime and usage data from Prometheus for configured hosts.
652 Automatically collects uptime and CPU/memory usage for machines without ESXi configuration.
653 Uses node_boot_time_seconds for Linux and windows_system_system_up_time for Windows.
654 Instance name is derived from FQDN (first part) unless explicitly
655 configured in prometheus.instance_map.
657 Returns:
658 True if any data was collected, False otherwise
659 """
660 config = load_config()
661 cfg = my_lib.config.accessor(config)
663 prometheus_url = cfg.get("prometheus", "url")
664 if not prometheus_url: 664 ↛ 667line 664 didn't jump to line 667 because the condition on line 664 was always true
665 return False
667 instance_map = cfg.get_dict("prometheus", "instance_map")
669 # Find machines without ESXi (servers that need Prometheus uptime)
670 machines = cfg.get_list("machine")
671 target_machines = [(m["name"], m.get("os", "")) for m in machines if not m.get("esxi")]
673 if not target_machines:
674 return False
676 updated = False
678 for host, os_type in target_machines:
679 instance = get_prometheus_instance(host, instance_map)
680 logging.info("Collecting uptime and usage from Prometheus for %s (instance: %s, os: %s)...", host, instance, os_type)
682 # Use appropriate fetch function based on OS
683 is_windows = os_type.lower() == "windows"
684 uptime_data = fetch_prometheus_uptime(prometheus_url, instance, is_windows=is_windows)
685 usage_data = fetch_prometheus_usage(prometheus_url, instance, is_windows=is_windows)
687 if uptime_data:
688 host_info = models.HostInfo(
689 host=host,
690 boot_time=uptime_data.boot_time,
691 uptime_seconds=uptime_data.uptime_seconds,
692 status=uptime_data.status,
693 cpu_threads=None,
694 cpu_cores=None,
695 os_version=None,
696 cpu_usage_percent=usage_data.cpu_usage_percent if usage_data else None,
697 memory_usage_percent=usage_data.memory_usage_percent if usage_data else None,
698 memory_total_bytes=usage_data.memory_total_bytes if usage_data else None,
699 memory_used_bytes=usage_data.memory_used_bytes if usage_data else None,
700 )
701 save_host_info(host_info)
702 logging.info(" Cached uptime for %s: %.1f days",
703 host, uptime_data.uptime_seconds / 86400)
704 updated = True
705 else:
706 save_host_info_failed(host)
708 return updated
711# =============================================================================
712# Prometheus ZFS pool functions (for Linux servers with ZFS)
713# =============================================================================
716def fetch_prometheus_zfs_pools(prometheus_url: str, instance: str) -> list[models.ZfsPoolInfo]:
717 """Fetch ZFS pool data from Prometheus.
719 Args:
720 prometheus_url: Prometheus server URL
721 instance: Prometheus instance label
723 Returns:
724 List of ZfsPoolInfo objects (empty list if no data)
725 """
726 metrics = ["zfs_pool_size_bytes", "zfs_pool_allocated_bytes", "zfs_pool_free_bytes", "zfs_pool_health"]
727 pool_data: dict[str, dict[str, float | None]] = {}
729 for metric in metrics:
730 query = f'{metric}{{instance=~"{instance}.*"}}'
731 results = _prometheus_request(prometheus_url, query)
733 for result in results:
734 pool_name = result.get("metric", {}).get("pool", "unknown")
735 value = result.get("value", [None, None])[1]
737 if pool_name not in pool_data:
738 pool_data[pool_name] = {}
740 # Convert metric name to field name
741 field = metric.replace("zfs_pool_", "")
742 if value is not None:
743 try:
744 pool_data[pool_name][field] = float(value)
745 except (ValueError, TypeError) as e:
746 logging.debug("ZFS metric value conversion failed: %s", e)
748 if not pool_data:
749 return []
751 return [
752 models.ZfsPoolInfo(
753 pool_name=name,
754 size_bytes=data.get("size_bytes"),
755 allocated_bytes=data.get("allocated_bytes"),
756 free_bytes=data.get("free_bytes"),
757 health=data.get("health"),
758 )
759 for name, data in pool_data.items()
760 ]
763def save_zfs_pool_info(host: str, pools: list[models.ZfsPoolInfo]):
764 """Save ZFS pool info to SQLite cache."""
765 collected_at = datetime.now().isoformat()
767 with _get_connection() as conn:
768 cursor = conn.cursor()
770 # Delete existing pools for this host
771 cursor.execute("DELETE FROM zfs_pool_info WHERE host = ?", (host,))
773 # Insert new pool data
774 for pool in pools:
775 cursor.execute("""
776 INSERT INTO zfs_pool_info
777 (host, pool_name, size_bytes, allocated_bytes, free_bytes, health, collected_at)
778 VALUES (?, ?, ?, ?, ?, ?, ?)
779 """, (
780 host,
781 pool.pool_name,
782 pool.size_bytes,
783 pool.allocated_bytes,
784 pool.free_bytes,
785 pool.health,
786 collected_at
787 ))
789 conn.commit()
792def get_zfs_pool_info(host: str) -> list[models.ZfsPoolInfo]:
793 """Get ZFS pool info from cache."""
794 with _get_connection() as conn:
795 cursor = conn.cursor()
797 cursor.execute("""
798 SELECT pool_name, size_bytes, allocated_bytes, free_bytes, health, collected_at
799 FROM zfs_pool_info
800 WHERE host = ?
801 """, (host,))
803 return [models.ZfsPoolInfo.parse_row(row) for row in cursor.fetchall()]
806def collect_prometheus_zfs_data() -> bool:
807 """Collect ZFS pool data from Prometheus for configured hosts.
809 Automatically collects ZFS pool data for machines with filesystem: ['zfs'].
810 Instance name is derived from FQDN (first part) unless explicitly
811 configured in prometheus.instance_map.
813 Returns:
814 True if any data was collected, False otherwise
815 """
816 config = load_config()
817 cfg = my_lib.config.accessor(config)
819 prometheus_url = cfg.get("prometheus", "url")
820 if not prometheus_url: 820 ↛ 823line 820 didn't jump to line 823 because the condition on line 820 was always true
821 return False
823 instance_map = cfg.get_dict("prometheus", "instance_map")
825 # Find machines with filesystem: ['zfs']
826 machines = cfg.get_list("machine")
827 target_hosts = [m["name"] for m in machines if "zfs" in m.get("filesystem", [])]
829 if not target_hosts:
830 return False
832 updated = False
834 for host in target_hosts:
835 instance = get_prometheus_instance(host, instance_map)
836 logging.info("Collecting ZFS pool data from Prometheus for %s (instance: %s)...", host, instance)
838 pools = fetch_prometheus_zfs_pools(prometheus_url, instance)
840 if pools:
841 save_zfs_pool_info(host, pools)
842 logging.info(" Cached %d ZFS pools for %s", len(pools), host)
843 updated = True
845 return updated
848# =============================================================================
849# Prometheus mount point functions (for Linux servers)
850# =============================================================================
853def fetch_btrfs_uuid(prometheus_url: str, label: str) -> str | None:
854 """Get btrfs UUID from label via node_btrfs_info metric.
856 Args:
857 prometheus_url: Prometheus server URL
858 label: Btrfs filesystem label
860 Returns:
861 UUID string or None if not found
862 """
863 query = f'node_btrfs_info{{label="{label}"}}'
864 result = _execute_prometheus_query(prometheus_url, query)
865 if result:
866 uuid = result.get("metric", {}).get("uuid")
867 return str(uuid) if uuid is not None else None
868 return None
871def fetch_btrfs_metrics(prometheus_url: str, uuid: str) -> models.StorageMetrics | None:
872 """Fetch btrfs size and used bytes from Prometheus.
874 Args:
875 prometheus_url: Prometheus server URL
876 uuid: Btrfs filesystem UUID
878 Returns:
879 StorageMetrics with size_bytes, avail_bytes, used_bytes or None if failed
880 """
881 # Get total size: sum of all device sizes
882 size_query = f'sum(node_btrfs_device_size_bytes{{uuid="{uuid}"}})'
883 size_bytes = _fetch_prometheus_metric(prometheus_url, size_query)
885 # Get used bytes: sum of used across all block group types
886 used_query = f'sum(node_btrfs_used_bytes{{uuid="{uuid}"}})'
887 used_bytes = _fetch_prometheus_metric(prometheus_url, used_query)
889 if size_bytes is not None and used_bytes is not None:
890 return models.StorageMetrics(
891 size_bytes=size_bytes,
892 avail_bytes=size_bytes - used_bytes,
893 used_bytes=used_bytes,
894 )
896 return None
899def fetch_windows_disk_metrics(
900 prometheus_url: str, volume: str, instance: str
901) -> models.StorageMetrics | None:
902 """Fetch Windows logical disk metrics from Prometheus.
904 Uses windows_exporter metrics:
905 - windows_logical_disk_size_bytes{volume="C:"}
906 - windows_logical_disk_free_bytes{volume="C:"}
908 Args:
909 prometheus_url: Prometheus server URL
910 volume: Windows volume label (e.g., "C:")
911 instance: Prometheus instance name (derived from FQDN)
913 Returns:
914 StorageMetrics with size_bytes, avail_bytes, used_bytes or None if failed
915 """
916 # Get total size
917 size_query = f'windows_logical_disk_size_bytes{{volume="{volume}",instance=~"{instance}.*"}}'
918 size_bytes = _fetch_prometheus_metric(prometheus_url, size_query)
920 # Get free bytes
921 avail_query = f'windows_logical_disk_free_bytes{{volume="{volume}",instance=~"{instance}.*"}}'
922 avail_bytes = _fetch_prometheus_metric(prometheus_url, avail_query)
924 if size_bytes is not None and avail_bytes is not None:
925 return models.StorageMetrics(
926 size_bytes=size_bytes,
927 avail_bytes=avail_bytes,
928 used_bytes=size_bytes - avail_bytes,
929 )
931 return None
934def _fetch_filesystem_mount_metrics(prometheus_url: str, label: str, path: str) -> models.StorageMetrics | None:
935 """node_filesystem_* メトリクスからマウント情報を取得.
937 Args:
938 prometheus_url: Prometheus server URL
939 label: Prometheus instance label
940 path: マウントポイントのパス
942 Returns:
943 StorageMetrics with size_bytes, avail_bytes, used_bytes or None if failed
944 """
945 size_query = f'node_filesystem_size_bytes{{instance=~"{label}.*",mountpoint="{path}"}}'
946 avail_query = f'node_filesystem_avail_bytes{{instance=~"{label}.*",mountpoint="{path}"}}'
948 size_bytes = _fetch_prometheus_metric(prometheus_url, size_query)
949 avail_bytes = _fetch_prometheus_metric(prometheus_url, avail_query)
951 if size_bytes is not None and avail_bytes is not None:
952 return models.StorageMetrics(
953 size_bytes=size_bytes,
954 avail_bytes=avail_bytes,
955 used_bytes=size_bytes - avail_bytes,
956 )
958 return None
961def _fetch_mount_for_config(
962 prometheus_url: str, mount_config: dict, host: str, instance_map: dict
963) -> models.MountInfo | None:
964 """単一のマウント設定からマウント情報を取得.
966 Args:
967 prometheus_url: Prometheus server URL
968 mount_config: マウント設定 (label, path, type)
969 host: ホスト名 (FQDN)
970 instance_map: ホスト -> Prometheus instance マッピング
972 Returns:
973 MountInfo、または None
974 """
975 label = mount_config.get("label", "")
976 path = mount_config.get("path", label)
977 mount_type = mount_config.get("type", "filesystem")
979 if not label:
980 return None
982 storage_data: models.StorageMetrics | None = None
984 if mount_type == "btrfs":
985 uuid = fetch_btrfs_uuid(prometheus_url, label)
986 if uuid:
987 storage_data = fetch_btrfs_metrics(prometheus_url, uuid)
988 elif mount_type == "windows":
989 instance = get_prometheus_instance(host, instance_map)
990 storage_data = fetch_windows_disk_metrics(prometheus_url, label, instance)
991 else:
992 storage_data = _fetch_filesystem_mount_metrics(prometheus_url, label, path)
994 if storage_data:
995 return models.MountInfo(
996 mountpoint=path,
997 size_bytes=storage_data.size_bytes,
998 avail_bytes=storage_data.avail_bytes,
999 used_bytes=storage_data.used_bytes,
1000 )
1002 return None
1005def fetch_prometheus_mount_info(
1006 prometheus_url: str, mount_configs: list[dict], host: str, instance_map: dict
1007) -> list[models.MountInfo]:
1008 """Fetch mount point data from Prometheus.
1010 Supports filesystem (node_filesystem_*), btrfs (node_btrfs_*),
1011 and windows (windows_logical_disk_*) metrics.
1013 Args:
1014 prometheus_url: Prometheus server URL
1015 mount_configs: List of mount configs with 'label', 'path', and optional 'type'
1016 host: Host name (FQDN) for deriving Prometheus instance
1017 instance_map: Optional mapping of host -> instance
1019 Returns:
1020 List of MountInfo (empty list if no data collected)
1021 """
1022 mount_data: list[models.MountInfo] = []
1024 for mount_config in mount_configs:
1025 result = _fetch_mount_for_config(prometheus_url, mount_config, host, instance_map)
1026 if result:
1027 mount_data.append(result)
1029 return mount_data
1032def save_mount_info(host: str, mounts: list[models.MountInfo]):
1033 """Save mount info to SQLite cache."""
1034 collected_at = datetime.now().isoformat()
1036 with _get_connection() as conn:
1037 cursor = conn.cursor()
1039 # Delete existing mounts for this host
1040 cursor.execute("DELETE FROM mount_info WHERE host = ?", (host,))
1042 # Insert new mount data
1043 for mount in mounts:
1044 cursor.execute("""
1045 INSERT INTO mount_info
1046 (host, mountpoint, size_bytes, avail_bytes, used_bytes, collected_at)
1047 VALUES (?, ?, ?, ?, ?, ?)
1048 """, (
1049 host,
1050 mount.mountpoint,
1051 mount.size_bytes,
1052 mount.avail_bytes,
1053 mount.used_bytes,
1054 collected_at,
1055 ))
1057 conn.commit()
1060def get_mount_info(host: str) -> list[models.MountInfo]:
1061 """Get mount info from cache."""
1062 with _get_connection() as conn:
1063 cursor = conn.cursor()
1065 cursor.execute("""
1066 SELECT mountpoint, size_bytes, avail_bytes, used_bytes, collected_at
1067 FROM mount_info
1068 WHERE host = ?
1069 """, (host,))
1071 return [models.MountInfo.parse_row(row) for row in cursor.fetchall()]
1074# =============================================================================
1075# UPS functions (from NUT - Network UPS Tools)
1076# =============================================================================
1079def save_ups_info(ups_info_list: list[models.UPSInfo]):
1080 """Save UPS info to SQLite cache."""
1081 collected_at = datetime.now().isoformat()
1083 with _get_connection() as conn:
1084 cursor = conn.cursor()
1086 for ups in ups_info_list:
1087 cursor.execute("""
1088 INSERT OR REPLACE INTO ups_info
1089 (ups_name, host, model, battery_charge, battery_runtime,
1090 ups_load, ups_status, ups_temperature, input_voltage, output_voltage, collected_at)
1091 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1092 """, (
1093 ups.ups_name,
1094 ups.host,
1095 ups.model,
1096 ups.battery_charge,
1097 ups.battery_runtime,
1098 ups.ups_load,
1099 ups.ups_status,
1100 ups.ups_temperature,
1101 ups.input_voltage,
1102 ups.output_voltage,
1103 collected_at,
1104 ))
1106 conn.commit()
1109def save_ups_clients(clients: list[models.UPSClient]):
1110 """Save UPS client info to SQLite cache."""
1111 collected_at = datetime.now().isoformat()
1113 with _get_connection() as conn:
1114 cursor = conn.cursor()
1116 # Delete existing clients for these UPS devices
1117 for client in clients:
1118 cursor.execute(
1119 "DELETE FROM ups_client WHERE ups_name = ? AND host = ?",
1120 (client.ups_name, client.host)
1121 )
1123 # Insert new client data
1124 for client in clients:
1125 cursor.execute("""
1126 INSERT INTO ups_client
1127 (ups_name, host, client_ip, client_hostname, esxi_host, machine_name, collected_at)
1128 VALUES (?, ?, ?, ?, ?, ?, ?)
1129 """, (
1130 client.ups_name,
1131 client.host,
1132 client.client_ip,
1133 client.client_hostname,
1134 client.esxi_host,
1135 client.machine_name,
1136 collected_at,
1137 ))
1139 conn.commit()
1142def get_all_ups_info() -> list[models.UPSInfo]:
1143 """Get all UPS info from cache."""
1144 with _get_connection() as conn:
1145 cursor = conn.cursor()
1147 cursor.execute("""
1148 SELECT ups_name, host, model, battery_charge, battery_runtime,
1149 ups_load, ups_status, ups_temperature, input_voltage, output_voltage, collected_at
1150 FROM ups_info
1151 """)
1153 return [models.UPSInfo.parse_row(row) for row in cursor.fetchall()]
1156def get_ups_info(ups_name: str, host: str) -> models.UPSInfo | None:
1157 """Get specific UPS info from cache."""
1158 with _get_connection() as conn:
1159 cursor = conn.cursor()
1161 cursor.execute("""
1162 SELECT ups_name, host, model, battery_charge, battery_runtime,
1163 ups_load, ups_status, ups_temperature, input_voltage, output_voltage, collected_at
1164 FROM ups_info
1165 WHERE ups_name = ? AND host = ?
1166 """, (ups_name, host))
1168 row = cursor.fetchone()
1169 return models.UPSInfo.parse_row(row) if row else None
1172def get_ups_clients(ups_name: str, host: str) -> list[models.UPSClient]:
1173 """Get UPS clients from cache."""
1174 with _get_connection() as conn:
1175 cursor = conn.cursor()
1177 cursor.execute("""
1178 SELECT ups_name, host, client_ip, client_hostname, esxi_host, machine_name, collected_at
1179 FROM ups_client
1180 WHERE ups_name = ? AND host = ?
1181 """, (ups_name, host))
1183 return [models.UPSClient.parse_row(row) for row in cursor.fetchall()]
1186def get_all_ups_clients() -> list[models.UPSClient]:
1187 """Get all UPS clients from cache."""
1188 with _get_connection() as conn:
1189 cursor = conn.cursor()
1191 cursor.execute("""
1192 SELECT ups_name, host, client_ip, client_hostname, esxi_host, machine_name, collected_at
1193 FROM ups_client
1194 """)
1196 return [models.UPSClient.parse_row(row) for row in cursor.fetchall()]
1199def _resolve_hostname(ip: str) -> str | None:
1200 """Resolve IP address to hostname using DNS."""
1201 import socket
1202 try:
1203 hostname, _, _ = socket.gethostbyaddr(ip)
1204 # Remove domain suffix if present (e.g., "server.local" -> "server")
1205 return hostname.split(".")[0]
1206 except (socket.herror, socket.gaierror):
1207 logging.debug("Could not resolve hostname for IP: %s", ip)
1208 return None
1211def _find_vm_esxi_host(hostname: str) -> str | None:
1212 """Find the ESXi host running a VM with the given hostname.
1214 Args:
1215 hostname: The VM name to search for
1217 Returns:
1218 ESXi host name if found, None otherwise
1219 """
1220 with _get_connection() as conn:
1221 cursor = conn.cursor()
1222 cursor.execute("""
1223 SELECT esxi_host FROM vm_info
1224 WHERE vm_name = ? OR vm_name LIKE ?
1225 """, (hostname, f"{hostname}.%"))
1226 row = cursor.fetchone()
1227 return row[0] if row else None
1230def _apply_domain(hostname: str, domain: str | None) -> str:
1231 """Apply domain to a short hostname if needed.
1233 Args:
1234 hostname: The hostname (may be short or FQDN)
1235 domain: The domain to append (e.g., "green-rabbit.net")
1237 Returns:
1238 FQDN if domain is provided and hostname is short, otherwise hostname as-is
1239 """
1240 if not domain:
1241 return hostname
1242 # If hostname already contains a dot, assume it's already a FQDN
1243 if "." in hostname:
1244 return hostname
1245 return f"{hostname}.{domain}"
1248def _enrich_ups_clients(
1249 clients: list[models.UPSClient],
1250 domain: str | None = None,
1251) -> list[models.UPSClient]:
1252 """Enrich UPS clients with hostname resolution and VM info.
1254 For each client:
1255 1. Replace localhost with UPS master host
1256 2. Resolve IP to hostname if not already set
1257 3. Check if hostname is a VM and get its ESXi host
1258 4. Set machine_name for linking (with domain appended if needed)
1260 Args:
1261 clients: List of UPS clients to enrich
1262 domain: Domain to append to short hostnames for linking
1263 """
1264 enriched: list[models.UPSClient] = []
1266 for client in clients:
1267 hostname = client.client_hostname
1268 ups_master_host = client.host
1270 # Handle localhost - replace with UPS master host
1271 if hostname == "localhost" or client.client_ip == "127.0.0.1": 1271 ↛ 1273line 1271 didn't jump to line 1273 because the condition on line 1271 was never true
1272 # Use the short hostname of the UPS master
1273 hostname = ups_master_host.split(".")[0]
1275 if not hostname: 1275 ↛ 1278line 1275 didn't jump to line 1278 because the condition on line 1275 was always true
1276 hostname = _resolve_hostname(client.client_ip)
1278 esxi_host = None
1279 machine_name = None
1281 if hostname: 1281 ↛ 1283line 1281 didn't jump to line 1283 because the condition on line 1281 was never true
1282 # Check if this is a VM
1283 esxi_host = _find_vm_esxi_host(hostname)
1284 if esxi_host:
1285 # It's a VM - link to the ESXi host (already FQDN usually)
1286 machine_name = _apply_domain(esxi_host, domain)
1287 else:
1288 # It's a physical machine - link to the hostname with domain
1289 machine_name = _apply_domain(hostname, domain)
1291 enriched.append(models.UPSClient(
1292 ups_name=client.ups_name,
1293 host=client.host,
1294 client_ip=client.client_ip,
1295 client_hostname=hostname,
1296 esxi_host=esxi_host,
1297 machine_name=machine_name,
1298 collected_at=client.collected_at,
1299 ))
1301 return enriched
1304def collect_ups_data() -> bool:
1305 """Collect UPS data from configured NUT hosts.
1307 Returns:
1308 True if any data was collected, False otherwise
1309 """
1310 config = load_config()
1311 cfg = my_lib.config.accessor(config)
1313 ups_configs = cfg.get_list("ups")
1314 if not ups_configs:
1315 return False
1317 # Get domain for hostname resolution
1318 domain = cfg.get("domain")
1320 updated = False
1321 all_ups_info: list[models.UPSInfo] = []
1322 all_clients: list[models.UPSClient] = []
1324 for ups_config in ups_configs:
1325 host = ups_config.get("host")
1326 if not host: 1326 ↛ 1327line 1326 didn't jump to line 1327 because the condition on line 1326 was never true
1327 continue
1329 port = ups_config.get("port", 3493)
1330 ups_name_filter = ups_config.get("name")
1332 logging.info("Collecting UPS data from NUT host %s:%d...", host, port)
1334 ups_info_list, clients = ups_collector.fetch_all_ups_from_host(
1335 host, port, ups_name_filter
1336 )
1338 if ups_info_list: 1338 ↛ 1324line 1338 didn't jump to line 1324 because the condition on line 1338 was always true
1339 all_ups_info.extend(ups_info_list)
1340 all_clients.extend(clients)
1341 logging.info(" Found %d UPS(s) with %d client(s)", len(ups_info_list), len(clients))
1342 updated = True
1344 if all_ups_info: 1344 ↛ 1346line 1344 didn't jump to line 1346 because the condition on line 1344 was always true
1345 save_ups_info(all_ups_info)
1346 if all_clients: 1346 ↛ 1351line 1346 didn't jump to line 1351 because the condition on line 1346 was always true
1347 # Enrich clients with hostname resolution, VM info, and domain
1348 enriched_clients = _enrich_ups_clients(all_clients, domain)
1349 save_ups_clients(enriched_clients)
1351 return updated
1354def collect_prometheus_mount_data() -> bool:
1355 """Collect mount point data from Prometheus for configured hosts.
1357 Automatically collects mount data for machines with mount configuration.
1358 Each mount config specifies 'label' (Prometheus label to match) and
1359 optional 'path' (display path, defaults to label).
1360 The 'type' field determines the metrics source: filesystem, btrfs, or windows.
1361 If 'type' is not specified and machine os is 'Windows', defaults to 'windows'.
1363 Returns:
1364 True if any data was collected, False otherwise
1365 """
1366 config = load_config()
1367 cfg = my_lib.config.accessor(config)
1369 prometheus_url = cfg.get("prometheus", "url")
1370 if not prometheus_url: 1370 ↛ 1373line 1370 didn't jump to line 1373 because the condition on line 1370 was always true
1371 return False
1373 instance_map = cfg.get_dict("prometheus", "instance_map")
1375 # Find machines with mount configuration
1376 machines = cfg.get_list("machine")
1377 target_machines = [
1378 (m["name"], m["mount"], m.get("os", ""))
1379 for m in machines if m.get("mount")
1380 ]
1382 if not target_machines:
1383 return False
1385 updated = False
1387 for host, mount_configs, os_type in target_machines:
1388 logging.info("Collecting mount data from Prometheus for %s...", host)
1390 # Auto-detect type based on OS if not specified
1391 processed_configs = []
1392 for mc in mount_configs:
1393 config_copy = dict(mc)
1394 if "type" not in config_copy and os_type.lower() == "windows":
1395 config_copy["type"] = "windows"
1396 processed_configs.append(config_copy)
1398 mounts = fetch_prometheus_mount_info(prometheus_url, processed_configs, host, instance_map)
1400 if mounts:
1401 save_mount_info(host, mounts)
1402 logging.info(" Cached %d mount points for %s", len(mounts), host)
1403 updated = True
1405 return updated
1408# =============================================================================
1409# Database save/get functions
1410# =============================================================================
1413# -----------------------------------------------------------------------------
1414# Save functions
1415# -----------------------------------------------------------------------------
1418def save_vm_data(esxi_host: str, vms: list[models.VMInfo]):
1419 """Save VM data to SQLite cache.
1421 Deletes all existing VMs for the host first, then inserts new data.
1422 This ensures deleted VMs are removed from the cache.
1423 """
1424 collected_at = datetime.now().isoformat()
1426 with _get_connection() as conn:
1427 cursor = conn.cursor()
1429 # Delete all existing VMs for this host first
1430 cursor.execute("DELETE FROM vm_info WHERE esxi_host = ?", (esxi_host,))
1432 # Insert new VM data
1433 for vm in vms:
1434 cursor.execute("""
1435 INSERT INTO vm_info
1436 (esxi_host, vm_name, cpu_count, ram_mb, storage_gb, power_state,
1437 cpu_usage_mhz, memory_usage_mb, collected_at)
1438 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
1439 """, (
1440 vm.esxi_host,
1441 vm.vm_name,
1442 vm.cpu_count,
1443 vm.ram_mb,
1444 vm.storage_gb,
1445 vm.power_state,
1446 vm.cpu_usage_mhz,
1447 vm.memory_usage_mb,
1448 collected_at
1449 ))
1451 conn.commit()
1454def save_host_info(host_info: models.HostInfo):
1455 """Save host info (uptime + CPU + ESXi version + usage) to SQLite cache."""
1456 collected_at = datetime.now().isoformat()
1458 with _get_connection() as conn:
1459 cursor = conn.cursor()
1461 cursor.execute("""
1462 INSERT OR REPLACE INTO host_info
1463 (host, boot_time, uptime_seconds, status, cpu_threads, cpu_cores, os_version,
1464 cpu_usage_percent, memory_usage_percent, memory_total_bytes, memory_used_bytes, collected_at)
1465 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1466 """, (
1467 host_info.host,
1468 host_info.boot_time,
1469 host_info.uptime_seconds,
1470 host_info.status,
1471 host_info.cpu_threads,
1472 host_info.cpu_cores,
1473 host_info.os_version,
1474 host_info.cpu_usage_percent,
1475 host_info.memory_usage_percent,
1476 host_info.memory_total_bytes,
1477 host_info.memory_used_bytes,
1478 collected_at
1479 ))
1481 conn.commit()
1484def save_host_info_failed(host: str):
1485 """Save failed host info status to SQLite cache.
1487 When ESXi is unreachable, set status to 'unknown' to indicate
1488 we cannot determine the actual state.
1489 """
1490 collected_at = datetime.now().isoformat()
1492 with _get_connection() as conn:
1493 cursor = conn.cursor()
1495 cursor.execute("""
1496 INSERT OR REPLACE INTO host_info
1497 (host, boot_time, uptime_seconds, status, cpu_threads, cpu_cores, os_version,
1498 cpu_usage_percent, memory_usage_percent, memory_total_bytes, memory_used_bytes, collected_at)
1499 VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
1500 """, (host, None, None, "unknown", None, None, None, None, None, None, None, collected_at))
1502 conn.commit()
1505def update_collection_status(host: str, status: str):
1506 """Update the collection status for a host."""
1507 with _get_connection() as conn:
1508 cursor = conn.cursor()
1510 cursor.execute("""
1511 INSERT OR REPLACE INTO collection_status (host, last_fetch, status)
1512 VALUES (?, ?, ?)
1513 """, (host, datetime.now().isoformat(), status))
1515 conn.commit()
1518def get_collection_status(host: str) -> models.CollectionStatus | None:
1519 """Get the collection status for a host."""
1520 with _get_connection() as conn:
1521 cursor = conn.cursor()
1523 cursor.execute("""
1524 SELECT host, last_fetch, status
1525 FROM collection_status
1526 WHERE host = ?
1527 """, (host,))
1529 row = cursor.fetchone()
1530 return models.CollectionStatus.parse_row(row) if row else None
1533def is_host_reachable(host: str) -> bool:
1534 """Check if a host was successfully reached in the last collection."""
1535 status = get_collection_status(host)
1536 return status is not None and status.status == "success"
1539def get_all_collection_status() -> dict[str, models.CollectionStatus]:
1540 """Get all collection statuses in a single DB query.
1542 Returns:
1543 Dict mapping host name to CollectionStatus
1544 """
1545 with _get_connection() as conn:
1546 cursor = conn.cursor()
1548 cursor.execute("""
1549 SELECT host, last_fetch, status
1550 FROM collection_status
1551 """)
1553 return {row[0]: models.CollectionStatus.parse_row(row) for row in cursor.fetchall()}
1556def get_all_vm_info() -> dict[str, list[models.VMInfo]]:
1557 """Get all VM info grouped by ESXi host in a single DB query.
1559 Returns:
1560 Dict mapping ESXi host name to list of VMInfo
1561 """
1562 from collections import defaultdict
1564 with _get_connection() as conn:
1565 cursor = conn.cursor()
1567 cursor.execute("""
1568 SELECT vm_name, cpu_count, ram_mb, storage_gb, power_state, esxi_host,
1569 cpu_usage_mhz, memory_usage_mb, collected_at
1570 FROM vm_info
1571 """)
1573 result: dict[str, list[models.VMInfo]] = defaultdict(list)
1574 for row in cursor.fetchall():
1575 if vm_info := models.VMInfo.parse_row_full(row): 1575 ↛ 1574line 1575 didn't jump to line 1574 because the condition on line 1575 was always true
1576 result[vm_info.esxi_host].append(vm_info)
1578 return dict(result)
1581def get_vm_info(vm_name: str, esxi_host: str | None = None) -> models.VMInfo | None:
1582 """Get VM info from cache."""
1583 with _get_connection() as conn:
1584 cursor = conn.cursor()
1586 if esxi_host: 1586 ↛ 1594line 1586 didn't jump to line 1594 because the condition on line 1586 was always true
1587 cursor.execute("""
1588 SELECT vm_name, cpu_count, ram_mb, storage_gb, power_state, esxi_host,
1589 cpu_usage_mhz, memory_usage_mb, collected_at
1590 FROM vm_info
1591 WHERE vm_name = ? AND esxi_host = ?
1592 """, (vm_name, esxi_host))
1593 else:
1594 cursor.execute("""
1595 SELECT vm_name, cpu_count, ram_mb, storage_gb, power_state, esxi_host,
1596 cpu_usage_mhz, memory_usage_mb, collected_at
1597 FROM vm_info
1598 WHERE vm_name = ?
1599 """, (vm_name,))
1601 row = cursor.fetchone()
1602 return models.VMInfo.parse_row_full(row) if row else None
1605def get_all_vm_info_for_host(esxi_host: str) -> list[models.VMInfo]:
1606 """Get all VM info for a specific ESXi host from cache."""
1607 with _get_connection() as conn:
1608 cursor = conn.cursor()
1610 cursor.execute("""
1611 SELECT vm_name, cpu_count, ram_mb, storage_gb, power_state,
1612 cpu_usage_mhz, memory_usage_mb, collected_at
1613 FROM vm_info
1614 WHERE esxi_host = ?
1615 """, (esxi_host,))
1617 return [models.VMInfo.parse_row(row, esxi_host) for row in cursor.fetchall()]
1620def get_host_info(host: str) -> models.HostInfo | None:
1621 """Get uptime info from cache."""
1622 with _get_connection() as conn:
1623 cursor = conn.cursor()
1625 cursor.execute("""
1626 SELECT host, boot_time, uptime_seconds, status, cpu_threads, cpu_cores, os_version,
1627 cpu_usage_percent, memory_usage_percent, memory_total_bytes, memory_used_bytes, collected_at
1628 FROM host_info
1629 WHERE host = ?
1630 """, (host,))
1632 row = cursor.fetchone()
1633 return models.HostInfo.parse_row(row) if row else None
1636def get_all_host_info() -> dict[str, models.HostInfo]:
1637 """Get all uptime info from cache."""
1638 with _get_connection() as conn:
1639 cursor = conn.cursor()
1641 cursor.execute("""
1642 SELECT host, boot_time, uptime_seconds, status, cpu_threads, cpu_cores, os_version,
1643 cpu_usage_percent, memory_usage_percent, memory_total_bytes, memory_used_bytes, collected_at
1644 FROM host_info
1645 """)
1647 return {row[0]: models.HostInfo.parse_row(row) for row in cursor.fetchall()}
1650def _collect_esxi_host_data(si, host: str) -> bool:
1651 """単一 ESXi ホストからデータ収集の共通処理.
1653 Args:
1654 si: ESXi 接続インスタンス (SmartConnect の戻り値)
1655 host: ホスト名
1657 Returns:
1658 True: 成功, False: 失敗
1659 """
1660 try:
1661 # Collect VM data
1662 vms = fetch_vm_data(si, host)
1663 save_vm_data(host, vms)
1664 logging.info(" Cached %d VMs from %s", len(vms), host)
1666 # Collect host info (uptime + CPU)
1667 host_info = fetch_host_info(si, host)
1668 if host_info: 1668 ↛ 1672line 1668 didn't jump to line 1672 because the condition on line 1668 was always true
1669 save_host_info(host_info)
1670 logging.info(" Cached host info for %s (CPU threads: %s)", host, host_info.cpu_threads)
1671 else:
1672 save_host_info_failed(host)
1674 update_collection_status(host, "success")
1675 return True
1677 except Exception as e: # ESXi/pyVmomi operations can raise various exceptions
1678 logging.warning("Error collecting data from %s: %s", host, e)
1679 update_collection_status(host, f"error: {e}")
1680 save_host_info_failed(host)
1681 return False
1683 finally:
1684 Disconnect(si)
1687def collect_cpu_benchmark_data() -> bool:
1688 """Collect CPU benchmark data for all configured machines.
1690 Checks which CPUs don't have benchmark data yet and fetches them.
1691 Uses rate limiting to avoid overwhelming cpubenchmark.net.
1693 Returns:
1694 True if any new data was collected, False otherwise
1695 """
1696 config = load_config()
1697 cfg = my_lib.config.accessor(config)
1699 machines = cfg.get_list("machine")
1700 if not machines: 1700 ↛ 1704line 1700 didn't jump to line 1704 because the condition on line 1700 was always true
1701 return False
1703 # Get unique CPU names from config
1704 cpu_names = set()
1705 for machine in machines:
1706 cpu_name = machine.get("cpu")
1707 if cpu_name:
1708 cpu_names.add(cpu_name)
1710 if not cpu_names:
1711 return False
1713 # Check which CPUs don't have benchmark data yet
1714 missing_cpus = []
1715 for cpu_name in cpu_names:
1716 if not cpu_benchmark.get_benchmark(cpu_name):
1717 missing_cpus.append(cpu_name)
1719 if not missing_cpus:
1720 logging.debug("All CPU benchmarks are already cached")
1721 return False
1723 logging.info("Fetching CPU benchmark data for %d CPU(s)...", len(missing_cpus))
1725 updated = False
1726 for cpu_name in missing_cpus:
1727 result = cpu_benchmark.fetch_and_save_benchmark(cpu_name)
1728 if result:
1729 logging.info(" Cached benchmark for: %s", cpu_name)
1730 updated = True
1731 else:
1732 logging.warning(" Could not find benchmark for: %s", cpu_name)
1734 return updated
1737def collect_all_data():
1738 """Collect all data from configured ESXi and iLO hosts."""
1739 secret = load_secret()
1740 esxi_auth = secret.get("esxi_auth", {})
1742 updated = False
1744 # Collect ESXi data
1745 if esxi_auth:
1746 for host, credentials in esxi_auth.items():
1747 logging.info("Collecting data from %s...", host)
1749 si = connect_to_esxi(
1750 host=credentials.get("host", host),
1751 username=credentials["username"],
1752 password=credentials["password"],
1753 port=credentials.get("port", 443),
1754 )
1756 if not si:
1757 update_collection_status(host, "connection_failed")
1758 save_host_info_failed(host)
1759 continue
1761 if _collect_esxi_host_data(si, host): 1761 ↛ 1746line 1761 didn't jump to line 1746 because the condition on line 1761 was always true
1762 updated = True
1764 # Collect iLO power data
1765 collect_ilo_power_data()
1767 # Collect Prometheus uptime data (for Linux servers)
1768 if collect_prometheus_uptime_data(): 1768 ↛ 1769line 1768 didn't jump to line 1769 because the condition on line 1768 was never true
1769 updated = True
1771 # Collect Prometheus ZFS pool data (for Linux servers with ZFS)
1772 if collect_prometheus_zfs_data(): 1772 ↛ 1773line 1772 didn't jump to line 1773 because the condition on line 1772 was never true
1773 updated = True
1775 # Collect Prometheus mount point data
1776 if collect_prometheus_mount_data(): 1776 ↛ 1777line 1776 didn't jump to line 1777 because the condition on line 1776 was never true
1777 updated = True
1779 # Collect UPS data from NUT
1780 if collect_ups_data(): 1780 ↛ 1781line 1780 didn't jump to line 1781 because the condition on line 1780 was never true
1781 updated = True
1783 # Collect CPU benchmark data for configured machines
1784 if collect_cpu_benchmark_data(): 1784 ↛ 1785line 1784 didn't jump to line 1785 because the condition on line 1784 was never true
1785 updated = True
1787 if updated:
1788 my_lib.webapp.event.notify_event(my_lib.webapp.event.EVENT_TYPE.CONTENT)
1789 logging.info("Data collection complete, clients notified")
1792def collect_host_data(host: str) -> bool:
1793 """Collect data from a specific ESXi host.
1795 Args:
1796 host: The ESXi host name to collect data from
1798 Returns:
1799 True if data was successfully collected, False otherwise
1800 """
1801 secret = load_secret()
1802 esxi_auth = secret.get("esxi_auth", {})
1804 if host not in esxi_auth:
1805 logging.warning("No credentials found for host: %s", host)
1806 return False
1808 credentials = esxi_auth[host]
1809 logging.info("Collecting data from %s (manual refresh)...", host)
1811 si = connect_to_esxi(
1812 host=credentials.get("host", host),
1813 username=credentials["username"],
1814 password=credentials["password"],
1815 port=credentials.get("port", 443),
1816 )
1818 if not si:
1819 update_collection_status(host, "connection_failed")
1820 save_host_info_failed(host)
1821 my_lib.webapp.event.notify_event(my_lib.webapp.event.EVENT_TYPE.CONTENT)
1822 return False
1824 success = _collect_esxi_host_data(si, host)
1825 my_lib.webapp.event.notify_event(my_lib.webapp.event.EVENT_TYPE.CONTENT)
1826 logging.info("Data collection complete for %s, clients notified", host)
1827 return success
1830def _update_worker():
1831 """Background worker that collects data periodically."""
1832 logging.info("Data collector started (interval: %d sec)", UPDATE_INTERVAL_SEC)
1834 # Initial collection
1835 try:
1836 collect_all_data()
1837 except Exception:
1838 logging.exception("Error in initial data collection")
1840 while not _should_stop.wait(UPDATE_INTERVAL_SEC):
1841 try:
1842 collect_all_data()
1843 except Exception:
1844 logging.exception("Error in periodic data collection")
1846 logging.info("Data collector stopped")
1849def start_collector():
1850 """Start the background data collector."""
1851 global _update_thread
1853 init_db()
1855 if _update_thread and _update_thread.is_alive(): 1855 ↛ 1856line 1855 didn't jump to line 1856 because the condition on line 1855 was never true
1856 return
1858 _should_stop.clear()
1859 _update_thread = threading.Thread(target=_update_worker, daemon=True)
1860 _update_thread.start()
1863def stop_collector():
1864 """Stop the background data collector."""
1865 _should_stop.set()
1866 if _update_thread: 1866 ↛ exitline 1866 didn't return from function 'stop_collector' because the condition on line 1866 was always true
1867 _update_thread.join(timeout=5)
1870if __name__ == "__main__":
1871 import sys
1873 logging.basicConfig(level=logging.INFO)
1874 init_db()
1876 if "--once" in sys.argv:
1877 collect_all_data()
1878 else:
1879 start_collector()
1880 try:
1881 while True:
1882 import time
1883 time.sleep(1)
1884 except KeyboardInterrupt:
1885 stop_collector()