118 lines
3.6 KiB
Python
118 lines
3.6 KiB
Python
"""
|
|
Utilities to handle container orchestration environments and improve
|
|
resilience in Kubernetes-like environments.
|
|
"""
|
|
|
|
import logging
|
|
import os
|
|
import socket
|
|
import time
|
|
from typing import Dict, Optional, Tuple
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Maximum attempts to resolve host
|
|
MAX_HOST_RESOLVE_ATTEMPTS = 5
|
|
# Delay between attempts in seconds
|
|
HOST_RESOLVE_DELAY = 2
|
|
|
|
|
|
def get_host_info() -> Dict[str, str]:
|
|
"""
|
|
Get information about the host where the application is running.
|
|
|
|
This helps diagnose container and pod issues in orchestration environments.
|
|
|
|
Returns:
|
|
Dict[str, str]: Dictionary with host information
|
|
"""
|
|
info = {
|
|
"hostname": "unknown",
|
|
"ip_address": "unknown",
|
|
"pod_name": os.environ.get("HOSTNAME", "unknown"),
|
|
"namespace": os.environ.get("POD_NAMESPACE", "unknown"),
|
|
}
|
|
|
|
try:
|
|
info["hostname"] = socket.gethostname()
|
|
info["ip_address"] = socket.gethostbyname(info["hostname"])
|
|
except Exception as e:
|
|
logger.warning(f"Could not resolve host information: {str(e)}")
|
|
|
|
return info
|
|
|
|
|
|
def check_host_connectivity(timeout: int = 5) -> Tuple[bool, Optional[str]]:
|
|
"""
|
|
Check if the host has network connectivity.
|
|
|
|
Args:
|
|
timeout: Timeout in seconds
|
|
|
|
Returns:
|
|
Tuple[bool, Optional[str]]: (Success, Error message if any)
|
|
"""
|
|
try:
|
|
# Try to resolve a common external domain
|
|
socket.getaddrinfo("google.com", 80, proto=socket.IPPROTO_TCP)
|
|
return True, None
|
|
except socket.gaierror as e:
|
|
return False, f"DNS resolution error: {str(e)}"
|
|
except socket.timeout:
|
|
return False, "Connection timed out"
|
|
except Exception as e:
|
|
return False, f"Unknown connection error: {str(e)}"
|
|
|
|
|
|
def wait_for_host_assignment(max_attempts: int = MAX_HOST_RESOLVE_ATTEMPTS,
|
|
delay: int = HOST_RESOLVE_DELAY) -> Tuple[bool, Optional[str]]:
|
|
"""
|
|
Wait for the host to be assigned in container orchestration environments.
|
|
|
|
This helps to handle situations where the pod is scheduled but the host
|
|
assignment is delayed.
|
|
|
|
Args:
|
|
max_attempts: Maximum number of attempts to resolve the host
|
|
delay: Delay between attempts in seconds
|
|
|
|
Returns:
|
|
Tuple[bool, Optional[str]]: (Success, Error message if any)
|
|
"""
|
|
attempt = 0
|
|
|
|
while attempt < max_attempts:
|
|
try:
|
|
hostname = socket.gethostname()
|
|
ip_address = socket.gethostbyname(hostname)
|
|
logger.info(f"Host assigned: {hostname} ({ip_address})")
|
|
return True, None
|
|
except socket.gaierror as e:
|
|
logger.warning(f"Host not resolved yet (attempt {attempt+1}/{max_attempts}): {str(e)}")
|
|
attempt += 1
|
|
time.sleep(delay)
|
|
|
|
return False, "Maximum attempts reached, host still not assigned"
|
|
|
|
|
|
def get_orchestration_status() -> Dict[str, any]:
|
|
"""
|
|
Get comprehensive status information about the container orchestration environment.
|
|
|
|
Returns:
|
|
Dict: Orchestration status information
|
|
"""
|
|
host_info = get_host_info()
|
|
connectivity_status, connectivity_error = check_host_connectivity()
|
|
startup_error = os.environ.get("APP_STARTUP_ERROR", None)
|
|
|
|
return {
|
|
"host": host_info,
|
|
"ready": os.environ.get("APP_READY", "false") == "true",
|
|
"connectivity": {
|
|
"status": "connected" if connectivity_status else "disconnected",
|
|
"error": connectivity_error
|
|
},
|
|
"startup_error": startup_error,
|
|
"environment": "kubernetes" if "KUBERNETES_SERVICE_HOST" in os.environ else "unknown"
|
|
} |