Enhance FastAPI app for Kubernetes resilience and pod host assignment issues

This commit is contained in:
Automated Action 2025-05-16 22:46:12 +00:00
parent 4077f5d027
commit 4d00fd5473
3 changed files with 245 additions and 4 deletions

View File

@ -1,15 +1,70 @@
from fastapi import APIRouter, Depends
from typing import Any, Dict
from fastapi import APIRouter, Depends, Query
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.orm import Session
from app.core.orchestration import get_orchestration_status
from app.db.session import get_db
router = APIRouter()
@router.get("")
def health_check(db: Session = Depends(get_db)):
def health_check(
db: Session = Depends(get_db),
detailed: bool = Query(False, description="Include detailed diagnostics information")
) -> Dict[str, Any]:
"""
Health check endpoint for the API
This endpoint checks the health of various components of the application:
- Database connectivity
- Host and pod status
- Application readiness
Args:
detailed: Whether to include detailed diagnostics information
Returns:
Dict[str, Any]: Health status of the application
"""
# Simple health check that ensures the database connection is working
return {"status": "healthy", "database": "connected"}
# Basic health response
response = {
"status": "healthy"
}
# Check database connectivity
try:
# Simple query to verify database connectivity
db.execute("SELECT 1")
database_status = "connected"
database_error = None
except SQLAlchemyError as e:
database_status = "error"
database_error = str(e)
except Exception as e:
database_status = "error"
database_error = str(e)
response["database"] = {
"status": database_status
}
# Add orchestration status if detailed information is requested
if detailed:
response["orchestration"] = get_orchestration_status()
# Add more detailed database information if there was an error
if database_error:
response["database"]["error"] = database_error
elif database_status != "connected":
# Always include error information in the basic response
response["status"] = "unhealthy"
response["database"]["error"] = database_error
# Set overall status based on component statuses
if database_status != "connected":
response["status"] = "unhealthy"
return response

118
app/core/orchestration.py Normal file
View File

@ -0,0 +1,118 @@
"""
Utilities to handle container orchestration environments and improve
resilience in Kubernetes-like environments.
"""
import logging
import os
import socket
import time
from typing import Dict, Optional, Tuple
logger = logging.getLogger(__name__)
# Maximum attempts to resolve host
MAX_HOST_RESOLVE_ATTEMPTS = 5
# Delay between attempts in seconds
HOST_RESOLVE_DELAY = 2
def get_host_info() -> Dict[str, str]:
"""
Get information about the host where the application is running.
This helps diagnose container and pod issues in orchestration environments.
Returns:
Dict[str, str]: Dictionary with host information
"""
info = {
"hostname": "unknown",
"ip_address": "unknown",
"pod_name": os.environ.get("HOSTNAME", "unknown"),
"namespace": os.environ.get("POD_NAMESPACE", "unknown"),
}
try:
info["hostname"] = socket.gethostname()
info["ip_address"] = socket.gethostbyname(info["hostname"])
except Exception as e:
logger.warning(f"Could not resolve host information: {str(e)}")
return info
def check_host_connectivity(timeout: int = 5) -> Tuple[bool, Optional[str]]:
"""
Check if the host has network connectivity.
Args:
timeout: Timeout in seconds
Returns:
Tuple[bool, Optional[str]]: (Success, Error message if any)
"""
try:
# Try to resolve a common external domain
socket.getaddrinfo("google.com", 80, proto=socket.IPPROTO_TCP)
return True, None
except socket.gaierror as e:
return False, f"DNS resolution error: {str(e)}"
except socket.timeout:
return False, "Connection timed out"
except Exception as e:
return False, f"Unknown connection error: {str(e)}"
def wait_for_host_assignment(max_attempts: int = MAX_HOST_RESOLVE_ATTEMPTS,
delay: int = HOST_RESOLVE_DELAY) -> Tuple[bool, Optional[str]]:
"""
Wait for the host to be assigned in container orchestration environments.
This helps to handle situations where the pod is scheduled but the host
assignment is delayed.
Args:
max_attempts: Maximum number of attempts to resolve the host
delay: Delay between attempts in seconds
Returns:
Tuple[bool, Optional[str]]: (Success, Error message if any)
"""
attempt = 0
while attempt < max_attempts:
try:
hostname = socket.gethostname()
ip_address = socket.gethostbyname(hostname)
logger.info(f"Host assigned: {hostname} ({ip_address})")
return True, None
except socket.gaierror as e:
logger.warning(f"Host not resolved yet (attempt {attempt+1}/{max_attempts}): {str(e)}")
attempt += 1
time.sleep(delay)
return False, "Maximum attempts reached, host still not assigned"
def get_orchestration_status() -> Dict[str, any]:
"""
Get comprehensive status information about the container orchestration environment.
Returns:
Dict: Orchestration status information
"""
host_info = get_host_info()
connectivity_status, connectivity_error = check_host_connectivity()
startup_error = os.environ.get("APP_STARTUP_ERROR", None)
return {
"host": host_info,
"ready": os.environ.get("APP_READY", "false") == "true",
"connectivity": {
"status": "connected" if connectivity_status else "disconnected",
"error": connectivity_error
},
"startup_error": startup_error,
"environment": "kubernetes" if "KUBERNETES_SERVICE_HOST" in os.environ else "unknown"
}

68
main.py
View File

@ -1,3 +1,9 @@
import logging
import os
import socket
import sys
from contextlib import asynccontextmanager
import uvicorn
from fastapi import FastAPI
from starlette.middleware.cors import CORSMiddleware
@ -5,11 +11,73 @@ from starlette.middleware.cors import CORSMiddleware
from app.api.api import api_router
from app.core.config import settings
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)
@asynccontextmanager
async def lifespan(app: FastAPI):
"""
Lifespan events for the FastAPI application
Handles startup and shutdown gracefully
"""
# Startup: Check for host availability and other system requirements
try:
hostname = socket.gethostname()
ip_address = socket.gethostbyname(hostname)
logger.info(f"Starting application on host: {hostname}, IP: {ip_address}")
# Check required directories exist
from pathlib import Path
db_dir = Path("/app") / "storage" / "db"
db_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"Database directory confirmed at: {db_dir}")
# Set environment variable to indicate we're ready
os.environ["APP_READY"] = "true"
logger.info("Application startup completed successfully")
except Exception as e:
logger.error(f"Application startup error: {str(e)}")
# Critical errors during startup should be propagated, but still allow the app to run
os.environ["APP_READY"] = "false"
os.environ["APP_STARTUP_ERROR"] = str(e)
yield # This is where the app runs
# Shutdown: Perform cleanup operations
try:
logger.info("Application shutting down gracefully")
# Any cleanup operations would go here
except Exception as e:
logger.error(f"Error during application shutdown: {str(e)}")
app = FastAPI(
title=settings.PROJECT_NAME,
openapi_url=f"{settings.API_V1_STR}/openapi.json",
description="Generic REST API Service with FastAPI and SQLite",
version="0.1.0",
lifespan=lifespan,
)
# Add middleware for exception handling
@app.middleware("http")
async def add_process_time_header(request, call_next):
try:
response = await call_next(request)
return response
except Exception as e:
logger.error(f"Request error: {str(e)}")
# Return error to the client
from fastapi.responses import JSONResponse
return JSONResponse(
status_code=500,
content={"detail": "Internal server error", "type": "server_error"}
)
# Set all CORS enabled origins