Enhance FastAPI app for Kubernetes resilience and pod host assignment issues

This commit is contained in:
Automated Action 2025-05-16 22:46:12 +00:00
parent 4077f5d027
commit 4d00fd5473
3 changed files with 245 additions and 4 deletions

View File

@ -1,15 +1,70 @@
from fastapi import APIRouter, Depends from typing import Any, Dict
from fastapi import APIRouter, Depends, Query
from sqlalchemy.exc import SQLAlchemyError
from sqlalchemy.orm import Session from sqlalchemy.orm import Session
from app.core.orchestration import get_orchestration_status
from app.db.session import get_db from app.db.session import get_db
router = APIRouter() router = APIRouter()
@router.get("") @router.get("")
def health_check(db: Session = Depends(get_db)): def health_check(
db: Session = Depends(get_db),
detailed: bool = Query(False, description="Include detailed diagnostics information")
) -> Dict[str, Any]:
""" """
Health check endpoint for the API Health check endpoint for the API
This endpoint checks the health of various components of the application:
- Database connectivity
- Host and pod status
- Application readiness
Args:
detailed: Whether to include detailed diagnostics information
Returns:
Dict[str, Any]: Health status of the application
""" """
# Simple health check that ensures the database connection is working # Basic health response
return {"status": "healthy", "database": "connected"} response = {
"status": "healthy"
}
# Check database connectivity
try:
# Simple query to verify database connectivity
db.execute("SELECT 1")
database_status = "connected"
database_error = None
except SQLAlchemyError as e:
database_status = "error"
database_error = str(e)
except Exception as e:
database_status = "error"
database_error = str(e)
response["database"] = {
"status": database_status
}
# Add orchestration status if detailed information is requested
if detailed:
response["orchestration"] = get_orchestration_status()
# Add more detailed database information if there was an error
if database_error:
response["database"]["error"] = database_error
elif database_status != "connected":
# Always include error information in the basic response
response["status"] = "unhealthy"
response["database"]["error"] = database_error
# Set overall status based on component statuses
if database_status != "connected":
response["status"] = "unhealthy"
return response

118
app/core/orchestration.py Normal file
View File

@ -0,0 +1,118 @@
"""
Utilities to handle container orchestration environments and improve
resilience in Kubernetes-like environments.
"""
import logging
import os
import socket
import time
from typing import Dict, Optional, Tuple
logger = logging.getLogger(__name__)
# Maximum attempts to resolve host
MAX_HOST_RESOLVE_ATTEMPTS = 5
# Delay between attempts in seconds
HOST_RESOLVE_DELAY = 2
def get_host_info() -> Dict[str, str]:
"""
Get information about the host where the application is running.
This helps diagnose container and pod issues in orchestration environments.
Returns:
Dict[str, str]: Dictionary with host information
"""
info = {
"hostname": "unknown",
"ip_address": "unknown",
"pod_name": os.environ.get("HOSTNAME", "unknown"),
"namespace": os.environ.get("POD_NAMESPACE", "unknown"),
}
try:
info["hostname"] = socket.gethostname()
info["ip_address"] = socket.gethostbyname(info["hostname"])
except Exception as e:
logger.warning(f"Could not resolve host information: {str(e)}")
return info
def check_host_connectivity(timeout: int = 5) -> Tuple[bool, Optional[str]]:
"""
Check if the host has network connectivity.
Args:
timeout: Timeout in seconds
Returns:
Tuple[bool, Optional[str]]: (Success, Error message if any)
"""
try:
# Try to resolve a common external domain
socket.getaddrinfo("google.com", 80, proto=socket.IPPROTO_TCP)
return True, None
except socket.gaierror as e:
return False, f"DNS resolution error: {str(e)}"
except socket.timeout:
return False, "Connection timed out"
except Exception as e:
return False, f"Unknown connection error: {str(e)}"
def wait_for_host_assignment(max_attempts: int = MAX_HOST_RESOLVE_ATTEMPTS,
delay: int = HOST_RESOLVE_DELAY) -> Tuple[bool, Optional[str]]:
"""
Wait for the host to be assigned in container orchestration environments.
This helps to handle situations where the pod is scheduled but the host
assignment is delayed.
Args:
max_attempts: Maximum number of attempts to resolve the host
delay: Delay between attempts in seconds
Returns:
Tuple[bool, Optional[str]]: (Success, Error message if any)
"""
attempt = 0
while attempt < max_attempts:
try:
hostname = socket.gethostname()
ip_address = socket.gethostbyname(hostname)
logger.info(f"Host assigned: {hostname} ({ip_address})")
return True, None
except socket.gaierror as e:
logger.warning(f"Host not resolved yet (attempt {attempt+1}/{max_attempts}): {str(e)}")
attempt += 1
time.sleep(delay)
return False, "Maximum attempts reached, host still not assigned"
def get_orchestration_status() -> Dict[str, any]:
"""
Get comprehensive status information about the container orchestration environment.
Returns:
Dict: Orchestration status information
"""
host_info = get_host_info()
connectivity_status, connectivity_error = check_host_connectivity()
startup_error = os.environ.get("APP_STARTUP_ERROR", None)
return {
"host": host_info,
"ready": os.environ.get("APP_READY", "false") == "true",
"connectivity": {
"status": "connected" if connectivity_status else "disconnected",
"error": connectivity_error
},
"startup_error": startup_error,
"environment": "kubernetes" if "KUBERNETES_SERVICE_HOST" in os.environ else "unknown"
}

68
main.py
View File

@ -1,3 +1,9 @@
import logging
import os
import socket
import sys
from contextlib import asynccontextmanager
import uvicorn import uvicorn
from fastapi import FastAPI from fastapi import FastAPI
from starlette.middleware.cors import CORSMiddleware from starlette.middleware.cors import CORSMiddleware
@ -5,13 +11,75 @@ from starlette.middleware.cors import CORSMiddleware
from app.api.api import api_router from app.api.api import api_router
from app.core.config import settings from app.core.config import settings
# Configure logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
handlers=[logging.StreamHandler(sys.stdout)]
)
logger = logging.getLogger(__name__)
@asynccontextmanager
async def lifespan(app: FastAPI):
"""
Lifespan events for the FastAPI application
Handles startup and shutdown gracefully
"""
# Startup: Check for host availability and other system requirements
try:
hostname = socket.gethostname()
ip_address = socket.gethostbyname(hostname)
logger.info(f"Starting application on host: {hostname}, IP: {ip_address}")
# Check required directories exist
from pathlib import Path
db_dir = Path("/app") / "storage" / "db"
db_dir.mkdir(parents=True, exist_ok=True)
logger.info(f"Database directory confirmed at: {db_dir}")
# Set environment variable to indicate we're ready
os.environ["APP_READY"] = "true"
logger.info("Application startup completed successfully")
except Exception as e:
logger.error(f"Application startup error: {str(e)}")
# Critical errors during startup should be propagated, but still allow the app to run
os.environ["APP_READY"] = "false"
os.environ["APP_STARTUP_ERROR"] = str(e)
yield # This is where the app runs
# Shutdown: Perform cleanup operations
try:
logger.info("Application shutting down gracefully")
# Any cleanup operations would go here
except Exception as e:
logger.error(f"Error during application shutdown: {str(e)}")
app = FastAPI( app = FastAPI(
title=settings.PROJECT_NAME, title=settings.PROJECT_NAME,
openapi_url=f"{settings.API_V1_STR}/openapi.json", openapi_url=f"{settings.API_V1_STR}/openapi.json",
description="Generic REST API Service with FastAPI and SQLite", description="Generic REST API Service with FastAPI and SQLite",
version="0.1.0", version="0.1.0",
lifespan=lifespan,
) )
# Add middleware for exception handling
@app.middleware("http")
async def add_process_time_header(request, call_next):
try:
response = await call_next(request)
return response
except Exception as e:
logger.error(f"Request error: {str(e)}")
# Return error to the client
from fastapi.responses import JSONResponse
return JSONResponse(
status_code=500,
content={"detail": "Internal server error", "type": "server_error"}
)
# Set all CORS enabled origins # Set all CORS enabled origins
if settings.BACKEND_CORS_ORIGINS: if settings.BACKEND_CORS_ORIGINS:
app.add_middleware( app.add_middleware(