Enhance FastAPI app for Kubernetes resilience and pod host assignment issues
This commit is contained in:
parent
4077f5d027
commit
4d00fd5473
@ -1,15 +1,70 @@
|
|||||||
from fastapi import APIRouter, Depends
|
from typing import Any, Dict
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, Query
|
||||||
|
from sqlalchemy.exc import SQLAlchemyError
|
||||||
from sqlalchemy.orm import Session
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.core.orchestration import get_orchestration_status
|
||||||
from app.db.session import get_db
|
from app.db.session import get_db
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
@router.get("")
|
@router.get("")
|
||||||
def health_check(db: Session = Depends(get_db)):
|
def health_check(
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
detailed: bool = Query(False, description="Include detailed diagnostics information")
|
||||||
|
) -> Dict[str, Any]:
|
||||||
"""
|
"""
|
||||||
Health check endpoint for the API
|
Health check endpoint for the API
|
||||||
|
|
||||||
|
This endpoint checks the health of various components of the application:
|
||||||
|
- Database connectivity
|
||||||
|
- Host and pod status
|
||||||
|
- Application readiness
|
||||||
|
|
||||||
|
Args:
|
||||||
|
detailed: Whether to include detailed diagnostics information
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, Any]: Health status of the application
|
||||||
"""
|
"""
|
||||||
# Simple health check that ensures the database connection is working
|
# Basic health response
|
||||||
return {"status": "healthy", "database": "connected"}
|
response = {
|
||||||
|
"status": "healthy"
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check database connectivity
|
||||||
|
try:
|
||||||
|
# Simple query to verify database connectivity
|
||||||
|
db.execute("SELECT 1")
|
||||||
|
database_status = "connected"
|
||||||
|
database_error = None
|
||||||
|
except SQLAlchemyError as e:
|
||||||
|
database_status = "error"
|
||||||
|
database_error = str(e)
|
||||||
|
except Exception as e:
|
||||||
|
database_status = "error"
|
||||||
|
database_error = str(e)
|
||||||
|
|
||||||
|
response["database"] = {
|
||||||
|
"status": database_status
|
||||||
|
}
|
||||||
|
|
||||||
|
# Add orchestration status if detailed information is requested
|
||||||
|
if detailed:
|
||||||
|
response["orchestration"] = get_orchestration_status()
|
||||||
|
|
||||||
|
# Add more detailed database information if there was an error
|
||||||
|
if database_error:
|
||||||
|
response["database"]["error"] = database_error
|
||||||
|
elif database_status != "connected":
|
||||||
|
# Always include error information in the basic response
|
||||||
|
response["status"] = "unhealthy"
|
||||||
|
response["database"]["error"] = database_error
|
||||||
|
|
||||||
|
# Set overall status based on component statuses
|
||||||
|
if database_status != "connected":
|
||||||
|
response["status"] = "unhealthy"
|
||||||
|
|
||||||
|
return response
|
118
app/core/orchestration.py
Normal file
118
app/core/orchestration.py
Normal file
@ -0,0 +1,118 @@
|
|||||||
|
"""
|
||||||
|
Utilities to handle container orchestration environments and improve
|
||||||
|
resilience in Kubernetes-like environments.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import socket
|
||||||
|
import time
|
||||||
|
from typing import Dict, Optional, Tuple
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Maximum attempts to resolve host
|
||||||
|
MAX_HOST_RESOLVE_ATTEMPTS = 5
|
||||||
|
# Delay between attempts in seconds
|
||||||
|
HOST_RESOLVE_DELAY = 2
|
||||||
|
|
||||||
|
|
||||||
|
def get_host_info() -> Dict[str, str]:
|
||||||
|
"""
|
||||||
|
Get information about the host where the application is running.
|
||||||
|
|
||||||
|
This helps diagnose container and pod issues in orchestration environments.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, str]: Dictionary with host information
|
||||||
|
"""
|
||||||
|
info = {
|
||||||
|
"hostname": "unknown",
|
||||||
|
"ip_address": "unknown",
|
||||||
|
"pod_name": os.environ.get("HOSTNAME", "unknown"),
|
||||||
|
"namespace": os.environ.get("POD_NAMESPACE", "unknown"),
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
info["hostname"] = socket.gethostname()
|
||||||
|
info["ip_address"] = socket.gethostbyname(info["hostname"])
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f"Could not resolve host information: {str(e)}")
|
||||||
|
|
||||||
|
return info
|
||||||
|
|
||||||
|
|
||||||
|
def check_host_connectivity(timeout: int = 5) -> Tuple[bool, Optional[str]]:
|
||||||
|
"""
|
||||||
|
Check if the host has network connectivity.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
timeout: Timeout in seconds
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[bool, Optional[str]]: (Success, Error message if any)
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
# Try to resolve a common external domain
|
||||||
|
socket.getaddrinfo("google.com", 80, proto=socket.IPPROTO_TCP)
|
||||||
|
return True, None
|
||||||
|
except socket.gaierror as e:
|
||||||
|
return False, f"DNS resolution error: {str(e)}"
|
||||||
|
except socket.timeout:
|
||||||
|
return False, "Connection timed out"
|
||||||
|
except Exception as e:
|
||||||
|
return False, f"Unknown connection error: {str(e)}"
|
||||||
|
|
||||||
|
|
||||||
|
def wait_for_host_assignment(max_attempts: int = MAX_HOST_RESOLVE_ATTEMPTS,
|
||||||
|
delay: int = HOST_RESOLVE_DELAY) -> Tuple[bool, Optional[str]]:
|
||||||
|
"""
|
||||||
|
Wait for the host to be assigned in container orchestration environments.
|
||||||
|
|
||||||
|
This helps to handle situations where the pod is scheduled but the host
|
||||||
|
assignment is delayed.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
max_attempts: Maximum number of attempts to resolve the host
|
||||||
|
delay: Delay between attempts in seconds
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple[bool, Optional[str]]: (Success, Error message if any)
|
||||||
|
"""
|
||||||
|
attempt = 0
|
||||||
|
|
||||||
|
while attempt < max_attempts:
|
||||||
|
try:
|
||||||
|
hostname = socket.gethostname()
|
||||||
|
ip_address = socket.gethostbyname(hostname)
|
||||||
|
logger.info(f"Host assigned: {hostname} ({ip_address})")
|
||||||
|
return True, None
|
||||||
|
except socket.gaierror as e:
|
||||||
|
logger.warning(f"Host not resolved yet (attempt {attempt+1}/{max_attempts}): {str(e)}")
|
||||||
|
attempt += 1
|
||||||
|
time.sleep(delay)
|
||||||
|
|
||||||
|
return False, "Maximum attempts reached, host still not assigned"
|
||||||
|
|
||||||
|
|
||||||
|
def get_orchestration_status() -> Dict[str, any]:
|
||||||
|
"""
|
||||||
|
Get comprehensive status information about the container orchestration environment.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict: Orchestration status information
|
||||||
|
"""
|
||||||
|
host_info = get_host_info()
|
||||||
|
connectivity_status, connectivity_error = check_host_connectivity()
|
||||||
|
startup_error = os.environ.get("APP_STARTUP_ERROR", None)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"host": host_info,
|
||||||
|
"ready": os.environ.get("APP_READY", "false") == "true",
|
||||||
|
"connectivity": {
|
||||||
|
"status": "connected" if connectivity_status else "disconnected",
|
||||||
|
"error": connectivity_error
|
||||||
|
},
|
||||||
|
"startup_error": startup_error,
|
||||||
|
"environment": "kubernetes" if "KUBERNETES_SERVICE_HOST" in os.environ else "unknown"
|
||||||
|
}
|
68
main.py
68
main.py
@ -1,3 +1,9 @@
|
|||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import socket
|
||||||
|
import sys
|
||||||
|
from contextlib import asynccontextmanager
|
||||||
|
|
||||||
import uvicorn
|
import uvicorn
|
||||||
from fastapi import FastAPI
|
from fastapi import FastAPI
|
||||||
from starlette.middleware.cors import CORSMiddleware
|
from starlette.middleware.cors import CORSMiddleware
|
||||||
@ -5,13 +11,75 @@ from starlette.middleware.cors import CORSMiddleware
|
|||||||
from app.api.api import api_router
|
from app.api.api import api_router
|
||||||
from app.core.config import settings
|
from app.core.config import settings
|
||||||
|
|
||||||
|
# Configure logging
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||||
|
handlers=[logging.StreamHandler(sys.stdout)]
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
||||||
|
@asynccontextmanager
|
||||||
|
async def lifespan(app: FastAPI):
|
||||||
|
"""
|
||||||
|
Lifespan events for the FastAPI application
|
||||||
|
Handles startup and shutdown gracefully
|
||||||
|
"""
|
||||||
|
# Startup: Check for host availability and other system requirements
|
||||||
|
try:
|
||||||
|
hostname = socket.gethostname()
|
||||||
|
ip_address = socket.gethostbyname(hostname)
|
||||||
|
logger.info(f"Starting application on host: {hostname}, IP: {ip_address}")
|
||||||
|
|
||||||
|
# Check required directories exist
|
||||||
|
from pathlib import Path
|
||||||
|
db_dir = Path("/app") / "storage" / "db"
|
||||||
|
db_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
logger.info(f"Database directory confirmed at: {db_dir}")
|
||||||
|
|
||||||
|
# Set environment variable to indicate we're ready
|
||||||
|
os.environ["APP_READY"] = "true"
|
||||||
|
logger.info("Application startup completed successfully")
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Application startup error: {str(e)}")
|
||||||
|
# Critical errors during startup should be propagated, but still allow the app to run
|
||||||
|
os.environ["APP_READY"] = "false"
|
||||||
|
os.environ["APP_STARTUP_ERROR"] = str(e)
|
||||||
|
|
||||||
|
yield # This is where the app runs
|
||||||
|
|
||||||
|
# Shutdown: Perform cleanup operations
|
||||||
|
try:
|
||||||
|
logger.info("Application shutting down gracefully")
|
||||||
|
# Any cleanup operations would go here
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Error during application shutdown: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
title=settings.PROJECT_NAME,
|
title=settings.PROJECT_NAME,
|
||||||
openapi_url=f"{settings.API_V1_STR}/openapi.json",
|
openapi_url=f"{settings.API_V1_STR}/openapi.json",
|
||||||
description="Generic REST API Service with FastAPI and SQLite",
|
description="Generic REST API Service with FastAPI and SQLite",
|
||||||
version="0.1.0",
|
version="0.1.0",
|
||||||
|
lifespan=lifespan,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Add middleware for exception handling
|
||||||
|
@app.middleware("http")
|
||||||
|
async def add_process_time_header(request, call_next):
|
||||||
|
try:
|
||||||
|
response = await call_next(request)
|
||||||
|
return response
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f"Request error: {str(e)}")
|
||||||
|
# Return error to the client
|
||||||
|
from fastapi.responses import JSONResponse
|
||||||
|
return JSONResponse(
|
||||||
|
status_code=500,
|
||||||
|
content={"detail": "Internal server error", "type": "server_error"}
|
||||||
|
)
|
||||||
|
|
||||||
# Set all CORS enabled origins
|
# Set all CORS enabled origins
|
||||||
if settings.BACKEND_CORS_ORIGINS:
|
if settings.BACKEND_CORS_ORIGINS:
|
||||||
app.add_middleware(
|
app.add_middleware(
|
||||||
|
Loading…
x
Reference in New Issue
Block a user