feat: implement LLM service using litellm for actual inference

2025-04-29 17:12:55 +00:00 · 2025-04-29 17:12:55 +00:00 · f1bf91258a
commit f1bf91258a
parent 97b6d4ec21
3 changed files with 80 additions and 50 deletions
--- a/endpoints/llm.post.py
+++ b/endpoints/llm.post.py
@ -1,72 +1,56 @@
 from fastapi import APIRouter, HTTPException, status
-from typing import Dict, Any, Optional
+from typing import Dict
 from pydantic import BaseModel
-from helpers.generic_helpers import (
-    create_generic_item,
-    log_error,
-    safe_json_serialize
-)
+from helpers.generic_helpers import process_llm_request, validate_data, log_error

 router = APIRouter()

 class LLMRequest(BaseModel):
+    model: str
    prompt: str
-    model: Optional[str] = "gpt-3.5-turbo"
-    max_tokens: Optional[int] = 1000
-    temperature: Optional[float] = 0.7
-    options: Optional[Dict[str, Any]] = None
+    temperature: float = 0.7
+    max_tokens: int = 1000

 class LLMResponse(BaseModel):
    id: str
-    created_at: str
-    updated_at: str
-    prompt: str
+    content: str
    model: str
-    response: str
-    tokens_used: Optional[int] = None
-    metadata: Optional[Dict[str, Any]] = None
+    created: int
+    usage: Dict[str, int]

-@router.post("/llm", status_code=status.HTTP_201_CREATED, response_model=LLMResponse)
-async def process_llm_request(request: LLMRequest):
+@router.post("/llm", status_code=status.HTTP_200_OK, response_model=LLMResponse)
+async def generate_llm_response(request: LLMRequest):
    """
-    Process a request to generate text using an LLM model.
+    Generate a response from an LLM model using the litellm library.
    
-    This endpoint accepts a prompt and optional parameters, then returns the generated response.
+    This endpoint accepts a model name, prompt, and optional parameters,
+    then calls the actual LLM service to generate a response.
    """
    try:
        # Validate required fields
-        if not request.prompt:
+        required_fields = ["model", "prompt"]
+        if not validate_data(request.dict(), required_fields):
            raise HTTPException(
                status_code=status.HTTP_400_BAD_REQUEST,
-                detail="Prompt is required"
+                detail="Missing required fields: model and prompt are required"
            )
        
-        # Prepare data for storage
-        llm_data = {
-            "prompt": request.prompt,
-            "model": request.model,
-            "response": f"Generated response for: {request.prompt}",  # Mock response
-            "tokens_used": len(request.prompt.split()) * 2,  # Mock token count
-            "metadata": {
-                "max_tokens": request.max_tokens,
-                "temperature": request.temperature,
-                "options": request.options or {}
-            }
-        }
+        # Process the LLM request using the helper function
+        result = process_llm_request(
+            model=request.model,
+            prompt=request.prompt,
+            temperature=request.temperature,
+            max_tokens=request.max_tokens
+        )
        
-        # Create item in storage
-        result = create_generic_item(llm_data)
-        
-        # Return serialized result
-        return safe_json_serialize(result)
+        return result
        
    except HTTPException:
-        # Re-raise HTTP exceptions
+        # Re-raise HTTP exceptions as they already have status codes
        raise
    except Exception as e:
-        # Log unexpected errors
-        log_error("Unexpected error processing LLM request", e)
+        log_error("Unexpected error in LLM endpoint", e)
        raise HTTPException(
            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
-            detail="An error occurred while processing your request"
+            detail=f"Failed to process LLM request: {str(e)}"
        )
--- a/helpers/generic_helpers.py
+++ b/helpers/generic_helpers.py
@ -6,16 +6,14 @@ import traceback
 import time
 import hashlib
 from fastapi import HTTPException
-
-# Since we don't have specific entity information and no model/schema code,
-# we'll create generic utility helper functions that don't rely on database access
-
-# In-memory data store as fallback
-_generic_store: List[Dict[str, Any]] = []
+import litellm

 # Configure logging
 logger = logging.getLogger(__name__)

+# In-memory data store as fallback
+_generic_store: List[Dict[str, Any]] = []
+
 def generate_unique_id() -> str:
    """
    Generates a unique identifier.
@ -287,4 +285,51 @@ def handle_http_error(status_code: int, detail: str) -> None:
    Raises:
        HTTPException: With the specified status code and detail
    """
-    raise HTTPException(status_code=status_code, detail=detail)
+    raise HTTPException(status_code=status_code, detail=detail)
+
+def process_llm_request(model: str, prompt: str, temperature: float = 0.7, max_tokens: int = 1000) -> Dict[str, Any]:
+    """
+    Processes an LLM request using litellm to handle the actual inference.
+    
+    Args:
+        model (str): The LLM model to use for inference
+        prompt (str): The prompt text to send to the LLM
+        temperature (float): Controls randomness in the output (0-1)
+        max_tokens (int): Maximum number of tokens to generate
+        
+    Returns:
+        Dict[str, Any]: The LLM response with content and metadata
+    """
+    try:
+        logger.info(f"Sending request to LLM model: {model}")
+        
+        # Make the actual LLM call using litellm
+        response = litellm.completion(
+            model=model,
+            messages=[{"role": "user", "content": prompt}],
+            temperature=temperature,
+            max_tokens=max_tokens
+        )
+        
+        # Process and return the response
+        result = {
+            "id": response.id,
+            "content": response.choices[0].message.content,
+            "model": response.model,
+            "created": response.created,
+            "usage": {
+                "prompt_tokens": response.usage.prompt_tokens,
+                "completion_tokens": response.usage.completion_tokens,
+                "total_tokens": response.usage.total_tokens
+            }
+        }
+        
+        logger.info(f"LLM request completed successfully. Used {result['usage']['total_tokens']} tokens.")
+        return result
+        
+    except Exception as e:
+        log_error("Error processing LLM request", e)
+        raise HTTPException(
+            status_code=500,
+            detail=f"LLM processing error: {str(e)}"
+        )
--- a/requirements.txt
+++ b/requirements.txt
@ -10,3 +10,4 @@ alembic>=1.13.1
 jose
 passlib
 pydantic
+litellm