feat: implement LLM service using litellm for actual inference
This commit is contained in:
parent
97b6d4ec21
commit
f1bf91258a
@ -1,72 +1,56 @@
|
|||||||
from fastapi import APIRouter, HTTPException, status
|
from fastapi import APIRouter, HTTPException, status
|
||||||
from typing import Dict, Any, Optional
|
from typing import Dict
|
||||||
from pydantic import BaseModel
|
from pydantic import BaseModel
|
||||||
from helpers.generic_helpers import (
|
from helpers.generic_helpers import process_llm_request, validate_data, log_error
|
||||||
create_generic_item,
|
|
||||||
log_error,
|
|
||||||
safe_json_serialize
|
|
||||||
)
|
|
||||||
|
|
||||||
router = APIRouter()
|
router = APIRouter()
|
||||||
|
|
||||||
class LLMRequest(BaseModel):
|
class LLMRequest(BaseModel):
|
||||||
|
model: str
|
||||||
prompt: str
|
prompt: str
|
||||||
model: Optional[str] = "gpt-3.5-turbo"
|
temperature: float = 0.7
|
||||||
max_tokens: Optional[int] = 1000
|
max_tokens: int = 1000
|
||||||
temperature: Optional[float] = 0.7
|
|
||||||
options: Optional[Dict[str, Any]] = None
|
|
||||||
|
|
||||||
class LLMResponse(BaseModel):
|
class LLMResponse(BaseModel):
|
||||||
id: str
|
id: str
|
||||||
created_at: str
|
content: str
|
||||||
updated_at: str
|
|
||||||
prompt: str
|
|
||||||
model: str
|
model: str
|
||||||
response: str
|
created: int
|
||||||
tokens_used: Optional[int] = None
|
usage: Dict[str, int]
|
||||||
metadata: Optional[Dict[str, Any]] = None
|
|
||||||
|
|
||||||
@router.post("/llm", status_code=status.HTTP_201_CREATED, response_model=LLMResponse)
|
@router.post("/llm", status_code=status.HTTP_200_OK, response_model=LLMResponse)
|
||||||
async def process_llm_request(request: LLMRequest):
|
async def generate_llm_response(request: LLMRequest):
|
||||||
"""
|
"""
|
||||||
Process a request to generate text using an LLM model.
|
Generate a response from an LLM model using the litellm library.
|
||||||
|
|
||||||
This endpoint accepts a prompt and optional parameters, then returns the generated response.
|
This endpoint accepts a model name, prompt, and optional parameters,
|
||||||
|
then calls the actual LLM service to generate a response.
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
# Validate required fields
|
# Validate required fields
|
||||||
if not request.prompt:
|
required_fields = ["model", "prompt"]
|
||||||
|
if not validate_data(request.dict(), required_fields):
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=status.HTTP_400_BAD_REQUEST,
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
detail="Prompt is required"
|
detail="Missing required fields: model and prompt are required"
|
||||||
)
|
)
|
||||||
|
|
||||||
# Prepare data for storage
|
# Process the LLM request using the helper function
|
||||||
llm_data = {
|
result = process_llm_request(
|
||||||
"prompt": request.prompt,
|
model=request.model,
|
||||||
"model": request.model,
|
prompt=request.prompt,
|
||||||
"response": f"Generated response for: {request.prompt}", # Mock response
|
temperature=request.temperature,
|
||||||
"tokens_used": len(request.prompt.split()) * 2, # Mock token count
|
max_tokens=request.max_tokens
|
||||||
"metadata": {
|
)
|
||||||
"max_tokens": request.max_tokens,
|
|
||||||
"temperature": request.temperature,
|
|
||||||
"options": request.options or {}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
# Create item in storage
|
return result
|
||||||
result = create_generic_item(llm_data)
|
|
||||||
|
|
||||||
# Return serialized result
|
|
||||||
return safe_json_serialize(result)
|
|
||||||
|
|
||||||
except HTTPException:
|
except HTTPException:
|
||||||
# Re-raise HTTP exceptions
|
# Re-raise HTTP exceptions as they already have status codes
|
||||||
raise
|
raise
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# Log unexpected errors
|
log_error("Unexpected error in LLM endpoint", e)
|
||||||
log_error("Unexpected error processing LLM request", e)
|
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
detail="An error occurred while processing your request"
|
detail=f"Failed to process LLM request: {str(e)}"
|
||||||
)
|
)
|
@ -6,16 +6,14 @@ import traceback
|
|||||||
import time
|
import time
|
||||||
import hashlib
|
import hashlib
|
||||||
from fastapi import HTTPException
|
from fastapi import HTTPException
|
||||||
|
import litellm
|
||||||
# Since we don't have specific entity information and no model/schema code,
|
|
||||||
# we'll create generic utility helper functions that don't rely on database access
|
|
||||||
|
|
||||||
# In-memory data store as fallback
|
|
||||||
_generic_store: List[Dict[str, Any]] = []
|
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# In-memory data store as fallback
|
||||||
|
_generic_store: List[Dict[str, Any]] = []
|
||||||
|
|
||||||
def generate_unique_id() -> str:
|
def generate_unique_id() -> str:
|
||||||
"""
|
"""
|
||||||
Generates a unique identifier.
|
Generates a unique identifier.
|
||||||
@ -288,3 +286,50 @@ def handle_http_error(status_code: int, detail: str) -> None:
|
|||||||
HTTPException: With the specified status code and detail
|
HTTPException: With the specified status code and detail
|
||||||
"""
|
"""
|
||||||
raise HTTPException(status_code=status_code, detail=detail)
|
raise HTTPException(status_code=status_code, detail=detail)
|
||||||
|
|
||||||
|
def process_llm_request(model: str, prompt: str, temperature: float = 0.7, max_tokens: int = 1000) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Processes an LLM request using litellm to handle the actual inference.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
model (str): The LLM model to use for inference
|
||||||
|
prompt (str): The prompt text to send to the LLM
|
||||||
|
temperature (float): Controls randomness in the output (0-1)
|
||||||
|
max_tokens (int): Maximum number of tokens to generate
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict[str, Any]: The LLM response with content and metadata
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
logger.info(f"Sending request to LLM model: {model}")
|
||||||
|
|
||||||
|
# Make the actual LLM call using litellm
|
||||||
|
response = litellm.completion(
|
||||||
|
model=model,
|
||||||
|
messages=[{"role": "user", "content": prompt}],
|
||||||
|
temperature=temperature,
|
||||||
|
max_tokens=max_tokens
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process and return the response
|
||||||
|
result = {
|
||||||
|
"id": response.id,
|
||||||
|
"content": response.choices[0].message.content,
|
||||||
|
"model": response.model,
|
||||||
|
"created": response.created,
|
||||||
|
"usage": {
|
||||||
|
"prompt_tokens": response.usage.prompt_tokens,
|
||||||
|
"completion_tokens": response.usage.completion_tokens,
|
||||||
|
"total_tokens": response.usage.total_tokens
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(f"LLM request completed successfully. Used {result['usage']['total_tokens']} tokens.")
|
||||||
|
return result
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
log_error("Error processing LLM request", e)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=500,
|
||||||
|
detail=f"LLM processing error: {str(e)}"
|
||||||
|
)
|
@ -10,3 +10,4 @@ alembic>=1.13.1
|
|||||||
jose
|
jose
|
||||||
passlib
|
passlib
|
||||||
pydantic
|
pydantic
|
||||||
|
litellm
|
||||||
|
Loading…
x
Reference in New Issue
Block a user