202 lines
5.3 KiB
Python

from typing import Any, Optional
from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks, status
from sqlalchemy.orm import Session
from app.api.deps import get_db
from app.models.scrape_job import JobStatus
from app.services.scraper import Scraper
from app.crud import scrape_job, scrape_result
from app.schemas.scrape_job import (
ScrapeJob,
ScrapeJobCreate,
ScrapeJobUpdate,
ScrapeJobList,
)
from app.schemas.scrape_result import ScrapeResult
router = APIRouter()
@router.post("/", response_model=ScrapeJob, status_code=status.HTTP_201_CREATED)
def create_scrape_job(
*,
db: Session = Depends(get_db),
job_in: ScrapeJobCreate,
background_tasks: BackgroundTasks,
) -> Any:
"""
Create a new scrape job.
"""
job = scrape_job.create(db=db, obj_in=job_in)
# Run job in background
background_tasks.add_task(run_scrape_job, job_id=job.id)
return job
@router.get("/", response_model=ScrapeJobList)
def list_scrape_jobs(
*,
db: Session = Depends(get_db),
skip: int = 0,
limit: int = 100,
status: Optional[JobStatus] = None,
) -> Any:
"""
List scrape jobs.
"""
if status:
jobs = scrape_job.get_by_status(db=db, status=status, skip=skip, limit=limit)
total = scrape_job.count_by_status(db=db, status=status)
else:
jobs = scrape_job.get_multi(db=db, skip=skip, limit=limit)
total = scrape_job.count(db=db)
return {"jobs": jobs, "total": total}
@router.get("/{job_id}", response_model=ScrapeJob)
def get_scrape_job(
*,
db: Session = Depends(get_db),
job_id: int,
) -> Any:
"""
Get a scrape job by ID.
"""
job = scrape_job.get(db=db, id=job_id)
if not job:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Scrape job with ID {job_id} not found",
)
return job
@router.put("/{job_id}", response_model=ScrapeJob)
def update_scrape_job(
*,
db: Session = Depends(get_db),
job_id: int,
job_in: ScrapeJobUpdate,
) -> Any:
"""
Update a scrape job.
"""
job = scrape_job.get(db=db, id=job_id)
if not job:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Scrape job with ID {job_id} not found",
)
job = scrape_job.update(db=db, db_obj=job, obj_in=job_in)
return job
@router.delete("/{job_id}", status_code=status.HTTP_204_NO_CONTENT, response_model=None)
def delete_scrape_job(
*,
db: Session = Depends(get_db),
job_id: int,
) -> None:
"""
Delete a scrape job.
"""
job = scrape_job.get(db=db, id=job_id)
if not job:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Scrape job with ID {job_id} not found",
)
scrape_job.remove(db=db, id=job_id)
@router.post("/{job_id}/run", response_model=ScrapeJob)
def run_scrape_job(
*,
db: Session = Depends(get_db),
job_id: int,
background_tasks: Optional[BackgroundTasks] = None,
) -> Any:
"""
Run a scrape job.
"""
job = scrape_job.get(db=db, id=job_id)
if not job:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Scrape job with ID {job_id} not found",
)
if job.status == JobStatus.IN_PROGRESS:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Scrape job with ID {job_id} is already in progress",
)
# If called with background_tasks, run in background
if background_tasks:
background_tasks.add_task(_run_job, job_id=job_id)
# Update job status to pending
job = scrape_job.update(db=db, db_obj=job, obj_in={"status": JobStatus.PENDING})
return job
# Otherwise, run synchronously
return _run_job(job_id=job_id)
@router.get("/{job_id}/results", response_model=ScrapeResult)
def get_scrape_results(
*,
db: Session = Depends(get_db),
job_id: int,
) -> Any:
"""
Get the latest result for a scrape job.
"""
job = scrape_job.get(db=db, id=job_id)
if not job:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Scrape job with ID {job_id} not found",
)
result = scrape_result.get_latest_by_job_id(db=db, job_id=job_id)
if not result:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"No results found for scrape job with ID {job_id}",
)
return result
def _run_job(job_id: int) -> ScrapeJob:
"""
Internal function to run a scrape job.
"""
# Create a new session and scraper
db = next(get_db())
scraper = Scraper(db=db)
try:
# Run the job
job = scraper.run_job(job_id=job_id)
return job
except Exception as e:
# Make sure the job is marked as failed
job = scrape_job.get(db=db, id=job_id)
if job and job.status != JobStatus.FAILED:
scrape_job.update(
db=db,
db_obj=job,
obj_in={"status": JobStatus.FAILED, "error": str(e)},
)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error running scrape job: {str(e)}",
)