from typing import Any, Optional from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks, status from sqlalchemy.orm import Session from app.api.deps import get_db from app.models.scrape_job import JobStatus from app.services.scraper import Scraper from app.crud import scrape_job, scrape_result from app.schemas.scrape_job import ( ScrapeJob, ScrapeJobCreate, ScrapeJobUpdate, ScrapeJobList, ) from app.schemas.scrape_result import ScrapeResult router = APIRouter() @router.post("/", response_model=ScrapeJob, status_code=status.HTTP_201_CREATED) def create_scrape_job( *, db: Session = Depends(get_db), job_in: ScrapeJobCreate, background_tasks: BackgroundTasks, ) -> Any: """ Create a new scrape job. """ job = scrape_job.create(db=db, obj_in=job_in) # Run job in background background_tasks.add_task(run_scrape_job, job_id=job.id) return job @router.get("/", response_model=ScrapeJobList) def list_scrape_jobs( *, db: Session = Depends(get_db), skip: int = 0, limit: int = 100, status: Optional[JobStatus] = None, ) -> Any: """ List scrape jobs. """ if status: jobs = scrape_job.get_by_status(db=db, status=status, skip=skip, limit=limit) total = scrape_job.count_by_status(db=db, status=status) else: jobs = scrape_job.get_multi(db=db, skip=skip, limit=limit) total = scrape_job.count(db=db) return {"jobs": jobs, "total": total} @router.get("/{job_id}", response_model=ScrapeJob) def get_scrape_job( *, db: Session = Depends(get_db), job_id: int, ) -> Any: """ Get a scrape job by ID. """ job = scrape_job.get(db=db, id=job_id) if not job: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"Scrape job with ID {job_id} not found", ) return job @router.put("/{job_id}", response_model=ScrapeJob) def update_scrape_job( *, db: Session = Depends(get_db), job_id: int, job_in: ScrapeJobUpdate, ) -> Any: """ Update a scrape job. """ job = scrape_job.get(db=db, id=job_id) if not job: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"Scrape job with ID {job_id} not found", ) job = scrape_job.update(db=db, db_obj=job, obj_in=job_in) return job @router.delete("/{job_id}", status_code=status.HTTP_204_NO_CONTENT, response_model=None) def delete_scrape_job( *, db: Session = Depends(get_db), job_id: int, ) -> None: """ Delete a scrape job. """ job = scrape_job.get(db=db, id=job_id) if not job: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"Scrape job with ID {job_id} not found", ) scrape_job.remove(db=db, id=job_id) @router.post("/{job_id}/run", response_model=ScrapeJob) def run_scrape_job( *, db: Session = Depends(get_db), job_id: int, background_tasks: Optional[BackgroundTasks] = None, ) -> Any: """ Run a scrape job. """ job = scrape_job.get(db=db, id=job_id) if not job: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"Scrape job with ID {job_id} not found", ) if job.status == JobStatus.IN_PROGRESS: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, detail=f"Scrape job with ID {job_id} is already in progress", ) # If called with background_tasks, run in background if background_tasks: background_tasks.add_task(_run_job, job_id=job_id) # Update job status to pending job = scrape_job.update(db=db, db_obj=job, obj_in={"status": JobStatus.PENDING}) return job # Otherwise, run synchronously return _run_job(job_id=job_id) @router.get("/{job_id}/results", response_model=ScrapeResult) def get_scrape_results( *, db: Session = Depends(get_db), job_id: int, ) -> Any: """ Get the latest result for a scrape job. """ job = scrape_job.get(db=db, id=job_id) if not job: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"Scrape job with ID {job_id} not found", ) result = scrape_result.get_latest_by_job_id(db=db, job_id=job_id) if not result: raise HTTPException( status_code=status.HTTP_404_NOT_FOUND, detail=f"No results found for scrape job with ID {job_id}", ) return result def _run_job(job_id: int) -> ScrapeJob: """ Internal function to run a scrape job. """ # Create a new session and scraper db = next(get_db()) scraper = Scraper(db=db) try: # Run the job job = scraper.run_job(job_id=job_id) return job except Exception as e: # Make sure the job is marked as failed job = scrape_job.get(db=db, id=job_id) if job and job.status != JobStatus.FAILED: scrape_job.update( db=db, db_obj=job, obj_in={"status": JobStatus.FAILED, "error": str(e)}, ) raise HTTPException( status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, detail=f"Error running scrape job: {str(e)}", )