Fix dependency installation issue by adding Dockerfile and docker-compose.yml

This commit is contained in:
Automated Action 2025-05-29 17:20:46 +00:00
parent b7132c82ed
commit a4511b3137
38 changed files with 1772 additions and 2 deletions

27
Dockerfile Normal file
View File

@ -0,0 +1,27 @@
FROM python:3.11-slim
WORKDIR /app
# Copy requirements file
COPY requirements.txt .
# Install dependencies
RUN pip install --no-cache-dir -r requirements.txt
# Copy the rest of the application
COPY . .
# Create the database directory
RUN mkdir -p /app/storage/db
# Set environment variables
ENV PYTHONPATH=/app
ENV HOST=0.0.0.0
ENV PORT=8000
ENV DEBUG=True
# Expose the port
EXPOSE 8000
# Run the application
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]

151
README.md
View File

@ -1,3 +1,150 @@
# FastAPI Application
# Web Scraper CLI
This is a FastAPI application bootstrapped by BackendIM, the AI-powered backend generation platform.
A FastAPI-based web scraper with CLI interface.
## Features
- REST API for web scraping management
- CLI tool for scraping websites
- Extract metadata, links, and specific content using CSS selectors
- Store scraping results in SQLite database
- Background job processing
- Rate limiting to avoid overloading target websites
## Installation
### Local Installation
1. Clone the repository:
```bash
git clone https://github.com/yourusername/webscrapercli.git
cd webscrapercli
```
2. Install dependencies:
```bash
pip install -r requirements.txt
```
3. Run the database migrations:
```bash
alembic upgrade head
```
### Docker Installation
1. Clone the repository:
```bash
git clone https://github.com/yourusername/webscrapercli.git
cd webscrapercli
```
2. Build and run using Docker Compose:
```bash
docker-compose up --build
```
This will:
- Build the Docker image with all dependencies
- Start the FastAPI server on port 8000
- Mount the app and storage directories as volumes for live code reloading
## Usage
### API Server
Start the API server:
```bash
# Development mode
uvicorn main:app --reload
# Production mode
uvicorn main:app --host 0.0.0.0 --port 8000
```
Access the API documentation at: http://localhost:8000/docs
### CLI Usage
The CLI provides several commands for scraping websites:
```bash
# Scrape a URL
python cli.py scrape https://example.com
# Scrape a URL with a specific selector
python cli.py scrape https://example.com --selector "div.content"
# Save the results to a file
python cli.py scrape https://example.com --output results.json
# List all scrape jobs
python cli.py list
# List scrape jobs with a specific status
python cli.py list --status completed
# Show details of a specific job
python cli.py show 1
# Run a specific job
python cli.py run 1
```
## API Endpoints
- `GET /health`: Health check endpoint
- `POST /api/v1/scrape-jobs/`: Create a new scrape job
- `GET /api/v1/scrape-jobs/`: List scrape jobs
- `GET /api/v1/scrape-jobs/{job_id}`: Get a specific scrape job
- `PUT /api/v1/scrape-jobs/{job_id}`: Update a scrape job
- `DELETE /api/v1/scrape-jobs/{job_id}`: Delete a scrape job
- `POST /api/v1/scrape-jobs/{job_id}/run`: Run a scrape job
- `GET /api/v1/scrape-jobs/{job_id}/results`: Get the results of a scrape job
## Development
### Project Structure
```
webscrapercli/
├── alembic.ini # Alembic configuration
├── app/ # Application package
│ ├── api/ # API endpoints
│ ├── cli/ # CLI implementation
│ ├── core/ # Core functionality
│ ├── crud/ # CRUD operations
│ ├── db/ # Database configuration
│ ├── models/ # SQLAlchemy models
│ ├── schemas/ # Pydantic schemas
│ ├── services/ # Business logic
│ └── utils/ # Utility functions
├── cli.py # CLI entry point
├── docker-compose.yml # Docker Compose configuration
├── Dockerfile # Docker configuration
├── main.py # API entry point
├── migrations/ # Alembic migrations
│ ├── env.py # Alembic environment
│ ├── script.py.mako # Alembic script template
│ └── versions/ # Migration scripts
├── requirements.txt # Dependencies
└── storage/ # Storage directory for database and other files
└── db/ # Database directory
```
### Running Tests
```bash
# Run tests
pytest
```
## License
This project is open source.

84
alembic.ini Normal file
View File

@ -0,0 +1,84 @@
# A generic, single database configuration.
[alembic]
# path to migration scripts
script_location = migrations
# template used to generate migration files
# file_template = %%(rev)s_%%(slug)s
# timezone to use when rendering the date
# within the migration file as well as the filename.
# string value is passed to dateutil.tz.gettz()
# leave blank for localtime
# timezone =
# max length of characters to apply to the
# "slug" field
# truncate_slug_length = 40
# set to 'true' to run the environment during
# the 'revision' command, regardless of autogenerate
# revision_environment = false
# set to 'true' to allow .pyc and .pyo files without
# a source .py file to be detected as revisions in the
# versions/ directory
# sourceless = false
# version location specification; this defaults
# to migrations/versions. When using multiple version
# directories, initial revisions must be specified with --version-path
# version_locations = %(here)s/bar %(here)s/bat migrations/versions
# the output encoding used when revision files
# are written from script.py.mako
# output_encoding = utf-8
sqlalchemy.url = sqlite:////app/storage/db/db.sqlite
[post_write_hooks]
# post_write_hooks defines scripts or Python functions that are run
# on newly generated revision scripts. See the documentation for further
# detail and examples
# format using "black" - use the console_scripts runner, against the "black" entrypoint
# hooks=black
# black.type=console_scripts
# black.entrypoint=black
# black.options=-l 79
# Logging configuration
[loggers]
keys = root,sqlalchemy,alembic
[handlers]
keys = console
[formatters]
keys = generic
[logger_root]
level = WARN
handlers = console
qualname =
[logger_sqlalchemy]
level = WARN
handlers =
qualname = sqlalchemy.engine
[logger_alembic]
level = INFO
handlers =
qualname = alembic
[handler_console]
class = StreamHandler
args = (sys.stderr,)
level = NOTSET
formatter = generic
[formatter_generic]
format = %(levelname)-5.5s [%(name)s] %(message)s
datefmt = %H:%M:%S

0
app/__init__.py Normal file
View File

0
app/api/__init__.py Normal file
View File

9
app/api/api.py Normal file
View File

@ -0,0 +1,9 @@
from fastapi import APIRouter
from app.api.endpoints import scrape_jobs
api_router = APIRouter()
api_router.include_router(
scrape_jobs.router, prefix="/scrape-jobs", tags=["scrape-jobs"]
)

15
app/api/deps.py Normal file
View File

@ -0,0 +1,15 @@
from typing import Generator
from app.db.session import SessionLocal
def get_db() -> Generator:
"""
Get a database session.
"""
db = SessionLocal()
try:
yield db
finally:
db.close()

View File

@ -0,0 +1 @@
# This file is intentionally empty to make the directory a Python package

View File

@ -0,0 +1,201 @@
from typing import Any, Optional
from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks, status
from sqlalchemy.orm import Session
from app.api.deps import get_db
from app.models.scrape_job import JobStatus
from app.services.scraper import Scraper
from app.crud import scrape_job, scrape_result
from app.schemas.scrape_job import (
ScrapeJob,
ScrapeJobCreate,
ScrapeJobUpdate,
ScrapeJobList,
)
from app.schemas.scrape_result import ScrapeResult
router = APIRouter()
@router.post("/", response_model=ScrapeJob, status_code=status.HTTP_201_CREATED)
def create_scrape_job(
*,
db: Session = Depends(get_db),
job_in: ScrapeJobCreate,
background_tasks: BackgroundTasks,
) -> Any:
"""
Create a new scrape job.
"""
job = scrape_job.create(db=db, obj_in=job_in)
# Run job in background
background_tasks.add_task(run_scrape_job, job_id=job.id)
return job
@router.get("/", response_model=ScrapeJobList)
def list_scrape_jobs(
*,
db: Session = Depends(get_db),
skip: int = 0,
limit: int = 100,
status: Optional[JobStatus] = None,
) -> Any:
"""
List scrape jobs.
"""
if status:
jobs = scrape_job.get_by_status(db=db, status=status, skip=skip, limit=limit)
total = scrape_job.count_by_status(db=db, status=status)
else:
jobs = scrape_job.get_multi(db=db, skip=skip, limit=limit)
total = scrape_job.count(db=db)
return {"jobs": jobs, "total": total}
@router.get("/{job_id}", response_model=ScrapeJob)
def get_scrape_job(
*,
db: Session = Depends(get_db),
job_id: int,
) -> Any:
"""
Get a scrape job by ID.
"""
job = scrape_job.get(db=db, id=job_id)
if not job:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Scrape job with ID {job_id} not found",
)
return job
@router.put("/{job_id}", response_model=ScrapeJob)
def update_scrape_job(
*,
db: Session = Depends(get_db),
job_id: int,
job_in: ScrapeJobUpdate,
) -> Any:
"""
Update a scrape job.
"""
job = scrape_job.get(db=db, id=job_id)
if not job:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Scrape job with ID {job_id} not found",
)
job = scrape_job.update(db=db, db_obj=job, obj_in=job_in)
return job
@router.delete("/{job_id}", status_code=status.HTTP_204_NO_CONTENT, response_model=None)
def delete_scrape_job(
*,
db: Session = Depends(get_db),
job_id: int,
) -> None:
"""
Delete a scrape job.
"""
job = scrape_job.get(db=db, id=job_id)
if not job:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Scrape job with ID {job_id} not found",
)
scrape_job.remove(db=db, id=job_id)
@router.post("/{job_id}/run", response_model=ScrapeJob)
def run_scrape_job(
*,
db: Session = Depends(get_db),
job_id: int,
background_tasks: Optional[BackgroundTasks] = None,
) -> Any:
"""
Run a scrape job.
"""
job = scrape_job.get(db=db, id=job_id)
if not job:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Scrape job with ID {job_id} not found",
)
if job.status == JobStatus.IN_PROGRESS:
raise HTTPException(
status_code=status.HTTP_400_BAD_REQUEST,
detail=f"Scrape job with ID {job_id} is already in progress",
)
# If called with background_tasks, run in background
if background_tasks:
background_tasks.add_task(_run_job, job_id=job_id)
# Update job status to pending
job = scrape_job.update(db=db, db_obj=job, obj_in={"status": JobStatus.PENDING})
return job
# Otherwise, run synchronously
return _run_job(job_id=job_id)
@router.get("/{job_id}/results", response_model=ScrapeResult)
def get_scrape_results(
*,
db: Session = Depends(get_db),
job_id: int,
) -> Any:
"""
Get the latest result for a scrape job.
"""
job = scrape_job.get(db=db, id=job_id)
if not job:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"Scrape job with ID {job_id} not found",
)
result = scrape_result.get_latest_by_job_id(db=db, job_id=job_id)
if not result:
raise HTTPException(
status_code=status.HTTP_404_NOT_FOUND,
detail=f"No results found for scrape job with ID {job_id}",
)
return result
def _run_job(job_id: int) -> ScrapeJob:
"""
Internal function to run a scrape job.
"""
# Create a new session and scraper
db = next(get_db())
scraper = Scraper(db=db)
try:
# Run the job
job = scraper.run_job(job_id=job_id)
return job
except Exception as e:
# Make sure the job is marked as failed
job = scrape_job.get(db=db, id=job_id)
if job and job.status != JobStatus.FAILED:
scrape_job.update(
db=db,
db_obj=job,
obj_in={"status": JobStatus.FAILED, "error": str(e)},
)
raise HTTPException(
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
detail=f"Error running scrape job: {str(e)}",
)

0
app/cli/__init__.py Normal file
View File

277
app/cli/cli.py Normal file
View File

@ -0,0 +1,277 @@
import json
from typing import Optional
import typer
from rich.console import Console
from rich.table import Table
from sqlalchemy.orm import Session
from app.db.session import SessionLocal
from app.crud import scrape_job, scrape_result
from app.models.scrape_job import JobStatus
from app.services.scraper import Scraper
app = typer.Typer(help="Web Scraper CLI")
console = Console()
def get_db() -> Session:
"""
Get a database session.
"""
return SessionLocal()
@app.command("scrape")
def scrape_url(
url: str = typer.Argument(..., help="URL to scrape"),
selector: Optional[str] = typer.Option(
None, help="CSS selector to extract content"
),
user_agent: Optional[str] = typer.Option(
None, help="User agent to use for request"
),
timeout: Optional[int] = typer.Option(None, help="Timeout for request in seconds"),
output: Optional[str] = typer.Option(
None, help="Output file path for results (JSON)"
),
):
"""
Scrape a URL and extract content.
"""
console.print(f"Scraping [bold]{url}[/bold]...")
db = get_db()
try:
# Create a new scrape job
job_data = {
"url": url,
"selector": selector,
"user_agent": user_agent,
"timeout": timeout,
}
job_in = {k: v for k, v in job_data.items() if v is not None}
# Create and run the job
job = scrape_job.create(db=db, obj_in=job_in)
console.print(f"Created scrape job with ID [bold]{job.id}[/bold]")
# Run the job
scraper = Scraper(db=db, user_agent=user_agent, timeout=timeout)
job = scraper.run_job(job_id=job.id)
if job.status == JobStatus.COMPLETED:
console.print("[bold green]Scraping completed successfully![/bold green]")
# Get the result
result = scrape_result.get_latest_by_job_id(db=db, job_id=job.id)
# Print basic info
console.print("\n[bold]Basic Information:[/bold]")
table = Table(show_header=True, header_style="bold")
table.add_column("Attribute")
table.add_column("Value")
if result and result.extracted_data:
data = result.extracted_data
# Add rows to table
if "title" in data:
table.add_row("Title", data["title"] or "")
if "meta_description" in data:
table.add_row("Description", data["meta_description"] or "")
if "h1" in data:
table.add_row(
"H1 Tags", ", ".join(data["h1"]) if data["h1"] else ""
)
if "links" in data:
link_count = len(data["links"]) if data["links"] else 0
table.add_row("Links", str(link_count))
if selector and "selected_content" in data:
content_count = (
len(data["selected_content"]) if data["selected_content"] else 0
)
table.add_row(f"Selected Content ({selector})", str(content_count))
console.print(table)
# Write results to file if specified
if output:
with open(output, "w") as f:
json.dump(data, f, indent=2)
console.print(f"\nResults saved to [bold]{output}[/bold]")
# Ask if user wants to see more details
if typer.confirm("\nDo you want to see the full extracted data?"):
console.print_json(json.dumps(data))
else:
console.print("[yellow]No data extracted.[/yellow]")
else:
console.print(f"[bold red]Scraping failed:[/bold red] {job.error}")
except Exception as e:
console.print(f"[bold red]Error:[/bold red] {str(e)}")
finally:
db.close()
@app.command("list")
def list_jobs(
status: Optional[str] = typer.Option(
None, help="Filter by status (pending, in_progress, completed, failed)"
),
limit: int = typer.Option(10, help="Limit number of jobs"),
):
"""
List scrape jobs.
"""
db = get_db()
try:
# Get jobs based on status
if status:
try:
job_status = JobStatus(status)
jobs = scrape_job.get_by_status(db=db, status=job_status, limit=limit)
total = scrape_job.count_by_status(db=db, status=job_status)
console.print(
f"Found [bold]{total}[/bold] jobs with status [bold]{status}[/bold]"
)
except ValueError:
console.print(f"[bold red]Invalid status:[/bold red] {status}")
return
else:
jobs = scrape_job.get_multi(db=db, limit=limit)
total = scrape_job.count(db=db)
console.print(f"Found [bold]{total}[/bold] jobs")
if not jobs:
console.print("[yellow]No jobs found.[/yellow]")
return
# Create table
table = Table(show_header=True, header_style="bold")
table.add_column("ID")
table.add_column("URL")
table.add_column("Status")
table.add_column("Created")
table.add_column("Updated")
# Add rows
for job in jobs:
table.add_row(
str(job.id),
job.url,
job.status.value,
job.created_at.strftime("%Y-%m-%d %H:%M:%S"),
job.updated_at.strftime("%Y-%m-%d %H:%M:%S"),
)
console.print(table)
except Exception as e:
console.print(f"[bold red]Error:[/bold red] {str(e)}")
finally:
db.close()
@app.command("show")
def show_job(
job_id: int = typer.Argument(..., help="ID of the job to show"),
):
"""
Show details of a scrape job.
"""
db = get_db()
try:
# Get job
job = scrape_job.get(db=db, id=job_id)
if not job:
console.print(f"[bold red]Job not found:[/bold red] {job_id}")
return
# Print job details
console.print(f"\n[bold]Job {job_id}[/bold]")
console.print(f"URL: [bold]{job.url}[/bold]")
console.print(f"Status: [bold]{job.status.value}[/bold]")
console.print(f"Created: [bold]{job.created_at}[/bold]")
console.print(f"Updated: [bold]{job.updated_at}[/bold]")
if job.started_at:
console.print(f"Started: [bold]{job.started_at}[/bold]")
if job.completed_at:
console.print(f"Completed: [bold]{job.completed_at}[/bold]")
if job.selector:
console.print(f"Selector: [bold]{job.selector}[/bold]")
if job.error:
console.print(f"Error: [bold red]{job.error}[/bold red]")
# Get results if job is completed
if job.status == JobStatus.COMPLETED:
result = scrape_result.get_latest_by_job_id(db=db, job_id=job.id)
if result and result.extracted_data:
console.print("\n[bold]Extracted Data:[/bold]")
# Ask if user wants to see the data
if typer.confirm("Do you want to see the extracted data?"):
console.print_json(json.dumps(result.extracted_data))
else:
console.print("[yellow]No data extracted.[/yellow]")
except Exception as e:
console.print(f"[bold red]Error:[/bold red] {str(e)}")
finally:
db.close()
@app.command("run")
def run_job(
job_id: int = typer.Argument(..., help="ID of the job to run"),
):
"""
Run a scrape job.
"""
db = get_db()
try:
# Get job
job = scrape_job.get(db=db, id=job_id)
if not job:
console.print(f"[bold red]Job not found:[/bold red] {job_id}")
return
console.print(f"Running job [bold]{job_id}[/bold]...")
# Run the job
scraper = Scraper(db=db)
job = scraper.run_job(job_id=job.id)
if job.status == JobStatus.COMPLETED:
console.print("[bold green]Job completed successfully![/bold green]")
else:
console.print(f"[bold red]Job failed:[/bold red] {job.error}")
except Exception as e:
console.print(f"[bold red]Error:[/bold red] {str(e)}")
finally:
db.close()
if __name__ == "__main__":
app()

0
app/core/__init__.py Normal file
View File

50
app/core/config.py Normal file
View File

@ -0,0 +1,50 @@
from pathlib import Path
from typing import Any, Dict, Optional
from pydantic import BaseSettings, validator
class Settings(BaseSettings):
# Base settings
PROJECT_NAME: str = "Web Scraper CLI"
PROJECT_DESCRIPTION: str = "A FastAPI-based web scraper with CLI interface"
VERSION: str = "0.1.0"
API_V1_STR: str = "/api/v1"
# Server settings
HOST: str = "0.0.0.0"
PORT: int = 8000
DEBUG: bool = True
# Database settings
DB_DIR: Path = Path("/app") / "storage" / "db"
SQLALCHEMY_DATABASE_URL: str = f"sqlite:///{DB_DIR}/db.sqlite"
@validator("SQLALCHEMY_DATABASE_URL", pre=True)
def validate_db_url(cls, v: Optional[str], values: Dict[str, Any]) -> str:
"""
Ensure the database directory exists.
"""
if isinstance(v, str) and v.startswith("sqlite"):
db_dir = values.get("DB_DIR")
if db_dir:
db_dir.mkdir(parents=True, exist_ok=True)
return v
return v
# Scraper settings
DEFAULT_USER_AGENT: str = (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
)
DEFAULT_TIMEOUT: int = 30 # seconds
# Ratelimit settings
DEFAULT_RATE_LIMIT: float = 1.0 # requests per second
class Config:
env_file = ".env"
case_sensitive = True
settings = Settings()

4
app/crud/__init__.py Normal file
View File

@ -0,0 +1,4 @@
from app.crud.scrape_job import scrape_job
from app.crud.scrape_result import scrape_result
__all__ = ["scrape_job", "scrape_result"]

89
app/crud/base.py Normal file
View File

@ -0,0 +1,89 @@
from typing import Any, Dict, Generic, List, Optional, Type, TypeVar, Union
from fastapi.encoders import jsonable_encoder
from pydantic import BaseModel
from sqlalchemy.orm import Session
from app.db.session import Base
ModelType = TypeVar("ModelType", bound=Base)
CreateSchemaType = TypeVar("CreateSchemaType", bound=BaseModel)
UpdateSchemaType = TypeVar("UpdateSchemaType", bound=BaseModel)
class CRUDBase(Generic[ModelType, CreateSchemaType, UpdateSchemaType]):
"""
CRUD operations base class.
"""
def __init__(self, model: Type[ModelType]):
"""
CRUD object with default methods to Create, Read, Update, Delete (CRUD).
**Parameters**
* `model`: A SQLAlchemy model class
* `schema`: A Pydantic model (schema) class
"""
self.model = model
def get(self, db: Session, id: Any) -> Optional[ModelType]:
"""
Get a record by ID.
"""
return db.query(self.model).filter(self.model.id == id).first()
def get_multi(
self, db: Session, *, skip: int = 0, limit: int = 100
) -> List[ModelType]:
"""
Get multiple records.
"""
return db.query(self.model).offset(skip).limit(limit).all()
def count(self, db: Session) -> int:
"""
Count total records.
"""
return db.query(self.model).count()
def create(self, db: Session, *, obj_in: CreateSchemaType) -> ModelType:
"""
Create a new record.
"""
obj_in_data = jsonable_encoder(obj_in)
db_obj = self.model(**obj_in_data)
db.add(db_obj)
db.commit()
db.refresh(db_obj)
return db_obj
def update(
self,
db: Session,
*,
db_obj: ModelType,
obj_in: Union[UpdateSchemaType, Dict[str, Any]],
) -> ModelType:
"""
Update a record.
"""
obj_data = jsonable_encoder(db_obj)
if isinstance(obj_in, dict):
update_data = obj_in
else:
update_data = obj_in.dict(exclude_unset=True)
for field in obj_data:
if field in update_data:
setattr(db_obj, field, update_data[field])
db.add(db_obj)
db.commit()
db.refresh(db_obj)
return db_obj
def remove(self, db: Session, *, id: int) -> ModelType:
"""
Remove a record.
"""
obj = db.query(self.model).get(id)
db.delete(obj)
db.commit()
return obj

47
app/crud/scrape_job.py Normal file
View File

@ -0,0 +1,47 @@
from typing import List
from sqlalchemy.orm import Session
from app.models.scrape_job import ScrapeJob, JobStatus
from app.schemas.scrape_job import ScrapeJobCreate, ScrapeJobUpdate
from app.crud.base import CRUDBase
class CRUDScrapeJob(CRUDBase[ScrapeJob, ScrapeJobCreate, ScrapeJobUpdate]):
"""
CRUD operations for ScrapeJob model.
"""
def get_by_status(
self, db: Session, *, status: JobStatus, skip: int = 0, limit: int = 100
) -> List[ScrapeJob]:
"""
Get jobs by status.
"""
return (
db.query(self.model)
.filter(self.model.status == status)
.offset(skip)
.limit(limit)
.all()
)
def count_by_status(self, db: Session, *, status: JobStatus) -> int:
"""
Count jobs by status.
"""
return db.query(self.model).filter(self.model.status == status).count()
def get_pending_jobs(self, db: Session, *, limit: int = 10) -> List[ScrapeJob]:
"""
Get pending jobs.
"""
return (
db.query(self.model)
.filter(self.model.status == JobStatus.PENDING)
.limit(limit)
.all()
)
scrape_job = CRUDScrapeJob(ScrapeJob)

35
app/crud/scrape_result.py Normal file
View File

@ -0,0 +1,35 @@
from typing import List, Optional
from sqlalchemy.orm import Session
from app.models.scrape_result import ScrapeResult
from app.schemas.scrape_result import ScrapeResultCreate, ScrapeResultUpdate
from app.crud.base import CRUDBase
class CRUDScrapeResult(CRUDBase[ScrapeResult, ScrapeResultCreate, ScrapeResultUpdate]):
"""
CRUD operations for ScrapeResult model.
"""
def get_by_job_id(self, db: Session, *, job_id: int) -> List[ScrapeResult]:
"""
Get results by job ID.
"""
return db.query(self.model).filter(self.model.job_id == job_id).all()
def get_latest_by_job_id(
self, db: Session, *, job_id: int
) -> Optional[ScrapeResult]:
"""
Get the latest result by job ID.
"""
return (
db.query(self.model)
.filter(self.model.job_id == job_id)
.order_by(self.model.created_at.desc())
.first()
)
scrape_result = CRUDScrapeResult(ScrapeResult)

0
app/db/__init__.py Normal file
View File

32
app/db/session.py Normal file
View File

@ -0,0 +1,32 @@
from sqlalchemy import create_engine
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import sessionmaker
from app.core.config import settings
# Create database directory if it doesn't exist
settings.DB_DIR.mkdir(parents=True, exist_ok=True)
# Create SQLAlchemy engine
engine = create_engine(
settings.SQLALCHEMY_DATABASE_URL,
connect_args={"check_same_thread": False}, # Only for SQLite
)
# Create sessionmaker
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
# Create base class for SQLAlchemy models
Base = declarative_base()
# Database session dependency
def get_db():
"""
Dependency for getting a database session.
"""
db = SessionLocal()
try:
yield db
finally:
db.close()

4
app/models/__init__.py Normal file
View File

@ -0,0 +1,4 @@
from app.models.scrape_job import ScrapeJob, JobStatus
from app.models.scrape_result import ScrapeResult
__all__ = ["ScrapeJob", "JobStatus", "ScrapeResult"]

41
app/models/scrape_job.py Normal file
View File

@ -0,0 +1,41 @@
import enum
from sqlalchemy import Column, String, Integer, DateTime, Enum, Text, JSON
from sqlalchemy.sql import func
from app.db.session import Base
class JobStatus(str, enum.Enum):
PENDING = "pending"
IN_PROGRESS = "in_progress"
COMPLETED = "completed"
FAILED = "failed"
class ScrapeJob(Base):
"""
Model for a web scraping job.
"""
__tablename__ = "scrape_jobs"
id = Column(Integer, primary_key=True, index=True)
url = Column(String(2048), nullable=False, index=True)
status = Column(Enum(JobStatus), default=JobStatus.PENDING, nullable=False)
created_at = Column(DateTime, default=func.now(), nullable=False)
updated_at = Column(
DateTime, default=func.now(), onupdate=func.now(), nullable=False
)
started_at = Column(DateTime, nullable=True)
completed_at = Column(DateTime, nullable=True)
selector = Column(String(255), nullable=True)
error = Column(Text, nullable=True)
result = Column(JSON, nullable=True)
user_agent = Column(String(255), nullable=True)
timeout = Column(Integer, nullable=True)
def __repr__(self):
return (
f"<ScrapeJob(id={self.id}, url='{self.url}', status='{self.status.value}')>"
)

View File

@ -0,0 +1,29 @@
from sqlalchemy import Column, String, Integer, DateTime, ForeignKey, Text, JSON
from sqlalchemy.sql import func
from sqlalchemy.orm import relationship
from app.db.session import Base
class ScrapeResult(Base):
"""
Model for storing scraping results.
"""
__tablename__ = "scrape_results"
id = Column(Integer, primary_key=True, index=True)
job_id = Column(
Integer, ForeignKey("scrape_jobs.id", ondelete="CASCADE"), nullable=False
)
created_at = Column(DateTime, default=func.now(), nullable=False)
content_type = Column(String(100), nullable=True)
headers = Column(JSON, nullable=True)
html_content = Column(Text, nullable=True)
extracted_data = Column(JSON, nullable=True)
# Relationship
job = relationship("ScrapeJob", backref="results")
def __repr__(self):
return f"<ScrapeResult(id={self.id}, job_id={self.job_id})>"

27
app/schemas/__init__.py Normal file
View File

@ -0,0 +1,27 @@
from app.schemas.scrape_job import (
ScrapeJobBase,
ScrapeJobCreate,
ScrapeJobUpdate,
ScrapeJob,
ScrapeJobList,
)
from app.schemas.scrape_result import (
ScrapeResultBase,
ScrapeResultCreate,
ScrapeResultUpdate,
ScrapeResult,
ScrapeResultList,
)
__all__ = [
"ScrapeJobBase",
"ScrapeJobCreate",
"ScrapeJobUpdate",
"ScrapeJob",
"ScrapeJobList",
"ScrapeResultBase",
"ScrapeResultCreate",
"ScrapeResultUpdate",
"ScrapeResult",
"ScrapeResultList",
]

74
app/schemas/scrape_job.py Normal file
View File

@ -0,0 +1,74 @@
from datetime import datetime
from typing import Optional, Dict, Any, List
from pydantic import BaseModel, HttpUrl
from app.models.scrape_job import JobStatus
class ScrapeJobBase(BaseModel):
"""
Base schema for scrape job data.
"""
url: HttpUrl
selector: Optional[str] = None
user_agent: Optional[str] = None
timeout: Optional[int] = None
class ScrapeJobCreate(ScrapeJobBase):
"""
Schema for creating a new scrape job.
"""
pass
class ScrapeJobUpdate(BaseModel):
"""
Schema for updating a scrape job.
"""
url: Optional[HttpUrl] = None
status: Optional[JobStatus] = None
selector: Optional[str] = None
error: Optional[str] = None
result: Optional[Dict[str, Any]] = None
user_agent: Optional[str] = None
timeout: Optional[int] = None
class ScrapeJobInDBBase(ScrapeJobBase):
"""
Base schema for scrape job in database.
"""
id: int
status: JobStatus
created_at: datetime
updated_at: datetime
started_at: Optional[datetime] = None
completed_at: Optional[datetime] = None
error: Optional[str] = None
result: Optional[Dict[str, Any]] = None
class Config:
orm_mode = True
class ScrapeJob(ScrapeJobInDBBase):
"""
Schema for returned scrape job.
"""
pass
class ScrapeJobList(BaseModel):
"""
Schema for a list of scrape jobs.
"""
jobs: List[ScrapeJob]
total: int

View File

@ -0,0 +1,64 @@
from datetime import datetime
from typing import Optional, Dict, Any, List
from pydantic import BaseModel
class ScrapeResultBase(BaseModel):
"""
Base schema for scrape result data.
"""
job_id: int
content_type: Optional[str] = None
headers: Optional[Dict[str, Any]] = None
extracted_data: Optional[Dict[str, Any]] = None
class ScrapeResultCreate(ScrapeResultBase):
"""
Schema for creating a new scrape result.
"""
html_content: Optional[str] = None
class ScrapeResultUpdate(BaseModel):
"""
Schema for updating a scrape result.
"""
content_type: Optional[str] = None
headers: Optional[Dict[str, Any]] = None
html_content: Optional[str] = None
extracted_data: Optional[Dict[str, Any]] = None
class ScrapeResultInDBBase(ScrapeResultBase):
"""
Base schema for scrape result in database.
"""
id: int
created_at: datetime
html_content: Optional[str] = None
class Config:
orm_mode = True
class ScrapeResult(ScrapeResultInDBBase):
"""
Schema for returned scrape result.
"""
pass
class ScrapeResultList(BaseModel):
"""
Schema for a list of scrape results.
"""
results: List[ScrapeResult]
total: int

0
app/services/__init__.py Normal file
View File

150
app/services/scraper.py Normal file
View File

@ -0,0 +1,150 @@
import time
from datetime import datetime
from typing import Dict, Any, Optional
import requests
from bs4 import BeautifulSoup
from sqlalchemy.orm import Session
from app.core.config import settings
from app.models.scrape_job import ScrapeJob, JobStatus
from app.models.scrape_result import ScrapeResult
class Scraper:
"""
Service for web scraping.
"""
def __init__(
self,
db: Session,
user_agent: Optional[str] = None,
timeout: Optional[int] = None,
rate_limit: Optional[float] = None,
):
self.db = db
self.user_agent = user_agent or settings.DEFAULT_USER_AGENT
self.timeout = timeout or settings.DEFAULT_TIMEOUT
self.rate_limit = rate_limit or settings.DEFAULT_RATE_LIMIT
self._last_request_time = 0
def _respect_rate_limit(self) -> None:
"""
Respect rate limit by sleeping if necessary.
"""
current_time = time.time()
time_since_last_request = current_time - self._last_request_time
if time_since_last_request < (1.0 / self.rate_limit):
sleep_time = (1.0 / self.rate_limit) - time_since_last_request
time.sleep(sleep_time)
self._last_request_time = time.time()
def fetch_url(self, url: str) -> requests.Response:
"""
Fetch URL respecting rate limits.
"""
self._respect_rate_limit()
headers = {
"User-Agent": self.user_agent,
}
response = requests.get(
url,
headers=headers,
timeout=self.timeout,
)
response.raise_for_status()
return response
def parse_html(self, html: str, selector: Optional[str] = None) -> Dict[str, Any]:
"""
Parse HTML content.
"""
soup = BeautifulSoup(html, "lxml")
result = {
"title": soup.title.text if soup.title else None,
"meta_description": None,
"h1": [h1.text.strip() for h1 in soup.find_all("h1")],
"links": [
{"href": a.get("href"), "text": a.text.strip()}
for a in soup.find_all("a")
if a.get("href")
],
}
# Extract meta description
meta_desc = soup.find("meta", attrs={"name": "description"})
if meta_desc:
result["meta_description"] = meta_desc.get("content")
# If a selector is provided, extract content matching the selector
if selector:
selected_elements = soup.select(selector)
result["selected_content"] = [
element.text.strip() for element in selected_elements
]
result["selected_html"] = [str(element) for element in selected_elements]
return result
def run_job(self, job_id: int) -> ScrapeJob:
"""
Run a scraping job.
"""
# Get job from DB
job = self.db.query(ScrapeJob).filter(ScrapeJob.id == job_id).first()
if not job:
raise ValueError(f"Job with ID {job_id} not found")
# Update job status
job.status = JobStatus.IN_PROGRESS
job.started_at = datetime.now()
self.db.commit()
self.db.refresh(job)
try:
# Fetch URL
response = self.fetch_url(job.url)
# Create ScrapeResult
result = ScrapeResult(
job_id=job.id,
content_type=response.headers.get("Content-Type"),
headers=dict(response.headers),
html_content=response.text,
)
self.db.add(result)
self.db.commit()
self.db.refresh(result)
# Parse HTML
extracted_data = self.parse_html(response.text, job.selector)
# Update ScrapeResult with extracted data
result.extracted_data = extracted_data
self.db.commit()
self.db.refresh(result)
# Update job status
job.status = JobStatus.COMPLETED
job.completed_at = datetime.now()
job.result = {"result_id": result.id}
self.db.commit()
self.db.refresh(job)
return job
except Exception as e:
# Update job with error
job.status = JobStatus.FAILED
job.completed_at = datetime.now()
job.error = str(e)
self.db.commit()
self.db.refresh(job)
raise e

0
app/utils/__init__.py Normal file
View File

83
app/utils/html.py Normal file
View File

@ -0,0 +1,83 @@
from typing import List, Dict, Any
from bs4 import BeautifulSoup
def extract_metadata(html: str) -> Dict[str, Any]:
"""
Extract metadata from HTML.
"""
soup = BeautifulSoup(html, "lxml")
result = {
"title": None,
"description": None,
"keywords": None,
"og_title": None,
"og_description": None,
"og_image": None,
}
# Title
if soup.title:
result["title"] = soup.title.text.strip()
# Meta tags
for meta in soup.find_all("meta"):
name = meta.get("name", "").lower()
property = meta.get("property", "").lower()
content = meta.get("content", "")
if name == "description":
result["description"] = content
elif name == "keywords":
result["keywords"] = content
elif property == "og:title":
result["og_title"] = content
elif property == "og:description":
result["og_description"] = content
elif property == "og:image":
result["og_image"] = content
return result
def extract_links(html: str) -> List[Dict[str, str]]:
"""
Extract links from HTML.
"""
soup = BeautifulSoup(html, "lxml")
links = []
for a in soup.find_all("a"):
href = a.get("href")
if href:
links.append(
{
"href": href,
"text": a.text.strip(),
"title": a.get("title", ""),
"rel": a.get("rel", ""),
}
)
return links
def extract_images(html: str) -> List[Dict[str, str]]:
"""
Extract images from HTML.
"""
soup = BeautifulSoup(html, "lxml")
images = []
for img in soup.find_all("img"):
src = img.get("src")
if src:
images.append(
{
"src": src,
"alt": img.get("alt", ""),
"title": img.get("title", ""),
}
)
return images

25
app/utils/url.py Normal file
View File

@ -0,0 +1,25 @@
from urllib.parse import urlparse, parse_qs
from typing import Dict, Any
def parse_url(url: str) -> Dict[str, Any]:
"""
Parse a URL into its components.
"""
parsed = urlparse(url)
return {
"scheme": parsed.scheme,
"netloc": parsed.netloc,
"path": parsed.path,
"params": parsed.params,
"query": parse_qs(parsed.query),
"fragment": parsed.fragment,
}
def is_valid_url(url: str) -> bool:
"""
Check if a URL is valid.
"""
parsed = urlparse(url)
return bool(parsed.scheme and parsed.netloc)

9
cli.py Normal file
View File

@ -0,0 +1,9 @@
#!/usr/bin/env python
"""
Web Scraper CLI
"""
from app.cli.cli import app
if __name__ == "__main__":
app()

15
docker-compose.yml Normal file
View File

@ -0,0 +1,15 @@
version: '3.8'
services:
app:
build:
context: .
dockerfile: Dockerfile
ports:
- "8000:8000"
volumes:
- ./app:/app/app
- ./storage:/app/storage
environment:
- DEBUG=True
command: uvicorn main:app --host 0.0.0.0 --port 8000 --reload

29
main.py Normal file
View File

@ -0,0 +1,29 @@
import uvicorn
from fastapi import FastAPI
from app.api.api import api_router
from app.core.config import settings
app = FastAPI(
title=settings.PROJECT_NAME,
description=settings.PROJECT_DESCRIPTION,
version=settings.VERSION,
)
app.include_router(api_router)
@app.get("/health", tags=["Health"])
async def health_check():
"""
Health check endpoint.
"""
return {"status": "ok"}
if __name__ == "__main__":
uvicorn.run(
"main:app",
host=settings.HOST,
port=settings.PORT,
reload=settings.DEBUG,
)

88
migrations/env.py Normal file
View File

@ -0,0 +1,88 @@
from logging.config import fileConfig
from sqlalchemy import engine_from_config
from sqlalchemy import pool
from alembic import context
# Import models to register them with the metadata
from app.db.session import Base
# These imports are needed to register models with SQLAlchemy metadata
# even though they appear unused to static analyzers
import app.models.scrape_job # noqa: F401
import app.models.scrape_result # noqa: F401
# this is the Alembic Config object, which provides
# access to the values within the .ini file in use.
config = context.config
# Interpret the config file for Python logging.
# This line sets up loggers basically.
fileConfig(config.config_file_name)
# add your model's MetaData object here
# for 'autogenerate' support
target_metadata = Base.metadata
# other values from the config, defined by the needs of env.py,
# can be acquired:
# my_important_option = config.get_main_option("my_important_option")
# ... etc.
def run_migrations_offline():
"""Run migrations in 'offline' mode.
This configures the context with just a URL
and not an Engine, though an Engine is acceptable
here as well. By skipping the Engine creation
we don't even need a DBAPI to be available.
Calls to context.execute() here emit the given string to the
script output.
"""
url = config.get_main_option("sqlalchemy.url")
context.configure(
url=url,
target_metadata=target_metadata,
literal_binds=True,
dialect_opts={"paramstyle": "named"},
)
with context.begin_transaction():
context.run_migrations()
def run_migrations_online():
"""Run migrations in 'online' mode.
In this scenario we need to create an Engine
and associate a connection with the context.
"""
connectable = engine_from_config(
config.get_section(config.config_ini_section),
prefix="sqlalchemy.",
poolclass=pool.NullPool,
)
with connectable.connect() as connection:
# Check if we're using SQLite
is_sqlite = connection.dialect.name == "sqlite"
context.configure(
connection=connection,
target_metadata=target_metadata,
render_as_batch=is_sqlite, # Critical for SQLite to handle alter table operations
)
with context.begin_transaction():
context.run_migrations()
if context.is_offline_mode():
run_migrations_offline()
else:
run_migrations_online()

24
migrations/script.py.mako Normal file
View File

@ -0,0 +1,24 @@
"""${message}
Revision ID: ${up_revision}
Revises: ${down_revision | comma,n}
Create Date: ${create_date}
"""
from alembic import op
import sqlalchemy as sa
${imports if imports else ""}
# revision identifiers, used by Alembic.
revision = ${repr(up_revision)}
down_revision = ${repr(down_revision)}
branch_labels = ${repr(branch_labels)}
depends_on = ${repr(depends_on)}
def upgrade():
${upgrades if upgrades else "pass"}
def downgrade():
${downgrades if downgrades else "pass"}

View File

@ -0,0 +1,75 @@
"""Initial migration
Revision ID: 0001
Revises:
Create Date: 2023-06-25
"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import sqlite
# revision identifiers, used by Alembic.
revision = "0001"
down_revision = None
branch_labels = None
depends_on = None
def upgrade():
# Create scrape_jobs table
op.create_table(
"scrape_jobs",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("url", sa.String(length=2048), nullable=False),
sa.Column(
"status",
sa.Enum("pending", "in_progress", "completed", "failed", name="jobstatus"),
nullable=False,
default="pending",
),
sa.Column(
"created_at", sa.DateTime(), nullable=False, server_default=sa.func.now()
),
sa.Column(
"updated_at", sa.DateTime(), nullable=False, server_default=sa.func.now()
),
sa.Column("started_at", sa.DateTime(), nullable=True),
sa.Column("completed_at", sa.DateTime(), nullable=True),
sa.Column("selector", sa.String(length=255), nullable=True),
sa.Column("error", sa.Text(), nullable=True),
sa.Column("result", sqlite.JSON(), nullable=True),
sa.Column("user_agent", sa.String(length=255), nullable=True),
sa.Column("timeout", sa.Integer(), nullable=True),
sa.PrimaryKeyConstraint("id"),
)
op.create_index(op.f("ix_scrape_jobs_id"), "scrape_jobs", ["id"], unique=False)
op.create_index(op.f("ix_scrape_jobs_url"), "scrape_jobs", ["url"], unique=False)
# Create scrape_results table
op.create_table(
"scrape_results",
sa.Column("id", sa.Integer(), nullable=False),
sa.Column("job_id", sa.Integer(), nullable=False),
sa.Column(
"created_at", sa.DateTime(), nullable=False, server_default=sa.func.now()
),
sa.Column("content_type", sa.String(length=100), nullable=True),
sa.Column("headers", sqlite.JSON(), nullable=True),
sa.Column("html_content", sa.Text(), nullable=True),
sa.Column("extracted_data", sqlite.JSON(), nullable=True),
sa.ForeignKeyConstraint(["job_id"], ["scrape_jobs.id"], ondelete="CASCADE"),
sa.PrimaryKeyConstraint("id"),
)
op.create_index(
op.f("ix_scrape_results_id"), "scrape_results", ["id"], unique=False
)
def downgrade():
op.drop_index(op.f("ix_scrape_results_id"), table_name="scrape_results")
op.drop_table("scrape_results")
op.drop_index(op.f("ix_scrape_jobs_url"), table_name="scrape_jobs")
op.drop_index(op.f("ix_scrape_jobs_id"), table_name="scrape_jobs")
op.drop_table("scrape_jobs")

View File

@ -0,0 +1 @@
# This file is intentionally empty to make the directory a Python package

14
requirements.txt Normal file
View File

@ -0,0 +1,14 @@
fastapi==0.110.0
uvicorn==0.27.1
sqlalchemy==2.0.27
alembic==1.13.1
pydantic==2.6.1
python-dotenv==1.0.1
beautifulsoup4==4.12.2
requests==2.31.0
typer==0.9.0
rich==13.7.0
httpx==0.26.0
lxml==4.9.3
aiohttp==3.9.3
ruff==0.2.2