Fix dependency installation issue by adding Dockerfile and docker-compose.yml

2025-05-29 17:20:46 +00:00 · 2025-05-29 17:20:46 +00:00 · a4511b3137
commit a4511b3137
parent b7132c82ed
38 changed files with 1772 additions and 2 deletions
--- a/27
+++ b/27
@ -0,0 +1,27 @@
+FROM python:3.11-slim
+
+WORKDIR /app
+
+# Copy requirements file
+COPY requirements.txt .
+
+# Install dependencies
+RUN pip install --no-cache-dir -r requirements.txt
+
+# Copy the rest of the application
+COPY . .
+
+# Create the database directory
+RUN mkdir -p /app/storage/db
+
+# Set environment variables
+ENV PYTHONPATH=/app
+ENV HOST=0.0.0.0
+ENV PORT=8000
+ENV DEBUG=True
+
+# Expose the port
+EXPOSE 8000
+
+# Run the application
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
--- a/README.md
+++ b/README.md
@ -1,3 +1,150 @@
-# FastAPI Application
+# Web Scraper CLI

-This is a FastAPI application bootstrapped by BackendIM, the AI-powered backend generation platform.
+A FastAPI-based web scraper with CLI interface.
+
+## Features
+
+- REST API for web scraping management
+- CLI tool for scraping websites
+- Extract metadata, links, and specific content using CSS selectors
+- Store scraping results in SQLite database
+- Background job processing
+- Rate limiting to avoid overloading target websites
+
+## Installation
+
+### Local Installation
+
+1. Clone the repository:
+
+```bash
+git clone https://github.com/yourusername/webscrapercli.git
+cd webscrapercli
+```
+
+2. Install dependencies:
+
+```bash
+pip install -r requirements.txt
+```
+
+3. Run the database migrations:
+
+```bash
+alembic upgrade head
+```
+
+### Docker Installation
+
+1. Clone the repository:
+
+```bash
+git clone https://github.com/yourusername/webscrapercli.git
+cd webscrapercli
+```
+
+2. Build and run using Docker Compose:
+
+```bash
+docker-compose up --build
+```
+
+This will:
+- Build the Docker image with all dependencies
+- Start the FastAPI server on port 8000
+- Mount the app and storage directories as volumes for live code reloading
+
+## Usage
+
+### API Server
+
+Start the API server:
+
+```bash
+# Development mode
+uvicorn main:app --reload
+
+# Production mode
+uvicorn main:app --host 0.0.0.0 --port 8000
+```
+
+Access the API documentation at: http://localhost:8000/docs
+
+### CLI Usage
+
+The CLI provides several commands for scraping websites:
+
+```bash
+# Scrape a URL
+python cli.py scrape https://example.com
+
+# Scrape a URL with a specific selector
+python cli.py scrape https://example.com --selector "div.content"
+
+# Save the results to a file
+python cli.py scrape https://example.com --output results.json
+
+# List all scrape jobs
+python cli.py list
+
+# List scrape jobs with a specific status
+python cli.py list --status completed
+
+# Show details of a specific job
+python cli.py show 1
+
+# Run a specific job
+python cli.py run 1
+```
+
+## API Endpoints
+
+- `GET /health`: Health check endpoint
+- `POST /api/v1/scrape-jobs/`: Create a new scrape job
+- `GET /api/v1/scrape-jobs/`: List scrape jobs
+- `GET /api/v1/scrape-jobs/{job_id}`: Get a specific scrape job
+- `PUT /api/v1/scrape-jobs/{job_id}`: Update a scrape job
+- `DELETE /api/v1/scrape-jobs/{job_id}`: Delete a scrape job
+- `POST /api/v1/scrape-jobs/{job_id}/run`: Run a scrape job
+- `GET /api/v1/scrape-jobs/{job_id}/results`: Get the results of a scrape job
+
+## Development
+
+### Project Structure
+
+```
+webscrapercli/
+├── alembic.ini                  # Alembic configuration
+├── app/                         # Application package
+│   ├── api/                     # API endpoints
+│   ├── cli/                     # CLI implementation
+│   ├── core/                    # Core functionality
+│   ├── crud/                    # CRUD operations
+│   ├── db/                      # Database configuration
+│   ├── models/                  # SQLAlchemy models
+│   ├── schemas/                 # Pydantic schemas
+│   ├── services/                # Business logic
+│   └── utils/                   # Utility functions
+├── cli.py                       # CLI entry point
+├── docker-compose.yml           # Docker Compose configuration
+├── Dockerfile                   # Docker configuration
+├── main.py                      # API entry point
+├── migrations/                  # Alembic migrations
+│   ├── env.py                   # Alembic environment
+│   ├── script.py.mako           # Alembic script template
+│   └── versions/                # Migration scripts
+├── requirements.txt             # Dependencies
+└── storage/                     # Storage directory for database and other files
+    └── db/                      # Database directory
+```
+
+### Running Tests
+
+```bash
+# Run tests
+pytest
+```
+
+## License
+
+This project is open source.
--- a/alembic.ini
+++ b/alembic.ini
@ -0,0 +1,84 @@
+# A generic, single database configuration.
+
+[alembic]
+# path to migration scripts
+script_location = migrations
+
+# template used to generate migration files
+# file_template = %%(rev)s_%%(slug)s
+
+# timezone to use when rendering the date
+# within the migration file as well as the filename.
+# string value is passed to dateutil.tz.gettz()
+# leave blank for localtime
+# timezone =
+
+# max length of characters to apply to the
+# "slug" field
+# truncate_slug_length = 40
+
+# set to 'true' to run the environment during
+# the 'revision' command, regardless of autogenerate
+# revision_environment = false
+
+# set to 'true' to allow .pyc and .pyo files without
+# a source .py file to be detected as revisions in the
+# versions/ directory
+# sourceless = false
+
+# version location specification; this defaults
+# to migrations/versions.  When using multiple version
+# directories, initial revisions must be specified with --version-path
+# version_locations = %(here)s/bar %(here)s/bat migrations/versions
+
+# the output encoding used when revision files
+# are written from script.py.mako
+# output_encoding = utf-8
+
+sqlalchemy.url = sqlite:////app/storage/db/db.sqlite
+
+[post_write_hooks]
+# post_write_hooks defines scripts or Python functions that are run
+# on newly generated revision scripts.  See the documentation for further
+# detail and examples
+
+# format using "black" - use the console_scripts runner, against the "black" entrypoint
+# hooks=black
+# black.type=console_scripts
+# black.entrypoint=black
+# black.options=-l 79
+
+# Logging configuration
+[loggers]
+keys = root,sqlalchemy,alembic
+
+[handlers]
+keys = console
+
+[formatters]
+keys = generic
+
+[logger_root]
+level = WARN
+handlers = console
+qualname =
+
+[logger_sqlalchemy]
+level = WARN
+handlers =
+qualname = sqlalchemy.engine
+
+[logger_alembic]
+level = INFO
+handlers =
+qualname = alembic
+
+[handler_console]
+class = StreamHandler
+args = (sys.stderr,)
+level = NOTSET
+formatter = generic
+
+[formatter_generic]
+format = %(levelname)-5.5s [%(name)s] %(message)s
+datefmt = %H:%M:%S
--- a/app/init.py
+++ b/app/init.py
--- a/app/api/init.py
+++ b/app/api/init.py
--- a/app/api/api.py
+++ b/app/api/api.py
@ -0,0 +1,9 @@
+from fastapi import APIRouter
+
+from app.api.endpoints import scrape_jobs
+
+api_router = APIRouter()
+
+api_router.include_router(
+    scrape_jobs.router, prefix="/scrape-jobs", tags=["scrape-jobs"]
+)
--- a/app/api/deps.py
+++ b/app/api/deps.py
@ -0,0 +1,15 @@
+from typing import Generator
+
+
+from app.db.session import SessionLocal
+
+
+def get_db() -> Generator:
+    """
+    Get a database session.
+    """
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()
--- a/app/api/endpoints/init.py
+++ b/app/api/endpoints/init.py
@ -0,0 +1 @@
+# This file is intentionally empty to make the directory a Python package
--- a/app/api/endpoints/scrape_jobs.py
+++ b/app/api/endpoints/scrape_jobs.py
@ -0,0 +1,201 @@
+from typing import Any, Optional
+
+from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks, status
+from sqlalchemy.orm import Session
+
+from app.api.deps import get_db
+from app.models.scrape_job import JobStatus
+from app.services.scraper import Scraper
+from app.crud import scrape_job, scrape_result
+from app.schemas.scrape_job import (
+    ScrapeJob,
+    ScrapeJobCreate,
+    ScrapeJobUpdate,
+    ScrapeJobList,
+)
+from app.schemas.scrape_result import ScrapeResult
+
+
+router = APIRouter()
+
+
+@router.post("/", response_model=ScrapeJob, status_code=status.HTTP_201_CREATED)
+def create_scrape_job(
+    *,
+    db: Session = Depends(get_db),
+    job_in: ScrapeJobCreate,
+    background_tasks: BackgroundTasks,
+) -> Any:
+    """
+    Create a new scrape job.
+    """
+    job = scrape_job.create(db=db, obj_in=job_in)
+
+    # Run job in background
+    background_tasks.add_task(run_scrape_job, job_id=job.id)
+
+    return job
+
+
+@router.get("/", response_model=ScrapeJobList)
+def list_scrape_jobs(
+    *,
+    db: Session = Depends(get_db),
+    skip: int = 0,
+    limit: int = 100,
+    status: Optional[JobStatus] = None,
+) -> Any:
+    """
+    List scrape jobs.
+    """
+    if status:
+        jobs = scrape_job.get_by_status(db=db, status=status, skip=skip, limit=limit)
+        total = scrape_job.count_by_status(db=db, status=status)
+    else:
+        jobs = scrape_job.get_multi(db=db, skip=skip, limit=limit)
+        total = scrape_job.count(db=db)
+
+    return {"jobs": jobs, "total": total}
+
+
+@router.get("/{job_id}", response_model=ScrapeJob)
+def get_scrape_job(
+    *,
+    db: Session = Depends(get_db),
+    job_id: int,
+) -> Any:
+    """
+    Get a scrape job by ID.
+    """
+    job = scrape_job.get(db=db, id=job_id)
+    if not job:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Scrape job with ID {job_id} not found",
+        )
+    return job
+
+
+@router.put("/{job_id}", response_model=ScrapeJob)
+def update_scrape_job(
+    *,
+    db: Session = Depends(get_db),
+    job_id: int,
+    job_in: ScrapeJobUpdate,
+) -> Any:
+    """
+    Update a scrape job.
+    """
+    job = scrape_job.get(db=db, id=job_id)
+    if not job:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Scrape job with ID {job_id} not found",
+        )
+    job = scrape_job.update(db=db, db_obj=job, obj_in=job_in)
+    return job
+
+
+@router.delete("/{job_id}", status_code=status.HTTP_204_NO_CONTENT, response_model=None)
+def delete_scrape_job(
+    *,
+    db: Session = Depends(get_db),
+    job_id: int,
+) -> None:
+    """
+    Delete a scrape job.
+    """
+    job = scrape_job.get(db=db, id=job_id)
+    if not job:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Scrape job with ID {job_id} not found",
+        )
+    scrape_job.remove(db=db, id=job_id)
+
+
+@router.post("/{job_id}/run", response_model=ScrapeJob)
+def run_scrape_job(
+    *,
+    db: Session = Depends(get_db),
+    job_id: int,
+    background_tasks: Optional[BackgroundTasks] = None,
+) -> Any:
+    """
+    Run a scrape job.
+    """
+    job = scrape_job.get(db=db, id=job_id)
+    if not job:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Scrape job with ID {job_id} not found",
+        )
+
+    if job.status == JobStatus.IN_PROGRESS:
+        raise HTTPException(
+            status_code=status.HTTP_400_BAD_REQUEST,
+            detail=f"Scrape job with ID {job_id} is already in progress",
+        )
+
+    # If called with background_tasks, run in background
+    if background_tasks:
+        background_tasks.add_task(_run_job, job_id=job_id)
+        # Update job status to pending
+        job = scrape_job.update(db=db, db_obj=job, obj_in={"status": JobStatus.PENDING})
+        return job
+
+    # Otherwise, run synchronously
+    return _run_job(job_id=job_id)
+
+
+@router.get("/{job_id}/results", response_model=ScrapeResult)
+def get_scrape_results(
+    *,
+    db: Session = Depends(get_db),
+    job_id: int,
+) -> Any:
+    """
+    Get the latest result for a scrape job.
+    """
+    job = scrape_job.get(db=db, id=job_id)
+    if not job:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"Scrape job with ID {job_id} not found",
+        )
+
+    result = scrape_result.get_latest_by_job_id(db=db, job_id=job_id)
+    if not result:
+        raise HTTPException(
+            status_code=status.HTTP_404_NOT_FOUND,
+            detail=f"No results found for scrape job with ID {job_id}",
+        )
+
+    return result
+
+
+def _run_job(job_id: int) -> ScrapeJob:
+    """
+    Internal function to run a scrape job.
+    """
+    # Create a new session and scraper
+    db = next(get_db())
+    scraper = Scraper(db=db)
+
+    try:
+        # Run the job
+        job = scraper.run_job(job_id=job_id)
+        return job
+    except Exception as e:
+        # Make sure the job is marked as failed
+        job = scrape_job.get(db=db, id=job_id)
+        if job and job.status != JobStatus.FAILED:
+            scrape_job.update(
+                db=db,
+                db_obj=job,
+                obj_in={"status": JobStatus.FAILED, "error": str(e)},
+            )
+        raise HTTPException(
+            status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
+            detail=f"Error running scrape job: {str(e)}",
+        )
--- a/app/cli/init.py
+++ b/app/cli/init.py
--- a/app/cli/cli.py
+++ b/app/cli/cli.py
@ -0,0 +1,277 @@
+import json
+from typing import Optional
+
+import typer
+from rich.console import Console
+from rich.table import Table
+from sqlalchemy.orm import Session
+
+from app.db.session import SessionLocal
+from app.crud import scrape_job, scrape_result
+from app.models.scrape_job import JobStatus
+from app.services.scraper import Scraper
+
+app = typer.Typer(help="Web Scraper CLI")
+console = Console()
+
+
+def get_db() -> Session:
+    """
+    Get a database session.
+    """
+    return SessionLocal()
+
+
+@app.command("scrape")
+def scrape_url(
+    url: str = typer.Argument(..., help="URL to scrape"),
+    selector: Optional[str] = typer.Option(
+        None, help="CSS selector to extract content"
+    ),
+    user_agent: Optional[str] = typer.Option(
+        None, help="User agent to use for request"
+    ),
+    timeout: Optional[int] = typer.Option(None, help="Timeout for request in seconds"),
+    output: Optional[str] = typer.Option(
+        None, help="Output file path for results (JSON)"
+    ),
+):
+    """
+    Scrape a URL and extract content.
+    """
+    console.print(f"Scraping [bold]{url}[/bold]...")
+
+    db = get_db()
+
+    try:
+        # Create a new scrape job
+        job_data = {
+            "url": url,
+            "selector": selector,
+            "user_agent": user_agent,
+            "timeout": timeout,
+        }
+        job_in = {k: v for k, v in job_data.items() if v is not None}
+
+        # Create and run the job
+        job = scrape_job.create(db=db, obj_in=job_in)
+        console.print(f"Created scrape job with ID [bold]{job.id}[/bold]")
+
+        # Run the job
+        scraper = Scraper(db=db, user_agent=user_agent, timeout=timeout)
+        job = scraper.run_job(job_id=job.id)
+
+        if job.status == JobStatus.COMPLETED:
+            console.print("[bold green]Scraping completed successfully![/bold green]")
+
+            # Get the result
+            result = scrape_result.get_latest_by_job_id(db=db, job_id=job.id)
+
+            # Print basic info
+            console.print("\n[bold]Basic Information:[/bold]")
+            table = Table(show_header=True, header_style="bold")
+            table.add_column("Attribute")
+            table.add_column("Value")
+
+            if result and result.extracted_data:
+                data = result.extracted_data
+
+                # Add rows to table
+                if "title" in data:
+                    table.add_row("Title", data["title"] or "")
+
+                if "meta_description" in data:
+                    table.add_row("Description", data["meta_description"] or "")
+
+                if "h1" in data:
+                    table.add_row(
+                        "H1 Tags", ", ".join(data["h1"]) if data["h1"] else ""
+                    )
+
+                if "links" in data:
+                    link_count = len(data["links"]) if data["links"] else 0
+                    table.add_row("Links", str(link_count))
+
+                if selector and "selected_content" in data:
+                    content_count = (
+                        len(data["selected_content"]) if data["selected_content"] else 0
+                    )
+                    table.add_row(f"Selected Content ({selector})", str(content_count))
+
+                console.print(table)
+
+                # Write results to file if specified
+                if output:
+                    with open(output, "w") as f:
+                        json.dump(data, f, indent=2)
+                    console.print(f"\nResults saved to [bold]{output}[/bold]")
+
+                # Ask if user wants to see more details
+                if typer.confirm("\nDo you want to see the full extracted data?"):
+                    console.print_json(json.dumps(data))
+            else:
+                console.print("[yellow]No data extracted.[/yellow]")
+        else:
+            console.print(f"[bold red]Scraping failed:[/bold red] {job.error}")
+
+    except Exception as e:
+        console.print(f"[bold red]Error:[/bold red] {str(e)}")
+
+    finally:
+        db.close()
+
+
+@app.command("list")
+def list_jobs(
+    status: Optional[str] = typer.Option(
+        None, help="Filter by status (pending, in_progress, completed, failed)"
+    ),
+    limit: int = typer.Option(10, help="Limit number of jobs"),
+):
+    """
+    List scrape jobs.
+    """
+    db = get_db()
+
+    try:
+        # Get jobs based on status
+        if status:
+            try:
+                job_status = JobStatus(status)
+                jobs = scrape_job.get_by_status(db=db, status=job_status, limit=limit)
+                total = scrape_job.count_by_status(db=db, status=job_status)
+                console.print(
+                    f"Found [bold]{total}[/bold] jobs with status [bold]{status}[/bold]"
+                )
+            except ValueError:
+                console.print(f"[bold red]Invalid status:[/bold red] {status}")
+                return
+        else:
+            jobs = scrape_job.get_multi(db=db, limit=limit)
+            total = scrape_job.count(db=db)
+            console.print(f"Found [bold]{total}[/bold] jobs")
+
+        if not jobs:
+            console.print("[yellow]No jobs found.[/yellow]")
+            return
+
+        # Create table
+        table = Table(show_header=True, header_style="bold")
+        table.add_column("ID")
+        table.add_column("URL")
+        table.add_column("Status")
+        table.add_column("Created")
+        table.add_column("Updated")
+
+        # Add rows
+        for job in jobs:
+            table.add_row(
+                str(job.id),
+                job.url,
+                job.status.value,
+                job.created_at.strftime("%Y-%m-%d %H:%M:%S"),
+                job.updated_at.strftime("%Y-%m-%d %H:%M:%S"),
+            )
+
+        console.print(table)
+
+    except Exception as e:
+        console.print(f"[bold red]Error:[/bold red] {str(e)}")
+
+    finally:
+        db.close()
+
+
+@app.command("show")
+def show_job(
+    job_id: int = typer.Argument(..., help="ID of the job to show"),
+):
+    """
+    Show details of a scrape job.
+    """
+    db = get_db()
+
+    try:
+        # Get job
+        job = scrape_job.get(db=db, id=job_id)
+
+        if not job:
+            console.print(f"[bold red]Job not found:[/bold red] {job_id}")
+            return
+
+        # Print job details
+        console.print(f"\n[bold]Job {job_id}[/bold]")
+        console.print(f"URL: [bold]{job.url}[/bold]")
+        console.print(f"Status: [bold]{job.status.value}[/bold]")
+        console.print(f"Created: [bold]{job.created_at}[/bold]")
+        console.print(f"Updated: [bold]{job.updated_at}[/bold]")
+
+        if job.started_at:
+            console.print(f"Started: [bold]{job.started_at}[/bold]")
+
+        if job.completed_at:
+            console.print(f"Completed: [bold]{job.completed_at}[/bold]")
+
+        if job.selector:
+            console.print(f"Selector: [bold]{job.selector}[/bold]")
+
+        if job.error:
+            console.print(f"Error: [bold red]{job.error}[/bold red]")
+
+        # Get results if job is completed
+        if job.status == JobStatus.COMPLETED:
+            result = scrape_result.get_latest_by_job_id(db=db, job_id=job.id)
+
+            if result and result.extracted_data:
+                console.print("\n[bold]Extracted Data:[/bold]")
+
+                # Ask if user wants to see the data
+                if typer.confirm("Do you want to see the extracted data?"):
+                    console.print_json(json.dumps(result.extracted_data))
+            else:
+                console.print("[yellow]No data extracted.[/yellow]")
+
+    except Exception as e:
+        console.print(f"[bold red]Error:[/bold red] {str(e)}")
+
+    finally:
+        db.close()
+
+
+@app.command("run")
+def run_job(
+    job_id: int = typer.Argument(..., help="ID of the job to run"),
+):
+    """
+    Run a scrape job.
+    """
+    db = get_db()
+
+    try:
+        # Get job
+        job = scrape_job.get(db=db, id=job_id)
+
+        if not job:
+            console.print(f"[bold red]Job not found:[/bold red] {job_id}")
+            return
+
+        console.print(f"Running job [bold]{job_id}[/bold]...")
+
+        # Run the job
+        scraper = Scraper(db=db)
+        job = scraper.run_job(job_id=job.id)
+
+        if job.status == JobStatus.COMPLETED:
+            console.print("[bold green]Job completed successfully![/bold green]")
+        else:
+            console.print(f"[bold red]Job failed:[/bold red] {job.error}")
+
+    except Exception as e:
+        console.print(f"[bold red]Error:[/bold red] {str(e)}")
+
+    finally:
+        db.close()
+
+
+if __name__ == "__main__":
+    app()
--- a/app/core/init.py
+++ b/app/core/init.py
--- a/app/core/config.py
+++ b/app/core/config.py
@ -0,0 +1,50 @@
+from pathlib import Path
+from typing import Any, Dict, Optional
+
+from pydantic import BaseSettings, validator
+
+
+class Settings(BaseSettings):
+    # Base settings
+    PROJECT_NAME: str = "Web Scraper CLI"
+    PROJECT_DESCRIPTION: str = "A FastAPI-based web scraper with CLI interface"
+    VERSION: str = "0.1.0"
+    API_V1_STR: str = "/api/v1"
+
+    # Server settings
+    HOST: str = "0.0.0.0"
+    PORT: int = 8000
+    DEBUG: bool = True
+
+    # Database settings
+    DB_DIR: Path = Path("/app") / "storage" / "db"
+    SQLALCHEMY_DATABASE_URL: str = f"sqlite:///{DB_DIR}/db.sqlite"
+
+    @validator("SQLALCHEMY_DATABASE_URL", pre=True)
+    def validate_db_url(cls, v: Optional[str], values: Dict[str, Any]) -> str:
+        """
+        Ensure the database directory exists.
+        """
+        if isinstance(v, str) and v.startswith("sqlite"):
+            db_dir = values.get("DB_DIR")
+            if db_dir:
+                db_dir.mkdir(parents=True, exist_ok=True)
+            return v
+        return v
+
+    # Scraper settings
+    DEFAULT_USER_AGENT: str = (
+        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+        "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+    )
+    DEFAULT_TIMEOUT: int = 30  # seconds
+
+    # Ratelimit settings
+    DEFAULT_RATE_LIMIT: float = 1.0  # requests per second
+
+    class Config:
+        env_file = ".env"
+        case_sensitive = True
+
+
+settings = Settings()
--- a/app/crud/init.py
+++ b/app/crud/init.py
@ -0,0 +1,4 @@
+from app.crud.scrape_job import scrape_job
+from app.crud.scrape_result import scrape_result
+
+__all__ = ["scrape_job", "scrape_result"]
--- a/app/crud/base.py
+++ b/app/crud/base.py
@ -0,0 +1,89 @@
+from typing import Any, Dict, Generic, List, Optional, Type, TypeVar, Union
+
+from fastapi.encoders import jsonable_encoder
+from pydantic import BaseModel
+from sqlalchemy.orm import Session
+
+from app.db.session import Base
+
+ModelType = TypeVar("ModelType", bound=Base)
+CreateSchemaType = TypeVar("CreateSchemaType", bound=BaseModel)
+UpdateSchemaType = TypeVar("UpdateSchemaType", bound=BaseModel)
+
+
+class CRUDBase(Generic[ModelType, CreateSchemaType, UpdateSchemaType]):
+    """
+    CRUD operations base class.
+    """
+
+    def __init__(self, model: Type[ModelType]):
+        """
+        CRUD object with default methods to Create, Read, Update, Delete (CRUD).
+        **Parameters**
+        * `model`: A SQLAlchemy model class
+        * `schema`: A Pydantic model (schema) class
+        """
+        self.model = model
+
+    def get(self, db: Session, id: Any) -> Optional[ModelType]:
+        """
+        Get a record by ID.
+        """
+        return db.query(self.model).filter(self.model.id == id).first()
+
+    def get_multi(
+        self, db: Session, *, skip: int = 0, limit: int = 100
+    ) -> List[ModelType]:
+        """
+        Get multiple records.
+        """
+        return db.query(self.model).offset(skip).limit(limit).all()
+
+    def count(self, db: Session) -> int:
+        """
+        Count total records.
+        """
+        return db.query(self.model).count()
+
+    def create(self, db: Session, *, obj_in: CreateSchemaType) -> ModelType:
+        """
+        Create a new record.
+        """
+        obj_in_data = jsonable_encoder(obj_in)
+        db_obj = self.model(**obj_in_data)
+        db.add(db_obj)
+        db.commit()
+        db.refresh(db_obj)
+        return db_obj
+
+    def update(
+        self,
+        db: Session,
+        *,
+        db_obj: ModelType,
+        obj_in: Union[UpdateSchemaType, Dict[str, Any]],
+    ) -> ModelType:
+        """
+        Update a record.
+        """
+        obj_data = jsonable_encoder(db_obj)
+        if isinstance(obj_in, dict):
+            update_data = obj_in
+        else:
+            update_data = obj_in.dict(exclude_unset=True)
+        for field in obj_data:
+            if field in update_data:
+                setattr(db_obj, field, update_data[field])
+        db.add(db_obj)
+        db.commit()
+        db.refresh(db_obj)
+        return db_obj
+
+    def remove(self, db: Session, *, id: int) -> ModelType:
+        """
+        Remove a record.
+        """
+        obj = db.query(self.model).get(id)
+        db.delete(obj)
+        db.commit()
+        return obj
--- a/app/crud/scrape_job.py
+++ b/app/crud/scrape_job.py
@ -0,0 +1,47 @@
+from typing import List
+
+from sqlalchemy.orm import Session
+
+from app.models.scrape_job import ScrapeJob, JobStatus
+from app.schemas.scrape_job import ScrapeJobCreate, ScrapeJobUpdate
+from app.crud.base import CRUDBase
+
+
+class CRUDScrapeJob(CRUDBase[ScrapeJob, ScrapeJobCreate, ScrapeJobUpdate]):
+    """
+    CRUD operations for ScrapeJob model.
+    """
+
+    def get_by_status(
+        self, db: Session, *, status: JobStatus, skip: int = 0, limit: int = 100
+    ) -> List[ScrapeJob]:
+        """
+        Get jobs by status.
+        """
+        return (
+            db.query(self.model)
+            .filter(self.model.status == status)
+            .offset(skip)
+            .limit(limit)
+            .all()
+        )
+
+    def count_by_status(self, db: Session, *, status: JobStatus) -> int:
+        """
+        Count jobs by status.
+        """
+        return db.query(self.model).filter(self.model.status == status).count()
+
+    def get_pending_jobs(self, db: Session, *, limit: int = 10) -> List[ScrapeJob]:
+        """
+        Get pending jobs.
+        """
+        return (
+            db.query(self.model)
+            .filter(self.model.status == JobStatus.PENDING)
+            .limit(limit)
+            .all()
+        )
+
+
+scrape_job = CRUDScrapeJob(ScrapeJob)
--- a/app/crud/scrape_result.py
+++ b/app/crud/scrape_result.py
@ -0,0 +1,35 @@
+from typing import List, Optional
+
+from sqlalchemy.orm import Session
+
+from app.models.scrape_result import ScrapeResult
+from app.schemas.scrape_result import ScrapeResultCreate, ScrapeResultUpdate
+from app.crud.base import CRUDBase
+
+
+class CRUDScrapeResult(CRUDBase[ScrapeResult, ScrapeResultCreate, ScrapeResultUpdate]):
+    """
+    CRUD operations for ScrapeResult model.
+    """
+
+    def get_by_job_id(self, db: Session, *, job_id: int) -> List[ScrapeResult]:
+        """
+        Get results by job ID.
+        """
+        return db.query(self.model).filter(self.model.job_id == job_id).all()
+
+    def get_latest_by_job_id(
+        self, db: Session, *, job_id: int
+    ) -> Optional[ScrapeResult]:
+        """
+        Get the latest result by job ID.
+        """
+        return (
+            db.query(self.model)
+            .filter(self.model.job_id == job_id)
+            .order_by(self.model.created_at.desc())
+            .first()
+        )
+
+
+scrape_result = CRUDScrapeResult(ScrapeResult)
--- a/app/db/init.py
+++ b/app/db/init.py
--- a/app/db/session.py
+++ b/app/db/session.py
@ -0,0 +1,32 @@
+from sqlalchemy import create_engine
+from sqlalchemy.ext.declarative import declarative_base
+from sqlalchemy.orm import sessionmaker
+
+from app.core.config import settings
+
+# Create database directory if it doesn't exist
+settings.DB_DIR.mkdir(parents=True, exist_ok=True)
+
+# Create SQLAlchemy engine
+engine = create_engine(
+    settings.SQLALCHEMY_DATABASE_URL,
+    connect_args={"check_same_thread": False},  # Only for SQLite
+)
+
+# Create sessionmaker
+SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
+
+# Create base class for SQLAlchemy models
+Base = declarative_base()
+
+
+# Database session dependency
+def get_db():
+    """
+    Dependency for getting a database session.
+    """
+    db = SessionLocal()
+    try:
+        yield db
+    finally:
+        db.close()
--- a/app/models/init.py
+++ b/app/models/init.py
@ -0,0 +1,4 @@
+from app.models.scrape_job import ScrapeJob, JobStatus
+from app.models.scrape_result import ScrapeResult
+
+__all__ = ["ScrapeJob", "JobStatus", "ScrapeResult"]
--- a/app/models/scrape_job.py
+++ b/app/models/scrape_job.py
@ -0,0 +1,41 @@
+import enum
+
+from sqlalchemy import Column, String, Integer, DateTime, Enum, Text, JSON
+from sqlalchemy.sql import func
+
+from app.db.session import Base
+
+
+class JobStatus(str, enum.Enum):
+    PENDING = "pending"
+    IN_PROGRESS = "in_progress"
+    COMPLETED = "completed"
+    FAILED = "failed"
+
+
+class ScrapeJob(Base):
+    """
+    Model for a web scraping job.
+    """
+
+    __tablename__ = "scrape_jobs"
+
+    id = Column(Integer, primary_key=True, index=True)
+    url = Column(String(2048), nullable=False, index=True)
+    status = Column(Enum(JobStatus), default=JobStatus.PENDING, nullable=False)
+    created_at = Column(DateTime, default=func.now(), nullable=False)
+    updated_at = Column(
+        DateTime, default=func.now(), onupdate=func.now(), nullable=False
+    )
+    started_at = Column(DateTime, nullable=True)
+    completed_at = Column(DateTime, nullable=True)
+    selector = Column(String(255), nullable=True)
+    error = Column(Text, nullable=True)
+    result = Column(JSON, nullable=True)
+    user_agent = Column(String(255), nullable=True)
+    timeout = Column(Integer, nullable=True)
+
+    def __repr__(self):
+        return (
+            f"<ScrapeJob(id={self.id}, url='{self.url}', status='{self.status.value}')>"
+        )
--- a/app/models/scrape_result.py
+++ b/app/models/scrape_result.py
@ -0,0 +1,29 @@
+from sqlalchemy import Column, String, Integer, DateTime, ForeignKey, Text, JSON
+from sqlalchemy.sql import func
+from sqlalchemy.orm import relationship
+
+from app.db.session import Base
+
+
+class ScrapeResult(Base):
+    """
+    Model for storing scraping results.
+    """
+
+    __tablename__ = "scrape_results"
+
+    id = Column(Integer, primary_key=True, index=True)
+    job_id = Column(
+        Integer, ForeignKey("scrape_jobs.id", ondelete="CASCADE"), nullable=False
+    )
+    created_at = Column(DateTime, default=func.now(), nullable=False)
+    content_type = Column(String(100), nullable=True)
+    headers = Column(JSON, nullable=True)
+    html_content = Column(Text, nullable=True)
+    extracted_data = Column(JSON, nullable=True)
+
+    # Relationship
+    job = relationship("ScrapeJob", backref="results")
+
+    def __repr__(self):
+        return f"<ScrapeResult(id={self.id}, job_id={self.job_id})>"
--- a/app/schemas/init.py
+++ b/app/schemas/init.py
@ -0,0 +1,27 @@
+from app.schemas.scrape_job import (
+    ScrapeJobBase,
+    ScrapeJobCreate,
+    ScrapeJobUpdate,
+    ScrapeJob,
+    ScrapeJobList,
+)
+from app.schemas.scrape_result import (
+    ScrapeResultBase,
+    ScrapeResultCreate,
+    ScrapeResultUpdate,
+    ScrapeResult,
+    ScrapeResultList,
+)
+
+__all__ = [
+    "ScrapeJobBase",
+    "ScrapeJobCreate",
+    "ScrapeJobUpdate",
+    "ScrapeJob",
+    "ScrapeJobList",
+    "ScrapeResultBase",
+    "ScrapeResultCreate",
+    "ScrapeResultUpdate",
+    "ScrapeResult",
+    "ScrapeResultList",
+]
--- a/app/schemas/scrape_job.py
+++ b/app/schemas/scrape_job.py
@ -0,0 +1,74 @@
+from datetime import datetime
+from typing import Optional, Dict, Any, List
+
+from pydantic import BaseModel, HttpUrl
+
+from app.models.scrape_job import JobStatus
+
+
+class ScrapeJobBase(BaseModel):
+    """
+    Base schema for scrape job data.
+    """
+
+    url: HttpUrl
+    selector: Optional[str] = None
+    user_agent: Optional[str] = None
+    timeout: Optional[int] = None
+
+
+class ScrapeJobCreate(ScrapeJobBase):
+    """
+    Schema for creating a new scrape job.
+    """
+
+    pass
+
+
+class ScrapeJobUpdate(BaseModel):
+    """
+    Schema for updating a scrape job.
+    """
+
+    url: Optional[HttpUrl] = None
+    status: Optional[JobStatus] = None
+    selector: Optional[str] = None
+    error: Optional[str] = None
+    result: Optional[Dict[str, Any]] = None
+    user_agent: Optional[str] = None
+    timeout: Optional[int] = None
+
+
+class ScrapeJobInDBBase(ScrapeJobBase):
+    """
+    Base schema for scrape job in database.
+    """
+
+    id: int
+    status: JobStatus
+    created_at: datetime
+    updated_at: datetime
+    started_at: Optional[datetime] = None
+    completed_at: Optional[datetime] = None
+    error: Optional[str] = None
+    result: Optional[Dict[str, Any]] = None
+
+    class Config:
+        orm_mode = True
+
+
+class ScrapeJob(ScrapeJobInDBBase):
+    """
+    Schema for returned scrape job.
+    """
+
+    pass
+
+
+class ScrapeJobList(BaseModel):
+    """
+    Schema for a list of scrape jobs.
+    """
+
+    jobs: List[ScrapeJob]
+    total: int
--- a/app/schemas/scrape_result.py
+++ b/app/schemas/scrape_result.py
@ -0,0 +1,64 @@
+from datetime import datetime
+from typing import Optional, Dict, Any, List
+
+from pydantic import BaseModel
+
+
+class ScrapeResultBase(BaseModel):
+    """
+    Base schema for scrape result data.
+    """
+
+    job_id: int
+    content_type: Optional[str] = None
+    headers: Optional[Dict[str, Any]] = None
+    extracted_data: Optional[Dict[str, Any]] = None
+
+
+class ScrapeResultCreate(ScrapeResultBase):
+    """
+    Schema for creating a new scrape result.
+    """
+
+    html_content: Optional[str] = None
+
+
+class ScrapeResultUpdate(BaseModel):
+    """
+    Schema for updating a scrape result.
+    """
+
+    content_type: Optional[str] = None
+    headers: Optional[Dict[str, Any]] = None
+    html_content: Optional[str] = None
+    extracted_data: Optional[Dict[str, Any]] = None
+
+
+class ScrapeResultInDBBase(ScrapeResultBase):
+    """
+    Base schema for scrape result in database.
+    """
+
+    id: int
+    created_at: datetime
+    html_content: Optional[str] = None
+
+    class Config:
+        orm_mode = True
+
+
+class ScrapeResult(ScrapeResultInDBBase):
+    """
+    Schema for returned scrape result.
+    """
+
+    pass
+
+
+class ScrapeResultList(BaseModel):
+    """
+    Schema for a list of scrape results.
+    """
+
+    results: List[ScrapeResult]
+    total: int
--- a/app/services/init.py
+++ b/app/services/init.py
--- a/app/services/scraper.py
+++ b/app/services/scraper.py
@ -0,0 +1,150 @@
+import time
+from datetime import datetime
+from typing import Dict, Any, Optional
+
+import requests
+from bs4 import BeautifulSoup
+from sqlalchemy.orm import Session
+
+from app.core.config import settings
+from app.models.scrape_job import ScrapeJob, JobStatus
+from app.models.scrape_result import ScrapeResult
+
+
+class Scraper:
+    """
+    Service for web scraping.
+    """
+
+    def __init__(
+        self,
+        db: Session,
+        user_agent: Optional[str] = None,
+        timeout: Optional[int] = None,
+        rate_limit: Optional[float] = None,
+    ):
+        self.db = db
+        self.user_agent = user_agent or settings.DEFAULT_USER_AGENT
+        self.timeout = timeout or settings.DEFAULT_TIMEOUT
+        self.rate_limit = rate_limit or settings.DEFAULT_RATE_LIMIT
+        self._last_request_time = 0
+
+    def _respect_rate_limit(self) -> None:
+        """
+        Respect rate limit by sleeping if necessary.
+        """
+        current_time = time.time()
+        time_since_last_request = current_time - self._last_request_time
+
+        if time_since_last_request < (1.0 / self.rate_limit):
+            sleep_time = (1.0 / self.rate_limit) - time_since_last_request
+            time.sleep(sleep_time)
+
+        self._last_request_time = time.time()
+
+    def fetch_url(self, url: str) -> requests.Response:
+        """
+        Fetch URL respecting rate limits.
+        """
+        self._respect_rate_limit()
+
+        headers = {
+            "User-Agent": self.user_agent,
+        }
+
+        response = requests.get(
+            url,
+            headers=headers,
+            timeout=self.timeout,
+        )
+        response.raise_for_status()
+
+        return response
+
+    def parse_html(self, html: str, selector: Optional[str] = None) -> Dict[str, Any]:
+        """
+        Parse HTML content.
+        """
+        soup = BeautifulSoup(html, "lxml")
+        result = {
+            "title": soup.title.text if soup.title else None,
+            "meta_description": None,
+            "h1": [h1.text.strip() for h1 in soup.find_all("h1")],
+            "links": [
+                {"href": a.get("href"), "text": a.text.strip()}
+                for a in soup.find_all("a")
+                if a.get("href")
+            ],
+        }
+
+        # Extract meta description
+        meta_desc = soup.find("meta", attrs={"name": "description"})
+        if meta_desc:
+            result["meta_description"] = meta_desc.get("content")
+
+        # If a selector is provided, extract content matching the selector
+        if selector:
+            selected_elements = soup.select(selector)
+            result["selected_content"] = [
+                element.text.strip() for element in selected_elements
+            ]
+            result["selected_html"] = [str(element) for element in selected_elements]
+
+        return result
+
+    def run_job(self, job_id: int) -> ScrapeJob:
+        """
+        Run a scraping job.
+        """
+        # Get job from DB
+        job = self.db.query(ScrapeJob).filter(ScrapeJob.id == job_id).first()
+        if not job:
+            raise ValueError(f"Job with ID {job_id} not found")
+
+        # Update job status
+        job.status = JobStatus.IN_PROGRESS
+        job.started_at = datetime.now()
+        self.db.commit()
+        self.db.refresh(job)
+
+        try:
+            # Fetch URL
+            response = self.fetch_url(job.url)
+
+            # Create ScrapeResult
+            result = ScrapeResult(
+                job_id=job.id,
+                content_type=response.headers.get("Content-Type"),
+                headers=dict(response.headers),
+                html_content=response.text,
+            )
+            self.db.add(result)
+            self.db.commit()
+            self.db.refresh(result)
+
+            # Parse HTML
+            extracted_data = self.parse_html(response.text, job.selector)
+
+            # Update ScrapeResult with extracted data
+            result.extracted_data = extracted_data
+            self.db.commit()
+            self.db.refresh(result)
+
+            # Update job status
+            job.status = JobStatus.COMPLETED
+            job.completed_at = datetime.now()
+            job.result = {"result_id": result.id}
+            self.db.commit()
+            self.db.refresh(job)
+
+            return job
+
+        except Exception as e:
+            # Update job with error
+            job.status = JobStatus.FAILED
+            job.completed_at = datetime.now()
+            job.error = str(e)
+            self.db.commit()
+            self.db.refresh(job)
+
+            raise e
--- a/app/utils/init.py
+++ b/app/utils/init.py
--- a/app/utils/html.py
+++ b/app/utils/html.py
@ -0,0 +1,83 @@
+from typing import List, Dict, Any
+from bs4 import BeautifulSoup
+
+
+def extract_metadata(html: str) -> Dict[str, Any]:
+    """
+    Extract metadata from HTML.
+    """
+    soup = BeautifulSoup(html, "lxml")
+    result = {
+        "title": None,
+        "description": None,
+        "keywords": None,
+        "og_title": None,
+        "og_description": None,
+        "og_image": None,
+    }
+
+    # Title
+    if soup.title:
+        result["title"] = soup.title.text.strip()
+
+    # Meta tags
+    for meta in soup.find_all("meta"):
+        name = meta.get("name", "").lower()
+        property = meta.get("property", "").lower()
+        content = meta.get("content", "")
+
+        if name == "description":
+            result["description"] = content
+        elif name == "keywords":
+            result["keywords"] = content
+        elif property == "og:title":
+            result["og_title"] = content
+        elif property == "og:description":
+            result["og_description"] = content
+        elif property == "og:image":
+            result["og_image"] = content
+
+    return result
+
+
+def extract_links(html: str) -> List[Dict[str, str]]:
+    """
+    Extract links from HTML.
+    """
+    soup = BeautifulSoup(html, "lxml")
+    links = []
+
+    for a in soup.find_all("a"):
+        href = a.get("href")
+        if href:
+            links.append(
+                {
+                    "href": href,
+                    "text": a.text.strip(),
+                    "title": a.get("title", ""),
+                    "rel": a.get("rel", ""),
+                }
+            )
+
+    return links
+
+
+def extract_images(html: str) -> List[Dict[str, str]]:
+    """
+    Extract images from HTML.
+    """
+    soup = BeautifulSoup(html, "lxml")
+    images = []
+
+    for img in soup.find_all("img"):
+        src = img.get("src")
+        if src:
+            images.append(
+                {
+                    "src": src,
+                    "alt": img.get("alt", ""),
+                    "title": img.get("title", ""),
+                }
+            )
+
+    return images
--- a/app/utils/url.py
+++ b/app/utils/url.py
@ -0,0 +1,25 @@
+from urllib.parse import urlparse, parse_qs
+from typing import Dict, Any
+
+
+def parse_url(url: str) -> Dict[str, Any]:
+    """
+    Parse a URL into its components.
+    """
+    parsed = urlparse(url)
+    return {
+        "scheme": parsed.scheme,
+        "netloc": parsed.netloc,
+        "path": parsed.path,
+        "params": parsed.params,
+        "query": parse_qs(parsed.query),
+        "fragment": parsed.fragment,
+    }
+
+
+def is_valid_url(url: str) -> bool:
+    """
+    Check if a URL is valid.
+    """
+    parsed = urlparse(url)
+    return bool(parsed.scheme and parsed.netloc)
--- a/cli.py
+++ b/cli.py
@ -0,0 +1,9 @@
+#!/usr/bin/env python
+"""
+Web Scraper CLI
+"""
+
+from app.cli.cli import app
+
+if __name__ == "__main__":
+    app()
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,15 @@
+version: '3.8'
+
+services:
+  app:
+    build:
+      context: .
+      dockerfile: Dockerfile
+    ports:
+      - "8000:8000"
+    volumes:
+      - ./app:/app/app
+      - ./storage:/app/storage
+    environment:
+      - DEBUG=True
+    command: uvicorn main:app --host 0.0.0.0 --port 8000 --reload
--- a/main.py
+++ b/main.py
@ -0,0 +1,29 @@
+import uvicorn
+from fastapi import FastAPI
+from app.api.api import api_router
+from app.core.config import settings
+
+app = FastAPI(
+    title=settings.PROJECT_NAME,
+    description=settings.PROJECT_DESCRIPTION,
+    version=settings.VERSION,
+)
+
+app.include_router(api_router)
+
+
+@app.get("/health", tags=["Health"])
+async def health_check():
+    """
+    Health check endpoint.
+    """
+    return {"status": "ok"}
+
+
+if __name__ == "__main__":
+    uvicorn.run(
+        "main:app",
+        host=settings.HOST,
+        port=settings.PORT,
+        reload=settings.DEBUG,
+    )
--- a/migrations/env.py
+++ b/migrations/env.py
@ -0,0 +1,88 @@
+from logging.config import fileConfig
+
+from sqlalchemy import engine_from_config
+from sqlalchemy import pool
+
+from alembic import context
+
+# Import models to register them with the metadata
+from app.db.session import Base
+
+# These imports are needed to register models with SQLAlchemy metadata
+# even though they appear unused to static analyzers
+import app.models.scrape_job  # noqa: F401
+import app.models.scrape_result  # noqa: F401
+
+# this is the Alembic Config object, which provides
+# access to the values within the .ini file in use.
+config = context.config
+
+# Interpret the config file for Python logging.
+# This line sets up loggers basically.
+fileConfig(config.config_file_name)
+
+# add your model's MetaData object here
+# for 'autogenerate' support
+target_metadata = Base.metadata
+
+# other values from the config, defined by the needs of env.py,
+# can be acquired:
+# my_important_option = config.get_main_option("my_important_option")
+# ... etc.
+
+
+def run_migrations_offline():
+    """Run migrations in 'offline' mode.
+
+    This configures the context with just a URL
+    and not an Engine, though an Engine is acceptable
+    here as well.  By skipping the Engine creation
+    we don't even need a DBAPI to be available.
+
+    Calls to context.execute() here emit the given string to the
+    script output.
+
+    """
+    url = config.get_main_option("sqlalchemy.url")
+    context.configure(
+        url=url,
+        target_metadata=target_metadata,
+        literal_binds=True,
+        dialect_opts={"paramstyle": "named"},
+    )
+
+    with context.begin_transaction():
+        context.run_migrations()
+
+
+def run_migrations_online():
+    """Run migrations in 'online' mode.
+
+    In this scenario we need to create an Engine
+    and associate a connection with the context.
+
+    """
+    connectable = engine_from_config(
+        config.get_section(config.config_ini_section),
+        prefix="sqlalchemy.",
+        poolclass=pool.NullPool,
+    )
+
+    with connectable.connect() as connection:
+        # Check if we're using SQLite
+        is_sqlite = connection.dialect.name == "sqlite"
+
+        context.configure(
+            connection=connection,
+            target_metadata=target_metadata,
+            render_as_batch=is_sqlite,  # Critical for SQLite to handle alter table operations
+        )
+
+        with context.begin_transaction():
+            context.run_migrations()
+
+
+if context.is_offline_mode():
+    run_migrations_offline()
+else:
+    run_migrations_online()
--- a/migrations/script.py.mako
+++ b/migrations/script.py.mako
@ -0,0 +1,24 @@
+"""${message}
+
+Revision ID: ${up_revision}
+Revises: ${down_revision | comma,n}
+Create Date: ${create_date}
+
+"""
+from alembic import op
+import sqlalchemy as sa
+${imports if imports else ""}
+
+# revision identifiers, used by Alembic.
+revision = ${repr(up_revision)}
+down_revision = ${repr(down_revision)}
+branch_labels = ${repr(branch_labels)}
+depends_on = ${repr(depends_on)}
+
+
+def upgrade():
+    ${upgrades if upgrades else "pass"}
+
+
+def downgrade():
+    ${downgrades if downgrades else "pass"}
--- a/migrations/versions/0001_initial_migration.py
+++ b/migrations/versions/0001_initial_migration.py
@ -0,0 +1,75 @@
+"""Initial migration
+
+Revision ID: 0001
+Revises:
+Create Date: 2023-06-25
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+from sqlalchemy.dialects import sqlite
+
+# revision identifiers, used by Alembic.
+revision = "0001"
+down_revision = None
+branch_labels = None
+depends_on = None
+
+
+def upgrade():
+    # Create scrape_jobs table
+    op.create_table(
+        "scrape_jobs",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("url", sa.String(length=2048), nullable=False),
+        sa.Column(
+            "status",
+            sa.Enum("pending", "in_progress", "completed", "failed", name="jobstatus"),
+            nullable=False,
+            default="pending",
+        ),
+        sa.Column(
+            "created_at", sa.DateTime(), nullable=False, server_default=sa.func.now()
+        ),
+        sa.Column(
+            "updated_at", sa.DateTime(), nullable=False, server_default=sa.func.now()
+        ),
+        sa.Column("started_at", sa.DateTime(), nullable=True),
+        sa.Column("completed_at", sa.DateTime(), nullable=True),
+        sa.Column("selector", sa.String(length=255), nullable=True),
+        sa.Column("error", sa.Text(), nullable=True),
+        sa.Column("result", sqlite.JSON(), nullable=True),
+        sa.Column("user_agent", sa.String(length=255), nullable=True),
+        sa.Column("timeout", sa.Integer(), nullable=True),
+        sa.PrimaryKeyConstraint("id"),
+    )
+    op.create_index(op.f("ix_scrape_jobs_id"), "scrape_jobs", ["id"], unique=False)
+    op.create_index(op.f("ix_scrape_jobs_url"), "scrape_jobs", ["url"], unique=False)
+
+    # Create scrape_results table
+    op.create_table(
+        "scrape_results",
+        sa.Column("id", sa.Integer(), nullable=False),
+        sa.Column("job_id", sa.Integer(), nullable=False),
+        sa.Column(
+            "created_at", sa.DateTime(), nullable=False, server_default=sa.func.now()
+        ),
+        sa.Column("content_type", sa.String(length=100), nullable=True),
+        sa.Column("headers", sqlite.JSON(), nullable=True),
+        sa.Column("html_content", sa.Text(), nullable=True),
+        sa.Column("extracted_data", sqlite.JSON(), nullable=True),
+        sa.ForeignKeyConstraint(["job_id"], ["scrape_jobs.id"], ondelete="CASCADE"),
+        sa.PrimaryKeyConstraint("id"),
+    )
+    op.create_index(
+        op.f("ix_scrape_results_id"), "scrape_results", ["id"], unique=False
+    )
+
+
+def downgrade():
+    op.drop_index(op.f("ix_scrape_results_id"), table_name="scrape_results")
+    op.drop_table("scrape_results")
+    op.drop_index(op.f("ix_scrape_jobs_url"), table_name="scrape_jobs")
+    op.drop_index(op.f("ix_scrape_jobs_id"), table_name="scrape_jobs")
+    op.drop_table("scrape_jobs")
--- a/migrations/versions/init.py
+++ b/migrations/versions/init.py
@ -0,0 +1 @@
+# This file is intentionally empty to make the directory a Python package
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,14 @@
+fastapi==0.110.0
+uvicorn==0.27.1
+sqlalchemy==2.0.27
+alembic==1.13.1
+pydantic==2.6.1
+python-dotenv==1.0.1
+beautifulsoup4==4.12.2
+requests==2.31.0
+typer==0.9.0
+rich==13.7.0
+httpx==0.26.0
+lxml==4.9.3
+aiohttp==3.9.3
+ruff==0.2.2
				`@ -0,0 +1 @@`
				`# This file is intentionally empty to make the directory a Python package`