From a4511b313728ee52ec844a003acf24b52305b36e Mon Sep 17 00:00:00 2001 From: Automated Action Date: Thu, 29 May 2025 17:20:46 +0000 Subject: [PATCH] Fix dependency installation issue by adding Dockerfile and docker-compose.yml --- Dockerfile | 27 ++ README.md | 151 +++++++++- alembic.ini | 84 ++++++ app/__init__.py | 0 app/api/__init__.py | 0 app/api/api.py | 9 + app/api/deps.py | 15 + app/api/endpoints/__init__.py | 1 + app/api/endpoints/scrape_jobs.py | 201 +++++++++++++ app/cli/__init__.py | 0 app/cli/cli.py | 277 ++++++++++++++++++ app/core/__init__.py | 0 app/core/config.py | 50 ++++ app/crud/__init__.py | 4 + app/crud/base.py | 89 ++++++ app/crud/scrape_job.py | 47 +++ app/crud/scrape_result.py | 35 +++ app/db/__init__.py | 0 app/db/session.py | 32 ++ app/models/__init__.py | 4 + app/models/scrape_job.py | 41 +++ app/models/scrape_result.py | 29 ++ app/schemas/__init__.py | 27 ++ app/schemas/scrape_job.py | 74 +++++ app/schemas/scrape_result.py | 64 ++++ app/services/__init__.py | 0 app/services/scraper.py | 150 ++++++++++ app/utils/__init__.py | 0 app/utils/html.py | 83 ++++++ app/utils/url.py | 25 ++ cli.py | 9 + docker-compose.yml | 15 + main.py | 29 ++ migrations/env.py | 88 ++++++ migrations/script.py.mako | 24 ++ migrations/versions/0001_initial_migration.py | 75 +++++ migrations/versions/__init__.py | 1 + requirements.txt | 14 + 38 files changed, 1772 insertions(+), 2 deletions(-) create mode 100644 Dockerfile create mode 100644 alembic.ini create mode 100644 app/__init__.py create mode 100644 app/api/__init__.py create mode 100644 app/api/api.py create mode 100644 app/api/deps.py create mode 100644 app/api/endpoints/__init__.py create mode 100644 app/api/endpoints/scrape_jobs.py create mode 100644 app/cli/__init__.py create mode 100644 app/cli/cli.py create mode 100644 app/core/__init__.py create mode 100644 app/core/config.py create mode 100644 app/crud/__init__.py create mode 100644 app/crud/base.py create mode 100644 app/crud/scrape_job.py create mode 100644 app/crud/scrape_result.py create mode 100644 app/db/__init__.py create mode 100644 app/db/session.py create mode 100644 app/models/__init__.py create mode 100644 app/models/scrape_job.py create mode 100644 app/models/scrape_result.py create mode 100644 app/schemas/__init__.py create mode 100644 app/schemas/scrape_job.py create mode 100644 app/schemas/scrape_result.py create mode 100644 app/services/__init__.py create mode 100644 app/services/scraper.py create mode 100644 app/utils/__init__.py create mode 100644 app/utils/html.py create mode 100644 app/utils/url.py create mode 100644 cli.py create mode 100644 docker-compose.yml create mode 100644 main.py create mode 100644 migrations/env.py create mode 100644 migrations/script.py.mako create mode 100644 migrations/versions/0001_initial_migration.py create mode 100644 migrations/versions/__init__.py create mode 100644 requirements.txt diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..349b6be --- /dev/null +++ b/Dockerfile @@ -0,0 +1,27 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Copy requirements file +COPY requirements.txt . + +# Install dependencies +RUN pip install --no-cache-dir -r requirements.txt + +# Copy the rest of the application +COPY . . + +# Create the database directory +RUN mkdir -p /app/storage/db + +# Set environment variables +ENV PYTHONPATH=/app +ENV HOST=0.0.0.0 +ENV PORT=8000 +ENV DEBUG=True + +# Expose the port +EXPOSE 8000 + +# Run the application +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] \ No newline at end of file diff --git a/README.md b/README.md index e8acfba..0d5f5d2 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,150 @@ -# FastAPI Application +# Web Scraper CLI -This is a FastAPI application bootstrapped by BackendIM, the AI-powered backend generation platform. +A FastAPI-based web scraper with CLI interface. + +## Features + +- REST API for web scraping management +- CLI tool for scraping websites +- Extract metadata, links, and specific content using CSS selectors +- Store scraping results in SQLite database +- Background job processing +- Rate limiting to avoid overloading target websites + +## Installation + +### Local Installation + +1. Clone the repository: + +```bash +git clone https://github.com/yourusername/webscrapercli.git +cd webscrapercli +``` + +2. Install dependencies: + +```bash +pip install -r requirements.txt +``` + +3. Run the database migrations: + +```bash +alembic upgrade head +``` + +### Docker Installation + +1. Clone the repository: + +```bash +git clone https://github.com/yourusername/webscrapercli.git +cd webscrapercli +``` + +2. Build and run using Docker Compose: + +```bash +docker-compose up --build +``` + +This will: +- Build the Docker image with all dependencies +- Start the FastAPI server on port 8000 +- Mount the app and storage directories as volumes for live code reloading + +## Usage + +### API Server + +Start the API server: + +```bash +# Development mode +uvicorn main:app --reload + +# Production mode +uvicorn main:app --host 0.0.0.0 --port 8000 +``` + +Access the API documentation at: http://localhost:8000/docs + +### CLI Usage + +The CLI provides several commands for scraping websites: + +```bash +# Scrape a URL +python cli.py scrape https://example.com + +# Scrape a URL with a specific selector +python cli.py scrape https://example.com --selector "div.content" + +# Save the results to a file +python cli.py scrape https://example.com --output results.json + +# List all scrape jobs +python cli.py list + +# List scrape jobs with a specific status +python cli.py list --status completed + +# Show details of a specific job +python cli.py show 1 + +# Run a specific job +python cli.py run 1 +``` + +## API Endpoints + +- `GET /health`: Health check endpoint +- `POST /api/v1/scrape-jobs/`: Create a new scrape job +- `GET /api/v1/scrape-jobs/`: List scrape jobs +- `GET /api/v1/scrape-jobs/{job_id}`: Get a specific scrape job +- `PUT /api/v1/scrape-jobs/{job_id}`: Update a scrape job +- `DELETE /api/v1/scrape-jobs/{job_id}`: Delete a scrape job +- `POST /api/v1/scrape-jobs/{job_id}/run`: Run a scrape job +- `GET /api/v1/scrape-jobs/{job_id}/results`: Get the results of a scrape job + +## Development + +### Project Structure + +``` +webscrapercli/ +├── alembic.ini # Alembic configuration +├── app/ # Application package +│ ├── api/ # API endpoints +│ ├── cli/ # CLI implementation +│ ├── core/ # Core functionality +│ ├── crud/ # CRUD operations +│ ├── db/ # Database configuration +│ ├── models/ # SQLAlchemy models +│ ├── schemas/ # Pydantic schemas +│ ├── services/ # Business logic +│ └── utils/ # Utility functions +├── cli.py # CLI entry point +├── docker-compose.yml # Docker Compose configuration +├── Dockerfile # Docker configuration +├── main.py # API entry point +├── migrations/ # Alembic migrations +│ ├── env.py # Alembic environment +│ ├── script.py.mako # Alembic script template +│ └── versions/ # Migration scripts +├── requirements.txt # Dependencies +└── storage/ # Storage directory for database and other files + └── db/ # Database directory +``` + +### Running Tests + +```bash +# Run tests +pytest +``` + +## License + +This project is open source. \ No newline at end of file diff --git a/alembic.ini b/alembic.ini new file mode 100644 index 0000000..75005f9 --- /dev/null +++ b/alembic.ini @@ -0,0 +1,84 @@ +# A generic, single database configuration. + +[alembic] +# path to migration scripts +script_location = migrations + +# template used to generate migration files +# file_template = %%(rev)s_%%(slug)s + +# timezone to use when rendering the date +# within the migration file as well as the filename. +# string value is passed to dateutil.tz.gettz() +# leave blank for localtime +# timezone = + +# max length of characters to apply to the +# "slug" field +# truncate_slug_length = 40 + +# set to 'true' to run the environment during +# the 'revision' command, regardless of autogenerate +# revision_environment = false + +# set to 'true' to allow .pyc and .pyo files without +# a source .py file to be detected as revisions in the +# versions/ directory +# sourceless = false + +# version location specification; this defaults +# to migrations/versions. When using multiple version +# directories, initial revisions must be specified with --version-path +# version_locations = %(here)s/bar %(here)s/bat migrations/versions + +# the output encoding used when revision files +# are written from script.py.mako +# output_encoding = utf-8 + +sqlalchemy.url = sqlite:////app/storage/db/db.sqlite + +[post_write_hooks] +# post_write_hooks defines scripts or Python functions that are run +# on newly generated revision scripts. See the documentation for further +# detail and examples + +# format using "black" - use the console_scripts runner, against the "black" entrypoint +# hooks=black +# black.type=console_scripts +# black.entrypoint=black +# black.options=-l 79 + +# Logging configuration +[loggers] +keys = root,sqlalchemy,alembic + +[handlers] +keys = console + +[formatters] +keys = generic + +[logger_root] +level = WARN +handlers = console +qualname = + +[logger_sqlalchemy] +level = WARN +handlers = +qualname = sqlalchemy.engine + +[logger_alembic] +level = INFO +handlers = +qualname = alembic + +[handler_console] +class = StreamHandler +args = (sys.stderr,) +level = NOTSET +formatter = generic + +[formatter_generic] +format = %(levelname)-5.5s [%(name)s] %(message)s +datefmt = %H:%M:%S \ No newline at end of file diff --git a/app/__init__.py b/app/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/api/__init__.py b/app/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/api/api.py b/app/api/api.py new file mode 100644 index 0000000..64a0de0 --- /dev/null +++ b/app/api/api.py @@ -0,0 +1,9 @@ +from fastapi import APIRouter + +from app.api.endpoints import scrape_jobs + +api_router = APIRouter() + +api_router.include_router( + scrape_jobs.router, prefix="/scrape-jobs", tags=["scrape-jobs"] +) diff --git a/app/api/deps.py b/app/api/deps.py new file mode 100644 index 0000000..bf65bd8 --- /dev/null +++ b/app/api/deps.py @@ -0,0 +1,15 @@ +from typing import Generator + + +from app.db.session import SessionLocal + + +def get_db() -> Generator: + """ + Get a database session. + """ + db = SessionLocal() + try: + yield db + finally: + db.close() diff --git a/app/api/endpoints/__init__.py b/app/api/endpoints/__init__.py new file mode 100644 index 0000000..011100a --- /dev/null +++ b/app/api/endpoints/__init__.py @@ -0,0 +1 @@ +# This file is intentionally empty to make the directory a Python package diff --git a/app/api/endpoints/scrape_jobs.py b/app/api/endpoints/scrape_jobs.py new file mode 100644 index 0000000..b81f2bc --- /dev/null +++ b/app/api/endpoints/scrape_jobs.py @@ -0,0 +1,201 @@ +from typing import Any, Optional + +from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks, status +from sqlalchemy.orm import Session + +from app.api.deps import get_db +from app.models.scrape_job import JobStatus +from app.services.scraper import Scraper +from app.crud import scrape_job, scrape_result +from app.schemas.scrape_job import ( + ScrapeJob, + ScrapeJobCreate, + ScrapeJobUpdate, + ScrapeJobList, +) +from app.schemas.scrape_result import ScrapeResult + + +router = APIRouter() + + +@router.post("/", response_model=ScrapeJob, status_code=status.HTTP_201_CREATED) +def create_scrape_job( + *, + db: Session = Depends(get_db), + job_in: ScrapeJobCreate, + background_tasks: BackgroundTasks, +) -> Any: + """ + Create a new scrape job. + """ + job = scrape_job.create(db=db, obj_in=job_in) + + # Run job in background + background_tasks.add_task(run_scrape_job, job_id=job.id) + + return job + + +@router.get("/", response_model=ScrapeJobList) +def list_scrape_jobs( + *, + db: Session = Depends(get_db), + skip: int = 0, + limit: int = 100, + status: Optional[JobStatus] = None, +) -> Any: + """ + List scrape jobs. + """ + if status: + jobs = scrape_job.get_by_status(db=db, status=status, skip=skip, limit=limit) + total = scrape_job.count_by_status(db=db, status=status) + else: + jobs = scrape_job.get_multi(db=db, skip=skip, limit=limit) + total = scrape_job.count(db=db) + + return {"jobs": jobs, "total": total} + + +@router.get("/{job_id}", response_model=ScrapeJob) +def get_scrape_job( + *, + db: Session = Depends(get_db), + job_id: int, +) -> Any: + """ + Get a scrape job by ID. + """ + job = scrape_job.get(db=db, id=job_id) + if not job: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Scrape job with ID {job_id} not found", + ) + return job + + +@router.put("/{job_id}", response_model=ScrapeJob) +def update_scrape_job( + *, + db: Session = Depends(get_db), + job_id: int, + job_in: ScrapeJobUpdate, +) -> Any: + """ + Update a scrape job. + """ + job = scrape_job.get(db=db, id=job_id) + if not job: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Scrape job with ID {job_id} not found", + ) + job = scrape_job.update(db=db, db_obj=job, obj_in=job_in) + return job + + +@router.delete("/{job_id}", status_code=status.HTTP_204_NO_CONTENT, response_model=None) +def delete_scrape_job( + *, + db: Session = Depends(get_db), + job_id: int, +) -> None: + """ + Delete a scrape job. + """ + job = scrape_job.get(db=db, id=job_id) + if not job: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Scrape job with ID {job_id} not found", + ) + scrape_job.remove(db=db, id=job_id) + + +@router.post("/{job_id}/run", response_model=ScrapeJob) +def run_scrape_job( + *, + db: Session = Depends(get_db), + job_id: int, + background_tasks: Optional[BackgroundTasks] = None, +) -> Any: + """ + Run a scrape job. + """ + job = scrape_job.get(db=db, id=job_id) + if not job: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Scrape job with ID {job_id} not found", + ) + + if job.status == JobStatus.IN_PROGRESS: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Scrape job with ID {job_id} is already in progress", + ) + + # If called with background_tasks, run in background + if background_tasks: + background_tasks.add_task(_run_job, job_id=job_id) + # Update job status to pending + job = scrape_job.update(db=db, db_obj=job, obj_in={"status": JobStatus.PENDING}) + return job + + # Otherwise, run synchronously + return _run_job(job_id=job_id) + + +@router.get("/{job_id}/results", response_model=ScrapeResult) +def get_scrape_results( + *, + db: Session = Depends(get_db), + job_id: int, +) -> Any: + """ + Get the latest result for a scrape job. + """ + job = scrape_job.get(db=db, id=job_id) + if not job: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"Scrape job with ID {job_id} not found", + ) + + result = scrape_result.get_latest_by_job_id(db=db, job_id=job_id) + if not result: + raise HTTPException( + status_code=status.HTTP_404_NOT_FOUND, + detail=f"No results found for scrape job with ID {job_id}", + ) + + return result + + +def _run_job(job_id: int) -> ScrapeJob: + """ + Internal function to run a scrape job. + """ + # Create a new session and scraper + db = next(get_db()) + scraper = Scraper(db=db) + + try: + # Run the job + job = scraper.run_job(job_id=job_id) + return job + except Exception as e: + # Make sure the job is marked as failed + job = scrape_job.get(db=db, id=job_id) + if job and job.status != JobStatus.FAILED: + scrape_job.update( + db=db, + db_obj=job, + obj_in={"status": JobStatus.FAILED, "error": str(e)}, + ) + raise HTTPException( + status_code=status.HTTP_500_INTERNAL_SERVER_ERROR, + detail=f"Error running scrape job: {str(e)}", + ) diff --git a/app/cli/__init__.py b/app/cli/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/cli/cli.py b/app/cli/cli.py new file mode 100644 index 0000000..ac46d0d --- /dev/null +++ b/app/cli/cli.py @@ -0,0 +1,277 @@ +import json +from typing import Optional + +import typer +from rich.console import Console +from rich.table import Table +from sqlalchemy.orm import Session + +from app.db.session import SessionLocal +from app.crud import scrape_job, scrape_result +from app.models.scrape_job import JobStatus +from app.services.scraper import Scraper + +app = typer.Typer(help="Web Scraper CLI") +console = Console() + + +def get_db() -> Session: + """ + Get a database session. + """ + return SessionLocal() + + +@app.command("scrape") +def scrape_url( + url: str = typer.Argument(..., help="URL to scrape"), + selector: Optional[str] = typer.Option( + None, help="CSS selector to extract content" + ), + user_agent: Optional[str] = typer.Option( + None, help="User agent to use for request" + ), + timeout: Optional[int] = typer.Option(None, help="Timeout for request in seconds"), + output: Optional[str] = typer.Option( + None, help="Output file path for results (JSON)" + ), +): + """ + Scrape a URL and extract content. + """ + console.print(f"Scraping [bold]{url}[/bold]...") + + db = get_db() + + try: + # Create a new scrape job + job_data = { + "url": url, + "selector": selector, + "user_agent": user_agent, + "timeout": timeout, + } + job_in = {k: v for k, v in job_data.items() if v is not None} + + # Create and run the job + job = scrape_job.create(db=db, obj_in=job_in) + console.print(f"Created scrape job with ID [bold]{job.id}[/bold]") + + # Run the job + scraper = Scraper(db=db, user_agent=user_agent, timeout=timeout) + job = scraper.run_job(job_id=job.id) + + if job.status == JobStatus.COMPLETED: + console.print("[bold green]Scraping completed successfully![/bold green]") + + # Get the result + result = scrape_result.get_latest_by_job_id(db=db, job_id=job.id) + + # Print basic info + console.print("\n[bold]Basic Information:[/bold]") + table = Table(show_header=True, header_style="bold") + table.add_column("Attribute") + table.add_column("Value") + + if result and result.extracted_data: + data = result.extracted_data + + # Add rows to table + if "title" in data: + table.add_row("Title", data["title"] or "") + + if "meta_description" in data: + table.add_row("Description", data["meta_description"] or "") + + if "h1" in data: + table.add_row( + "H1 Tags", ", ".join(data["h1"]) if data["h1"] else "" + ) + + if "links" in data: + link_count = len(data["links"]) if data["links"] else 0 + table.add_row("Links", str(link_count)) + + if selector and "selected_content" in data: + content_count = ( + len(data["selected_content"]) if data["selected_content"] else 0 + ) + table.add_row(f"Selected Content ({selector})", str(content_count)) + + console.print(table) + + # Write results to file if specified + if output: + with open(output, "w") as f: + json.dump(data, f, indent=2) + console.print(f"\nResults saved to [bold]{output}[/bold]") + + # Ask if user wants to see more details + if typer.confirm("\nDo you want to see the full extracted data?"): + console.print_json(json.dumps(data)) + else: + console.print("[yellow]No data extracted.[/yellow]") + else: + console.print(f"[bold red]Scraping failed:[/bold red] {job.error}") + + except Exception as e: + console.print(f"[bold red]Error:[/bold red] {str(e)}") + + finally: + db.close() + + +@app.command("list") +def list_jobs( + status: Optional[str] = typer.Option( + None, help="Filter by status (pending, in_progress, completed, failed)" + ), + limit: int = typer.Option(10, help="Limit number of jobs"), +): + """ + List scrape jobs. + """ + db = get_db() + + try: + # Get jobs based on status + if status: + try: + job_status = JobStatus(status) + jobs = scrape_job.get_by_status(db=db, status=job_status, limit=limit) + total = scrape_job.count_by_status(db=db, status=job_status) + console.print( + f"Found [bold]{total}[/bold] jobs with status [bold]{status}[/bold]" + ) + except ValueError: + console.print(f"[bold red]Invalid status:[/bold red] {status}") + return + else: + jobs = scrape_job.get_multi(db=db, limit=limit) + total = scrape_job.count(db=db) + console.print(f"Found [bold]{total}[/bold] jobs") + + if not jobs: + console.print("[yellow]No jobs found.[/yellow]") + return + + # Create table + table = Table(show_header=True, header_style="bold") + table.add_column("ID") + table.add_column("URL") + table.add_column("Status") + table.add_column("Created") + table.add_column("Updated") + + # Add rows + for job in jobs: + table.add_row( + str(job.id), + job.url, + job.status.value, + job.created_at.strftime("%Y-%m-%d %H:%M:%S"), + job.updated_at.strftime("%Y-%m-%d %H:%M:%S"), + ) + + console.print(table) + + except Exception as e: + console.print(f"[bold red]Error:[/bold red] {str(e)}") + + finally: + db.close() + + +@app.command("show") +def show_job( + job_id: int = typer.Argument(..., help="ID of the job to show"), +): + """ + Show details of a scrape job. + """ + db = get_db() + + try: + # Get job + job = scrape_job.get(db=db, id=job_id) + + if not job: + console.print(f"[bold red]Job not found:[/bold red] {job_id}") + return + + # Print job details + console.print(f"\n[bold]Job {job_id}[/bold]") + console.print(f"URL: [bold]{job.url}[/bold]") + console.print(f"Status: [bold]{job.status.value}[/bold]") + console.print(f"Created: [bold]{job.created_at}[/bold]") + console.print(f"Updated: [bold]{job.updated_at}[/bold]") + + if job.started_at: + console.print(f"Started: [bold]{job.started_at}[/bold]") + + if job.completed_at: + console.print(f"Completed: [bold]{job.completed_at}[/bold]") + + if job.selector: + console.print(f"Selector: [bold]{job.selector}[/bold]") + + if job.error: + console.print(f"Error: [bold red]{job.error}[/bold red]") + + # Get results if job is completed + if job.status == JobStatus.COMPLETED: + result = scrape_result.get_latest_by_job_id(db=db, job_id=job.id) + + if result and result.extracted_data: + console.print("\n[bold]Extracted Data:[/bold]") + + # Ask if user wants to see the data + if typer.confirm("Do you want to see the extracted data?"): + console.print_json(json.dumps(result.extracted_data)) + else: + console.print("[yellow]No data extracted.[/yellow]") + + except Exception as e: + console.print(f"[bold red]Error:[/bold red] {str(e)}") + + finally: + db.close() + + +@app.command("run") +def run_job( + job_id: int = typer.Argument(..., help="ID of the job to run"), +): + """ + Run a scrape job. + """ + db = get_db() + + try: + # Get job + job = scrape_job.get(db=db, id=job_id) + + if not job: + console.print(f"[bold red]Job not found:[/bold red] {job_id}") + return + + console.print(f"Running job [bold]{job_id}[/bold]...") + + # Run the job + scraper = Scraper(db=db) + job = scraper.run_job(job_id=job.id) + + if job.status == JobStatus.COMPLETED: + console.print("[bold green]Job completed successfully![/bold green]") + else: + console.print(f"[bold red]Job failed:[/bold red] {job.error}") + + except Exception as e: + console.print(f"[bold red]Error:[/bold red] {str(e)}") + + finally: + db.close() + + +if __name__ == "__main__": + app() diff --git a/app/core/__init__.py b/app/core/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/core/config.py b/app/core/config.py new file mode 100644 index 0000000..212d697 --- /dev/null +++ b/app/core/config.py @@ -0,0 +1,50 @@ +from pathlib import Path +from typing import Any, Dict, Optional + +from pydantic import BaseSettings, validator + + +class Settings(BaseSettings): + # Base settings + PROJECT_NAME: str = "Web Scraper CLI" + PROJECT_DESCRIPTION: str = "A FastAPI-based web scraper with CLI interface" + VERSION: str = "0.1.0" + API_V1_STR: str = "/api/v1" + + # Server settings + HOST: str = "0.0.0.0" + PORT: int = 8000 + DEBUG: bool = True + + # Database settings + DB_DIR: Path = Path("/app") / "storage" / "db" + SQLALCHEMY_DATABASE_URL: str = f"sqlite:///{DB_DIR}/db.sqlite" + + @validator("SQLALCHEMY_DATABASE_URL", pre=True) + def validate_db_url(cls, v: Optional[str], values: Dict[str, Any]) -> str: + """ + Ensure the database directory exists. + """ + if isinstance(v, str) and v.startswith("sqlite"): + db_dir = values.get("DB_DIR") + if db_dir: + db_dir.mkdir(parents=True, exist_ok=True) + return v + return v + + # Scraper settings + DEFAULT_USER_AGENT: str = ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " + "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + ) + DEFAULT_TIMEOUT: int = 30 # seconds + + # Ratelimit settings + DEFAULT_RATE_LIMIT: float = 1.0 # requests per second + + class Config: + env_file = ".env" + case_sensitive = True + + +settings = Settings() diff --git a/app/crud/__init__.py b/app/crud/__init__.py new file mode 100644 index 0000000..5feee49 --- /dev/null +++ b/app/crud/__init__.py @@ -0,0 +1,4 @@ +from app.crud.scrape_job import scrape_job +from app.crud.scrape_result import scrape_result + +__all__ = ["scrape_job", "scrape_result"] diff --git a/app/crud/base.py b/app/crud/base.py new file mode 100644 index 0000000..6825ec9 --- /dev/null +++ b/app/crud/base.py @@ -0,0 +1,89 @@ +from typing import Any, Dict, Generic, List, Optional, Type, TypeVar, Union + +from fastapi.encoders import jsonable_encoder +from pydantic import BaseModel +from sqlalchemy.orm import Session + +from app.db.session import Base + +ModelType = TypeVar("ModelType", bound=Base) +CreateSchemaType = TypeVar("CreateSchemaType", bound=BaseModel) +UpdateSchemaType = TypeVar("UpdateSchemaType", bound=BaseModel) + + +class CRUDBase(Generic[ModelType, CreateSchemaType, UpdateSchemaType]): + """ + CRUD operations base class. + """ + + def __init__(self, model: Type[ModelType]): + """ + CRUD object with default methods to Create, Read, Update, Delete (CRUD). + **Parameters** + * `model`: A SQLAlchemy model class + * `schema`: A Pydantic model (schema) class + """ + self.model = model + + def get(self, db: Session, id: Any) -> Optional[ModelType]: + """ + Get a record by ID. + """ + return db.query(self.model).filter(self.model.id == id).first() + + def get_multi( + self, db: Session, *, skip: int = 0, limit: int = 100 + ) -> List[ModelType]: + """ + Get multiple records. + """ + return db.query(self.model).offset(skip).limit(limit).all() + + def count(self, db: Session) -> int: + """ + Count total records. + """ + return db.query(self.model).count() + + def create(self, db: Session, *, obj_in: CreateSchemaType) -> ModelType: + """ + Create a new record. + """ + obj_in_data = jsonable_encoder(obj_in) + db_obj = self.model(**obj_in_data) + db.add(db_obj) + db.commit() + db.refresh(db_obj) + return db_obj + + def update( + self, + db: Session, + *, + db_obj: ModelType, + obj_in: Union[UpdateSchemaType, Dict[str, Any]], + ) -> ModelType: + """ + Update a record. + """ + obj_data = jsonable_encoder(db_obj) + if isinstance(obj_in, dict): + update_data = obj_in + else: + update_data = obj_in.dict(exclude_unset=True) + for field in obj_data: + if field in update_data: + setattr(db_obj, field, update_data[field]) + db.add(db_obj) + db.commit() + db.refresh(db_obj) + return db_obj + + def remove(self, db: Session, *, id: int) -> ModelType: + """ + Remove a record. + """ + obj = db.query(self.model).get(id) + db.delete(obj) + db.commit() + return obj diff --git a/app/crud/scrape_job.py b/app/crud/scrape_job.py new file mode 100644 index 0000000..81ea897 --- /dev/null +++ b/app/crud/scrape_job.py @@ -0,0 +1,47 @@ +from typing import List + +from sqlalchemy.orm import Session + +from app.models.scrape_job import ScrapeJob, JobStatus +from app.schemas.scrape_job import ScrapeJobCreate, ScrapeJobUpdate +from app.crud.base import CRUDBase + + +class CRUDScrapeJob(CRUDBase[ScrapeJob, ScrapeJobCreate, ScrapeJobUpdate]): + """ + CRUD operations for ScrapeJob model. + """ + + def get_by_status( + self, db: Session, *, status: JobStatus, skip: int = 0, limit: int = 100 + ) -> List[ScrapeJob]: + """ + Get jobs by status. + """ + return ( + db.query(self.model) + .filter(self.model.status == status) + .offset(skip) + .limit(limit) + .all() + ) + + def count_by_status(self, db: Session, *, status: JobStatus) -> int: + """ + Count jobs by status. + """ + return db.query(self.model).filter(self.model.status == status).count() + + def get_pending_jobs(self, db: Session, *, limit: int = 10) -> List[ScrapeJob]: + """ + Get pending jobs. + """ + return ( + db.query(self.model) + .filter(self.model.status == JobStatus.PENDING) + .limit(limit) + .all() + ) + + +scrape_job = CRUDScrapeJob(ScrapeJob) diff --git a/app/crud/scrape_result.py b/app/crud/scrape_result.py new file mode 100644 index 0000000..49a59e6 --- /dev/null +++ b/app/crud/scrape_result.py @@ -0,0 +1,35 @@ +from typing import List, Optional + +from sqlalchemy.orm import Session + +from app.models.scrape_result import ScrapeResult +from app.schemas.scrape_result import ScrapeResultCreate, ScrapeResultUpdate +from app.crud.base import CRUDBase + + +class CRUDScrapeResult(CRUDBase[ScrapeResult, ScrapeResultCreate, ScrapeResultUpdate]): + """ + CRUD operations for ScrapeResult model. + """ + + def get_by_job_id(self, db: Session, *, job_id: int) -> List[ScrapeResult]: + """ + Get results by job ID. + """ + return db.query(self.model).filter(self.model.job_id == job_id).all() + + def get_latest_by_job_id( + self, db: Session, *, job_id: int + ) -> Optional[ScrapeResult]: + """ + Get the latest result by job ID. + """ + return ( + db.query(self.model) + .filter(self.model.job_id == job_id) + .order_by(self.model.created_at.desc()) + .first() + ) + + +scrape_result = CRUDScrapeResult(ScrapeResult) diff --git a/app/db/__init__.py b/app/db/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/db/session.py b/app/db/session.py new file mode 100644 index 0000000..62f3fa9 --- /dev/null +++ b/app/db/session.py @@ -0,0 +1,32 @@ +from sqlalchemy import create_engine +from sqlalchemy.ext.declarative import declarative_base +from sqlalchemy.orm import sessionmaker + +from app.core.config import settings + +# Create database directory if it doesn't exist +settings.DB_DIR.mkdir(parents=True, exist_ok=True) + +# Create SQLAlchemy engine +engine = create_engine( + settings.SQLALCHEMY_DATABASE_URL, + connect_args={"check_same_thread": False}, # Only for SQLite +) + +# Create sessionmaker +SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) + +# Create base class for SQLAlchemy models +Base = declarative_base() + + +# Database session dependency +def get_db(): + """ + Dependency for getting a database session. + """ + db = SessionLocal() + try: + yield db + finally: + db.close() diff --git a/app/models/__init__.py b/app/models/__init__.py new file mode 100644 index 0000000..099fc35 --- /dev/null +++ b/app/models/__init__.py @@ -0,0 +1,4 @@ +from app.models.scrape_job import ScrapeJob, JobStatus +from app.models.scrape_result import ScrapeResult + +__all__ = ["ScrapeJob", "JobStatus", "ScrapeResult"] diff --git a/app/models/scrape_job.py b/app/models/scrape_job.py new file mode 100644 index 0000000..a335871 --- /dev/null +++ b/app/models/scrape_job.py @@ -0,0 +1,41 @@ +import enum + +from sqlalchemy import Column, String, Integer, DateTime, Enum, Text, JSON +from sqlalchemy.sql import func + +from app.db.session import Base + + +class JobStatus(str, enum.Enum): + PENDING = "pending" + IN_PROGRESS = "in_progress" + COMPLETED = "completed" + FAILED = "failed" + + +class ScrapeJob(Base): + """ + Model for a web scraping job. + """ + + __tablename__ = "scrape_jobs" + + id = Column(Integer, primary_key=True, index=True) + url = Column(String(2048), nullable=False, index=True) + status = Column(Enum(JobStatus), default=JobStatus.PENDING, nullable=False) + created_at = Column(DateTime, default=func.now(), nullable=False) + updated_at = Column( + DateTime, default=func.now(), onupdate=func.now(), nullable=False + ) + started_at = Column(DateTime, nullable=True) + completed_at = Column(DateTime, nullable=True) + selector = Column(String(255), nullable=True) + error = Column(Text, nullable=True) + result = Column(JSON, nullable=True) + user_agent = Column(String(255), nullable=True) + timeout = Column(Integer, nullable=True) + + def __repr__(self): + return ( + f"" + ) diff --git a/app/models/scrape_result.py b/app/models/scrape_result.py new file mode 100644 index 0000000..c2d1c66 --- /dev/null +++ b/app/models/scrape_result.py @@ -0,0 +1,29 @@ +from sqlalchemy import Column, String, Integer, DateTime, ForeignKey, Text, JSON +from sqlalchemy.sql import func +from sqlalchemy.orm import relationship + +from app.db.session import Base + + +class ScrapeResult(Base): + """ + Model for storing scraping results. + """ + + __tablename__ = "scrape_results" + + id = Column(Integer, primary_key=True, index=True) + job_id = Column( + Integer, ForeignKey("scrape_jobs.id", ondelete="CASCADE"), nullable=False + ) + created_at = Column(DateTime, default=func.now(), nullable=False) + content_type = Column(String(100), nullable=True) + headers = Column(JSON, nullable=True) + html_content = Column(Text, nullable=True) + extracted_data = Column(JSON, nullable=True) + + # Relationship + job = relationship("ScrapeJob", backref="results") + + def __repr__(self): + return f"" diff --git a/app/schemas/__init__.py b/app/schemas/__init__.py new file mode 100644 index 0000000..17bbe69 --- /dev/null +++ b/app/schemas/__init__.py @@ -0,0 +1,27 @@ +from app.schemas.scrape_job import ( + ScrapeJobBase, + ScrapeJobCreate, + ScrapeJobUpdate, + ScrapeJob, + ScrapeJobList, +) +from app.schemas.scrape_result import ( + ScrapeResultBase, + ScrapeResultCreate, + ScrapeResultUpdate, + ScrapeResult, + ScrapeResultList, +) + +__all__ = [ + "ScrapeJobBase", + "ScrapeJobCreate", + "ScrapeJobUpdate", + "ScrapeJob", + "ScrapeJobList", + "ScrapeResultBase", + "ScrapeResultCreate", + "ScrapeResultUpdate", + "ScrapeResult", + "ScrapeResultList", +] diff --git a/app/schemas/scrape_job.py b/app/schemas/scrape_job.py new file mode 100644 index 0000000..5bd2487 --- /dev/null +++ b/app/schemas/scrape_job.py @@ -0,0 +1,74 @@ +from datetime import datetime +from typing import Optional, Dict, Any, List + +from pydantic import BaseModel, HttpUrl + +from app.models.scrape_job import JobStatus + + +class ScrapeJobBase(BaseModel): + """ + Base schema for scrape job data. + """ + + url: HttpUrl + selector: Optional[str] = None + user_agent: Optional[str] = None + timeout: Optional[int] = None + + +class ScrapeJobCreate(ScrapeJobBase): + """ + Schema for creating a new scrape job. + """ + + pass + + +class ScrapeJobUpdate(BaseModel): + """ + Schema for updating a scrape job. + """ + + url: Optional[HttpUrl] = None + status: Optional[JobStatus] = None + selector: Optional[str] = None + error: Optional[str] = None + result: Optional[Dict[str, Any]] = None + user_agent: Optional[str] = None + timeout: Optional[int] = None + + +class ScrapeJobInDBBase(ScrapeJobBase): + """ + Base schema for scrape job in database. + """ + + id: int + status: JobStatus + created_at: datetime + updated_at: datetime + started_at: Optional[datetime] = None + completed_at: Optional[datetime] = None + error: Optional[str] = None + result: Optional[Dict[str, Any]] = None + + class Config: + orm_mode = True + + +class ScrapeJob(ScrapeJobInDBBase): + """ + Schema for returned scrape job. + """ + + pass + + +class ScrapeJobList(BaseModel): + """ + Schema for a list of scrape jobs. + """ + + jobs: List[ScrapeJob] + total: int diff --git a/app/schemas/scrape_result.py b/app/schemas/scrape_result.py new file mode 100644 index 0000000..d62c0b7 --- /dev/null +++ b/app/schemas/scrape_result.py @@ -0,0 +1,64 @@ +from datetime import datetime +from typing import Optional, Dict, Any, List + +from pydantic import BaseModel + + +class ScrapeResultBase(BaseModel): + """ + Base schema for scrape result data. + """ + + job_id: int + content_type: Optional[str] = None + headers: Optional[Dict[str, Any]] = None + extracted_data: Optional[Dict[str, Any]] = None + + +class ScrapeResultCreate(ScrapeResultBase): + """ + Schema for creating a new scrape result. + """ + + html_content: Optional[str] = None + + +class ScrapeResultUpdate(BaseModel): + """ + Schema for updating a scrape result. + """ + + content_type: Optional[str] = None + headers: Optional[Dict[str, Any]] = None + html_content: Optional[str] = None + extracted_data: Optional[Dict[str, Any]] = None + + +class ScrapeResultInDBBase(ScrapeResultBase): + """ + Base schema for scrape result in database. + """ + + id: int + created_at: datetime + html_content: Optional[str] = None + + class Config: + orm_mode = True + + +class ScrapeResult(ScrapeResultInDBBase): + """ + Schema for returned scrape result. + """ + + pass + + +class ScrapeResultList(BaseModel): + """ + Schema for a list of scrape results. + """ + + results: List[ScrapeResult] + total: int diff --git a/app/services/__init__.py b/app/services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/services/scraper.py b/app/services/scraper.py new file mode 100644 index 0000000..aff069c --- /dev/null +++ b/app/services/scraper.py @@ -0,0 +1,150 @@ +import time +from datetime import datetime +from typing import Dict, Any, Optional + +import requests +from bs4 import BeautifulSoup +from sqlalchemy.orm import Session + +from app.core.config import settings +from app.models.scrape_job import ScrapeJob, JobStatus +from app.models.scrape_result import ScrapeResult + + +class Scraper: + """ + Service for web scraping. + """ + + def __init__( + self, + db: Session, + user_agent: Optional[str] = None, + timeout: Optional[int] = None, + rate_limit: Optional[float] = None, + ): + self.db = db + self.user_agent = user_agent or settings.DEFAULT_USER_AGENT + self.timeout = timeout or settings.DEFAULT_TIMEOUT + self.rate_limit = rate_limit or settings.DEFAULT_RATE_LIMIT + self._last_request_time = 0 + + def _respect_rate_limit(self) -> None: + """ + Respect rate limit by sleeping if necessary. + """ + current_time = time.time() + time_since_last_request = current_time - self._last_request_time + + if time_since_last_request < (1.0 / self.rate_limit): + sleep_time = (1.0 / self.rate_limit) - time_since_last_request + time.sleep(sleep_time) + + self._last_request_time = time.time() + + def fetch_url(self, url: str) -> requests.Response: + """ + Fetch URL respecting rate limits. + """ + self._respect_rate_limit() + + headers = { + "User-Agent": self.user_agent, + } + + response = requests.get( + url, + headers=headers, + timeout=self.timeout, + ) + response.raise_for_status() + + return response + + def parse_html(self, html: str, selector: Optional[str] = None) -> Dict[str, Any]: + """ + Parse HTML content. + """ + soup = BeautifulSoup(html, "lxml") + result = { + "title": soup.title.text if soup.title else None, + "meta_description": None, + "h1": [h1.text.strip() for h1 in soup.find_all("h1")], + "links": [ + {"href": a.get("href"), "text": a.text.strip()} + for a in soup.find_all("a") + if a.get("href") + ], + } + + # Extract meta description + meta_desc = soup.find("meta", attrs={"name": "description"}) + if meta_desc: + result["meta_description"] = meta_desc.get("content") + + # If a selector is provided, extract content matching the selector + if selector: + selected_elements = soup.select(selector) + result["selected_content"] = [ + element.text.strip() for element in selected_elements + ] + result["selected_html"] = [str(element) for element in selected_elements] + + return result + + def run_job(self, job_id: int) -> ScrapeJob: + """ + Run a scraping job. + """ + # Get job from DB + job = self.db.query(ScrapeJob).filter(ScrapeJob.id == job_id).first() + if not job: + raise ValueError(f"Job with ID {job_id} not found") + + # Update job status + job.status = JobStatus.IN_PROGRESS + job.started_at = datetime.now() + self.db.commit() + self.db.refresh(job) + + try: + # Fetch URL + response = self.fetch_url(job.url) + + # Create ScrapeResult + result = ScrapeResult( + job_id=job.id, + content_type=response.headers.get("Content-Type"), + headers=dict(response.headers), + html_content=response.text, + ) + self.db.add(result) + self.db.commit() + self.db.refresh(result) + + # Parse HTML + extracted_data = self.parse_html(response.text, job.selector) + + # Update ScrapeResult with extracted data + result.extracted_data = extracted_data + self.db.commit() + self.db.refresh(result) + + # Update job status + job.status = JobStatus.COMPLETED + job.completed_at = datetime.now() + job.result = {"result_id": result.id} + self.db.commit() + self.db.refresh(job) + + return job + + except Exception as e: + # Update job with error + job.status = JobStatus.FAILED + job.completed_at = datetime.now() + job.error = str(e) + self.db.commit() + self.db.refresh(job) + + raise e diff --git a/app/utils/__init__.py b/app/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/utils/html.py b/app/utils/html.py new file mode 100644 index 0000000..2b7c50a --- /dev/null +++ b/app/utils/html.py @@ -0,0 +1,83 @@ +from typing import List, Dict, Any +from bs4 import BeautifulSoup + + +def extract_metadata(html: str) -> Dict[str, Any]: + """ + Extract metadata from HTML. + """ + soup = BeautifulSoup(html, "lxml") + result = { + "title": None, + "description": None, + "keywords": None, + "og_title": None, + "og_description": None, + "og_image": None, + } + + # Title + if soup.title: + result["title"] = soup.title.text.strip() + + # Meta tags + for meta in soup.find_all("meta"): + name = meta.get("name", "").lower() + property = meta.get("property", "").lower() + content = meta.get("content", "") + + if name == "description": + result["description"] = content + elif name == "keywords": + result["keywords"] = content + elif property == "og:title": + result["og_title"] = content + elif property == "og:description": + result["og_description"] = content + elif property == "og:image": + result["og_image"] = content + + return result + + +def extract_links(html: str) -> List[Dict[str, str]]: + """ + Extract links from HTML. + """ + soup = BeautifulSoup(html, "lxml") + links = [] + + for a in soup.find_all("a"): + href = a.get("href") + if href: + links.append( + { + "href": href, + "text": a.text.strip(), + "title": a.get("title", ""), + "rel": a.get("rel", ""), + } + ) + + return links + + +def extract_images(html: str) -> List[Dict[str, str]]: + """ + Extract images from HTML. + """ + soup = BeautifulSoup(html, "lxml") + images = [] + + for img in soup.find_all("img"): + src = img.get("src") + if src: + images.append( + { + "src": src, + "alt": img.get("alt", ""), + "title": img.get("title", ""), + } + ) + + return images diff --git a/app/utils/url.py b/app/utils/url.py new file mode 100644 index 0000000..3f2e9d1 --- /dev/null +++ b/app/utils/url.py @@ -0,0 +1,25 @@ +from urllib.parse import urlparse, parse_qs +from typing import Dict, Any + + +def parse_url(url: str) -> Dict[str, Any]: + """ + Parse a URL into its components. + """ + parsed = urlparse(url) + return { + "scheme": parsed.scheme, + "netloc": parsed.netloc, + "path": parsed.path, + "params": parsed.params, + "query": parse_qs(parsed.query), + "fragment": parsed.fragment, + } + + +def is_valid_url(url: str) -> bool: + """ + Check if a URL is valid. + """ + parsed = urlparse(url) + return bool(parsed.scheme and parsed.netloc) diff --git a/cli.py b/cli.py new file mode 100644 index 0000000..52569d5 --- /dev/null +++ b/cli.py @@ -0,0 +1,9 @@ +#!/usr/bin/env python +""" +Web Scraper CLI +""" + +from app.cli.cli import app + +if __name__ == "__main__": + app() diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..b41ecc5 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,15 @@ +version: '3.8' + +services: + app: + build: + context: . + dockerfile: Dockerfile + ports: + - "8000:8000" + volumes: + - ./app:/app/app + - ./storage:/app/storage + environment: + - DEBUG=True + command: uvicorn main:app --host 0.0.0.0 --port 8000 --reload \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..ef5e2a4 --- /dev/null +++ b/main.py @@ -0,0 +1,29 @@ +import uvicorn +from fastapi import FastAPI +from app.api.api import api_router +from app.core.config import settings + +app = FastAPI( + title=settings.PROJECT_NAME, + description=settings.PROJECT_DESCRIPTION, + version=settings.VERSION, +) + +app.include_router(api_router) + + +@app.get("/health", tags=["Health"]) +async def health_check(): + """ + Health check endpoint. + """ + return {"status": "ok"} + + +if __name__ == "__main__": + uvicorn.run( + "main:app", + host=settings.HOST, + port=settings.PORT, + reload=settings.DEBUG, + ) diff --git a/migrations/env.py b/migrations/env.py new file mode 100644 index 0000000..ced9708 --- /dev/null +++ b/migrations/env.py @@ -0,0 +1,88 @@ +from logging.config import fileConfig + +from sqlalchemy import engine_from_config +from sqlalchemy import pool + +from alembic import context + +# Import models to register them with the metadata +from app.db.session import Base + +# These imports are needed to register models with SQLAlchemy metadata +# even though they appear unused to static analyzers +import app.models.scrape_job # noqa: F401 +import app.models.scrape_result # noqa: F401 + +# this is the Alembic Config object, which provides +# access to the values within the .ini file in use. +config = context.config + +# Interpret the config file for Python logging. +# This line sets up loggers basically. +fileConfig(config.config_file_name) + +# add your model's MetaData object here +# for 'autogenerate' support +target_metadata = Base.metadata + +# other values from the config, defined by the needs of env.py, +# can be acquired: +# my_important_option = config.get_main_option("my_important_option") +# ... etc. + + +def run_migrations_offline(): + """Run migrations in 'offline' mode. + + This configures the context with just a URL + and not an Engine, though an Engine is acceptable + here as well. By skipping the Engine creation + we don't even need a DBAPI to be available. + + Calls to context.execute() here emit the given string to the + script output. + + """ + url = config.get_main_option("sqlalchemy.url") + context.configure( + url=url, + target_metadata=target_metadata, + literal_binds=True, + dialect_opts={"paramstyle": "named"}, + ) + + with context.begin_transaction(): + context.run_migrations() + + +def run_migrations_online(): + """Run migrations in 'online' mode. + + In this scenario we need to create an Engine + and associate a connection with the context. + + """ + connectable = engine_from_config( + config.get_section(config.config_ini_section), + prefix="sqlalchemy.", + poolclass=pool.NullPool, + ) + + with connectable.connect() as connection: + # Check if we're using SQLite + is_sqlite = connection.dialect.name == "sqlite" + + context.configure( + connection=connection, + target_metadata=target_metadata, + render_as_batch=is_sqlite, # Critical for SQLite to handle alter table operations + ) + + with context.begin_transaction(): + context.run_migrations() + + +if context.is_offline_mode(): + run_migrations_offline() +else: + run_migrations_online() diff --git a/migrations/script.py.mako b/migrations/script.py.mako new file mode 100644 index 0000000..1e4564e --- /dev/null +++ b/migrations/script.py.mako @@ -0,0 +1,24 @@ +"""${message} + +Revision ID: ${up_revision} +Revises: ${down_revision | comma,n} +Create Date: ${create_date} + +""" +from alembic import op +import sqlalchemy as sa +${imports if imports else ""} + +# revision identifiers, used by Alembic. +revision = ${repr(up_revision)} +down_revision = ${repr(down_revision)} +branch_labels = ${repr(branch_labels)} +depends_on = ${repr(depends_on)} + + +def upgrade(): + ${upgrades if upgrades else "pass"} + + +def downgrade(): + ${downgrades if downgrades else "pass"} \ No newline at end of file diff --git a/migrations/versions/0001_initial_migration.py b/migrations/versions/0001_initial_migration.py new file mode 100644 index 0000000..0180524 --- /dev/null +++ b/migrations/versions/0001_initial_migration.py @@ -0,0 +1,75 @@ +"""Initial migration + +Revision ID: 0001 +Revises: +Create Date: 2023-06-25 + +""" + +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import sqlite + +# revision identifiers, used by Alembic. +revision = "0001" +down_revision = None +branch_labels = None +depends_on = None + + +def upgrade(): + # Create scrape_jobs table + op.create_table( + "scrape_jobs", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("url", sa.String(length=2048), nullable=False), + sa.Column( + "status", + sa.Enum("pending", "in_progress", "completed", "failed", name="jobstatus"), + nullable=False, + default="pending", + ), + sa.Column( + "created_at", sa.DateTime(), nullable=False, server_default=sa.func.now() + ), + sa.Column( + "updated_at", sa.DateTime(), nullable=False, server_default=sa.func.now() + ), + sa.Column("started_at", sa.DateTime(), nullable=True), + sa.Column("completed_at", sa.DateTime(), nullable=True), + sa.Column("selector", sa.String(length=255), nullable=True), + sa.Column("error", sa.Text(), nullable=True), + sa.Column("result", sqlite.JSON(), nullable=True), + sa.Column("user_agent", sa.String(length=255), nullable=True), + sa.Column("timeout", sa.Integer(), nullable=True), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index(op.f("ix_scrape_jobs_id"), "scrape_jobs", ["id"], unique=False) + op.create_index(op.f("ix_scrape_jobs_url"), "scrape_jobs", ["url"], unique=False) + + # Create scrape_results table + op.create_table( + "scrape_results", + sa.Column("id", sa.Integer(), nullable=False), + sa.Column("job_id", sa.Integer(), nullable=False), + sa.Column( + "created_at", sa.DateTime(), nullable=False, server_default=sa.func.now() + ), + sa.Column("content_type", sa.String(length=100), nullable=True), + sa.Column("headers", sqlite.JSON(), nullable=True), + sa.Column("html_content", sa.Text(), nullable=True), + sa.Column("extracted_data", sqlite.JSON(), nullable=True), + sa.ForeignKeyConstraint(["job_id"], ["scrape_jobs.id"], ondelete="CASCADE"), + sa.PrimaryKeyConstraint("id"), + ) + op.create_index( + op.f("ix_scrape_results_id"), "scrape_results", ["id"], unique=False + ) + + +def downgrade(): + op.drop_index(op.f("ix_scrape_results_id"), table_name="scrape_results") + op.drop_table("scrape_results") + op.drop_index(op.f("ix_scrape_jobs_url"), table_name="scrape_jobs") + op.drop_index(op.f("ix_scrape_jobs_id"), table_name="scrape_jobs") + op.drop_table("scrape_jobs") diff --git a/migrations/versions/__init__.py b/migrations/versions/__init__.py new file mode 100644 index 0000000..011100a --- /dev/null +++ b/migrations/versions/__init__.py @@ -0,0 +1 @@ +# This file is intentionally empty to make the directory a Python package diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c03e5b5 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,14 @@ +fastapi==0.110.0 +uvicorn==0.27.1 +sqlalchemy==2.0.27 +alembic==1.13.1 +pydantic==2.6.1 +python-dotenv==1.0.1 +beautifulsoup4==4.12.2 +requests==2.31.0 +typer==0.9.0 +rich==13.7.0 +httpx==0.26.0 +lxml==4.9.3 +aiohttp==3.9.3 +ruff==0.2.2 \ No newline at end of file