Fix dependency installation issue by adding Dockerfile and docker-compose.yml
This commit is contained in:
parent
b7132c82ed
commit
a4511b3137
27
Dockerfile
Normal file
27
Dockerfile
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Copy requirements file
|
||||||
|
COPY requirements.txt .
|
||||||
|
|
||||||
|
# Install dependencies
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Copy the rest of the application
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Create the database directory
|
||||||
|
RUN mkdir -p /app/storage/db
|
||||||
|
|
||||||
|
# Set environment variables
|
||||||
|
ENV PYTHONPATH=/app
|
||||||
|
ENV HOST=0.0.0.0
|
||||||
|
ENV PORT=8000
|
||||||
|
ENV DEBUG=True
|
||||||
|
|
||||||
|
# Expose the port
|
||||||
|
EXPOSE 8000
|
||||||
|
|
||||||
|
# Run the application
|
||||||
|
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
151
README.md
151
README.md
@ -1,3 +1,150 @@
|
|||||||
# FastAPI Application
|
# Web Scraper CLI
|
||||||
|
|
||||||
This is a FastAPI application bootstrapped by BackendIM, the AI-powered backend generation platform.
|
A FastAPI-based web scraper with CLI interface.
|
||||||
|
|
||||||
|
## Features
|
||||||
|
|
||||||
|
- REST API for web scraping management
|
||||||
|
- CLI tool for scraping websites
|
||||||
|
- Extract metadata, links, and specific content using CSS selectors
|
||||||
|
- Store scraping results in SQLite database
|
||||||
|
- Background job processing
|
||||||
|
- Rate limiting to avoid overloading target websites
|
||||||
|
|
||||||
|
## Installation
|
||||||
|
|
||||||
|
### Local Installation
|
||||||
|
|
||||||
|
1. Clone the repository:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/yourusername/webscrapercli.git
|
||||||
|
cd webscrapercli
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Install dependencies:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -r requirements.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
3. Run the database migrations:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
alembic upgrade head
|
||||||
|
```
|
||||||
|
|
||||||
|
### Docker Installation
|
||||||
|
|
||||||
|
1. Clone the repository:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
git clone https://github.com/yourusername/webscrapercli.git
|
||||||
|
cd webscrapercli
|
||||||
|
```
|
||||||
|
|
||||||
|
2. Build and run using Docker Compose:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
docker-compose up --build
|
||||||
|
```
|
||||||
|
|
||||||
|
This will:
|
||||||
|
- Build the Docker image with all dependencies
|
||||||
|
- Start the FastAPI server on port 8000
|
||||||
|
- Mount the app and storage directories as volumes for live code reloading
|
||||||
|
|
||||||
|
## Usage
|
||||||
|
|
||||||
|
### API Server
|
||||||
|
|
||||||
|
Start the API server:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Development mode
|
||||||
|
uvicorn main:app --reload
|
||||||
|
|
||||||
|
# Production mode
|
||||||
|
uvicorn main:app --host 0.0.0.0 --port 8000
|
||||||
|
```
|
||||||
|
|
||||||
|
Access the API documentation at: http://localhost:8000/docs
|
||||||
|
|
||||||
|
### CLI Usage
|
||||||
|
|
||||||
|
The CLI provides several commands for scraping websites:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Scrape a URL
|
||||||
|
python cli.py scrape https://example.com
|
||||||
|
|
||||||
|
# Scrape a URL with a specific selector
|
||||||
|
python cli.py scrape https://example.com --selector "div.content"
|
||||||
|
|
||||||
|
# Save the results to a file
|
||||||
|
python cli.py scrape https://example.com --output results.json
|
||||||
|
|
||||||
|
# List all scrape jobs
|
||||||
|
python cli.py list
|
||||||
|
|
||||||
|
# List scrape jobs with a specific status
|
||||||
|
python cli.py list --status completed
|
||||||
|
|
||||||
|
# Show details of a specific job
|
||||||
|
python cli.py show 1
|
||||||
|
|
||||||
|
# Run a specific job
|
||||||
|
python cli.py run 1
|
||||||
|
```
|
||||||
|
|
||||||
|
## API Endpoints
|
||||||
|
|
||||||
|
- `GET /health`: Health check endpoint
|
||||||
|
- `POST /api/v1/scrape-jobs/`: Create a new scrape job
|
||||||
|
- `GET /api/v1/scrape-jobs/`: List scrape jobs
|
||||||
|
- `GET /api/v1/scrape-jobs/{job_id}`: Get a specific scrape job
|
||||||
|
- `PUT /api/v1/scrape-jobs/{job_id}`: Update a scrape job
|
||||||
|
- `DELETE /api/v1/scrape-jobs/{job_id}`: Delete a scrape job
|
||||||
|
- `POST /api/v1/scrape-jobs/{job_id}/run`: Run a scrape job
|
||||||
|
- `GET /api/v1/scrape-jobs/{job_id}/results`: Get the results of a scrape job
|
||||||
|
|
||||||
|
## Development
|
||||||
|
|
||||||
|
### Project Structure
|
||||||
|
|
||||||
|
```
|
||||||
|
webscrapercli/
|
||||||
|
├── alembic.ini # Alembic configuration
|
||||||
|
├── app/ # Application package
|
||||||
|
│ ├── api/ # API endpoints
|
||||||
|
│ ├── cli/ # CLI implementation
|
||||||
|
│ ├── core/ # Core functionality
|
||||||
|
│ ├── crud/ # CRUD operations
|
||||||
|
│ ├── db/ # Database configuration
|
||||||
|
│ ├── models/ # SQLAlchemy models
|
||||||
|
│ ├── schemas/ # Pydantic schemas
|
||||||
|
│ ├── services/ # Business logic
|
||||||
|
│ └── utils/ # Utility functions
|
||||||
|
├── cli.py # CLI entry point
|
||||||
|
├── docker-compose.yml # Docker Compose configuration
|
||||||
|
├── Dockerfile # Docker configuration
|
||||||
|
├── main.py # API entry point
|
||||||
|
├── migrations/ # Alembic migrations
|
||||||
|
│ ├── env.py # Alembic environment
|
||||||
|
│ ├── script.py.mako # Alembic script template
|
||||||
|
│ └── versions/ # Migration scripts
|
||||||
|
├── requirements.txt # Dependencies
|
||||||
|
└── storage/ # Storage directory for database and other files
|
||||||
|
└── db/ # Database directory
|
||||||
|
```
|
||||||
|
|
||||||
|
### Running Tests
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Run tests
|
||||||
|
pytest
|
||||||
|
```
|
||||||
|
|
||||||
|
## License
|
||||||
|
|
||||||
|
This project is open source.
|
84
alembic.ini
Normal file
84
alembic.ini
Normal file
@ -0,0 +1,84 @@
|
|||||||
|
# A generic, single database configuration.
|
||||||
|
|
||||||
|
[alembic]
|
||||||
|
# path to migration scripts
|
||||||
|
script_location = migrations
|
||||||
|
|
||||||
|
# template used to generate migration files
|
||||||
|
# file_template = %%(rev)s_%%(slug)s
|
||||||
|
|
||||||
|
# timezone to use when rendering the date
|
||||||
|
# within the migration file as well as the filename.
|
||||||
|
# string value is passed to dateutil.tz.gettz()
|
||||||
|
# leave blank for localtime
|
||||||
|
# timezone =
|
||||||
|
|
||||||
|
# max length of characters to apply to the
|
||||||
|
# "slug" field
|
||||||
|
# truncate_slug_length = 40
|
||||||
|
|
||||||
|
# set to 'true' to run the environment during
|
||||||
|
# the 'revision' command, regardless of autogenerate
|
||||||
|
# revision_environment = false
|
||||||
|
|
||||||
|
# set to 'true' to allow .pyc and .pyo files without
|
||||||
|
# a source .py file to be detected as revisions in the
|
||||||
|
# versions/ directory
|
||||||
|
# sourceless = false
|
||||||
|
|
||||||
|
# version location specification; this defaults
|
||||||
|
# to migrations/versions. When using multiple version
|
||||||
|
# directories, initial revisions must be specified with --version-path
|
||||||
|
# version_locations = %(here)s/bar %(here)s/bat migrations/versions
|
||||||
|
|
||||||
|
# the output encoding used when revision files
|
||||||
|
# are written from script.py.mako
|
||||||
|
# output_encoding = utf-8
|
||||||
|
|
||||||
|
sqlalchemy.url = sqlite:////app/storage/db/db.sqlite
|
||||||
|
|
||||||
|
[post_write_hooks]
|
||||||
|
# post_write_hooks defines scripts or Python functions that are run
|
||||||
|
# on newly generated revision scripts. See the documentation for further
|
||||||
|
# detail and examples
|
||||||
|
|
||||||
|
# format using "black" - use the console_scripts runner, against the "black" entrypoint
|
||||||
|
# hooks=black
|
||||||
|
# black.type=console_scripts
|
||||||
|
# black.entrypoint=black
|
||||||
|
# black.options=-l 79
|
||||||
|
|
||||||
|
# Logging configuration
|
||||||
|
[loggers]
|
||||||
|
keys = root,sqlalchemy,alembic
|
||||||
|
|
||||||
|
[handlers]
|
||||||
|
keys = console
|
||||||
|
|
||||||
|
[formatters]
|
||||||
|
keys = generic
|
||||||
|
|
||||||
|
[logger_root]
|
||||||
|
level = WARN
|
||||||
|
handlers = console
|
||||||
|
qualname =
|
||||||
|
|
||||||
|
[logger_sqlalchemy]
|
||||||
|
level = WARN
|
||||||
|
handlers =
|
||||||
|
qualname = sqlalchemy.engine
|
||||||
|
|
||||||
|
[logger_alembic]
|
||||||
|
level = INFO
|
||||||
|
handlers =
|
||||||
|
qualname = alembic
|
||||||
|
|
||||||
|
[handler_console]
|
||||||
|
class = StreamHandler
|
||||||
|
args = (sys.stderr,)
|
||||||
|
level = NOTSET
|
||||||
|
formatter = generic
|
||||||
|
|
||||||
|
[formatter_generic]
|
||||||
|
format = %(levelname)-5.5s [%(name)s] %(message)s
|
||||||
|
datefmt = %H:%M:%S
|
0
app/__init__.py
Normal file
0
app/__init__.py
Normal file
0
app/api/__init__.py
Normal file
0
app/api/__init__.py
Normal file
9
app/api/api.py
Normal file
9
app/api/api.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
from fastapi import APIRouter
|
||||||
|
|
||||||
|
from app.api.endpoints import scrape_jobs
|
||||||
|
|
||||||
|
api_router = APIRouter()
|
||||||
|
|
||||||
|
api_router.include_router(
|
||||||
|
scrape_jobs.router, prefix="/scrape-jobs", tags=["scrape-jobs"]
|
||||||
|
)
|
15
app/api/deps.py
Normal file
15
app/api/deps.py
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
from typing import Generator
|
||||||
|
|
||||||
|
|
||||||
|
from app.db.session import SessionLocal
|
||||||
|
|
||||||
|
|
||||||
|
def get_db() -> Generator:
|
||||||
|
"""
|
||||||
|
Get a database session.
|
||||||
|
"""
|
||||||
|
db = SessionLocal()
|
||||||
|
try:
|
||||||
|
yield db
|
||||||
|
finally:
|
||||||
|
db.close()
|
1
app/api/endpoints/__init__.py
Normal file
1
app/api/endpoints/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
# This file is intentionally empty to make the directory a Python package
|
201
app/api/endpoints/scrape_jobs.py
Normal file
201
app/api/endpoints/scrape_jobs.py
Normal file
@ -0,0 +1,201 @@
|
|||||||
|
from typing import Any, Optional
|
||||||
|
|
||||||
|
from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks, status
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.api.deps import get_db
|
||||||
|
from app.models.scrape_job import JobStatus
|
||||||
|
from app.services.scraper import Scraper
|
||||||
|
from app.crud import scrape_job, scrape_result
|
||||||
|
from app.schemas.scrape_job import (
|
||||||
|
ScrapeJob,
|
||||||
|
ScrapeJobCreate,
|
||||||
|
ScrapeJobUpdate,
|
||||||
|
ScrapeJobList,
|
||||||
|
)
|
||||||
|
from app.schemas.scrape_result import ScrapeResult
|
||||||
|
|
||||||
|
|
||||||
|
router = APIRouter()
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/", response_model=ScrapeJob, status_code=status.HTTP_201_CREATED)
|
||||||
|
def create_scrape_job(
|
||||||
|
*,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
job_in: ScrapeJobCreate,
|
||||||
|
background_tasks: BackgroundTasks,
|
||||||
|
) -> Any:
|
||||||
|
"""
|
||||||
|
Create a new scrape job.
|
||||||
|
"""
|
||||||
|
job = scrape_job.create(db=db, obj_in=job_in)
|
||||||
|
|
||||||
|
# Run job in background
|
||||||
|
background_tasks.add_task(run_scrape_job, job_id=job.id)
|
||||||
|
|
||||||
|
return job
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/", response_model=ScrapeJobList)
|
||||||
|
def list_scrape_jobs(
|
||||||
|
*,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
skip: int = 0,
|
||||||
|
limit: int = 100,
|
||||||
|
status: Optional[JobStatus] = None,
|
||||||
|
) -> Any:
|
||||||
|
"""
|
||||||
|
List scrape jobs.
|
||||||
|
"""
|
||||||
|
if status:
|
||||||
|
jobs = scrape_job.get_by_status(db=db, status=status, skip=skip, limit=limit)
|
||||||
|
total = scrape_job.count_by_status(db=db, status=status)
|
||||||
|
else:
|
||||||
|
jobs = scrape_job.get_multi(db=db, skip=skip, limit=limit)
|
||||||
|
total = scrape_job.count(db=db)
|
||||||
|
|
||||||
|
return {"jobs": jobs, "total": total}
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{job_id}", response_model=ScrapeJob)
|
||||||
|
def get_scrape_job(
|
||||||
|
*,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
job_id: int,
|
||||||
|
) -> Any:
|
||||||
|
"""
|
||||||
|
Get a scrape job by ID.
|
||||||
|
"""
|
||||||
|
job = scrape_job.get(db=db, id=job_id)
|
||||||
|
if not job:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail=f"Scrape job with ID {job_id} not found",
|
||||||
|
)
|
||||||
|
return job
|
||||||
|
|
||||||
|
|
||||||
|
@router.put("/{job_id}", response_model=ScrapeJob)
|
||||||
|
def update_scrape_job(
|
||||||
|
*,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
job_id: int,
|
||||||
|
job_in: ScrapeJobUpdate,
|
||||||
|
) -> Any:
|
||||||
|
"""
|
||||||
|
Update a scrape job.
|
||||||
|
"""
|
||||||
|
job = scrape_job.get(db=db, id=job_id)
|
||||||
|
if not job:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail=f"Scrape job with ID {job_id} not found",
|
||||||
|
)
|
||||||
|
job = scrape_job.update(db=db, db_obj=job, obj_in=job_in)
|
||||||
|
return job
|
||||||
|
|
||||||
|
|
||||||
|
@router.delete("/{job_id}", status_code=status.HTTP_204_NO_CONTENT, response_model=None)
|
||||||
|
def delete_scrape_job(
|
||||||
|
*,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
job_id: int,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Delete a scrape job.
|
||||||
|
"""
|
||||||
|
job = scrape_job.get(db=db, id=job_id)
|
||||||
|
if not job:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail=f"Scrape job with ID {job_id} not found",
|
||||||
|
)
|
||||||
|
scrape_job.remove(db=db, id=job_id)
|
||||||
|
|
||||||
|
|
||||||
|
@router.post("/{job_id}/run", response_model=ScrapeJob)
|
||||||
|
def run_scrape_job(
|
||||||
|
*,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
job_id: int,
|
||||||
|
background_tasks: Optional[BackgroundTasks] = None,
|
||||||
|
) -> Any:
|
||||||
|
"""
|
||||||
|
Run a scrape job.
|
||||||
|
"""
|
||||||
|
job = scrape_job.get(db=db, id=job_id)
|
||||||
|
if not job:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail=f"Scrape job with ID {job_id} not found",
|
||||||
|
)
|
||||||
|
|
||||||
|
if job.status == JobStatus.IN_PROGRESS:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_400_BAD_REQUEST,
|
||||||
|
detail=f"Scrape job with ID {job_id} is already in progress",
|
||||||
|
)
|
||||||
|
|
||||||
|
# If called with background_tasks, run in background
|
||||||
|
if background_tasks:
|
||||||
|
background_tasks.add_task(_run_job, job_id=job_id)
|
||||||
|
# Update job status to pending
|
||||||
|
job = scrape_job.update(db=db, db_obj=job, obj_in={"status": JobStatus.PENDING})
|
||||||
|
return job
|
||||||
|
|
||||||
|
# Otherwise, run synchronously
|
||||||
|
return _run_job(job_id=job_id)
|
||||||
|
|
||||||
|
|
||||||
|
@router.get("/{job_id}/results", response_model=ScrapeResult)
|
||||||
|
def get_scrape_results(
|
||||||
|
*,
|
||||||
|
db: Session = Depends(get_db),
|
||||||
|
job_id: int,
|
||||||
|
) -> Any:
|
||||||
|
"""
|
||||||
|
Get the latest result for a scrape job.
|
||||||
|
"""
|
||||||
|
job = scrape_job.get(db=db, id=job_id)
|
||||||
|
if not job:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail=f"Scrape job with ID {job_id} not found",
|
||||||
|
)
|
||||||
|
|
||||||
|
result = scrape_result.get_latest_by_job_id(db=db, job_id=job_id)
|
||||||
|
if not result:
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_404_NOT_FOUND,
|
||||||
|
detail=f"No results found for scrape job with ID {job_id}",
|
||||||
|
)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _run_job(job_id: int) -> ScrapeJob:
|
||||||
|
"""
|
||||||
|
Internal function to run a scrape job.
|
||||||
|
"""
|
||||||
|
# Create a new session and scraper
|
||||||
|
db = next(get_db())
|
||||||
|
scraper = Scraper(db=db)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Run the job
|
||||||
|
job = scraper.run_job(job_id=job_id)
|
||||||
|
return job
|
||||||
|
except Exception as e:
|
||||||
|
# Make sure the job is marked as failed
|
||||||
|
job = scrape_job.get(db=db, id=job_id)
|
||||||
|
if job and job.status != JobStatus.FAILED:
|
||||||
|
scrape_job.update(
|
||||||
|
db=db,
|
||||||
|
db_obj=job,
|
||||||
|
obj_in={"status": JobStatus.FAILED, "error": str(e)},
|
||||||
|
)
|
||||||
|
raise HTTPException(
|
||||||
|
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||||
|
detail=f"Error running scrape job: {str(e)}",
|
||||||
|
)
|
0
app/cli/__init__.py
Normal file
0
app/cli/__init__.py
Normal file
277
app/cli/cli.py
Normal file
277
app/cli/cli.py
Normal file
@ -0,0 +1,277 @@
|
|||||||
|
import json
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import typer
|
||||||
|
from rich.console import Console
|
||||||
|
from rich.table import Table
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.db.session import SessionLocal
|
||||||
|
from app.crud import scrape_job, scrape_result
|
||||||
|
from app.models.scrape_job import JobStatus
|
||||||
|
from app.services.scraper import Scraper
|
||||||
|
|
||||||
|
app = typer.Typer(help="Web Scraper CLI")
|
||||||
|
console = Console()
|
||||||
|
|
||||||
|
|
||||||
|
def get_db() -> Session:
|
||||||
|
"""
|
||||||
|
Get a database session.
|
||||||
|
"""
|
||||||
|
return SessionLocal()
|
||||||
|
|
||||||
|
|
||||||
|
@app.command("scrape")
|
||||||
|
def scrape_url(
|
||||||
|
url: str = typer.Argument(..., help="URL to scrape"),
|
||||||
|
selector: Optional[str] = typer.Option(
|
||||||
|
None, help="CSS selector to extract content"
|
||||||
|
),
|
||||||
|
user_agent: Optional[str] = typer.Option(
|
||||||
|
None, help="User agent to use for request"
|
||||||
|
),
|
||||||
|
timeout: Optional[int] = typer.Option(None, help="Timeout for request in seconds"),
|
||||||
|
output: Optional[str] = typer.Option(
|
||||||
|
None, help="Output file path for results (JSON)"
|
||||||
|
),
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Scrape a URL and extract content.
|
||||||
|
"""
|
||||||
|
console.print(f"Scraping [bold]{url}[/bold]...")
|
||||||
|
|
||||||
|
db = get_db()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Create a new scrape job
|
||||||
|
job_data = {
|
||||||
|
"url": url,
|
||||||
|
"selector": selector,
|
||||||
|
"user_agent": user_agent,
|
||||||
|
"timeout": timeout,
|
||||||
|
}
|
||||||
|
job_in = {k: v for k, v in job_data.items() if v is not None}
|
||||||
|
|
||||||
|
# Create and run the job
|
||||||
|
job = scrape_job.create(db=db, obj_in=job_in)
|
||||||
|
console.print(f"Created scrape job with ID [bold]{job.id}[/bold]")
|
||||||
|
|
||||||
|
# Run the job
|
||||||
|
scraper = Scraper(db=db, user_agent=user_agent, timeout=timeout)
|
||||||
|
job = scraper.run_job(job_id=job.id)
|
||||||
|
|
||||||
|
if job.status == JobStatus.COMPLETED:
|
||||||
|
console.print("[bold green]Scraping completed successfully![/bold green]")
|
||||||
|
|
||||||
|
# Get the result
|
||||||
|
result = scrape_result.get_latest_by_job_id(db=db, job_id=job.id)
|
||||||
|
|
||||||
|
# Print basic info
|
||||||
|
console.print("\n[bold]Basic Information:[/bold]")
|
||||||
|
table = Table(show_header=True, header_style="bold")
|
||||||
|
table.add_column("Attribute")
|
||||||
|
table.add_column("Value")
|
||||||
|
|
||||||
|
if result and result.extracted_data:
|
||||||
|
data = result.extracted_data
|
||||||
|
|
||||||
|
# Add rows to table
|
||||||
|
if "title" in data:
|
||||||
|
table.add_row("Title", data["title"] or "")
|
||||||
|
|
||||||
|
if "meta_description" in data:
|
||||||
|
table.add_row("Description", data["meta_description"] or "")
|
||||||
|
|
||||||
|
if "h1" in data:
|
||||||
|
table.add_row(
|
||||||
|
"H1 Tags", ", ".join(data["h1"]) if data["h1"] else ""
|
||||||
|
)
|
||||||
|
|
||||||
|
if "links" in data:
|
||||||
|
link_count = len(data["links"]) if data["links"] else 0
|
||||||
|
table.add_row("Links", str(link_count))
|
||||||
|
|
||||||
|
if selector and "selected_content" in data:
|
||||||
|
content_count = (
|
||||||
|
len(data["selected_content"]) if data["selected_content"] else 0
|
||||||
|
)
|
||||||
|
table.add_row(f"Selected Content ({selector})", str(content_count))
|
||||||
|
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
# Write results to file if specified
|
||||||
|
if output:
|
||||||
|
with open(output, "w") as f:
|
||||||
|
json.dump(data, f, indent=2)
|
||||||
|
console.print(f"\nResults saved to [bold]{output}[/bold]")
|
||||||
|
|
||||||
|
# Ask if user wants to see more details
|
||||||
|
if typer.confirm("\nDo you want to see the full extracted data?"):
|
||||||
|
console.print_json(json.dumps(data))
|
||||||
|
else:
|
||||||
|
console.print("[yellow]No data extracted.[/yellow]")
|
||||||
|
else:
|
||||||
|
console.print(f"[bold red]Scraping failed:[/bold red] {job.error}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[bold red]Error:[/bold red] {str(e)}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
@app.command("list")
|
||||||
|
def list_jobs(
|
||||||
|
status: Optional[str] = typer.Option(
|
||||||
|
None, help="Filter by status (pending, in_progress, completed, failed)"
|
||||||
|
),
|
||||||
|
limit: int = typer.Option(10, help="Limit number of jobs"),
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
List scrape jobs.
|
||||||
|
"""
|
||||||
|
db = get_db()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get jobs based on status
|
||||||
|
if status:
|
||||||
|
try:
|
||||||
|
job_status = JobStatus(status)
|
||||||
|
jobs = scrape_job.get_by_status(db=db, status=job_status, limit=limit)
|
||||||
|
total = scrape_job.count_by_status(db=db, status=job_status)
|
||||||
|
console.print(
|
||||||
|
f"Found [bold]{total}[/bold] jobs with status [bold]{status}[/bold]"
|
||||||
|
)
|
||||||
|
except ValueError:
|
||||||
|
console.print(f"[bold red]Invalid status:[/bold red] {status}")
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
jobs = scrape_job.get_multi(db=db, limit=limit)
|
||||||
|
total = scrape_job.count(db=db)
|
||||||
|
console.print(f"Found [bold]{total}[/bold] jobs")
|
||||||
|
|
||||||
|
if not jobs:
|
||||||
|
console.print("[yellow]No jobs found.[/yellow]")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Create table
|
||||||
|
table = Table(show_header=True, header_style="bold")
|
||||||
|
table.add_column("ID")
|
||||||
|
table.add_column("URL")
|
||||||
|
table.add_column("Status")
|
||||||
|
table.add_column("Created")
|
||||||
|
table.add_column("Updated")
|
||||||
|
|
||||||
|
# Add rows
|
||||||
|
for job in jobs:
|
||||||
|
table.add_row(
|
||||||
|
str(job.id),
|
||||||
|
job.url,
|
||||||
|
job.status.value,
|
||||||
|
job.created_at.strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
job.updated_at.strftime("%Y-%m-%d %H:%M:%S"),
|
||||||
|
)
|
||||||
|
|
||||||
|
console.print(table)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[bold red]Error:[/bold red] {str(e)}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
@app.command("show")
|
||||||
|
def show_job(
|
||||||
|
job_id: int = typer.Argument(..., help="ID of the job to show"),
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Show details of a scrape job.
|
||||||
|
"""
|
||||||
|
db = get_db()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get job
|
||||||
|
job = scrape_job.get(db=db, id=job_id)
|
||||||
|
|
||||||
|
if not job:
|
||||||
|
console.print(f"[bold red]Job not found:[/bold red] {job_id}")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Print job details
|
||||||
|
console.print(f"\n[bold]Job {job_id}[/bold]")
|
||||||
|
console.print(f"URL: [bold]{job.url}[/bold]")
|
||||||
|
console.print(f"Status: [bold]{job.status.value}[/bold]")
|
||||||
|
console.print(f"Created: [bold]{job.created_at}[/bold]")
|
||||||
|
console.print(f"Updated: [bold]{job.updated_at}[/bold]")
|
||||||
|
|
||||||
|
if job.started_at:
|
||||||
|
console.print(f"Started: [bold]{job.started_at}[/bold]")
|
||||||
|
|
||||||
|
if job.completed_at:
|
||||||
|
console.print(f"Completed: [bold]{job.completed_at}[/bold]")
|
||||||
|
|
||||||
|
if job.selector:
|
||||||
|
console.print(f"Selector: [bold]{job.selector}[/bold]")
|
||||||
|
|
||||||
|
if job.error:
|
||||||
|
console.print(f"Error: [bold red]{job.error}[/bold red]")
|
||||||
|
|
||||||
|
# Get results if job is completed
|
||||||
|
if job.status == JobStatus.COMPLETED:
|
||||||
|
result = scrape_result.get_latest_by_job_id(db=db, job_id=job.id)
|
||||||
|
|
||||||
|
if result and result.extracted_data:
|
||||||
|
console.print("\n[bold]Extracted Data:[/bold]")
|
||||||
|
|
||||||
|
# Ask if user wants to see the data
|
||||||
|
if typer.confirm("Do you want to see the extracted data?"):
|
||||||
|
console.print_json(json.dumps(result.extracted_data))
|
||||||
|
else:
|
||||||
|
console.print("[yellow]No data extracted.[/yellow]")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[bold red]Error:[/bold red] {str(e)}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
@app.command("run")
|
||||||
|
def run_job(
|
||||||
|
job_id: int = typer.Argument(..., help="ID of the job to run"),
|
||||||
|
):
|
||||||
|
"""
|
||||||
|
Run a scrape job.
|
||||||
|
"""
|
||||||
|
db = get_db()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get job
|
||||||
|
job = scrape_job.get(db=db, id=job_id)
|
||||||
|
|
||||||
|
if not job:
|
||||||
|
console.print(f"[bold red]Job not found:[/bold red] {job_id}")
|
||||||
|
return
|
||||||
|
|
||||||
|
console.print(f"Running job [bold]{job_id}[/bold]...")
|
||||||
|
|
||||||
|
# Run the job
|
||||||
|
scraper = Scraper(db=db)
|
||||||
|
job = scraper.run_job(job_id=job.id)
|
||||||
|
|
||||||
|
if job.status == JobStatus.COMPLETED:
|
||||||
|
console.print("[bold green]Job completed successfully![/bold green]")
|
||||||
|
else:
|
||||||
|
console.print(f"[bold red]Job failed:[/bold red] {job.error}")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[bold red]Error:[/bold red] {str(e)}")
|
||||||
|
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app()
|
0
app/core/__init__.py
Normal file
0
app/core/__init__.py
Normal file
50
app/core/config.py
Normal file
50
app/core/config.py
Normal file
@ -0,0 +1,50 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from typing import Any, Dict, Optional
|
||||||
|
|
||||||
|
from pydantic import BaseSettings, validator
|
||||||
|
|
||||||
|
|
||||||
|
class Settings(BaseSettings):
|
||||||
|
# Base settings
|
||||||
|
PROJECT_NAME: str = "Web Scraper CLI"
|
||||||
|
PROJECT_DESCRIPTION: str = "A FastAPI-based web scraper with CLI interface"
|
||||||
|
VERSION: str = "0.1.0"
|
||||||
|
API_V1_STR: str = "/api/v1"
|
||||||
|
|
||||||
|
# Server settings
|
||||||
|
HOST: str = "0.0.0.0"
|
||||||
|
PORT: int = 8000
|
||||||
|
DEBUG: bool = True
|
||||||
|
|
||||||
|
# Database settings
|
||||||
|
DB_DIR: Path = Path("/app") / "storage" / "db"
|
||||||
|
SQLALCHEMY_DATABASE_URL: str = f"sqlite:///{DB_DIR}/db.sqlite"
|
||||||
|
|
||||||
|
@validator("SQLALCHEMY_DATABASE_URL", pre=True)
|
||||||
|
def validate_db_url(cls, v: Optional[str], values: Dict[str, Any]) -> str:
|
||||||
|
"""
|
||||||
|
Ensure the database directory exists.
|
||||||
|
"""
|
||||||
|
if isinstance(v, str) and v.startswith("sqlite"):
|
||||||
|
db_dir = values.get("DB_DIR")
|
||||||
|
if db_dir:
|
||||||
|
db_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
return v
|
||||||
|
return v
|
||||||
|
|
||||||
|
# Scraper settings
|
||||||
|
DEFAULT_USER_AGENT: str = (
|
||||||
|
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||||
|
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||||
|
)
|
||||||
|
DEFAULT_TIMEOUT: int = 30 # seconds
|
||||||
|
|
||||||
|
# Ratelimit settings
|
||||||
|
DEFAULT_RATE_LIMIT: float = 1.0 # requests per second
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
env_file = ".env"
|
||||||
|
case_sensitive = True
|
||||||
|
|
||||||
|
|
||||||
|
settings = Settings()
|
4
app/crud/__init__.py
Normal file
4
app/crud/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
from app.crud.scrape_job import scrape_job
|
||||||
|
from app.crud.scrape_result import scrape_result
|
||||||
|
|
||||||
|
__all__ = ["scrape_job", "scrape_result"]
|
89
app/crud/base.py
Normal file
89
app/crud/base.py
Normal file
@ -0,0 +1,89 @@
|
|||||||
|
from typing import Any, Dict, Generic, List, Optional, Type, TypeVar, Union
|
||||||
|
|
||||||
|
from fastapi.encoders import jsonable_encoder
|
||||||
|
from pydantic import BaseModel
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.db.session import Base
|
||||||
|
|
||||||
|
ModelType = TypeVar("ModelType", bound=Base)
|
||||||
|
CreateSchemaType = TypeVar("CreateSchemaType", bound=BaseModel)
|
||||||
|
UpdateSchemaType = TypeVar("UpdateSchemaType", bound=BaseModel)
|
||||||
|
|
||||||
|
|
||||||
|
class CRUDBase(Generic[ModelType, CreateSchemaType, UpdateSchemaType]):
|
||||||
|
"""
|
||||||
|
CRUD operations base class.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(self, model: Type[ModelType]):
|
||||||
|
"""
|
||||||
|
CRUD object with default methods to Create, Read, Update, Delete (CRUD).
|
||||||
|
**Parameters**
|
||||||
|
* `model`: A SQLAlchemy model class
|
||||||
|
* `schema`: A Pydantic model (schema) class
|
||||||
|
"""
|
||||||
|
self.model = model
|
||||||
|
|
||||||
|
def get(self, db: Session, id: Any) -> Optional[ModelType]:
|
||||||
|
"""
|
||||||
|
Get a record by ID.
|
||||||
|
"""
|
||||||
|
return db.query(self.model).filter(self.model.id == id).first()
|
||||||
|
|
||||||
|
def get_multi(
|
||||||
|
self, db: Session, *, skip: int = 0, limit: int = 100
|
||||||
|
) -> List[ModelType]:
|
||||||
|
"""
|
||||||
|
Get multiple records.
|
||||||
|
"""
|
||||||
|
return db.query(self.model).offset(skip).limit(limit).all()
|
||||||
|
|
||||||
|
def count(self, db: Session) -> int:
|
||||||
|
"""
|
||||||
|
Count total records.
|
||||||
|
"""
|
||||||
|
return db.query(self.model).count()
|
||||||
|
|
||||||
|
def create(self, db: Session, *, obj_in: CreateSchemaType) -> ModelType:
|
||||||
|
"""
|
||||||
|
Create a new record.
|
||||||
|
"""
|
||||||
|
obj_in_data = jsonable_encoder(obj_in)
|
||||||
|
db_obj = self.model(**obj_in_data)
|
||||||
|
db.add(db_obj)
|
||||||
|
db.commit()
|
||||||
|
db.refresh(db_obj)
|
||||||
|
return db_obj
|
||||||
|
|
||||||
|
def update(
|
||||||
|
self,
|
||||||
|
db: Session,
|
||||||
|
*,
|
||||||
|
db_obj: ModelType,
|
||||||
|
obj_in: Union[UpdateSchemaType, Dict[str, Any]],
|
||||||
|
) -> ModelType:
|
||||||
|
"""
|
||||||
|
Update a record.
|
||||||
|
"""
|
||||||
|
obj_data = jsonable_encoder(db_obj)
|
||||||
|
if isinstance(obj_in, dict):
|
||||||
|
update_data = obj_in
|
||||||
|
else:
|
||||||
|
update_data = obj_in.dict(exclude_unset=True)
|
||||||
|
for field in obj_data:
|
||||||
|
if field in update_data:
|
||||||
|
setattr(db_obj, field, update_data[field])
|
||||||
|
db.add(db_obj)
|
||||||
|
db.commit()
|
||||||
|
db.refresh(db_obj)
|
||||||
|
return db_obj
|
||||||
|
|
||||||
|
def remove(self, db: Session, *, id: int) -> ModelType:
|
||||||
|
"""
|
||||||
|
Remove a record.
|
||||||
|
"""
|
||||||
|
obj = db.query(self.model).get(id)
|
||||||
|
db.delete(obj)
|
||||||
|
db.commit()
|
||||||
|
return obj
|
47
app/crud/scrape_job.py
Normal file
47
app/crud/scrape_job.py
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
from typing import List
|
||||||
|
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.models.scrape_job import ScrapeJob, JobStatus
|
||||||
|
from app.schemas.scrape_job import ScrapeJobCreate, ScrapeJobUpdate
|
||||||
|
from app.crud.base import CRUDBase
|
||||||
|
|
||||||
|
|
||||||
|
class CRUDScrapeJob(CRUDBase[ScrapeJob, ScrapeJobCreate, ScrapeJobUpdate]):
|
||||||
|
"""
|
||||||
|
CRUD operations for ScrapeJob model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def get_by_status(
|
||||||
|
self, db: Session, *, status: JobStatus, skip: int = 0, limit: int = 100
|
||||||
|
) -> List[ScrapeJob]:
|
||||||
|
"""
|
||||||
|
Get jobs by status.
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
db.query(self.model)
|
||||||
|
.filter(self.model.status == status)
|
||||||
|
.offset(skip)
|
||||||
|
.limit(limit)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
def count_by_status(self, db: Session, *, status: JobStatus) -> int:
|
||||||
|
"""
|
||||||
|
Count jobs by status.
|
||||||
|
"""
|
||||||
|
return db.query(self.model).filter(self.model.status == status).count()
|
||||||
|
|
||||||
|
def get_pending_jobs(self, db: Session, *, limit: int = 10) -> List[ScrapeJob]:
|
||||||
|
"""
|
||||||
|
Get pending jobs.
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
db.query(self.model)
|
||||||
|
.filter(self.model.status == JobStatus.PENDING)
|
||||||
|
.limit(limit)
|
||||||
|
.all()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
scrape_job = CRUDScrapeJob(ScrapeJob)
|
35
app/crud/scrape_result.py
Normal file
35
app/crud/scrape_result.py
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
from typing import List, Optional
|
||||||
|
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.models.scrape_result import ScrapeResult
|
||||||
|
from app.schemas.scrape_result import ScrapeResultCreate, ScrapeResultUpdate
|
||||||
|
from app.crud.base import CRUDBase
|
||||||
|
|
||||||
|
|
||||||
|
class CRUDScrapeResult(CRUDBase[ScrapeResult, ScrapeResultCreate, ScrapeResultUpdate]):
|
||||||
|
"""
|
||||||
|
CRUD operations for ScrapeResult model.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def get_by_job_id(self, db: Session, *, job_id: int) -> List[ScrapeResult]:
|
||||||
|
"""
|
||||||
|
Get results by job ID.
|
||||||
|
"""
|
||||||
|
return db.query(self.model).filter(self.model.job_id == job_id).all()
|
||||||
|
|
||||||
|
def get_latest_by_job_id(
|
||||||
|
self, db: Session, *, job_id: int
|
||||||
|
) -> Optional[ScrapeResult]:
|
||||||
|
"""
|
||||||
|
Get the latest result by job ID.
|
||||||
|
"""
|
||||||
|
return (
|
||||||
|
db.query(self.model)
|
||||||
|
.filter(self.model.job_id == job_id)
|
||||||
|
.order_by(self.model.created_at.desc())
|
||||||
|
.first()
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
scrape_result = CRUDScrapeResult(ScrapeResult)
|
0
app/db/__init__.py
Normal file
0
app/db/__init__.py
Normal file
32
app/db/session.py
Normal file
32
app/db/session.py
Normal file
@ -0,0 +1,32 @@
|
|||||||
|
from sqlalchemy import create_engine
|
||||||
|
from sqlalchemy.ext.declarative import declarative_base
|
||||||
|
from sqlalchemy.orm import sessionmaker
|
||||||
|
|
||||||
|
from app.core.config import settings
|
||||||
|
|
||||||
|
# Create database directory if it doesn't exist
|
||||||
|
settings.DB_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
# Create SQLAlchemy engine
|
||||||
|
engine = create_engine(
|
||||||
|
settings.SQLALCHEMY_DATABASE_URL,
|
||||||
|
connect_args={"check_same_thread": False}, # Only for SQLite
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create sessionmaker
|
||||||
|
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||||
|
|
||||||
|
# Create base class for SQLAlchemy models
|
||||||
|
Base = declarative_base()
|
||||||
|
|
||||||
|
|
||||||
|
# Database session dependency
|
||||||
|
def get_db():
|
||||||
|
"""
|
||||||
|
Dependency for getting a database session.
|
||||||
|
"""
|
||||||
|
db = SessionLocal()
|
||||||
|
try:
|
||||||
|
yield db
|
||||||
|
finally:
|
||||||
|
db.close()
|
4
app/models/__init__.py
Normal file
4
app/models/__init__.py
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
from app.models.scrape_job import ScrapeJob, JobStatus
|
||||||
|
from app.models.scrape_result import ScrapeResult
|
||||||
|
|
||||||
|
__all__ = ["ScrapeJob", "JobStatus", "ScrapeResult"]
|
41
app/models/scrape_job.py
Normal file
41
app/models/scrape_job.py
Normal file
@ -0,0 +1,41 @@
|
|||||||
|
import enum
|
||||||
|
|
||||||
|
from sqlalchemy import Column, String, Integer, DateTime, Enum, Text, JSON
|
||||||
|
from sqlalchemy.sql import func
|
||||||
|
|
||||||
|
from app.db.session import Base
|
||||||
|
|
||||||
|
|
||||||
|
class JobStatus(str, enum.Enum):
|
||||||
|
PENDING = "pending"
|
||||||
|
IN_PROGRESS = "in_progress"
|
||||||
|
COMPLETED = "completed"
|
||||||
|
FAILED = "failed"
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapeJob(Base):
|
||||||
|
"""
|
||||||
|
Model for a web scraping job.
|
||||||
|
"""
|
||||||
|
|
||||||
|
__tablename__ = "scrape_jobs"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True, index=True)
|
||||||
|
url = Column(String(2048), nullable=False, index=True)
|
||||||
|
status = Column(Enum(JobStatus), default=JobStatus.PENDING, nullable=False)
|
||||||
|
created_at = Column(DateTime, default=func.now(), nullable=False)
|
||||||
|
updated_at = Column(
|
||||||
|
DateTime, default=func.now(), onupdate=func.now(), nullable=False
|
||||||
|
)
|
||||||
|
started_at = Column(DateTime, nullable=True)
|
||||||
|
completed_at = Column(DateTime, nullable=True)
|
||||||
|
selector = Column(String(255), nullable=True)
|
||||||
|
error = Column(Text, nullable=True)
|
||||||
|
result = Column(JSON, nullable=True)
|
||||||
|
user_agent = Column(String(255), nullable=True)
|
||||||
|
timeout = Column(Integer, nullable=True)
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return (
|
||||||
|
f"<ScrapeJob(id={self.id}, url='{self.url}', status='{self.status.value}')>"
|
||||||
|
)
|
29
app/models/scrape_result.py
Normal file
29
app/models/scrape_result.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
from sqlalchemy import Column, String, Integer, DateTime, ForeignKey, Text, JSON
|
||||||
|
from sqlalchemy.sql import func
|
||||||
|
from sqlalchemy.orm import relationship
|
||||||
|
|
||||||
|
from app.db.session import Base
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapeResult(Base):
|
||||||
|
"""
|
||||||
|
Model for storing scraping results.
|
||||||
|
"""
|
||||||
|
|
||||||
|
__tablename__ = "scrape_results"
|
||||||
|
|
||||||
|
id = Column(Integer, primary_key=True, index=True)
|
||||||
|
job_id = Column(
|
||||||
|
Integer, ForeignKey("scrape_jobs.id", ondelete="CASCADE"), nullable=False
|
||||||
|
)
|
||||||
|
created_at = Column(DateTime, default=func.now(), nullable=False)
|
||||||
|
content_type = Column(String(100), nullable=True)
|
||||||
|
headers = Column(JSON, nullable=True)
|
||||||
|
html_content = Column(Text, nullable=True)
|
||||||
|
extracted_data = Column(JSON, nullable=True)
|
||||||
|
|
||||||
|
# Relationship
|
||||||
|
job = relationship("ScrapeJob", backref="results")
|
||||||
|
|
||||||
|
def __repr__(self):
|
||||||
|
return f"<ScrapeResult(id={self.id}, job_id={self.job_id})>"
|
27
app/schemas/__init__.py
Normal file
27
app/schemas/__init__.py
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
from app.schemas.scrape_job import (
|
||||||
|
ScrapeJobBase,
|
||||||
|
ScrapeJobCreate,
|
||||||
|
ScrapeJobUpdate,
|
||||||
|
ScrapeJob,
|
||||||
|
ScrapeJobList,
|
||||||
|
)
|
||||||
|
from app.schemas.scrape_result import (
|
||||||
|
ScrapeResultBase,
|
||||||
|
ScrapeResultCreate,
|
||||||
|
ScrapeResultUpdate,
|
||||||
|
ScrapeResult,
|
||||||
|
ScrapeResultList,
|
||||||
|
)
|
||||||
|
|
||||||
|
__all__ = [
|
||||||
|
"ScrapeJobBase",
|
||||||
|
"ScrapeJobCreate",
|
||||||
|
"ScrapeJobUpdate",
|
||||||
|
"ScrapeJob",
|
||||||
|
"ScrapeJobList",
|
||||||
|
"ScrapeResultBase",
|
||||||
|
"ScrapeResultCreate",
|
||||||
|
"ScrapeResultUpdate",
|
||||||
|
"ScrapeResult",
|
||||||
|
"ScrapeResultList",
|
||||||
|
]
|
74
app/schemas/scrape_job.py
Normal file
74
app/schemas/scrape_job.py
Normal file
@ -0,0 +1,74 @@
|
|||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional, Dict, Any, List
|
||||||
|
|
||||||
|
from pydantic import BaseModel, HttpUrl
|
||||||
|
|
||||||
|
from app.models.scrape_job import JobStatus
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapeJobBase(BaseModel):
|
||||||
|
"""
|
||||||
|
Base schema for scrape job data.
|
||||||
|
"""
|
||||||
|
|
||||||
|
url: HttpUrl
|
||||||
|
selector: Optional[str] = None
|
||||||
|
user_agent: Optional[str] = None
|
||||||
|
timeout: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapeJobCreate(ScrapeJobBase):
|
||||||
|
"""
|
||||||
|
Schema for creating a new scrape job.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapeJobUpdate(BaseModel):
|
||||||
|
"""
|
||||||
|
Schema for updating a scrape job.
|
||||||
|
"""
|
||||||
|
|
||||||
|
url: Optional[HttpUrl] = None
|
||||||
|
status: Optional[JobStatus] = None
|
||||||
|
selector: Optional[str] = None
|
||||||
|
error: Optional[str] = None
|
||||||
|
result: Optional[Dict[str, Any]] = None
|
||||||
|
user_agent: Optional[str] = None
|
||||||
|
timeout: Optional[int] = None
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapeJobInDBBase(ScrapeJobBase):
|
||||||
|
"""
|
||||||
|
Base schema for scrape job in database.
|
||||||
|
"""
|
||||||
|
|
||||||
|
id: int
|
||||||
|
status: JobStatus
|
||||||
|
created_at: datetime
|
||||||
|
updated_at: datetime
|
||||||
|
started_at: Optional[datetime] = None
|
||||||
|
completed_at: Optional[datetime] = None
|
||||||
|
error: Optional[str] = None
|
||||||
|
result: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
orm_mode = True
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapeJob(ScrapeJobInDBBase):
|
||||||
|
"""
|
||||||
|
Schema for returned scrape job.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapeJobList(BaseModel):
|
||||||
|
"""
|
||||||
|
Schema for a list of scrape jobs.
|
||||||
|
"""
|
||||||
|
|
||||||
|
jobs: List[ScrapeJob]
|
||||||
|
total: int
|
64
app/schemas/scrape_result.py
Normal file
64
app/schemas/scrape_result.py
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional, Dict, Any, List
|
||||||
|
|
||||||
|
from pydantic import BaseModel
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapeResultBase(BaseModel):
|
||||||
|
"""
|
||||||
|
Base schema for scrape result data.
|
||||||
|
"""
|
||||||
|
|
||||||
|
job_id: int
|
||||||
|
content_type: Optional[str] = None
|
||||||
|
headers: Optional[Dict[str, Any]] = None
|
||||||
|
extracted_data: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapeResultCreate(ScrapeResultBase):
|
||||||
|
"""
|
||||||
|
Schema for creating a new scrape result.
|
||||||
|
"""
|
||||||
|
|
||||||
|
html_content: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapeResultUpdate(BaseModel):
|
||||||
|
"""
|
||||||
|
Schema for updating a scrape result.
|
||||||
|
"""
|
||||||
|
|
||||||
|
content_type: Optional[str] = None
|
||||||
|
headers: Optional[Dict[str, Any]] = None
|
||||||
|
html_content: Optional[str] = None
|
||||||
|
extracted_data: Optional[Dict[str, Any]] = None
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapeResultInDBBase(ScrapeResultBase):
|
||||||
|
"""
|
||||||
|
Base schema for scrape result in database.
|
||||||
|
"""
|
||||||
|
|
||||||
|
id: int
|
||||||
|
created_at: datetime
|
||||||
|
html_content: Optional[str] = None
|
||||||
|
|
||||||
|
class Config:
|
||||||
|
orm_mode = True
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapeResult(ScrapeResultInDBBase):
|
||||||
|
"""
|
||||||
|
Schema for returned scrape result.
|
||||||
|
"""
|
||||||
|
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class ScrapeResultList(BaseModel):
|
||||||
|
"""
|
||||||
|
Schema for a list of scrape results.
|
||||||
|
"""
|
||||||
|
|
||||||
|
results: List[ScrapeResult]
|
||||||
|
total: int
|
0
app/services/__init__.py
Normal file
0
app/services/__init__.py
Normal file
150
app/services/scraper.py
Normal file
150
app/services/scraper.py
Normal file
@ -0,0 +1,150 @@
|
|||||||
|
import time
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Dict, Any, Optional
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
from sqlalchemy.orm import Session
|
||||||
|
|
||||||
|
from app.core.config import settings
|
||||||
|
from app.models.scrape_job import ScrapeJob, JobStatus
|
||||||
|
from app.models.scrape_result import ScrapeResult
|
||||||
|
|
||||||
|
|
||||||
|
class Scraper:
|
||||||
|
"""
|
||||||
|
Service for web scraping.
|
||||||
|
"""
|
||||||
|
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
db: Session,
|
||||||
|
user_agent: Optional[str] = None,
|
||||||
|
timeout: Optional[int] = None,
|
||||||
|
rate_limit: Optional[float] = None,
|
||||||
|
):
|
||||||
|
self.db = db
|
||||||
|
self.user_agent = user_agent or settings.DEFAULT_USER_AGENT
|
||||||
|
self.timeout = timeout or settings.DEFAULT_TIMEOUT
|
||||||
|
self.rate_limit = rate_limit or settings.DEFAULT_RATE_LIMIT
|
||||||
|
self._last_request_time = 0
|
||||||
|
|
||||||
|
def _respect_rate_limit(self) -> None:
|
||||||
|
"""
|
||||||
|
Respect rate limit by sleeping if necessary.
|
||||||
|
"""
|
||||||
|
current_time = time.time()
|
||||||
|
time_since_last_request = current_time - self._last_request_time
|
||||||
|
|
||||||
|
if time_since_last_request < (1.0 / self.rate_limit):
|
||||||
|
sleep_time = (1.0 / self.rate_limit) - time_since_last_request
|
||||||
|
time.sleep(sleep_time)
|
||||||
|
|
||||||
|
self._last_request_time = time.time()
|
||||||
|
|
||||||
|
def fetch_url(self, url: str) -> requests.Response:
|
||||||
|
"""
|
||||||
|
Fetch URL respecting rate limits.
|
||||||
|
"""
|
||||||
|
self._respect_rate_limit()
|
||||||
|
|
||||||
|
headers = {
|
||||||
|
"User-Agent": self.user_agent,
|
||||||
|
}
|
||||||
|
|
||||||
|
response = requests.get(
|
||||||
|
url,
|
||||||
|
headers=headers,
|
||||||
|
timeout=self.timeout,
|
||||||
|
)
|
||||||
|
response.raise_for_status()
|
||||||
|
|
||||||
|
return response
|
||||||
|
|
||||||
|
def parse_html(self, html: str, selector: Optional[str] = None) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Parse HTML content.
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
result = {
|
||||||
|
"title": soup.title.text if soup.title else None,
|
||||||
|
"meta_description": None,
|
||||||
|
"h1": [h1.text.strip() for h1 in soup.find_all("h1")],
|
||||||
|
"links": [
|
||||||
|
{"href": a.get("href"), "text": a.text.strip()}
|
||||||
|
for a in soup.find_all("a")
|
||||||
|
if a.get("href")
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
# Extract meta description
|
||||||
|
meta_desc = soup.find("meta", attrs={"name": "description"})
|
||||||
|
if meta_desc:
|
||||||
|
result["meta_description"] = meta_desc.get("content")
|
||||||
|
|
||||||
|
# If a selector is provided, extract content matching the selector
|
||||||
|
if selector:
|
||||||
|
selected_elements = soup.select(selector)
|
||||||
|
result["selected_content"] = [
|
||||||
|
element.text.strip() for element in selected_elements
|
||||||
|
]
|
||||||
|
result["selected_html"] = [str(element) for element in selected_elements]
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
def run_job(self, job_id: int) -> ScrapeJob:
|
||||||
|
"""
|
||||||
|
Run a scraping job.
|
||||||
|
"""
|
||||||
|
# Get job from DB
|
||||||
|
job = self.db.query(ScrapeJob).filter(ScrapeJob.id == job_id).first()
|
||||||
|
if not job:
|
||||||
|
raise ValueError(f"Job with ID {job_id} not found")
|
||||||
|
|
||||||
|
# Update job status
|
||||||
|
job.status = JobStatus.IN_PROGRESS
|
||||||
|
job.started_at = datetime.now()
|
||||||
|
self.db.commit()
|
||||||
|
self.db.refresh(job)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Fetch URL
|
||||||
|
response = self.fetch_url(job.url)
|
||||||
|
|
||||||
|
# Create ScrapeResult
|
||||||
|
result = ScrapeResult(
|
||||||
|
job_id=job.id,
|
||||||
|
content_type=response.headers.get("Content-Type"),
|
||||||
|
headers=dict(response.headers),
|
||||||
|
html_content=response.text,
|
||||||
|
)
|
||||||
|
self.db.add(result)
|
||||||
|
self.db.commit()
|
||||||
|
self.db.refresh(result)
|
||||||
|
|
||||||
|
# Parse HTML
|
||||||
|
extracted_data = self.parse_html(response.text, job.selector)
|
||||||
|
|
||||||
|
# Update ScrapeResult with extracted data
|
||||||
|
result.extracted_data = extracted_data
|
||||||
|
self.db.commit()
|
||||||
|
self.db.refresh(result)
|
||||||
|
|
||||||
|
# Update job status
|
||||||
|
job.status = JobStatus.COMPLETED
|
||||||
|
job.completed_at = datetime.now()
|
||||||
|
job.result = {"result_id": result.id}
|
||||||
|
self.db.commit()
|
||||||
|
self.db.refresh(job)
|
||||||
|
|
||||||
|
return job
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
# Update job with error
|
||||||
|
job.status = JobStatus.FAILED
|
||||||
|
job.completed_at = datetime.now()
|
||||||
|
job.error = str(e)
|
||||||
|
self.db.commit()
|
||||||
|
self.db.refresh(job)
|
||||||
|
|
||||||
|
raise e
|
0
app/utils/__init__.py
Normal file
0
app/utils/__init__.py
Normal file
83
app/utils/html.py
Normal file
83
app/utils/html.py
Normal file
@ -0,0 +1,83 @@
|
|||||||
|
from typing import List, Dict, Any
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
|
||||||
|
def extract_metadata(html: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Extract metadata from HTML.
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
result = {
|
||||||
|
"title": None,
|
||||||
|
"description": None,
|
||||||
|
"keywords": None,
|
||||||
|
"og_title": None,
|
||||||
|
"og_description": None,
|
||||||
|
"og_image": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Title
|
||||||
|
if soup.title:
|
||||||
|
result["title"] = soup.title.text.strip()
|
||||||
|
|
||||||
|
# Meta tags
|
||||||
|
for meta in soup.find_all("meta"):
|
||||||
|
name = meta.get("name", "").lower()
|
||||||
|
property = meta.get("property", "").lower()
|
||||||
|
content = meta.get("content", "")
|
||||||
|
|
||||||
|
if name == "description":
|
||||||
|
result["description"] = content
|
||||||
|
elif name == "keywords":
|
||||||
|
result["keywords"] = content
|
||||||
|
elif property == "og:title":
|
||||||
|
result["og_title"] = content
|
||||||
|
elif property == "og:description":
|
||||||
|
result["og_description"] = content
|
||||||
|
elif property == "og:image":
|
||||||
|
result["og_image"] = content
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def extract_links(html: str) -> List[Dict[str, str]]:
|
||||||
|
"""
|
||||||
|
Extract links from HTML.
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
links = []
|
||||||
|
|
||||||
|
for a in soup.find_all("a"):
|
||||||
|
href = a.get("href")
|
||||||
|
if href:
|
||||||
|
links.append(
|
||||||
|
{
|
||||||
|
"href": href,
|
||||||
|
"text": a.text.strip(),
|
||||||
|
"title": a.get("title", ""),
|
||||||
|
"rel": a.get("rel", ""),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return links
|
||||||
|
|
||||||
|
|
||||||
|
def extract_images(html: str) -> List[Dict[str, str]]:
|
||||||
|
"""
|
||||||
|
Extract images from HTML.
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html, "lxml")
|
||||||
|
images = []
|
||||||
|
|
||||||
|
for img in soup.find_all("img"):
|
||||||
|
src = img.get("src")
|
||||||
|
if src:
|
||||||
|
images.append(
|
||||||
|
{
|
||||||
|
"src": src,
|
||||||
|
"alt": img.get("alt", ""),
|
||||||
|
"title": img.get("title", ""),
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
return images
|
25
app/utils/url.py
Normal file
25
app/utils/url.py
Normal file
@ -0,0 +1,25 @@
|
|||||||
|
from urllib.parse import urlparse, parse_qs
|
||||||
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
|
||||||
|
def parse_url(url: str) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Parse a URL into its components.
|
||||||
|
"""
|
||||||
|
parsed = urlparse(url)
|
||||||
|
return {
|
||||||
|
"scheme": parsed.scheme,
|
||||||
|
"netloc": parsed.netloc,
|
||||||
|
"path": parsed.path,
|
||||||
|
"params": parsed.params,
|
||||||
|
"query": parse_qs(parsed.query),
|
||||||
|
"fragment": parsed.fragment,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def is_valid_url(url: str) -> bool:
|
||||||
|
"""
|
||||||
|
Check if a URL is valid.
|
||||||
|
"""
|
||||||
|
parsed = urlparse(url)
|
||||||
|
return bool(parsed.scheme and parsed.netloc)
|
9
cli.py
Normal file
9
cli.py
Normal file
@ -0,0 +1,9 @@
|
|||||||
|
#!/usr/bin/env python
|
||||||
|
"""
|
||||||
|
Web Scraper CLI
|
||||||
|
"""
|
||||||
|
|
||||||
|
from app.cli.cli import app
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
app()
|
15
docker-compose.yml
Normal file
15
docker-compose.yml
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
version: '3.8'
|
||||||
|
|
||||||
|
services:
|
||||||
|
app:
|
||||||
|
build:
|
||||||
|
context: .
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
ports:
|
||||||
|
- "8000:8000"
|
||||||
|
volumes:
|
||||||
|
- ./app:/app/app
|
||||||
|
- ./storage:/app/storage
|
||||||
|
environment:
|
||||||
|
- DEBUG=True
|
||||||
|
command: uvicorn main:app --host 0.0.0.0 --port 8000 --reload
|
29
main.py
Normal file
29
main.py
Normal file
@ -0,0 +1,29 @@
|
|||||||
|
import uvicorn
|
||||||
|
from fastapi import FastAPI
|
||||||
|
from app.api.api import api_router
|
||||||
|
from app.core.config import settings
|
||||||
|
|
||||||
|
app = FastAPI(
|
||||||
|
title=settings.PROJECT_NAME,
|
||||||
|
description=settings.PROJECT_DESCRIPTION,
|
||||||
|
version=settings.VERSION,
|
||||||
|
)
|
||||||
|
|
||||||
|
app.include_router(api_router)
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/health", tags=["Health"])
|
||||||
|
async def health_check():
|
||||||
|
"""
|
||||||
|
Health check endpoint.
|
||||||
|
"""
|
||||||
|
return {"status": "ok"}
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
uvicorn.run(
|
||||||
|
"main:app",
|
||||||
|
host=settings.HOST,
|
||||||
|
port=settings.PORT,
|
||||||
|
reload=settings.DEBUG,
|
||||||
|
)
|
88
migrations/env.py
Normal file
88
migrations/env.py
Normal file
@ -0,0 +1,88 @@
|
|||||||
|
from logging.config import fileConfig
|
||||||
|
|
||||||
|
from sqlalchemy import engine_from_config
|
||||||
|
from sqlalchemy import pool
|
||||||
|
|
||||||
|
from alembic import context
|
||||||
|
|
||||||
|
# Import models to register them with the metadata
|
||||||
|
from app.db.session import Base
|
||||||
|
|
||||||
|
# These imports are needed to register models with SQLAlchemy metadata
|
||||||
|
# even though they appear unused to static analyzers
|
||||||
|
import app.models.scrape_job # noqa: F401
|
||||||
|
import app.models.scrape_result # noqa: F401
|
||||||
|
|
||||||
|
# this is the Alembic Config object, which provides
|
||||||
|
# access to the values within the .ini file in use.
|
||||||
|
config = context.config
|
||||||
|
|
||||||
|
# Interpret the config file for Python logging.
|
||||||
|
# This line sets up loggers basically.
|
||||||
|
fileConfig(config.config_file_name)
|
||||||
|
|
||||||
|
# add your model's MetaData object here
|
||||||
|
# for 'autogenerate' support
|
||||||
|
target_metadata = Base.metadata
|
||||||
|
|
||||||
|
# other values from the config, defined by the needs of env.py,
|
||||||
|
# can be acquired:
|
||||||
|
# my_important_option = config.get_main_option("my_important_option")
|
||||||
|
# ... etc.
|
||||||
|
|
||||||
|
|
||||||
|
def run_migrations_offline():
|
||||||
|
"""Run migrations in 'offline' mode.
|
||||||
|
|
||||||
|
This configures the context with just a URL
|
||||||
|
and not an Engine, though an Engine is acceptable
|
||||||
|
here as well. By skipping the Engine creation
|
||||||
|
we don't even need a DBAPI to be available.
|
||||||
|
|
||||||
|
Calls to context.execute() here emit the given string to the
|
||||||
|
script output.
|
||||||
|
|
||||||
|
"""
|
||||||
|
url = config.get_main_option("sqlalchemy.url")
|
||||||
|
context.configure(
|
||||||
|
url=url,
|
||||||
|
target_metadata=target_metadata,
|
||||||
|
literal_binds=True,
|
||||||
|
dialect_opts={"paramstyle": "named"},
|
||||||
|
)
|
||||||
|
|
||||||
|
with context.begin_transaction():
|
||||||
|
context.run_migrations()
|
||||||
|
|
||||||
|
|
||||||
|
def run_migrations_online():
|
||||||
|
"""Run migrations in 'online' mode.
|
||||||
|
|
||||||
|
In this scenario we need to create an Engine
|
||||||
|
and associate a connection with the context.
|
||||||
|
|
||||||
|
"""
|
||||||
|
connectable = engine_from_config(
|
||||||
|
config.get_section(config.config_ini_section),
|
||||||
|
prefix="sqlalchemy.",
|
||||||
|
poolclass=pool.NullPool,
|
||||||
|
)
|
||||||
|
|
||||||
|
with connectable.connect() as connection:
|
||||||
|
# Check if we're using SQLite
|
||||||
|
is_sqlite = connection.dialect.name == "sqlite"
|
||||||
|
|
||||||
|
context.configure(
|
||||||
|
connection=connection,
|
||||||
|
target_metadata=target_metadata,
|
||||||
|
render_as_batch=is_sqlite, # Critical for SQLite to handle alter table operations
|
||||||
|
)
|
||||||
|
|
||||||
|
with context.begin_transaction():
|
||||||
|
context.run_migrations()
|
||||||
|
|
||||||
|
|
||||||
|
if context.is_offline_mode():
|
||||||
|
run_migrations_offline()
|
||||||
|
else:
|
||||||
|
run_migrations_online()
|
24
migrations/script.py.mako
Normal file
24
migrations/script.py.mako
Normal file
@ -0,0 +1,24 @@
|
|||||||
|
"""${message}
|
||||||
|
|
||||||
|
Revision ID: ${up_revision}
|
||||||
|
Revises: ${down_revision | comma,n}
|
||||||
|
Create Date: ${create_date}
|
||||||
|
|
||||||
|
"""
|
||||||
|
from alembic import op
|
||||||
|
import sqlalchemy as sa
|
||||||
|
${imports if imports else ""}
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision = ${repr(up_revision)}
|
||||||
|
down_revision = ${repr(down_revision)}
|
||||||
|
branch_labels = ${repr(branch_labels)}
|
||||||
|
depends_on = ${repr(depends_on)}
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade():
|
||||||
|
${upgrades if upgrades else "pass"}
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade():
|
||||||
|
${downgrades if downgrades else "pass"}
|
75
migrations/versions/0001_initial_migration.py
Normal file
75
migrations/versions/0001_initial_migration.py
Normal file
@ -0,0 +1,75 @@
|
|||||||
|
"""Initial migration
|
||||||
|
|
||||||
|
Revision ID: 0001
|
||||||
|
Revises:
|
||||||
|
Create Date: 2023-06-25
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
from alembic import op
|
||||||
|
import sqlalchemy as sa
|
||||||
|
from sqlalchemy.dialects import sqlite
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision = "0001"
|
||||||
|
down_revision = None
|
||||||
|
branch_labels = None
|
||||||
|
depends_on = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade():
|
||||||
|
# Create scrape_jobs table
|
||||||
|
op.create_table(
|
||||||
|
"scrape_jobs",
|
||||||
|
sa.Column("id", sa.Integer(), nullable=False),
|
||||||
|
sa.Column("url", sa.String(length=2048), nullable=False),
|
||||||
|
sa.Column(
|
||||||
|
"status",
|
||||||
|
sa.Enum("pending", "in_progress", "completed", "failed", name="jobstatus"),
|
||||||
|
nullable=False,
|
||||||
|
default="pending",
|
||||||
|
),
|
||||||
|
sa.Column(
|
||||||
|
"created_at", sa.DateTime(), nullable=False, server_default=sa.func.now()
|
||||||
|
),
|
||||||
|
sa.Column(
|
||||||
|
"updated_at", sa.DateTime(), nullable=False, server_default=sa.func.now()
|
||||||
|
),
|
||||||
|
sa.Column("started_at", sa.DateTime(), nullable=True),
|
||||||
|
sa.Column("completed_at", sa.DateTime(), nullable=True),
|
||||||
|
sa.Column("selector", sa.String(length=255), nullable=True),
|
||||||
|
sa.Column("error", sa.Text(), nullable=True),
|
||||||
|
sa.Column("result", sqlite.JSON(), nullable=True),
|
||||||
|
sa.Column("user_agent", sa.String(length=255), nullable=True),
|
||||||
|
sa.Column("timeout", sa.Integer(), nullable=True),
|
||||||
|
sa.PrimaryKeyConstraint("id"),
|
||||||
|
)
|
||||||
|
op.create_index(op.f("ix_scrape_jobs_id"), "scrape_jobs", ["id"], unique=False)
|
||||||
|
op.create_index(op.f("ix_scrape_jobs_url"), "scrape_jobs", ["url"], unique=False)
|
||||||
|
|
||||||
|
# Create scrape_results table
|
||||||
|
op.create_table(
|
||||||
|
"scrape_results",
|
||||||
|
sa.Column("id", sa.Integer(), nullable=False),
|
||||||
|
sa.Column("job_id", sa.Integer(), nullable=False),
|
||||||
|
sa.Column(
|
||||||
|
"created_at", sa.DateTime(), nullable=False, server_default=sa.func.now()
|
||||||
|
),
|
||||||
|
sa.Column("content_type", sa.String(length=100), nullable=True),
|
||||||
|
sa.Column("headers", sqlite.JSON(), nullable=True),
|
||||||
|
sa.Column("html_content", sa.Text(), nullable=True),
|
||||||
|
sa.Column("extracted_data", sqlite.JSON(), nullable=True),
|
||||||
|
sa.ForeignKeyConstraint(["job_id"], ["scrape_jobs.id"], ondelete="CASCADE"),
|
||||||
|
sa.PrimaryKeyConstraint("id"),
|
||||||
|
)
|
||||||
|
op.create_index(
|
||||||
|
op.f("ix_scrape_results_id"), "scrape_results", ["id"], unique=False
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade():
|
||||||
|
op.drop_index(op.f("ix_scrape_results_id"), table_name="scrape_results")
|
||||||
|
op.drop_table("scrape_results")
|
||||||
|
op.drop_index(op.f("ix_scrape_jobs_url"), table_name="scrape_jobs")
|
||||||
|
op.drop_index(op.f("ix_scrape_jobs_id"), table_name="scrape_jobs")
|
||||||
|
op.drop_table("scrape_jobs")
|
1
migrations/versions/__init__.py
Normal file
1
migrations/versions/__init__.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
# This file is intentionally empty to make the directory a Python package
|
14
requirements.txt
Normal file
14
requirements.txt
Normal file
@ -0,0 +1,14 @@
|
|||||||
|
fastapi==0.110.0
|
||||||
|
uvicorn==0.27.1
|
||||||
|
sqlalchemy==2.0.27
|
||||||
|
alembic==1.13.1
|
||||||
|
pydantic==2.6.1
|
||||||
|
python-dotenv==1.0.1
|
||||||
|
beautifulsoup4==4.12.2
|
||||||
|
requests==2.31.0
|
||||||
|
typer==0.9.0
|
||||||
|
rich==13.7.0
|
||||||
|
httpx==0.26.0
|
||||||
|
lxml==4.9.3
|
||||||
|
aiohttp==3.9.3
|
||||||
|
ruff==0.2.2
|
Loading…
x
Reference in New Issue
Block a user