Fix dependency installation issue by adding Dockerfile and docker-compose.yml
This commit is contained in:
parent
b7132c82ed
commit
a4511b3137
27
Dockerfile
Normal file
27
Dockerfile
Normal file
@ -0,0 +1,27 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Copy requirements file
|
||||
COPY requirements.txt .
|
||||
|
||||
# Install dependencies
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy the rest of the application
|
||||
COPY . .
|
||||
|
||||
# Create the database directory
|
||||
RUN mkdir -p /app/storage/db
|
||||
|
||||
# Set environment variables
|
||||
ENV PYTHONPATH=/app
|
||||
ENV HOST=0.0.0.0
|
||||
ENV PORT=8000
|
||||
ENV DEBUG=True
|
||||
|
||||
# Expose the port
|
||||
EXPOSE 8000
|
||||
|
||||
# Run the application
|
||||
CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"]
|
151
README.md
151
README.md
@ -1,3 +1,150 @@
|
||||
# FastAPI Application
|
||||
# Web Scraper CLI
|
||||
|
||||
This is a FastAPI application bootstrapped by BackendIM, the AI-powered backend generation platform.
|
||||
A FastAPI-based web scraper with CLI interface.
|
||||
|
||||
## Features
|
||||
|
||||
- REST API for web scraping management
|
||||
- CLI tool for scraping websites
|
||||
- Extract metadata, links, and specific content using CSS selectors
|
||||
- Store scraping results in SQLite database
|
||||
- Background job processing
|
||||
- Rate limiting to avoid overloading target websites
|
||||
|
||||
## Installation
|
||||
|
||||
### Local Installation
|
||||
|
||||
1. Clone the repository:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/yourusername/webscrapercli.git
|
||||
cd webscrapercli
|
||||
```
|
||||
|
||||
2. Install dependencies:
|
||||
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
```
|
||||
|
||||
3. Run the database migrations:
|
||||
|
||||
```bash
|
||||
alembic upgrade head
|
||||
```
|
||||
|
||||
### Docker Installation
|
||||
|
||||
1. Clone the repository:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/yourusername/webscrapercli.git
|
||||
cd webscrapercli
|
||||
```
|
||||
|
||||
2. Build and run using Docker Compose:
|
||||
|
||||
```bash
|
||||
docker-compose up --build
|
||||
```
|
||||
|
||||
This will:
|
||||
- Build the Docker image with all dependencies
|
||||
- Start the FastAPI server on port 8000
|
||||
- Mount the app and storage directories as volumes for live code reloading
|
||||
|
||||
## Usage
|
||||
|
||||
### API Server
|
||||
|
||||
Start the API server:
|
||||
|
||||
```bash
|
||||
# Development mode
|
||||
uvicorn main:app --reload
|
||||
|
||||
# Production mode
|
||||
uvicorn main:app --host 0.0.0.0 --port 8000
|
||||
```
|
||||
|
||||
Access the API documentation at: http://localhost:8000/docs
|
||||
|
||||
### CLI Usage
|
||||
|
||||
The CLI provides several commands for scraping websites:
|
||||
|
||||
```bash
|
||||
# Scrape a URL
|
||||
python cli.py scrape https://example.com
|
||||
|
||||
# Scrape a URL with a specific selector
|
||||
python cli.py scrape https://example.com --selector "div.content"
|
||||
|
||||
# Save the results to a file
|
||||
python cli.py scrape https://example.com --output results.json
|
||||
|
||||
# List all scrape jobs
|
||||
python cli.py list
|
||||
|
||||
# List scrape jobs with a specific status
|
||||
python cli.py list --status completed
|
||||
|
||||
# Show details of a specific job
|
||||
python cli.py show 1
|
||||
|
||||
# Run a specific job
|
||||
python cli.py run 1
|
||||
```
|
||||
|
||||
## API Endpoints
|
||||
|
||||
- `GET /health`: Health check endpoint
|
||||
- `POST /api/v1/scrape-jobs/`: Create a new scrape job
|
||||
- `GET /api/v1/scrape-jobs/`: List scrape jobs
|
||||
- `GET /api/v1/scrape-jobs/{job_id}`: Get a specific scrape job
|
||||
- `PUT /api/v1/scrape-jobs/{job_id}`: Update a scrape job
|
||||
- `DELETE /api/v1/scrape-jobs/{job_id}`: Delete a scrape job
|
||||
- `POST /api/v1/scrape-jobs/{job_id}/run`: Run a scrape job
|
||||
- `GET /api/v1/scrape-jobs/{job_id}/results`: Get the results of a scrape job
|
||||
|
||||
## Development
|
||||
|
||||
### Project Structure
|
||||
|
||||
```
|
||||
webscrapercli/
|
||||
├── alembic.ini # Alembic configuration
|
||||
├── app/ # Application package
|
||||
│ ├── api/ # API endpoints
|
||||
│ ├── cli/ # CLI implementation
|
||||
│ ├── core/ # Core functionality
|
||||
│ ├── crud/ # CRUD operations
|
||||
│ ├── db/ # Database configuration
|
||||
│ ├── models/ # SQLAlchemy models
|
||||
│ ├── schemas/ # Pydantic schemas
|
||||
│ ├── services/ # Business logic
|
||||
│ └── utils/ # Utility functions
|
||||
├── cli.py # CLI entry point
|
||||
├── docker-compose.yml # Docker Compose configuration
|
||||
├── Dockerfile # Docker configuration
|
||||
├── main.py # API entry point
|
||||
├── migrations/ # Alembic migrations
|
||||
│ ├── env.py # Alembic environment
|
||||
│ ├── script.py.mako # Alembic script template
|
||||
│ └── versions/ # Migration scripts
|
||||
├── requirements.txt # Dependencies
|
||||
└── storage/ # Storage directory for database and other files
|
||||
└── db/ # Database directory
|
||||
```
|
||||
|
||||
### Running Tests
|
||||
|
||||
```bash
|
||||
# Run tests
|
||||
pytest
|
||||
```
|
||||
|
||||
## License
|
||||
|
||||
This project is open source.
|
84
alembic.ini
Normal file
84
alembic.ini
Normal file
@ -0,0 +1,84 @@
|
||||
# A generic, single database configuration.
|
||||
|
||||
[alembic]
|
||||
# path to migration scripts
|
||||
script_location = migrations
|
||||
|
||||
# template used to generate migration files
|
||||
# file_template = %%(rev)s_%%(slug)s
|
||||
|
||||
# timezone to use when rendering the date
|
||||
# within the migration file as well as the filename.
|
||||
# string value is passed to dateutil.tz.gettz()
|
||||
# leave blank for localtime
|
||||
# timezone =
|
||||
|
||||
# max length of characters to apply to the
|
||||
# "slug" field
|
||||
# truncate_slug_length = 40
|
||||
|
||||
# set to 'true' to run the environment during
|
||||
# the 'revision' command, regardless of autogenerate
|
||||
# revision_environment = false
|
||||
|
||||
# set to 'true' to allow .pyc and .pyo files without
|
||||
# a source .py file to be detected as revisions in the
|
||||
# versions/ directory
|
||||
# sourceless = false
|
||||
|
||||
# version location specification; this defaults
|
||||
# to migrations/versions. When using multiple version
|
||||
# directories, initial revisions must be specified with --version-path
|
||||
# version_locations = %(here)s/bar %(here)s/bat migrations/versions
|
||||
|
||||
# the output encoding used when revision files
|
||||
# are written from script.py.mako
|
||||
# output_encoding = utf-8
|
||||
|
||||
sqlalchemy.url = sqlite:////app/storage/db/db.sqlite
|
||||
|
||||
[post_write_hooks]
|
||||
# post_write_hooks defines scripts or Python functions that are run
|
||||
# on newly generated revision scripts. See the documentation for further
|
||||
# detail and examples
|
||||
|
||||
# format using "black" - use the console_scripts runner, against the "black" entrypoint
|
||||
# hooks=black
|
||||
# black.type=console_scripts
|
||||
# black.entrypoint=black
|
||||
# black.options=-l 79
|
||||
|
||||
# Logging configuration
|
||||
[loggers]
|
||||
keys = root,sqlalchemy,alembic
|
||||
|
||||
[handlers]
|
||||
keys = console
|
||||
|
||||
[formatters]
|
||||
keys = generic
|
||||
|
||||
[logger_root]
|
||||
level = WARN
|
||||
handlers = console
|
||||
qualname =
|
||||
|
||||
[logger_sqlalchemy]
|
||||
level = WARN
|
||||
handlers =
|
||||
qualname = sqlalchemy.engine
|
||||
|
||||
[logger_alembic]
|
||||
level = INFO
|
||||
handlers =
|
||||
qualname = alembic
|
||||
|
||||
[handler_console]
|
||||
class = StreamHandler
|
||||
args = (sys.stderr,)
|
||||
level = NOTSET
|
||||
formatter = generic
|
||||
|
||||
[formatter_generic]
|
||||
format = %(levelname)-5.5s [%(name)s] %(message)s
|
||||
datefmt = %H:%M:%S
|
0
app/__init__.py
Normal file
0
app/__init__.py
Normal file
0
app/api/__init__.py
Normal file
0
app/api/__init__.py
Normal file
9
app/api/api.py
Normal file
9
app/api/api.py
Normal file
@ -0,0 +1,9 @@
|
||||
from fastapi import APIRouter
|
||||
|
||||
from app.api.endpoints import scrape_jobs
|
||||
|
||||
api_router = APIRouter()
|
||||
|
||||
api_router.include_router(
|
||||
scrape_jobs.router, prefix="/scrape-jobs", tags=["scrape-jobs"]
|
||||
)
|
15
app/api/deps.py
Normal file
15
app/api/deps.py
Normal file
@ -0,0 +1,15 @@
|
||||
from typing import Generator
|
||||
|
||||
|
||||
from app.db.session import SessionLocal
|
||||
|
||||
|
||||
def get_db() -> Generator:
|
||||
"""
|
||||
Get a database session.
|
||||
"""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
1
app/api/endpoints/__init__.py
Normal file
1
app/api/endpoints/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
# This file is intentionally empty to make the directory a Python package
|
201
app/api/endpoints/scrape_jobs.py
Normal file
201
app/api/endpoints/scrape_jobs.py
Normal file
@ -0,0 +1,201 @@
|
||||
from typing import Any, Optional
|
||||
|
||||
from fastapi import APIRouter, Depends, HTTPException, BackgroundTasks, status
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.api.deps import get_db
|
||||
from app.models.scrape_job import JobStatus
|
||||
from app.services.scraper import Scraper
|
||||
from app.crud import scrape_job, scrape_result
|
||||
from app.schemas.scrape_job import (
|
||||
ScrapeJob,
|
||||
ScrapeJobCreate,
|
||||
ScrapeJobUpdate,
|
||||
ScrapeJobList,
|
||||
)
|
||||
from app.schemas.scrape_result import ScrapeResult
|
||||
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post("/", response_model=ScrapeJob, status_code=status.HTTP_201_CREATED)
|
||||
def create_scrape_job(
|
||||
*,
|
||||
db: Session = Depends(get_db),
|
||||
job_in: ScrapeJobCreate,
|
||||
background_tasks: BackgroundTasks,
|
||||
) -> Any:
|
||||
"""
|
||||
Create a new scrape job.
|
||||
"""
|
||||
job = scrape_job.create(db=db, obj_in=job_in)
|
||||
|
||||
# Run job in background
|
||||
background_tasks.add_task(run_scrape_job, job_id=job.id)
|
||||
|
||||
return job
|
||||
|
||||
|
||||
@router.get("/", response_model=ScrapeJobList)
|
||||
def list_scrape_jobs(
|
||||
*,
|
||||
db: Session = Depends(get_db),
|
||||
skip: int = 0,
|
||||
limit: int = 100,
|
||||
status: Optional[JobStatus] = None,
|
||||
) -> Any:
|
||||
"""
|
||||
List scrape jobs.
|
||||
"""
|
||||
if status:
|
||||
jobs = scrape_job.get_by_status(db=db, status=status, skip=skip, limit=limit)
|
||||
total = scrape_job.count_by_status(db=db, status=status)
|
||||
else:
|
||||
jobs = scrape_job.get_multi(db=db, skip=skip, limit=limit)
|
||||
total = scrape_job.count(db=db)
|
||||
|
||||
return {"jobs": jobs, "total": total}
|
||||
|
||||
|
||||
@router.get("/{job_id}", response_model=ScrapeJob)
|
||||
def get_scrape_job(
|
||||
*,
|
||||
db: Session = Depends(get_db),
|
||||
job_id: int,
|
||||
) -> Any:
|
||||
"""
|
||||
Get a scrape job by ID.
|
||||
"""
|
||||
job = scrape_job.get(db=db, id=job_id)
|
||||
if not job:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=f"Scrape job with ID {job_id} not found",
|
||||
)
|
||||
return job
|
||||
|
||||
|
||||
@router.put("/{job_id}", response_model=ScrapeJob)
|
||||
def update_scrape_job(
|
||||
*,
|
||||
db: Session = Depends(get_db),
|
||||
job_id: int,
|
||||
job_in: ScrapeJobUpdate,
|
||||
) -> Any:
|
||||
"""
|
||||
Update a scrape job.
|
||||
"""
|
||||
job = scrape_job.get(db=db, id=job_id)
|
||||
if not job:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=f"Scrape job with ID {job_id} not found",
|
||||
)
|
||||
job = scrape_job.update(db=db, db_obj=job, obj_in=job_in)
|
||||
return job
|
||||
|
||||
|
||||
@router.delete("/{job_id}", status_code=status.HTTP_204_NO_CONTENT, response_model=None)
|
||||
def delete_scrape_job(
|
||||
*,
|
||||
db: Session = Depends(get_db),
|
||||
job_id: int,
|
||||
) -> None:
|
||||
"""
|
||||
Delete a scrape job.
|
||||
"""
|
||||
job = scrape_job.get(db=db, id=job_id)
|
||||
if not job:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=f"Scrape job with ID {job_id} not found",
|
||||
)
|
||||
scrape_job.remove(db=db, id=job_id)
|
||||
|
||||
|
||||
@router.post("/{job_id}/run", response_model=ScrapeJob)
|
||||
def run_scrape_job(
|
||||
*,
|
||||
db: Session = Depends(get_db),
|
||||
job_id: int,
|
||||
background_tasks: Optional[BackgroundTasks] = None,
|
||||
) -> Any:
|
||||
"""
|
||||
Run a scrape job.
|
||||
"""
|
||||
job = scrape_job.get(db=db, id=job_id)
|
||||
if not job:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=f"Scrape job with ID {job_id} not found",
|
||||
)
|
||||
|
||||
if job.status == JobStatus.IN_PROGRESS:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_400_BAD_REQUEST,
|
||||
detail=f"Scrape job with ID {job_id} is already in progress",
|
||||
)
|
||||
|
||||
# If called with background_tasks, run in background
|
||||
if background_tasks:
|
||||
background_tasks.add_task(_run_job, job_id=job_id)
|
||||
# Update job status to pending
|
||||
job = scrape_job.update(db=db, db_obj=job, obj_in={"status": JobStatus.PENDING})
|
||||
return job
|
||||
|
||||
# Otherwise, run synchronously
|
||||
return _run_job(job_id=job_id)
|
||||
|
||||
|
||||
@router.get("/{job_id}/results", response_model=ScrapeResult)
|
||||
def get_scrape_results(
|
||||
*,
|
||||
db: Session = Depends(get_db),
|
||||
job_id: int,
|
||||
) -> Any:
|
||||
"""
|
||||
Get the latest result for a scrape job.
|
||||
"""
|
||||
job = scrape_job.get(db=db, id=job_id)
|
||||
if not job:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=f"Scrape job with ID {job_id} not found",
|
||||
)
|
||||
|
||||
result = scrape_result.get_latest_by_job_id(db=db, job_id=job_id)
|
||||
if not result:
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_404_NOT_FOUND,
|
||||
detail=f"No results found for scrape job with ID {job_id}",
|
||||
)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def _run_job(job_id: int) -> ScrapeJob:
|
||||
"""
|
||||
Internal function to run a scrape job.
|
||||
"""
|
||||
# Create a new session and scraper
|
||||
db = next(get_db())
|
||||
scraper = Scraper(db=db)
|
||||
|
||||
try:
|
||||
# Run the job
|
||||
job = scraper.run_job(job_id=job_id)
|
||||
return job
|
||||
except Exception as e:
|
||||
# Make sure the job is marked as failed
|
||||
job = scrape_job.get(db=db, id=job_id)
|
||||
if job and job.status != JobStatus.FAILED:
|
||||
scrape_job.update(
|
||||
db=db,
|
||||
db_obj=job,
|
||||
obj_in={"status": JobStatus.FAILED, "error": str(e)},
|
||||
)
|
||||
raise HTTPException(
|
||||
status_code=status.HTTP_500_INTERNAL_SERVER_ERROR,
|
||||
detail=f"Error running scrape job: {str(e)}",
|
||||
)
|
0
app/cli/__init__.py
Normal file
0
app/cli/__init__.py
Normal file
277
app/cli/cli.py
Normal file
277
app/cli/cli.py
Normal file
@ -0,0 +1,277 @@
|
||||
import json
|
||||
from typing import Optional
|
||||
|
||||
import typer
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.db.session import SessionLocal
|
||||
from app.crud import scrape_job, scrape_result
|
||||
from app.models.scrape_job import JobStatus
|
||||
from app.services.scraper import Scraper
|
||||
|
||||
app = typer.Typer(help="Web Scraper CLI")
|
||||
console = Console()
|
||||
|
||||
|
||||
def get_db() -> Session:
|
||||
"""
|
||||
Get a database session.
|
||||
"""
|
||||
return SessionLocal()
|
||||
|
||||
|
||||
@app.command("scrape")
|
||||
def scrape_url(
|
||||
url: str = typer.Argument(..., help="URL to scrape"),
|
||||
selector: Optional[str] = typer.Option(
|
||||
None, help="CSS selector to extract content"
|
||||
),
|
||||
user_agent: Optional[str] = typer.Option(
|
||||
None, help="User agent to use for request"
|
||||
),
|
||||
timeout: Optional[int] = typer.Option(None, help="Timeout for request in seconds"),
|
||||
output: Optional[str] = typer.Option(
|
||||
None, help="Output file path for results (JSON)"
|
||||
),
|
||||
):
|
||||
"""
|
||||
Scrape a URL and extract content.
|
||||
"""
|
||||
console.print(f"Scraping [bold]{url}[/bold]...")
|
||||
|
||||
db = get_db()
|
||||
|
||||
try:
|
||||
# Create a new scrape job
|
||||
job_data = {
|
||||
"url": url,
|
||||
"selector": selector,
|
||||
"user_agent": user_agent,
|
||||
"timeout": timeout,
|
||||
}
|
||||
job_in = {k: v for k, v in job_data.items() if v is not None}
|
||||
|
||||
# Create and run the job
|
||||
job = scrape_job.create(db=db, obj_in=job_in)
|
||||
console.print(f"Created scrape job with ID [bold]{job.id}[/bold]")
|
||||
|
||||
# Run the job
|
||||
scraper = Scraper(db=db, user_agent=user_agent, timeout=timeout)
|
||||
job = scraper.run_job(job_id=job.id)
|
||||
|
||||
if job.status == JobStatus.COMPLETED:
|
||||
console.print("[bold green]Scraping completed successfully![/bold green]")
|
||||
|
||||
# Get the result
|
||||
result = scrape_result.get_latest_by_job_id(db=db, job_id=job.id)
|
||||
|
||||
# Print basic info
|
||||
console.print("\n[bold]Basic Information:[/bold]")
|
||||
table = Table(show_header=True, header_style="bold")
|
||||
table.add_column("Attribute")
|
||||
table.add_column("Value")
|
||||
|
||||
if result and result.extracted_data:
|
||||
data = result.extracted_data
|
||||
|
||||
# Add rows to table
|
||||
if "title" in data:
|
||||
table.add_row("Title", data["title"] or "")
|
||||
|
||||
if "meta_description" in data:
|
||||
table.add_row("Description", data["meta_description"] or "")
|
||||
|
||||
if "h1" in data:
|
||||
table.add_row(
|
||||
"H1 Tags", ", ".join(data["h1"]) if data["h1"] else ""
|
||||
)
|
||||
|
||||
if "links" in data:
|
||||
link_count = len(data["links"]) if data["links"] else 0
|
||||
table.add_row("Links", str(link_count))
|
||||
|
||||
if selector and "selected_content" in data:
|
||||
content_count = (
|
||||
len(data["selected_content"]) if data["selected_content"] else 0
|
||||
)
|
||||
table.add_row(f"Selected Content ({selector})", str(content_count))
|
||||
|
||||
console.print(table)
|
||||
|
||||
# Write results to file if specified
|
||||
if output:
|
||||
with open(output, "w") as f:
|
||||
json.dump(data, f, indent=2)
|
||||
console.print(f"\nResults saved to [bold]{output}[/bold]")
|
||||
|
||||
# Ask if user wants to see more details
|
||||
if typer.confirm("\nDo you want to see the full extracted data?"):
|
||||
console.print_json(json.dumps(data))
|
||||
else:
|
||||
console.print("[yellow]No data extracted.[/yellow]")
|
||||
else:
|
||||
console.print(f"[bold red]Scraping failed:[/bold red] {job.error}")
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[bold red]Error:[/bold red] {str(e)}")
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@app.command("list")
|
||||
def list_jobs(
|
||||
status: Optional[str] = typer.Option(
|
||||
None, help="Filter by status (pending, in_progress, completed, failed)"
|
||||
),
|
||||
limit: int = typer.Option(10, help="Limit number of jobs"),
|
||||
):
|
||||
"""
|
||||
List scrape jobs.
|
||||
"""
|
||||
db = get_db()
|
||||
|
||||
try:
|
||||
# Get jobs based on status
|
||||
if status:
|
||||
try:
|
||||
job_status = JobStatus(status)
|
||||
jobs = scrape_job.get_by_status(db=db, status=job_status, limit=limit)
|
||||
total = scrape_job.count_by_status(db=db, status=job_status)
|
||||
console.print(
|
||||
f"Found [bold]{total}[/bold] jobs with status [bold]{status}[/bold]"
|
||||
)
|
||||
except ValueError:
|
||||
console.print(f"[bold red]Invalid status:[/bold red] {status}")
|
||||
return
|
||||
else:
|
||||
jobs = scrape_job.get_multi(db=db, limit=limit)
|
||||
total = scrape_job.count(db=db)
|
||||
console.print(f"Found [bold]{total}[/bold] jobs")
|
||||
|
||||
if not jobs:
|
||||
console.print("[yellow]No jobs found.[/yellow]")
|
||||
return
|
||||
|
||||
# Create table
|
||||
table = Table(show_header=True, header_style="bold")
|
||||
table.add_column("ID")
|
||||
table.add_column("URL")
|
||||
table.add_column("Status")
|
||||
table.add_column("Created")
|
||||
table.add_column("Updated")
|
||||
|
||||
# Add rows
|
||||
for job in jobs:
|
||||
table.add_row(
|
||||
str(job.id),
|
||||
job.url,
|
||||
job.status.value,
|
||||
job.created_at.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
job.updated_at.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
)
|
||||
|
||||
console.print(table)
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[bold red]Error:[/bold red] {str(e)}")
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@app.command("show")
|
||||
def show_job(
|
||||
job_id: int = typer.Argument(..., help="ID of the job to show"),
|
||||
):
|
||||
"""
|
||||
Show details of a scrape job.
|
||||
"""
|
||||
db = get_db()
|
||||
|
||||
try:
|
||||
# Get job
|
||||
job = scrape_job.get(db=db, id=job_id)
|
||||
|
||||
if not job:
|
||||
console.print(f"[bold red]Job not found:[/bold red] {job_id}")
|
||||
return
|
||||
|
||||
# Print job details
|
||||
console.print(f"\n[bold]Job {job_id}[/bold]")
|
||||
console.print(f"URL: [bold]{job.url}[/bold]")
|
||||
console.print(f"Status: [bold]{job.status.value}[/bold]")
|
||||
console.print(f"Created: [bold]{job.created_at}[/bold]")
|
||||
console.print(f"Updated: [bold]{job.updated_at}[/bold]")
|
||||
|
||||
if job.started_at:
|
||||
console.print(f"Started: [bold]{job.started_at}[/bold]")
|
||||
|
||||
if job.completed_at:
|
||||
console.print(f"Completed: [bold]{job.completed_at}[/bold]")
|
||||
|
||||
if job.selector:
|
||||
console.print(f"Selector: [bold]{job.selector}[/bold]")
|
||||
|
||||
if job.error:
|
||||
console.print(f"Error: [bold red]{job.error}[/bold red]")
|
||||
|
||||
# Get results if job is completed
|
||||
if job.status == JobStatus.COMPLETED:
|
||||
result = scrape_result.get_latest_by_job_id(db=db, job_id=job.id)
|
||||
|
||||
if result and result.extracted_data:
|
||||
console.print("\n[bold]Extracted Data:[/bold]")
|
||||
|
||||
# Ask if user wants to see the data
|
||||
if typer.confirm("Do you want to see the extracted data?"):
|
||||
console.print_json(json.dumps(result.extracted_data))
|
||||
else:
|
||||
console.print("[yellow]No data extracted.[/yellow]")
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[bold red]Error:[/bold red] {str(e)}")
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
@app.command("run")
|
||||
def run_job(
|
||||
job_id: int = typer.Argument(..., help="ID of the job to run"),
|
||||
):
|
||||
"""
|
||||
Run a scrape job.
|
||||
"""
|
||||
db = get_db()
|
||||
|
||||
try:
|
||||
# Get job
|
||||
job = scrape_job.get(db=db, id=job_id)
|
||||
|
||||
if not job:
|
||||
console.print(f"[bold red]Job not found:[/bold red] {job_id}")
|
||||
return
|
||||
|
||||
console.print(f"Running job [bold]{job_id}[/bold]...")
|
||||
|
||||
# Run the job
|
||||
scraper = Scraper(db=db)
|
||||
job = scraper.run_job(job_id=job.id)
|
||||
|
||||
if job.status == JobStatus.COMPLETED:
|
||||
console.print("[bold green]Job completed successfully![/bold green]")
|
||||
else:
|
||||
console.print(f"[bold red]Job failed:[/bold red] {job.error}")
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[bold red]Error:[/bold red] {str(e)}")
|
||||
|
||||
finally:
|
||||
db.close()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
0
app/core/__init__.py
Normal file
0
app/core/__init__.py
Normal file
50
app/core/config.py
Normal file
50
app/core/config.py
Normal file
@ -0,0 +1,50 @@
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional
|
||||
|
||||
from pydantic import BaseSettings, validator
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
# Base settings
|
||||
PROJECT_NAME: str = "Web Scraper CLI"
|
||||
PROJECT_DESCRIPTION: str = "A FastAPI-based web scraper with CLI interface"
|
||||
VERSION: str = "0.1.0"
|
||||
API_V1_STR: str = "/api/v1"
|
||||
|
||||
# Server settings
|
||||
HOST: str = "0.0.0.0"
|
||||
PORT: int = 8000
|
||||
DEBUG: bool = True
|
||||
|
||||
# Database settings
|
||||
DB_DIR: Path = Path("/app") / "storage" / "db"
|
||||
SQLALCHEMY_DATABASE_URL: str = f"sqlite:///{DB_DIR}/db.sqlite"
|
||||
|
||||
@validator("SQLALCHEMY_DATABASE_URL", pre=True)
|
||||
def validate_db_url(cls, v: Optional[str], values: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Ensure the database directory exists.
|
||||
"""
|
||||
if isinstance(v, str) and v.startswith("sqlite"):
|
||||
db_dir = values.get("DB_DIR")
|
||||
if db_dir:
|
||||
db_dir.mkdir(parents=True, exist_ok=True)
|
||||
return v
|
||||
return v
|
||||
|
||||
# Scraper settings
|
||||
DEFAULT_USER_AGENT: str = (
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
|
||||
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
)
|
||||
DEFAULT_TIMEOUT: int = 30 # seconds
|
||||
|
||||
# Ratelimit settings
|
||||
DEFAULT_RATE_LIMIT: float = 1.0 # requests per second
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
case_sensitive = True
|
||||
|
||||
|
||||
settings = Settings()
|
4
app/crud/__init__.py
Normal file
4
app/crud/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
from app.crud.scrape_job import scrape_job
|
||||
from app.crud.scrape_result import scrape_result
|
||||
|
||||
__all__ = ["scrape_job", "scrape_result"]
|
89
app/crud/base.py
Normal file
89
app/crud/base.py
Normal file
@ -0,0 +1,89 @@
|
||||
from typing import Any, Dict, Generic, List, Optional, Type, TypeVar, Union
|
||||
|
||||
from fastapi.encoders import jsonable_encoder
|
||||
from pydantic import BaseModel
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.db.session import Base
|
||||
|
||||
ModelType = TypeVar("ModelType", bound=Base)
|
||||
CreateSchemaType = TypeVar("CreateSchemaType", bound=BaseModel)
|
||||
UpdateSchemaType = TypeVar("UpdateSchemaType", bound=BaseModel)
|
||||
|
||||
|
||||
class CRUDBase(Generic[ModelType, CreateSchemaType, UpdateSchemaType]):
|
||||
"""
|
||||
CRUD operations base class.
|
||||
"""
|
||||
|
||||
def __init__(self, model: Type[ModelType]):
|
||||
"""
|
||||
CRUD object with default methods to Create, Read, Update, Delete (CRUD).
|
||||
**Parameters**
|
||||
* `model`: A SQLAlchemy model class
|
||||
* `schema`: A Pydantic model (schema) class
|
||||
"""
|
||||
self.model = model
|
||||
|
||||
def get(self, db: Session, id: Any) -> Optional[ModelType]:
|
||||
"""
|
||||
Get a record by ID.
|
||||
"""
|
||||
return db.query(self.model).filter(self.model.id == id).first()
|
||||
|
||||
def get_multi(
|
||||
self, db: Session, *, skip: int = 0, limit: int = 100
|
||||
) -> List[ModelType]:
|
||||
"""
|
||||
Get multiple records.
|
||||
"""
|
||||
return db.query(self.model).offset(skip).limit(limit).all()
|
||||
|
||||
def count(self, db: Session) -> int:
|
||||
"""
|
||||
Count total records.
|
||||
"""
|
||||
return db.query(self.model).count()
|
||||
|
||||
def create(self, db: Session, *, obj_in: CreateSchemaType) -> ModelType:
|
||||
"""
|
||||
Create a new record.
|
||||
"""
|
||||
obj_in_data = jsonable_encoder(obj_in)
|
||||
db_obj = self.model(**obj_in_data)
|
||||
db.add(db_obj)
|
||||
db.commit()
|
||||
db.refresh(db_obj)
|
||||
return db_obj
|
||||
|
||||
def update(
|
||||
self,
|
||||
db: Session,
|
||||
*,
|
||||
db_obj: ModelType,
|
||||
obj_in: Union[UpdateSchemaType, Dict[str, Any]],
|
||||
) -> ModelType:
|
||||
"""
|
||||
Update a record.
|
||||
"""
|
||||
obj_data = jsonable_encoder(db_obj)
|
||||
if isinstance(obj_in, dict):
|
||||
update_data = obj_in
|
||||
else:
|
||||
update_data = obj_in.dict(exclude_unset=True)
|
||||
for field in obj_data:
|
||||
if field in update_data:
|
||||
setattr(db_obj, field, update_data[field])
|
||||
db.add(db_obj)
|
||||
db.commit()
|
||||
db.refresh(db_obj)
|
||||
return db_obj
|
||||
|
||||
def remove(self, db: Session, *, id: int) -> ModelType:
|
||||
"""
|
||||
Remove a record.
|
||||
"""
|
||||
obj = db.query(self.model).get(id)
|
||||
db.delete(obj)
|
||||
db.commit()
|
||||
return obj
|
47
app/crud/scrape_job.py
Normal file
47
app/crud/scrape_job.py
Normal file
@ -0,0 +1,47 @@
|
||||
from typing import List
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.models.scrape_job import ScrapeJob, JobStatus
|
||||
from app.schemas.scrape_job import ScrapeJobCreate, ScrapeJobUpdate
|
||||
from app.crud.base import CRUDBase
|
||||
|
||||
|
||||
class CRUDScrapeJob(CRUDBase[ScrapeJob, ScrapeJobCreate, ScrapeJobUpdate]):
|
||||
"""
|
||||
CRUD operations for ScrapeJob model.
|
||||
"""
|
||||
|
||||
def get_by_status(
|
||||
self, db: Session, *, status: JobStatus, skip: int = 0, limit: int = 100
|
||||
) -> List[ScrapeJob]:
|
||||
"""
|
||||
Get jobs by status.
|
||||
"""
|
||||
return (
|
||||
db.query(self.model)
|
||||
.filter(self.model.status == status)
|
||||
.offset(skip)
|
||||
.limit(limit)
|
||||
.all()
|
||||
)
|
||||
|
||||
def count_by_status(self, db: Session, *, status: JobStatus) -> int:
|
||||
"""
|
||||
Count jobs by status.
|
||||
"""
|
||||
return db.query(self.model).filter(self.model.status == status).count()
|
||||
|
||||
def get_pending_jobs(self, db: Session, *, limit: int = 10) -> List[ScrapeJob]:
|
||||
"""
|
||||
Get pending jobs.
|
||||
"""
|
||||
return (
|
||||
db.query(self.model)
|
||||
.filter(self.model.status == JobStatus.PENDING)
|
||||
.limit(limit)
|
||||
.all()
|
||||
)
|
||||
|
||||
|
||||
scrape_job = CRUDScrapeJob(ScrapeJob)
|
35
app/crud/scrape_result.py
Normal file
35
app/crud/scrape_result.py
Normal file
@ -0,0 +1,35 @@
|
||||
from typing import List, Optional
|
||||
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.models.scrape_result import ScrapeResult
|
||||
from app.schemas.scrape_result import ScrapeResultCreate, ScrapeResultUpdate
|
||||
from app.crud.base import CRUDBase
|
||||
|
||||
|
||||
class CRUDScrapeResult(CRUDBase[ScrapeResult, ScrapeResultCreate, ScrapeResultUpdate]):
|
||||
"""
|
||||
CRUD operations for ScrapeResult model.
|
||||
"""
|
||||
|
||||
def get_by_job_id(self, db: Session, *, job_id: int) -> List[ScrapeResult]:
|
||||
"""
|
||||
Get results by job ID.
|
||||
"""
|
||||
return db.query(self.model).filter(self.model.job_id == job_id).all()
|
||||
|
||||
def get_latest_by_job_id(
|
||||
self, db: Session, *, job_id: int
|
||||
) -> Optional[ScrapeResult]:
|
||||
"""
|
||||
Get the latest result by job ID.
|
||||
"""
|
||||
return (
|
||||
db.query(self.model)
|
||||
.filter(self.model.job_id == job_id)
|
||||
.order_by(self.model.created_at.desc())
|
||||
.first()
|
||||
)
|
||||
|
||||
|
||||
scrape_result = CRUDScrapeResult(ScrapeResult)
|
0
app/db/__init__.py
Normal file
0
app/db/__init__.py
Normal file
32
app/db/session.py
Normal file
32
app/db/session.py
Normal file
@ -0,0 +1,32 @@
|
||||
from sqlalchemy import create_engine
|
||||
from sqlalchemy.ext.declarative import declarative_base
|
||||
from sqlalchemy.orm import sessionmaker
|
||||
|
||||
from app.core.config import settings
|
||||
|
||||
# Create database directory if it doesn't exist
|
||||
settings.DB_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create SQLAlchemy engine
|
||||
engine = create_engine(
|
||||
settings.SQLALCHEMY_DATABASE_URL,
|
||||
connect_args={"check_same_thread": False}, # Only for SQLite
|
||||
)
|
||||
|
||||
# Create sessionmaker
|
||||
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
|
||||
|
||||
# Create base class for SQLAlchemy models
|
||||
Base = declarative_base()
|
||||
|
||||
|
||||
# Database session dependency
|
||||
def get_db():
|
||||
"""
|
||||
Dependency for getting a database session.
|
||||
"""
|
||||
db = SessionLocal()
|
||||
try:
|
||||
yield db
|
||||
finally:
|
||||
db.close()
|
4
app/models/__init__.py
Normal file
4
app/models/__init__.py
Normal file
@ -0,0 +1,4 @@
|
||||
from app.models.scrape_job import ScrapeJob, JobStatus
|
||||
from app.models.scrape_result import ScrapeResult
|
||||
|
||||
__all__ = ["ScrapeJob", "JobStatus", "ScrapeResult"]
|
41
app/models/scrape_job.py
Normal file
41
app/models/scrape_job.py
Normal file
@ -0,0 +1,41 @@
|
||||
import enum
|
||||
|
||||
from sqlalchemy import Column, String, Integer, DateTime, Enum, Text, JSON
|
||||
from sqlalchemy.sql import func
|
||||
|
||||
from app.db.session import Base
|
||||
|
||||
|
||||
class JobStatus(str, enum.Enum):
|
||||
PENDING = "pending"
|
||||
IN_PROGRESS = "in_progress"
|
||||
COMPLETED = "completed"
|
||||
FAILED = "failed"
|
||||
|
||||
|
||||
class ScrapeJob(Base):
|
||||
"""
|
||||
Model for a web scraping job.
|
||||
"""
|
||||
|
||||
__tablename__ = "scrape_jobs"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
url = Column(String(2048), nullable=False, index=True)
|
||||
status = Column(Enum(JobStatus), default=JobStatus.PENDING, nullable=False)
|
||||
created_at = Column(DateTime, default=func.now(), nullable=False)
|
||||
updated_at = Column(
|
||||
DateTime, default=func.now(), onupdate=func.now(), nullable=False
|
||||
)
|
||||
started_at = Column(DateTime, nullable=True)
|
||||
completed_at = Column(DateTime, nullable=True)
|
||||
selector = Column(String(255), nullable=True)
|
||||
error = Column(Text, nullable=True)
|
||||
result = Column(JSON, nullable=True)
|
||||
user_agent = Column(String(255), nullable=True)
|
||||
timeout = Column(Integer, nullable=True)
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
f"<ScrapeJob(id={self.id}, url='{self.url}', status='{self.status.value}')>"
|
||||
)
|
29
app/models/scrape_result.py
Normal file
29
app/models/scrape_result.py
Normal file
@ -0,0 +1,29 @@
|
||||
from sqlalchemy import Column, String, Integer, DateTime, ForeignKey, Text, JSON
|
||||
from sqlalchemy.sql import func
|
||||
from sqlalchemy.orm import relationship
|
||||
|
||||
from app.db.session import Base
|
||||
|
||||
|
||||
class ScrapeResult(Base):
|
||||
"""
|
||||
Model for storing scraping results.
|
||||
"""
|
||||
|
||||
__tablename__ = "scrape_results"
|
||||
|
||||
id = Column(Integer, primary_key=True, index=True)
|
||||
job_id = Column(
|
||||
Integer, ForeignKey("scrape_jobs.id", ondelete="CASCADE"), nullable=False
|
||||
)
|
||||
created_at = Column(DateTime, default=func.now(), nullable=False)
|
||||
content_type = Column(String(100), nullable=True)
|
||||
headers = Column(JSON, nullable=True)
|
||||
html_content = Column(Text, nullable=True)
|
||||
extracted_data = Column(JSON, nullable=True)
|
||||
|
||||
# Relationship
|
||||
job = relationship("ScrapeJob", backref="results")
|
||||
|
||||
def __repr__(self):
|
||||
return f"<ScrapeResult(id={self.id}, job_id={self.job_id})>"
|
27
app/schemas/__init__.py
Normal file
27
app/schemas/__init__.py
Normal file
@ -0,0 +1,27 @@
|
||||
from app.schemas.scrape_job import (
|
||||
ScrapeJobBase,
|
||||
ScrapeJobCreate,
|
||||
ScrapeJobUpdate,
|
||||
ScrapeJob,
|
||||
ScrapeJobList,
|
||||
)
|
||||
from app.schemas.scrape_result import (
|
||||
ScrapeResultBase,
|
||||
ScrapeResultCreate,
|
||||
ScrapeResultUpdate,
|
||||
ScrapeResult,
|
||||
ScrapeResultList,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"ScrapeJobBase",
|
||||
"ScrapeJobCreate",
|
||||
"ScrapeJobUpdate",
|
||||
"ScrapeJob",
|
||||
"ScrapeJobList",
|
||||
"ScrapeResultBase",
|
||||
"ScrapeResultCreate",
|
||||
"ScrapeResultUpdate",
|
||||
"ScrapeResult",
|
||||
"ScrapeResultList",
|
||||
]
|
74
app/schemas/scrape_job.py
Normal file
74
app/schemas/scrape_job.py
Normal file
@ -0,0 +1,74 @@
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any, List
|
||||
|
||||
from pydantic import BaseModel, HttpUrl
|
||||
|
||||
from app.models.scrape_job import JobStatus
|
||||
|
||||
|
||||
class ScrapeJobBase(BaseModel):
|
||||
"""
|
||||
Base schema for scrape job data.
|
||||
"""
|
||||
|
||||
url: HttpUrl
|
||||
selector: Optional[str] = None
|
||||
user_agent: Optional[str] = None
|
||||
timeout: Optional[int] = None
|
||||
|
||||
|
||||
class ScrapeJobCreate(ScrapeJobBase):
|
||||
"""
|
||||
Schema for creating a new scrape job.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class ScrapeJobUpdate(BaseModel):
|
||||
"""
|
||||
Schema for updating a scrape job.
|
||||
"""
|
||||
|
||||
url: Optional[HttpUrl] = None
|
||||
status: Optional[JobStatus] = None
|
||||
selector: Optional[str] = None
|
||||
error: Optional[str] = None
|
||||
result: Optional[Dict[str, Any]] = None
|
||||
user_agent: Optional[str] = None
|
||||
timeout: Optional[int] = None
|
||||
|
||||
|
||||
class ScrapeJobInDBBase(ScrapeJobBase):
|
||||
"""
|
||||
Base schema for scrape job in database.
|
||||
"""
|
||||
|
||||
id: int
|
||||
status: JobStatus
|
||||
created_at: datetime
|
||||
updated_at: datetime
|
||||
started_at: Optional[datetime] = None
|
||||
completed_at: Optional[datetime] = None
|
||||
error: Optional[str] = None
|
||||
result: Optional[Dict[str, Any]] = None
|
||||
|
||||
class Config:
|
||||
orm_mode = True
|
||||
|
||||
|
||||
class ScrapeJob(ScrapeJobInDBBase):
|
||||
"""
|
||||
Schema for returned scrape job.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class ScrapeJobList(BaseModel):
|
||||
"""
|
||||
Schema for a list of scrape jobs.
|
||||
"""
|
||||
|
||||
jobs: List[ScrapeJob]
|
||||
total: int
|
64
app/schemas/scrape_result.py
Normal file
64
app/schemas/scrape_result.py
Normal file
@ -0,0 +1,64 @@
|
||||
from datetime import datetime
|
||||
from typing import Optional, Dict, Any, List
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
|
||||
class ScrapeResultBase(BaseModel):
|
||||
"""
|
||||
Base schema for scrape result data.
|
||||
"""
|
||||
|
||||
job_id: int
|
||||
content_type: Optional[str] = None
|
||||
headers: Optional[Dict[str, Any]] = None
|
||||
extracted_data: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
class ScrapeResultCreate(ScrapeResultBase):
|
||||
"""
|
||||
Schema for creating a new scrape result.
|
||||
"""
|
||||
|
||||
html_content: Optional[str] = None
|
||||
|
||||
|
||||
class ScrapeResultUpdate(BaseModel):
|
||||
"""
|
||||
Schema for updating a scrape result.
|
||||
"""
|
||||
|
||||
content_type: Optional[str] = None
|
||||
headers: Optional[Dict[str, Any]] = None
|
||||
html_content: Optional[str] = None
|
||||
extracted_data: Optional[Dict[str, Any]] = None
|
||||
|
||||
|
||||
class ScrapeResultInDBBase(ScrapeResultBase):
|
||||
"""
|
||||
Base schema for scrape result in database.
|
||||
"""
|
||||
|
||||
id: int
|
||||
created_at: datetime
|
||||
html_content: Optional[str] = None
|
||||
|
||||
class Config:
|
||||
orm_mode = True
|
||||
|
||||
|
||||
class ScrapeResult(ScrapeResultInDBBase):
|
||||
"""
|
||||
Schema for returned scrape result.
|
||||
"""
|
||||
|
||||
pass
|
||||
|
||||
|
||||
class ScrapeResultList(BaseModel):
|
||||
"""
|
||||
Schema for a list of scrape results.
|
||||
"""
|
||||
|
||||
results: List[ScrapeResult]
|
||||
total: int
|
0
app/services/__init__.py
Normal file
0
app/services/__init__.py
Normal file
150
app/services/scraper.py
Normal file
150
app/services/scraper.py
Normal file
@ -0,0 +1,150 @@
|
||||
import time
|
||||
from datetime import datetime
|
||||
from typing import Dict, Any, Optional
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
from app.core.config import settings
|
||||
from app.models.scrape_job import ScrapeJob, JobStatus
|
||||
from app.models.scrape_result import ScrapeResult
|
||||
|
||||
|
||||
class Scraper:
|
||||
"""
|
||||
Service for web scraping.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
db: Session,
|
||||
user_agent: Optional[str] = None,
|
||||
timeout: Optional[int] = None,
|
||||
rate_limit: Optional[float] = None,
|
||||
):
|
||||
self.db = db
|
||||
self.user_agent = user_agent or settings.DEFAULT_USER_AGENT
|
||||
self.timeout = timeout or settings.DEFAULT_TIMEOUT
|
||||
self.rate_limit = rate_limit or settings.DEFAULT_RATE_LIMIT
|
||||
self._last_request_time = 0
|
||||
|
||||
def _respect_rate_limit(self) -> None:
|
||||
"""
|
||||
Respect rate limit by sleeping if necessary.
|
||||
"""
|
||||
current_time = time.time()
|
||||
time_since_last_request = current_time - self._last_request_time
|
||||
|
||||
if time_since_last_request < (1.0 / self.rate_limit):
|
||||
sleep_time = (1.0 / self.rate_limit) - time_since_last_request
|
||||
time.sleep(sleep_time)
|
||||
|
||||
self._last_request_time = time.time()
|
||||
|
||||
def fetch_url(self, url: str) -> requests.Response:
|
||||
"""
|
||||
Fetch URL respecting rate limits.
|
||||
"""
|
||||
self._respect_rate_limit()
|
||||
|
||||
headers = {
|
||||
"User-Agent": self.user_agent,
|
||||
}
|
||||
|
||||
response = requests.get(
|
||||
url,
|
||||
headers=headers,
|
||||
timeout=self.timeout,
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
return response
|
||||
|
||||
def parse_html(self, html: str, selector: Optional[str] = None) -> Dict[str, Any]:
|
||||
"""
|
||||
Parse HTML content.
|
||||
"""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
result = {
|
||||
"title": soup.title.text if soup.title else None,
|
||||
"meta_description": None,
|
||||
"h1": [h1.text.strip() for h1 in soup.find_all("h1")],
|
||||
"links": [
|
||||
{"href": a.get("href"), "text": a.text.strip()}
|
||||
for a in soup.find_all("a")
|
||||
if a.get("href")
|
||||
],
|
||||
}
|
||||
|
||||
# Extract meta description
|
||||
meta_desc = soup.find("meta", attrs={"name": "description"})
|
||||
if meta_desc:
|
||||
result["meta_description"] = meta_desc.get("content")
|
||||
|
||||
# If a selector is provided, extract content matching the selector
|
||||
if selector:
|
||||
selected_elements = soup.select(selector)
|
||||
result["selected_content"] = [
|
||||
element.text.strip() for element in selected_elements
|
||||
]
|
||||
result["selected_html"] = [str(element) for element in selected_elements]
|
||||
|
||||
return result
|
||||
|
||||
def run_job(self, job_id: int) -> ScrapeJob:
|
||||
"""
|
||||
Run a scraping job.
|
||||
"""
|
||||
# Get job from DB
|
||||
job = self.db.query(ScrapeJob).filter(ScrapeJob.id == job_id).first()
|
||||
if not job:
|
||||
raise ValueError(f"Job with ID {job_id} not found")
|
||||
|
||||
# Update job status
|
||||
job.status = JobStatus.IN_PROGRESS
|
||||
job.started_at = datetime.now()
|
||||
self.db.commit()
|
||||
self.db.refresh(job)
|
||||
|
||||
try:
|
||||
# Fetch URL
|
||||
response = self.fetch_url(job.url)
|
||||
|
||||
# Create ScrapeResult
|
||||
result = ScrapeResult(
|
||||
job_id=job.id,
|
||||
content_type=response.headers.get("Content-Type"),
|
||||
headers=dict(response.headers),
|
||||
html_content=response.text,
|
||||
)
|
||||
self.db.add(result)
|
||||
self.db.commit()
|
||||
self.db.refresh(result)
|
||||
|
||||
# Parse HTML
|
||||
extracted_data = self.parse_html(response.text, job.selector)
|
||||
|
||||
# Update ScrapeResult with extracted data
|
||||
result.extracted_data = extracted_data
|
||||
self.db.commit()
|
||||
self.db.refresh(result)
|
||||
|
||||
# Update job status
|
||||
job.status = JobStatus.COMPLETED
|
||||
job.completed_at = datetime.now()
|
||||
job.result = {"result_id": result.id}
|
||||
self.db.commit()
|
||||
self.db.refresh(job)
|
||||
|
||||
return job
|
||||
|
||||
except Exception as e:
|
||||
# Update job with error
|
||||
job.status = JobStatus.FAILED
|
||||
job.completed_at = datetime.now()
|
||||
job.error = str(e)
|
||||
self.db.commit()
|
||||
self.db.refresh(job)
|
||||
|
||||
raise e
|
0
app/utils/__init__.py
Normal file
0
app/utils/__init__.py
Normal file
83
app/utils/html.py
Normal file
83
app/utils/html.py
Normal file
@ -0,0 +1,83 @@
|
||||
from typing import List, Dict, Any
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
|
||||
def extract_metadata(html: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Extract metadata from HTML.
|
||||
"""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
result = {
|
||||
"title": None,
|
||||
"description": None,
|
||||
"keywords": None,
|
||||
"og_title": None,
|
||||
"og_description": None,
|
||||
"og_image": None,
|
||||
}
|
||||
|
||||
# Title
|
||||
if soup.title:
|
||||
result["title"] = soup.title.text.strip()
|
||||
|
||||
# Meta tags
|
||||
for meta in soup.find_all("meta"):
|
||||
name = meta.get("name", "").lower()
|
||||
property = meta.get("property", "").lower()
|
||||
content = meta.get("content", "")
|
||||
|
||||
if name == "description":
|
||||
result["description"] = content
|
||||
elif name == "keywords":
|
||||
result["keywords"] = content
|
||||
elif property == "og:title":
|
||||
result["og_title"] = content
|
||||
elif property == "og:description":
|
||||
result["og_description"] = content
|
||||
elif property == "og:image":
|
||||
result["og_image"] = content
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def extract_links(html: str) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Extract links from HTML.
|
||||
"""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
links = []
|
||||
|
||||
for a in soup.find_all("a"):
|
||||
href = a.get("href")
|
||||
if href:
|
||||
links.append(
|
||||
{
|
||||
"href": href,
|
||||
"text": a.text.strip(),
|
||||
"title": a.get("title", ""),
|
||||
"rel": a.get("rel", ""),
|
||||
}
|
||||
)
|
||||
|
||||
return links
|
||||
|
||||
|
||||
def extract_images(html: str) -> List[Dict[str, str]]:
|
||||
"""
|
||||
Extract images from HTML.
|
||||
"""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
images = []
|
||||
|
||||
for img in soup.find_all("img"):
|
||||
src = img.get("src")
|
||||
if src:
|
||||
images.append(
|
||||
{
|
||||
"src": src,
|
||||
"alt": img.get("alt", ""),
|
||||
"title": img.get("title", ""),
|
||||
}
|
||||
)
|
||||
|
||||
return images
|
25
app/utils/url.py
Normal file
25
app/utils/url.py
Normal file
@ -0,0 +1,25 @@
|
||||
from urllib.parse import urlparse, parse_qs
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
def parse_url(url: str) -> Dict[str, Any]:
|
||||
"""
|
||||
Parse a URL into its components.
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
return {
|
||||
"scheme": parsed.scheme,
|
||||
"netloc": parsed.netloc,
|
||||
"path": parsed.path,
|
||||
"params": parsed.params,
|
||||
"query": parse_qs(parsed.query),
|
||||
"fragment": parsed.fragment,
|
||||
}
|
||||
|
||||
|
||||
def is_valid_url(url: str) -> bool:
|
||||
"""
|
||||
Check if a URL is valid.
|
||||
"""
|
||||
parsed = urlparse(url)
|
||||
return bool(parsed.scheme and parsed.netloc)
|
9
cli.py
Normal file
9
cli.py
Normal file
@ -0,0 +1,9 @@
|
||||
#!/usr/bin/env python
|
||||
"""
|
||||
Web Scraper CLI
|
||||
"""
|
||||
|
||||
from app.cli.cli import app
|
||||
|
||||
if __name__ == "__main__":
|
||||
app()
|
15
docker-compose.yml
Normal file
15
docker-compose.yml
Normal file
@ -0,0 +1,15 @@
|
||||
version: '3.8'
|
||||
|
||||
services:
|
||||
app:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
ports:
|
||||
- "8000:8000"
|
||||
volumes:
|
||||
- ./app:/app/app
|
||||
- ./storage:/app/storage
|
||||
environment:
|
||||
- DEBUG=True
|
||||
command: uvicorn main:app --host 0.0.0.0 --port 8000 --reload
|
29
main.py
Normal file
29
main.py
Normal file
@ -0,0 +1,29 @@
|
||||
import uvicorn
|
||||
from fastapi import FastAPI
|
||||
from app.api.api import api_router
|
||||
from app.core.config import settings
|
||||
|
||||
app = FastAPI(
|
||||
title=settings.PROJECT_NAME,
|
||||
description=settings.PROJECT_DESCRIPTION,
|
||||
version=settings.VERSION,
|
||||
)
|
||||
|
||||
app.include_router(api_router)
|
||||
|
||||
|
||||
@app.get("/health", tags=["Health"])
|
||||
async def health_check():
|
||||
"""
|
||||
Health check endpoint.
|
||||
"""
|
||||
return {"status": "ok"}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
uvicorn.run(
|
||||
"main:app",
|
||||
host=settings.HOST,
|
||||
port=settings.PORT,
|
||||
reload=settings.DEBUG,
|
||||
)
|
88
migrations/env.py
Normal file
88
migrations/env.py
Normal file
@ -0,0 +1,88 @@
|
||||
from logging.config import fileConfig
|
||||
|
||||
from sqlalchemy import engine_from_config
|
||||
from sqlalchemy import pool
|
||||
|
||||
from alembic import context
|
||||
|
||||
# Import models to register them with the metadata
|
||||
from app.db.session import Base
|
||||
|
||||
# These imports are needed to register models with SQLAlchemy metadata
|
||||
# even though they appear unused to static analyzers
|
||||
import app.models.scrape_job # noqa: F401
|
||||
import app.models.scrape_result # noqa: F401
|
||||
|
||||
# this is the Alembic Config object, which provides
|
||||
# access to the values within the .ini file in use.
|
||||
config = context.config
|
||||
|
||||
# Interpret the config file for Python logging.
|
||||
# This line sets up loggers basically.
|
||||
fileConfig(config.config_file_name)
|
||||
|
||||
# add your model's MetaData object here
|
||||
# for 'autogenerate' support
|
||||
target_metadata = Base.metadata
|
||||
|
||||
# other values from the config, defined by the needs of env.py,
|
||||
# can be acquired:
|
||||
# my_important_option = config.get_main_option("my_important_option")
|
||||
# ... etc.
|
||||
|
||||
|
||||
def run_migrations_offline():
|
||||
"""Run migrations in 'offline' mode.
|
||||
|
||||
This configures the context with just a URL
|
||||
and not an Engine, though an Engine is acceptable
|
||||
here as well. By skipping the Engine creation
|
||||
we don't even need a DBAPI to be available.
|
||||
|
||||
Calls to context.execute() here emit the given string to the
|
||||
script output.
|
||||
|
||||
"""
|
||||
url = config.get_main_option("sqlalchemy.url")
|
||||
context.configure(
|
||||
url=url,
|
||||
target_metadata=target_metadata,
|
||||
literal_binds=True,
|
||||
dialect_opts={"paramstyle": "named"},
|
||||
)
|
||||
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
def run_migrations_online():
|
||||
"""Run migrations in 'online' mode.
|
||||
|
||||
In this scenario we need to create an Engine
|
||||
and associate a connection with the context.
|
||||
|
||||
"""
|
||||
connectable = engine_from_config(
|
||||
config.get_section(config.config_ini_section),
|
||||
prefix="sqlalchemy.",
|
||||
poolclass=pool.NullPool,
|
||||
)
|
||||
|
||||
with connectable.connect() as connection:
|
||||
# Check if we're using SQLite
|
||||
is_sqlite = connection.dialect.name == "sqlite"
|
||||
|
||||
context.configure(
|
||||
connection=connection,
|
||||
target_metadata=target_metadata,
|
||||
render_as_batch=is_sqlite, # Critical for SQLite to handle alter table operations
|
||||
)
|
||||
|
||||
with context.begin_transaction():
|
||||
context.run_migrations()
|
||||
|
||||
|
||||
if context.is_offline_mode():
|
||||
run_migrations_offline()
|
||||
else:
|
||||
run_migrations_online()
|
24
migrations/script.py.mako
Normal file
24
migrations/script.py.mako
Normal file
@ -0,0 +1,24 @@
|
||||
"""${message}
|
||||
|
||||
Revision ID: ${up_revision}
|
||||
Revises: ${down_revision | comma,n}
|
||||
Create Date: ${create_date}
|
||||
|
||||
"""
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
${imports if imports else ""}
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = ${repr(up_revision)}
|
||||
down_revision = ${repr(down_revision)}
|
||||
branch_labels = ${repr(branch_labels)}
|
||||
depends_on = ${repr(depends_on)}
|
||||
|
||||
|
||||
def upgrade():
|
||||
${upgrades if upgrades else "pass"}
|
||||
|
||||
|
||||
def downgrade():
|
||||
${downgrades if downgrades else "pass"}
|
75
migrations/versions/0001_initial_migration.py
Normal file
75
migrations/versions/0001_initial_migration.py
Normal file
@ -0,0 +1,75 @@
|
||||
"""Initial migration
|
||||
|
||||
Revision ID: 0001
|
||||
Revises:
|
||||
Create Date: 2023-06-25
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import sqlite
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "0001"
|
||||
down_revision = None
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade():
|
||||
# Create scrape_jobs table
|
||||
op.create_table(
|
||||
"scrape_jobs",
|
||||
sa.Column("id", sa.Integer(), nullable=False),
|
||||
sa.Column("url", sa.String(length=2048), nullable=False),
|
||||
sa.Column(
|
||||
"status",
|
||||
sa.Enum("pending", "in_progress", "completed", "failed", name="jobstatus"),
|
||||
nullable=False,
|
||||
default="pending",
|
||||
),
|
||||
sa.Column(
|
||||
"created_at", sa.DateTime(), nullable=False, server_default=sa.func.now()
|
||||
),
|
||||
sa.Column(
|
||||
"updated_at", sa.DateTime(), nullable=False, server_default=sa.func.now()
|
||||
),
|
||||
sa.Column("started_at", sa.DateTime(), nullable=True),
|
||||
sa.Column("completed_at", sa.DateTime(), nullable=True),
|
||||
sa.Column("selector", sa.String(length=255), nullable=True),
|
||||
sa.Column("error", sa.Text(), nullable=True),
|
||||
sa.Column("result", sqlite.JSON(), nullable=True),
|
||||
sa.Column("user_agent", sa.String(length=255), nullable=True),
|
||||
sa.Column("timeout", sa.Integer(), nullable=True),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
op.create_index(op.f("ix_scrape_jobs_id"), "scrape_jobs", ["id"], unique=False)
|
||||
op.create_index(op.f("ix_scrape_jobs_url"), "scrape_jobs", ["url"], unique=False)
|
||||
|
||||
# Create scrape_results table
|
||||
op.create_table(
|
||||
"scrape_results",
|
||||
sa.Column("id", sa.Integer(), nullable=False),
|
||||
sa.Column("job_id", sa.Integer(), nullable=False),
|
||||
sa.Column(
|
||||
"created_at", sa.DateTime(), nullable=False, server_default=sa.func.now()
|
||||
),
|
||||
sa.Column("content_type", sa.String(length=100), nullable=True),
|
||||
sa.Column("headers", sqlite.JSON(), nullable=True),
|
||||
sa.Column("html_content", sa.Text(), nullable=True),
|
||||
sa.Column("extracted_data", sqlite.JSON(), nullable=True),
|
||||
sa.ForeignKeyConstraint(["job_id"], ["scrape_jobs.id"], ondelete="CASCADE"),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
op.create_index(
|
||||
op.f("ix_scrape_results_id"), "scrape_results", ["id"], unique=False
|
||||
)
|
||||
|
||||
|
||||
def downgrade():
|
||||
op.drop_index(op.f("ix_scrape_results_id"), table_name="scrape_results")
|
||||
op.drop_table("scrape_results")
|
||||
op.drop_index(op.f("ix_scrape_jobs_url"), table_name="scrape_jobs")
|
||||
op.drop_index(op.f("ix_scrape_jobs_id"), table_name="scrape_jobs")
|
||||
op.drop_table("scrape_jobs")
|
1
migrations/versions/__init__.py
Normal file
1
migrations/versions/__init__.py
Normal file
@ -0,0 +1 @@
|
||||
# This file is intentionally empty to make the directory a Python package
|
14
requirements.txt
Normal file
14
requirements.txt
Normal file
@ -0,0 +1,14 @@
|
||||
fastapi==0.110.0
|
||||
uvicorn==0.27.1
|
||||
sqlalchemy==2.0.27
|
||||
alembic==1.13.1
|
||||
pydantic==2.6.1
|
||||
python-dotenv==1.0.1
|
||||
beautifulsoup4==4.12.2
|
||||
requests==2.31.0
|
||||
typer==0.9.0
|
||||
rich==13.7.0
|
||||
httpx==0.26.0
|
||||
lxml==4.9.3
|
||||
aiohttp==3.9.3
|
||||
ruff==0.2.2
|
Loading…
x
Reference in New Issue
Block a user