278 lines
8.3 KiB
Python
278 lines
8.3 KiB
Python
import json
|
|
from typing import Optional
|
|
|
|
import typer
|
|
from rich.console import Console
|
|
from rich.table import Table
|
|
from sqlalchemy.orm import Session
|
|
|
|
from app.db.session import SessionLocal
|
|
from app.crud import scrape_job, scrape_result
|
|
from app.models.scrape_job import JobStatus
|
|
from app.services.scraper import Scraper
|
|
|
|
app = typer.Typer(help="Web Scraper CLI")
|
|
console = Console()
|
|
|
|
|
|
def get_db() -> Session:
|
|
"""
|
|
Get a database session.
|
|
"""
|
|
return SessionLocal()
|
|
|
|
|
|
@app.command("scrape")
|
|
def scrape_url(
|
|
url: str = typer.Argument(..., help="URL to scrape"),
|
|
selector: Optional[str] = typer.Option(
|
|
None, help="CSS selector to extract content"
|
|
),
|
|
user_agent: Optional[str] = typer.Option(
|
|
None, help="User agent to use for request"
|
|
),
|
|
timeout: Optional[int] = typer.Option(None, help="Timeout for request in seconds"),
|
|
output: Optional[str] = typer.Option(
|
|
None, help="Output file path for results (JSON)"
|
|
),
|
|
):
|
|
"""
|
|
Scrape a URL and extract content.
|
|
"""
|
|
console.print(f"Scraping [bold]{url}[/bold]...")
|
|
|
|
db = get_db()
|
|
|
|
try:
|
|
# Create a new scrape job
|
|
job_data = {
|
|
"url": url,
|
|
"selector": selector,
|
|
"user_agent": user_agent,
|
|
"timeout": timeout,
|
|
}
|
|
job_in = {k: v for k, v in job_data.items() if v is not None}
|
|
|
|
# Create and run the job
|
|
job = scrape_job.create(db=db, obj_in=job_in)
|
|
console.print(f"Created scrape job with ID [bold]{job.id}[/bold]")
|
|
|
|
# Run the job
|
|
scraper = Scraper(db=db, user_agent=user_agent, timeout=timeout)
|
|
job = scraper.run_job(job_id=job.id)
|
|
|
|
if job.status == JobStatus.COMPLETED:
|
|
console.print("[bold green]Scraping completed successfully![/bold green]")
|
|
|
|
# Get the result
|
|
result = scrape_result.get_latest_by_job_id(db=db, job_id=job.id)
|
|
|
|
# Print basic info
|
|
console.print("\n[bold]Basic Information:[/bold]")
|
|
table = Table(show_header=True, header_style="bold")
|
|
table.add_column("Attribute")
|
|
table.add_column("Value")
|
|
|
|
if result and result.extracted_data:
|
|
data = result.extracted_data
|
|
|
|
# Add rows to table
|
|
if "title" in data:
|
|
table.add_row("Title", data["title"] or "")
|
|
|
|
if "meta_description" in data:
|
|
table.add_row("Description", data["meta_description"] or "")
|
|
|
|
if "h1" in data:
|
|
table.add_row(
|
|
"H1 Tags", ", ".join(data["h1"]) if data["h1"] else ""
|
|
)
|
|
|
|
if "links" in data:
|
|
link_count = len(data["links"]) if data["links"] else 0
|
|
table.add_row("Links", str(link_count))
|
|
|
|
if selector and "selected_content" in data:
|
|
content_count = (
|
|
len(data["selected_content"]) if data["selected_content"] else 0
|
|
)
|
|
table.add_row(f"Selected Content ({selector})", str(content_count))
|
|
|
|
console.print(table)
|
|
|
|
# Write results to file if specified
|
|
if output:
|
|
with open(output, "w") as f:
|
|
json.dump(data, f, indent=2)
|
|
console.print(f"\nResults saved to [bold]{output}[/bold]")
|
|
|
|
# Ask if user wants to see more details
|
|
if typer.confirm("\nDo you want to see the full extracted data?"):
|
|
console.print_json(json.dumps(data))
|
|
else:
|
|
console.print("[yellow]No data extracted.[/yellow]")
|
|
else:
|
|
console.print(f"[bold red]Scraping failed:[/bold red] {job.error}")
|
|
|
|
except Exception as e:
|
|
console.print(f"[bold red]Error:[/bold red] {str(e)}")
|
|
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
@app.command("list")
|
|
def list_jobs(
|
|
status: Optional[str] = typer.Option(
|
|
None, help="Filter by status (pending, in_progress, completed, failed)"
|
|
),
|
|
limit: int = typer.Option(10, help="Limit number of jobs"),
|
|
):
|
|
"""
|
|
List scrape jobs.
|
|
"""
|
|
db = get_db()
|
|
|
|
try:
|
|
# Get jobs based on status
|
|
if status:
|
|
try:
|
|
job_status = JobStatus(status)
|
|
jobs = scrape_job.get_by_status(db=db, status=job_status, limit=limit)
|
|
total = scrape_job.count_by_status(db=db, status=job_status)
|
|
console.print(
|
|
f"Found [bold]{total}[/bold] jobs with status [bold]{status}[/bold]"
|
|
)
|
|
except ValueError:
|
|
console.print(f"[bold red]Invalid status:[/bold red] {status}")
|
|
return
|
|
else:
|
|
jobs = scrape_job.get_multi(db=db, limit=limit)
|
|
total = scrape_job.count(db=db)
|
|
console.print(f"Found [bold]{total}[/bold] jobs")
|
|
|
|
if not jobs:
|
|
console.print("[yellow]No jobs found.[/yellow]")
|
|
return
|
|
|
|
# Create table
|
|
table = Table(show_header=True, header_style="bold")
|
|
table.add_column("ID")
|
|
table.add_column("URL")
|
|
table.add_column("Status")
|
|
table.add_column("Created")
|
|
table.add_column("Updated")
|
|
|
|
# Add rows
|
|
for job in jobs:
|
|
table.add_row(
|
|
str(job.id),
|
|
job.url,
|
|
job.status.value,
|
|
job.created_at.strftime("%Y-%m-%d %H:%M:%S"),
|
|
job.updated_at.strftime("%Y-%m-%d %H:%M:%S"),
|
|
)
|
|
|
|
console.print(table)
|
|
|
|
except Exception as e:
|
|
console.print(f"[bold red]Error:[/bold red] {str(e)}")
|
|
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
@app.command("show")
|
|
def show_job(
|
|
job_id: int = typer.Argument(..., help="ID of the job to show"),
|
|
):
|
|
"""
|
|
Show details of a scrape job.
|
|
"""
|
|
db = get_db()
|
|
|
|
try:
|
|
# Get job
|
|
job = scrape_job.get(db=db, id=job_id)
|
|
|
|
if not job:
|
|
console.print(f"[bold red]Job not found:[/bold red] {job_id}")
|
|
return
|
|
|
|
# Print job details
|
|
console.print(f"\n[bold]Job {job_id}[/bold]")
|
|
console.print(f"URL: [bold]{job.url}[/bold]")
|
|
console.print(f"Status: [bold]{job.status.value}[/bold]")
|
|
console.print(f"Created: [bold]{job.created_at}[/bold]")
|
|
console.print(f"Updated: [bold]{job.updated_at}[/bold]")
|
|
|
|
if job.started_at:
|
|
console.print(f"Started: [bold]{job.started_at}[/bold]")
|
|
|
|
if job.completed_at:
|
|
console.print(f"Completed: [bold]{job.completed_at}[/bold]")
|
|
|
|
if job.selector:
|
|
console.print(f"Selector: [bold]{job.selector}[/bold]")
|
|
|
|
if job.error:
|
|
console.print(f"Error: [bold red]{job.error}[/bold red]")
|
|
|
|
# Get results if job is completed
|
|
if job.status == JobStatus.COMPLETED:
|
|
result = scrape_result.get_latest_by_job_id(db=db, job_id=job.id)
|
|
|
|
if result and result.extracted_data:
|
|
console.print("\n[bold]Extracted Data:[/bold]")
|
|
|
|
# Ask if user wants to see the data
|
|
if typer.confirm("Do you want to see the extracted data?"):
|
|
console.print_json(json.dumps(result.extracted_data))
|
|
else:
|
|
console.print("[yellow]No data extracted.[/yellow]")
|
|
|
|
except Exception as e:
|
|
console.print(f"[bold red]Error:[/bold red] {str(e)}")
|
|
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
@app.command("run")
|
|
def run_job(
|
|
job_id: int = typer.Argument(..., help="ID of the job to run"),
|
|
):
|
|
"""
|
|
Run a scrape job.
|
|
"""
|
|
db = get_db()
|
|
|
|
try:
|
|
# Get job
|
|
job = scrape_job.get(db=db, id=job_id)
|
|
|
|
if not job:
|
|
console.print(f"[bold red]Job not found:[/bold red] {job_id}")
|
|
return
|
|
|
|
console.print(f"Running job [bold]{job_id}[/bold]...")
|
|
|
|
# Run the job
|
|
scraper = Scraper(db=db)
|
|
job = scraper.run_job(job_id=job.id)
|
|
|
|
if job.status == JobStatus.COMPLETED:
|
|
console.print("[bold green]Job completed successfully![/bold green]")
|
|
else:
|
|
console.print(f"[bold red]Job failed:[/bold red] {job.error}")
|
|
|
|
except Exception as e:
|
|
console.print(f"[bold red]Error:[/bold red] {str(e)}")
|
|
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
app()
|