webscrapercli-wbi8nl/app/utils/html.py

from typing import List, Dict, Any
from bs4 import BeautifulSoup


def extract_metadata(html: str) -> Dict[str, Any]:
    """
    Extract metadata from HTML.
    """
    soup = BeautifulSoup(html, "lxml")
    result = {
        "title": None,
        "description": None,
        "keywords": None,
        "og_title": None,
        "og_description": None,
        "og_image": None,
    }

    # Title
    if soup.title:
        result["title"] = soup.title.text.strip()

    # Meta tags
    for meta in soup.find_all("meta"):
        name = meta.get("name", "").lower()
        property = meta.get("property", "").lower()
        content = meta.get("content", "")

        if name == "description":
            result["description"] = content
        elif name == "keywords":
            result["keywords"] = content
        elif property == "og:title":
            result["og_title"] = content
        elif property == "og:description":
            result["og_description"] = content
        elif property == "og:image":
            result["og_image"] = content

    return result


def extract_links(html: str) -> List[Dict[str, str]]:
    """
    Extract links from HTML.
    """
    soup = BeautifulSoup(html, "lxml")
    links = []

    for a in soup.find_all("a"):
        href = a.get("href")
        if href:
            links.append(
                {
                    "href": href,
                    "text": a.text.strip(),
                    "title": a.get("title", ""),
                    "rel": a.get("rel", ""),
                }
            )

    return links


def extract_images(html: str) -> List[Dict[str, str]]:
    """
    Extract images from HTML.
    """
    soup = BeautifulSoup(html, "lxml")
    images = []

    for img in soup.find_all("img"):
        src = img.get("src")
        if src:
            images.append(
                {
                    "src": src,
                    "alt": img.get("alt", ""),
                    "title": img.get("title", ""),
                }
            )

    return images