from typing import List, Dict, Any from bs4 import BeautifulSoup def extract_metadata(html: str) -> Dict[str, Any]: """ Extract metadata from HTML. """ soup = BeautifulSoup(html, "lxml") result = { "title": None, "description": None, "keywords": None, "og_title": None, "og_description": None, "og_image": None, } # Title if soup.title: result["title"] = soup.title.text.strip() # Meta tags for meta in soup.find_all("meta"): name = meta.get("name", "").lower() property = meta.get("property", "").lower() content = meta.get("content", "") if name == "description": result["description"] = content elif name == "keywords": result["keywords"] = content elif property == "og:title": result["og_title"] = content elif property == "og:description": result["og_description"] = content elif property == "og:image": result["og_image"] = content return result def extract_links(html: str) -> List[Dict[str, str]]: """ Extract links from HTML. """ soup = BeautifulSoup(html, "lxml") links = [] for a in soup.find_all("a"): href = a.get("href") if href: links.append( { "href": href, "text": a.text.strip(), "title": a.get("title", ""), "rel": a.get("rel", ""), } ) return links def extract_images(html: str) -> List[Dict[str, str]]: """ Extract images from HTML. """ soup = BeautifulSoup(html, "lxml") images = [] for img in soup.find_all("img"): src = img.get("src") if src: images.append( { "src": src, "alt": img.get("alt", ""), "title": img.get("title", ""), } ) return images