84 lines
2.0 KiB
Python

from typing import List, Dict, Any
from bs4 import BeautifulSoup
def extract_metadata(html: str) -> Dict[str, Any]:
"""
Extract metadata from HTML.
"""
soup = BeautifulSoup(html, "lxml")
result = {
"title": None,
"description": None,
"keywords": None,
"og_title": None,
"og_description": None,
"og_image": None,
}
# Title
if soup.title:
result["title"] = soup.title.text.strip()
# Meta tags
for meta in soup.find_all("meta"):
name = meta.get("name", "").lower()
property = meta.get("property", "").lower()
content = meta.get("content", "")
if name == "description":
result["description"] = content
elif name == "keywords":
result["keywords"] = content
elif property == "og:title":
result["og_title"] = content
elif property == "og:description":
result["og_description"] = content
elif property == "og:image":
result["og_image"] = content
return result
def extract_links(html: str) -> List[Dict[str, str]]:
"""
Extract links from HTML.
"""
soup = BeautifulSoup(html, "lxml")
links = []
for a in soup.find_all("a"):
href = a.get("href")
if href:
links.append(
{
"href": href,
"text": a.text.strip(),
"title": a.get("title", ""),
"rel": a.get("rel", ""),
}
)
return links
def extract_images(html: str) -> List[Dict[str, str]]:
"""
Extract images from HTML.
"""
soup = BeautifulSoup(html, "lxml")
images = []
for img in soup.find_all("img"):
src = img.get("src")
if src:
images.append(
{
"src": src,
"alt": img.get("alt", ""),
"title": img.get("title", ""),
}
)
return images