84 lines
2.0 KiB
Python
84 lines
2.0 KiB
Python
from typing import List, Dict, Any
|
|
from bs4 import BeautifulSoup
|
|
|
|
|
|
def extract_metadata(html: str) -> Dict[str, Any]:
|
|
"""
|
|
Extract metadata from HTML.
|
|
"""
|
|
soup = BeautifulSoup(html, "lxml")
|
|
result = {
|
|
"title": None,
|
|
"description": None,
|
|
"keywords": None,
|
|
"og_title": None,
|
|
"og_description": None,
|
|
"og_image": None,
|
|
}
|
|
|
|
# Title
|
|
if soup.title:
|
|
result["title"] = soup.title.text.strip()
|
|
|
|
# Meta tags
|
|
for meta in soup.find_all("meta"):
|
|
name = meta.get("name", "").lower()
|
|
property = meta.get("property", "").lower()
|
|
content = meta.get("content", "")
|
|
|
|
if name == "description":
|
|
result["description"] = content
|
|
elif name == "keywords":
|
|
result["keywords"] = content
|
|
elif property == "og:title":
|
|
result["og_title"] = content
|
|
elif property == "og:description":
|
|
result["og_description"] = content
|
|
elif property == "og:image":
|
|
result["og_image"] = content
|
|
|
|
return result
|
|
|
|
|
|
def extract_links(html: str) -> List[Dict[str, str]]:
|
|
"""
|
|
Extract links from HTML.
|
|
"""
|
|
soup = BeautifulSoup(html, "lxml")
|
|
links = []
|
|
|
|
for a in soup.find_all("a"):
|
|
href = a.get("href")
|
|
if href:
|
|
links.append(
|
|
{
|
|
"href": href,
|
|
"text": a.text.strip(),
|
|
"title": a.get("title", ""),
|
|
"rel": a.get("rel", ""),
|
|
}
|
|
)
|
|
|
|
return links
|
|
|
|
|
|
def extract_images(html: str) -> List[Dict[str, str]]:
|
|
"""
|
|
Extract images from HTML.
|
|
"""
|
|
soup = BeautifulSoup(html, "lxml")
|
|
images = []
|
|
|
|
for img in soup.find_all("img"):
|
|
src = img.get("src")
|
|
if src:
|
|
images.append(
|
|
{
|
|
"src": src,
|
|
"alt": img.get("alt", ""),
|
|
"title": img.get("title", ""),
|
|
}
|
|
)
|
|
|
|
return images
|