import lxml.html
from bs4 import BeautifulSoup
import requests
import json
def flag_html_anomalies(url, timeout=5):
"""
Flags anomalies in an HTML document using a combination of lxml and BeautifulSoup.
Includes fallback logic for robustness.
"""
try:
response = requests.get(url, timeout=timeout)
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
html_content = response.text
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
return {"url": url, "anomaly": "Fetch Error", "details": str(e)}
try:
# Use lxml for robust parsing and schema validation
tree = lxml.html.fromstring(html_content)
# Check for invalid HTML structure (e.g., missing closing tags)
if not tree.is_well_formed:
return {"url": url, "anomaly": "Invalid HTML", "details": "Malformed HTML structure"}
# Check for unusual tag usage
for element in tree.iter():
if element.tag in ["script", "style"] and element.get("src"): # Check for external scripts/styles
return {"url": url, "anomaly": "External Scripts/Styles", "details": f"External script/style found: {element.get('src')}"}
#Basic check for excessive script or style tags
script_count = len(list(tree.getsouls()))
style_count = len(list(tree.getsouls()))
if script_count > 10 or style_count > 10:
return {"url": url, "anomaly": "Excessive Scripts/Styles", "details": f"High number of script/style tags ({script_count} scripts, {style_count} styles)"}
# Fallback: Use BeautifulSoup for more lenient parsing and content analysis
soup = BeautifulSoup(html_content, 'html.parser')
# Check for unusual attributes
for tag in soup.find_all():
for attr in tag.attrs:
if attr not in ['id', 'class', 'style', 'data-*']:
return {"url": url, "anomaly": f"Unusual Attribute: {attr}", "details": f"Found unusual attribute '{attr}' on tag '{tag.name}'"}
return {"url": url, "anomaly": "No Anomalies Detected", "details": "HTML appears valid and within normal bounds."}
except lxml.html.document_structure_error as e:
return {"url": url, "anomaly": "Lxml Parsing Error", "details": str(e)}
except Exception as e:
return {"url": url, "anomaly": "General Error", "details": str(e)}
if __name__ == '__main__':
#Example Usage
url1 = "https://www.example.com"
url2 = "https://httpstat.us/404" #Simulates a 404 error
url3 = "https://www.invalid-url.com" #Simulates a DNS error
url4 = "https://www.wikipedia.org"
print(flag_html_anomalies(url1))
print(flag_html_anomalies(url2))
print(flag_html_anomalies(url3))
print(flag_html_anomalies(url4))
Add your comment