1. import lxml.html
  2. from bs4 import BeautifulSoup
  3. import requests
  4. import json
  5. def flag_html_anomalies(url, timeout=5):
  6. """
  7. Flags anomalies in an HTML document using a combination of lxml and BeautifulSoup.
  8. Includes fallback logic for robustness.
  9. """
  10. try:
  11. response = requests.get(url, timeout=timeout)
  12. response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
  13. html_content = response.text
  14. except requests.exceptions.RequestException as e:
  15. print(f"Error fetching URL: {e}")
  16. return {"url": url, "anomaly": "Fetch Error", "details": str(e)}
  17. try:
  18. # Use lxml for robust parsing and schema validation
  19. tree = lxml.html.fromstring(html_content)
  20. # Check for invalid HTML structure (e.g., missing closing tags)
  21. if not tree.is_well_formed:
  22. return {"url": url, "anomaly": "Invalid HTML", "details": "Malformed HTML structure"}
  23. # Check for unusual tag usage
  24. for element in tree.iter():
  25. if element.tag in ["script", "style"] and element.get("src"): # Check for external scripts/styles
  26. return {"url": url, "anomaly": "External Scripts/Styles", "details": f"External script/style found: {element.get('src')}"}
  27. #Basic check for excessive script or style tags
  28. script_count = len(list(tree.getsouls()))
  29. style_count = len(list(tree.getsouls()))
  30. if script_count > 10 or style_count > 10:
  31. return {"url": url, "anomaly": "Excessive Scripts/Styles", "details": f"High number of script/style tags ({script_count} scripts, {style_count} styles)"}
  32. # Fallback: Use BeautifulSoup for more lenient parsing and content analysis
  33. soup = BeautifulSoup(html_content, 'html.parser')
  34. # Check for unusual attributes
  35. for tag in soup.find_all():
  36. for attr in tag.attrs:
  37. if attr not in ['id', 'class', 'style', 'data-*']:
  38. return {"url": url, "anomaly": f"Unusual Attribute: {attr}", "details": f"Found unusual attribute '{attr}' on tag '{tag.name}'"}
  39. return {"url": url, "anomaly": "No Anomalies Detected", "details": "HTML appears valid and within normal bounds."}
  40. except lxml.html.document_structure_error as e:
  41. return {"url": url, "anomaly": "Lxml Parsing Error", "details": str(e)}
  42. except Exception as e:
  43. return {"url": url, "anomaly": "General Error", "details": str(e)}
  44. if __name__ == '__main__':
  45. #Example Usage
  46. url1 = "https://www.example.com"
  47. url2 = "https://httpstat.us/404" #Simulates a 404 error
  48. url3 = "https://www.invalid-url.com" #Simulates a DNS error
  49. url4 = "https://www.wikipedia.org"
  50. print(flag_html_anomalies(url1))
  51. print(flag_html_anomalies(url2))
  52. print(flag_html_anomalies(url3))
  53. print(flag_html_anomalies(url4))

Add your comment