1. import requests
  2. from bs4 import BeautifulSoup
  3. import logging
  4. import jsonschema
  5. import json
  6. # Configure logging
  7. logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
  8. def validate_html(html_content, schema_path):
  9. """
  10. Validates HTML content against a JSON schema.
  11. Args:
  12. html_content (str): The HTML content to validate.
  13. schema_path (str): Path to the JSON schema file.
  14. Returns:
  15. bool: True if the HTML is valid, False otherwise.
  16. """
  17. try:
  18. # Load the schema
  19. with open(schema_path, 'r') as f:
  20. schema = json.load(f)
  21. # Parse the HTML content
  22. soup = BeautifulSoup(html_content, 'html.parser')
  23. # Extract relevant data from the parsed HTML (customize based on your needs)
  24. # This is a placeholder; adapt to extract data specific to your schema.
  25. extracted_data = {
  26. 'title': soup.title.text if soup.title else None,
  27. 'headings': [h.text for h in soup.find_all(['h1', 'h2', 'h3'])]
  28. }
  29. # Validate the extracted data against the schema
  30. jsonschema.validate(instance=extracted_data, schema=schema)
  31. logging.info("HTML validation successful.")
  32. return True
  33. except jsonschema.exceptions.ValidationError as e:
  34. logging.error(f"HTML validation failed: {e}")
  35. return False
  36. except FileNotFoundError:
  37. logging.error(f"Schema file not found: {schema_path}")
  38. return False
  39. except Exception as e:
  40. logging.error(f"An unexpected error occurred: {e}")
  41. return False
  42. def validate_url(url, schema_path):
  43. """
  44. Fetches HTML from a URL and validates it against a JSON schema.
  45. Args:
  46. url (str): The URL to fetch and validate.
  47. schema_path (str): Path to the JSON schema file.
  48. Returns:
  49. bool: True if the HTML is valid, False otherwise.
  50. """
  51. try:
  52. response = requests.get(url)
  53. response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
  54. html_content = response.text
  55. return validate_html(html_content, schema_path)
  56. except requests.exceptions.RequestException as e:
  57. logging.error(f"Error fetching URL {url}: {e}")
  58. return False
  59. except Exception as e:
  60. logging.error(f"An unexpected error occurred: {e}")
  61. return False
  62. if __name__ == '__main__':
  63. # Example usage
  64. url_to_validate = "https://www.example.com" # Replace with your URL
  65. schema_file = "schema.json" # Replace with your schema file
  66. # Validate a URL
  67. if validate_url(url_to_validate, schema_file):
  68. print(f"URL {url_to_validate} is valid.")
  69. else:
  70. print(f"URL {url_to_validate} is invalid.")
  71. # Example with local HTML content
  72. html_content = """
  73. <html>
  74. <head>
  75. <title>My Page</title>
  76. </head>
  77. <body>
  78. <h1>Hello</h1>
  79. <h2>World</h2>
  80. </body>
  81. </html>
  82. """
  83. if validate_html(html_content, schema_file):
  84. print("Local HTML is valid.")
  85. else:
  86. print("Local HTML is invalid.")

Add your comment