import requests
from bs4 import BeautifulSoup
import logging
import jsonschema
import json
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
def validate_html(html_content, schema_path):
"""
Validates HTML content against a JSON schema.
Args:
html_content (str): The HTML content to validate.
schema_path (str): Path to the JSON schema file.
Returns:
bool: True if the HTML is valid, False otherwise.
"""
try:
# Load the schema
with open(schema_path, 'r') as f:
schema = json.load(f)
# Parse the HTML content
soup = BeautifulSoup(html_content, 'html.parser')
# Extract relevant data from the parsed HTML (customize based on your needs)
# This is a placeholder; adapt to extract data specific to your schema.
extracted_data = {
'title': soup.title.text if soup.title else None,
'headings': [h.text for h in soup.find_all(['h1', 'h2', 'h3'])]
}
# Validate the extracted data against the schema
jsonschema.validate(instance=extracted_data, schema=schema)
logging.info("HTML validation successful.")
return True
except jsonschema.exceptions.ValidationError as e:
logging.error(f"HTML validation failed: {e}")
return False
except FileNotFoundError:
logging.error(f"Schema file not found: {schema_path}")
return False
except Exception as e:
logging.error(f"An unexpected error occurred: {e}")
return False
def validate_url(url, schema_path):
"""
Fetches HTML from a URL and validates it against a JSON schema.
Args:
url (str): The URL to fetch and validate.
schema_path (str): Path to the JSON schema file.
Returns:
bool: True if the HTML is valid, False otherwise.
"""
try:
response = requests.get(url)
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
html_content = response.text
return validate_html(html_content, schema_path)
except requests.exceptions.RequestException as e:
logging.error(f"Error fetching URL {url}: {e}")
return False
except Exception as e:
logging.error(f"An unexpected error occurred: {e}")
return False
if __name__ == '__main__':
# Example usage
url_to_validate = "https://www.example.com" # Replace with your URL
schema_file = "schema.json" # Replace with your schema file
# Validate a URL
if validate_url(url_to_validate, schema_file):
print(f"URL {url_to_validate} is valid.")
else:
print(f"URL {url_to_validate} is invalid.")
# Example with local HTML content
html_content = """
<html>
<head>
<title>My Page</title>
</head>
<body>
<h1>Hello</h1>
<h2>World</h2>
</body>
</html>
"""
if validate_html(html_content, schema_file):
print("Local HTML is valid.")
else:
print("Local HTML is invalid.")
Add your comment