import re
from urllib.parse import urlparse
def validate_url_list(url_list):
"""
Validates a list of URLs, handling potential errors and providing fallback options.
"""
validated_urls = []
invalid_urls = []
for url in url_list:
try:
# Basic URL format validation using regex
if not re.match(r"^(http(s)?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&'\(\)\*\+,;=.]+$", url):
raise ValueError("Invalid URL format")
# Parse the URL to check for scheme and netloc
result = urlparse(url)
if not all([result.scheme, result.netloc]):
raise ValueError("Missing scheme or netloc")
#Further validation by attempting to connect.
#This helps catch URLs that are technically valid but don't resolve.
result = urlparse(url)
urllib.request.urlopen(url, timeout=5) #add timeout
validated_urls.append(url)
except ValueError as e:
print(f"Invalid URL: {url} - {e}")
invalid_urls.append(url)
except urllib.error.URLError as e:
print(f"URL Error: {url} - {e}")
invalid_urls.append(url)
except Exception as e:
print(f"Unexpected Error: {url} - {e}")
invalid_urls.append(url)
return validated_urls, invalid_urls
if __name__ == '__main__':
#Example Usage
url_list = [
"https://www.google.com",
"http://example.com",
"invalid-url",
"ftp://example.com",
"https://notarealdomain.xyz",
"www.example.com", #missing scheme
"https://www.google.com/search?q=python",
"https://subdomain.example.co.uk"
]
validated, invalid = validate_url_list(url_list)
print("\nValidated URLs:")
for url in validated:
print(url)
print("\nInvalid URLs:")
for url in invalid:
print(url)
Add your comment