1. import re
  2. from urllib.parse import urlparse
  3. def validate_url_list(url_list):
  4. """
  5. Validates a list of URLs, handling potential errors and providing fallback options.
  6. """
  7. validated_urls = []
  8. invalid_urls = []
  9. for url in url_list:
  10. try:
  11. # Basic URL format validation using regex
  12. if not re.match(r"^(http(s)?:\/\/)?[\w.-]+(?:\.[\w\.-]+)+[\w\-\._~:/?#[\]@!\$&'\(\)\*\+,;=.]+$", url):
  13. raise ValueError("Invalid URL format")
  14. # Parse the URL to check for scheme and netloc
  15. result = urlparse(url)
  16. if not all([result.scheme, result.netloc]):
  17. raise ValueError("Missing scheme or netloc")
  18. #Further validation by attempting to connect.
  19. #This helps catch URLs that are technically valid but don't resolve.
  20. result = urlparse(url)
  21. urllib.request.urlopen(url, timeout=5) #add timeout
  22. validated_urls.append(url)
  23. except ValueError as e:
  24. print(f"Invalid URL: {url} - {e}")
  25. invalid_urls.append(url)
  26. except urllib.error.URLError as e:
  27. print(f"URL Error: {url} - {e}")
  28. invalid_urls.append(url)
  29. except Exception as e:
  30. print(f"Unexpected Error: {url} - {e}")
  31. invalid_urls.append(url)
  32. return validated_urls, invalid_urls
  33. if __name__ == '__main__':
  34. #Example Usage
  35. url_list = [
  36. "https://www.google.com",
  37. "http://example.com",
  38. "invalid-url",
  39. "ftp://example.com",
  40. "https://notarealdomain.xyz",
  41. "www.example.com", #missing scheme
  42. "https://www.google.com/search?q=python",
  43. "https://subdomain.example.co.uk"
  44. ]
  45. validated, invalid = validate_url_list(url_list)
  46. print("\nValidated URLs:")
  47. for url in validated:
  48. print(url)
  49. print("\nInvalid URLs:")
  50. for url in invalid:
  51. print(url)

Add your comment