import json
from urllib.parse import urlparse
def extract_urls_from_config(config_file):
"""
Extracts URLs from a configuration file.
Args:
config_file (str): Path to the configuration file (JSON).
Returns:
list: A list of extracted URLs. Returns an empty list if the file
doesn't exist or is invalid.
"""
try:
with open(config_file, 'r') as f:
config = json.load(f)
except FileNotFoundError:
print(f"Error: Configuration file not found: {config_file}")
return []
except json.JSONDecodeError:
print(f"Error: Invalid JSON format in {config_file}")
return []
urls = []
# Iterate through the configuration data. Assumes 'urls' key.
if 'urls' in config:
for url_data in config['urls']:
if isinstance(url_data, str): # Handle simple string URLs
urls.append(url_data)
elif isinstance(url_data, dict) and 'value' in url_data: #Handle dictionary with a value
urls.append(url_data['value'])
elif isinstance(url_data, list): #Handle list of strings
for item in url_data:
if isinstance(item, str):
urls.append(item)
else:
print(f"Warning: Unexpected URL data format: {url_data}")
return urls
def validate_url(url):
"""
Validates a URL using urllib.parse.urlparse.
Args:
url (str): The URL to validate.
Returns:
bool: True if the URL is valid, False otherwise.
"""
try:
result = urlparse(url)
return all([result.scheme, result.netloc]) # Check scheme and network location
except:
return False
def main():
"""
Main function to demonstrate URL extraction.
"""
config_file = 'config.json' # Replace with your config file name
extracted_urls = extract_urls_from_config(config_file)
valid_urls = [url for url in extracted_urls if validate_url(url)]
print("Extracted URLs:")
for url in valid_urls:
print(url)
if __name__ == "__main__":
main()
Add your comment