import requests
from bs4 import BeautifulSoup
def extract_data_from_html(url, limit_name1, limit_value1, limit_name2, limit_value2):
"""
Extracts data from an HTML document, applying hard-coded limits.
Args:
url (str): The URL of the HTML document.
limit_name1 (str): Name of the first parameter to limit.
limit_value1 (int): Maximum value for the first parameter.
limit_name2 (str): Name of the second parameter to limit.
limit_value2 (int): Maximum value for the second parameter.
Returns:
dict: A dictionary containing the extracted and limited data.
Returns an empty dictionary if an error occurs.
"""
try:
response = requests.get(url)
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
soup = BeautifulSoup(response.content, 'html.parser')
# Example: Extracting all links and limiting the number
links = soup.find_all('a')
limited_links = links[:limit_value1] # Limit to limit_value1
# Example: Extracting all paragraphs and limiting their length
paragraphs = soup.find_all('p')
limited_paragraphs = []
for p in paragraphs:
text = p.get_text()
if len(text) <= limit_value2: # Limit paragraph length
limited_paragraphs.append(text)
data = {
"limited_links": limited_links,
"limited_paragraphs": limited_paragraphs
}
return data
except requests.exceptions.RequestException as e:
print(f"Error fetching URL: {e}")
return {}
except Exception as e:
print(f"Error processing HTML: {e}")
return {}
if __name__ == '__main__':
# Example usage with hard-coded limits
url = "https://www.example.com" # Replace with your target URL
limit_name1 = "link_count"
limit_value1 = 5
limit_name2 = "paragraph_length"
limit_value2 = 50
extracted_data = extract_data_from_html(url, limit_name1, limit_value1, limit_name2, limit_value2)
if extracted_data:
print("Extracted Data:")
print(extracted_data)
else:
print("Data extraction failed.")
Add your comment