1. import requests
  2. from bs4 import BeautifulSoup
  3. def extract_data_from_html(url, limit_name1, limit_value1, limit_name2, limit_value2):
  4. """
  5. Extracts data from an HTML document, applying hard-coded limits.
  6. Args:
  7. url (str): The URL of the HTML document.
  8. limit_name1 (str): Name of the first parameter to limit.
  9. limit_value1 (int): Maximum value for the first parameter.
  10. limit_name2 (str): Name of the second parameter to limit.
  11. limit_value2 (int): Maximum value for the second parameter.
  12. Returns:
  13. dict: A dictionary containing the extracted and limited data.
  14. Returns an empty dictionary if an error occurs.
  15. """
  16. try:
  17. response = requests.get(url)
  18. response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
  19. soup = BeautifulSoup(response.content, 'html.parser')
  20. # Example: Extracting all links and limiting the number
  21. links = soup.find_all('a')
  22. limited_links = links[:limit_value1] # Limit to limit_value1
  23. # Example: Extracting all paragraphs and limiting their length
  24. paragraphs = soup.find_all('p')
  25. limited_paragraphs = []
  26. for p in paragraphs:
  27. text = p.get_text()
  28. if len(text) <= limit_value2: # Limit paragraph length
  29. limited_paragraphs.append(text)
  30. data = {
  31. "limited_links": limited_links,
  32. "limited_paragraphs": limited_paragraphs
  33. }
  34. return data
  35. except requests.exceptions.RequestException as e:
  36. print(f"Error fetching URL: {e}")
  37. return {}
  38. except Exception as e:
  39. print(f"Error processing HTML: {e}")
  40. return {}
  41. if __name__ == '__main__':
  42. # Example usage with hard-coded limits
  43. url = "https://www.example.com" # Replace with your target URL
  44. limit_name1 = "link_count"
  45. limit_value1 = 5
  46. limit_name2 = "paragraph_length"
  47. limit_value2 = 50
  48. extracted_data = extract_data_from_html(url, limit_name1, limit_value1, limit_name2, limit_value2)
  49. if extracted_data:
  50. print("Extracted Data:")
  51. print(extracted_data)
  52. else:
  53. print("Data extraction failed.")

Add your comment