1. /**
  2. * Merges datasets from multiple HTML pages.
  3. *
  4. * @param {string[]} pageUrls An array of URLs to fetch data from.
  5. * @param {string} outputJsonPath The path to save the merged data as a JSON file.
  6. * @param {string} [dataSelector='table'] The CSS selector for the data to extract. Defaults to 'table'.
  7. * @returns {Promise<void>} A promise that resolves when the merging is complete.
  8. */
  9. async function mergeDatasets(pageUrls, outputJsonPath, dataSelector = 'table') {
  10. try {
  11. const allData = [];
  12. for (const pageUrl of pageUrls) {
  13. try {
  14. const response = await fetch(pageUrl);
  15. if (!response.ok) {
  16. console.error(`Error fetching ${pageUrl}: ${response.status}`);
  17. continue;
  18. }
  19. const html = await response.text();
  20. const parser = new DOMParser();
  21. const doc = parser.parseFromString(html, 'text/html');
  22. const elements = doc.querySelectorAll(dataSelector);
  23. elements.forEach(element => {
  24. const tableData = [];
  25. const rows = element.querySelectorAll('tr');
  26. rows.forEach(row => {
  27. const cells = row.querySelectorAll('td');
  28. const rowData = [];
  29. cells.forEach(cell => {
  30. rowData.push(cell.textContent.trim());
  31. });
  32. tableData.push(rowData);
  33. });
  34. allData.push(tableData);
  35. });
  36. } catch (error) {
  37. console.error(`Error processing ${pageUrl}:`, error);
  38. }
  39. }
  40. // Flatten the 2D array into a single array
  41. const flattenedData = allData.flat();
  42. // Save the merged data to a JSON file
  43. const fs = require('fs').promises; // Import fs module
  44. await fs.writeFile(outputJsonPath, JSON.stringify(flattenedData, null, 2), 'utf8');
  45. console.log(`Data merged and saved to ${outputJsonPath}`);
  46. } catch (error) {
  47. console.error("An error occurred:", error);
  48. }
  49. }

Add your comment