/**
* Merges datasets from multiple HTML pages.
*
* @param {string[]} pageUrls An array of URLs to fetch data from.
* @param {string} outputJsonPath The path to save the merged data as a JSON file.
* @param {string} [dataSelector='table'] The CSS selector for the data to extract. Defaults to 'table'.
* @returns {Promise<void>} A promise that resolves when the merging is complete.
*/
async function mergeDatasets(pageUrls, outputJsonPath, dataSelector = 'table') {
try {
const allData = [];
for (const pageUrl of pageUrls) {
try {
const response = await fetch(pageUrl);
if (!response.ok) {
console.error(`Error fetching ${pageUrl}: ${response.status}`);
continue;
}
const html = await response.text();
const parser = new DOMParser();
const doc = parser.parseFromString(html, 'text/html');
const elements = doc.querySelectorAll(dataSelector);
elements.forEach(element => {
const tableData = [];
const rows = element.querySelectorAll('tr');
rows.forEach(row => {
const cells = row.querySelectorAll('td');
const rowData = [];
cells.forEach(cell => {
rowData.push(cell.textContent.trim());
});
tableData.push(rowData);
});
allData.push(tableData);
});
} catch (error) {
console.error(`Error processing ${pageUrl}:`, error);
}
}
// Flatten the 2D array into a single array
const flattenedData = allData.flat();
// Save the merged data to a JSON file
const fs = require('fs').promises; // Import fs module
await fs.writeFile(outputJsonPath, JSON.stringify(flattenedData, null, 2), 'utf8');
console.log(`Data merged and saved to ${outputJsonPath}`);
} catch (error) {
console.error("An error occurred:", error);
}
}
Add your comment