<?php
/**
* Tokenizes the content of DOM elements with verbose logging.
*
* @param DOMDocument $domDocument The DOMDocument object.
* @return array An array of tokenized elements, where each element is an associative array
* containing the token type and the token value.
*/
function tokenizeDom(DOMDocument $domDocument): array
{
$tokens = [];
// Recursive function to traverse the DOM tree
function traverse(DOMNode $node, string $path = "") {
$nodeName = $node->nodeName;
$tokenType = 'NODE'; // Default token type for nodes
$tokenValue = $nodeName;
//Log the node being processed
$logMessage = "Processing node: {$path} - {$nodeName} ({$tokenType} - {$tokenValue})";
error_log($logMessage);
// Handle text nodes
if ($node->nodeType === XML_TEXT) {
$tokenType = 'TEXT';
$tokenValue = trim($node->textContent);
if ($tokenValue !== "") {
$tokens[] = ['type' => $tokenType, 'value' => $tokenValue];
}
} else {
//Handle element nodes
$tokenType = 'ELEMENT';
$tokenValue = $nodeName;
//Process attributes
if ($node->hasAttributes()) {
foreach ($node->attributes as $attribute) {
$attributeName = $attribute->name;
$attributeValue = $attribute->value;
$logMessage = " Attribute: {$path}/{$nodeName} - {$attributeName} = {$attributeValue}";
error_log($logMessage);
$tokens[] = ['type' => 'ATTRIBUTE', 'name' => $attributeName, 'value' => $attributeValue];
}
}
// Recursively process child nodes
foreach ($node->childNodes as $childNode) {
$childPath = $path . '/' . $nodeName;
traverse($childNode, $childPath);
}
}
}
// Start traversal from the root node
traverse($domDocument->documentElement);
return $tokens;
}
// Example usage (replace with your DOMDocument)
$html = '<div id="root" class="container"><p>Hello, <b>World</b>!</p><a href="https://example.com">Link</a>';
$dom = new DOMDocument();
$dom->loadHTML($html);
$tokenizedData = tokenizeDom($dom);
// Output the tokenized data
echo "<pre>";
print_r($tokenizedData);
echo "</pre>";
?>
Add your comment