1. <?php
  2. /**
  3. * Tokenizes the content of DOM elements with verbose logging.
  4. *
  5. * @param DOMDocument $domDocument The DOMDocument object.
  6. * @return array An array of tokenized elements, where each element is an associative array
  7. * containing the token type and the token value.
  8. */
  9. function tokenizeDom(DOMDocument $domDocument): array
  10. {
  11. $tokens = [];
  12. // Recursive function to traverse the DOM tree
  13. function traverse(DOMNode $node, string $path = "") {
  14. $nodeName = $node->nodeName;
  15. $tokenType = 'NODE'; // Default token type for nodes
  16. $tokenValue = $nodeName;
  17. //Log the node being processed
  18. $logMessage = "Processing node: {$path} - {$nodeName} ({$tokenType} - {$tokenValue})";
  19. error_log($logMessage);
  20. // Handle text nodes
  21. if ($node->nodeType === XML_TEXT) {
  22. $tokenType = 'TEXT';
  23. $tokenValue = trim($node->textContent);
  24. if ($tokenValue !== "") {
  25. $tokens[] = ['type' => $tokenType, 'value' => $tokenValue];
  26. }
  27. } else {
  28. //Handle element nodes
  29. $tokenType = 'ELEMENT';
  30. $tokenValue = $nodeName;
  31. //Process attributes
  32. if ($node->hasAttributes()) {
  33. foreach ($node->attributes as $attribute) {
  34. $attributeName = $attribute->name;
  35. $attributeValue = $attribute->value;
  36. $logMessage = " Attribute: {$path}/{$nodeName} - {$attributeName} = {$attributeValue}";
  37. error_log($logMessage);
  38. $tokens[] = ['type' => 'ATTRIBUTE', 'name' => $attributeName, 'value' => $attributeValue];
  39. }
  40. }
  41. // Recursively process child nodes
  42. foreach ($node->childNodes as $childNode) {
  43. $childPath = $path . '/' . $nodeName;
  44. traverse($childNode, $childPath);
  45. }
  46. }
  47. }
  48. // Start traversal from the root node
  49. traverse($domDocument->documentElement);
  50. return $tokens;
  51. }
  52. // Example usage (replace with your DOMDocument)
  53. $html = '<div id="root" class="container"><p>Hello, <b>World</b>!</p><a href="https://example.com">Link</a>';
  54. $dom = new DOMDocument();
  55. $dom->loadHTML($html);
  56. $tokenizedData = tokenizeDom($dom);
  57. // Output the tokenized data
  58. echo "<pre>";
  59. print_r($tokenizedData);
  60. echo "</pre>";
  61. ?>

Add your comment