1. <?php
  2. /**
  3. * Cleans up artifacts of text blocks for an experiment.
  4. * Uses default values for cleanup parameters.
  5. *
  6. * @param array $textBlocks An array of text block data. Each element should contain at least a 'text' key.
  7. * @return array The cleaned text block data.
  8. */
  9. function cleanupTextBlocks(array $textBlocks): array
  10. {
  11. $defaultParams = [
  12. 'removeLeadingWhitespace' => true,
  13. 'removeTrailingWhitespace' => true,
  14. 'removeExcessNewlines' => true,
  15. 'normalizeWhitespace' => true,
  16. 'removeHTMLTags' => false, // Default: Don't remove HTML
  17. ];
  18. $params = array_merge($defaultParams, $argv); // Allow overriding with command-line arguments (optional)
  19. $cleanedBlocks = [];
  20. foreach ($textBlocks as $block) {
  21. $text = $block['text'] ?? ''; // Default to empty string if 'text' key is missing
  22. //Remove leading/trailing whitespace
  23. $text = trim($text, $params['removeLeadingWhitespace'] ? "\x00\x09\x0a\x0d\x0c" : "");
  24. $text = trim($text, $params['removeTrailingWhitespace'] ? "\x00\x09\x0a\x0d\x0c" : "");
  25. // Remove excessive newlines
  26. $text = preg_replace('/\s*\n+/', "\n", $text, 1); // Replace multiple newlines with single newline
  27. // Normalize whitespace
  28. $text = preg_replace('/\s+/', ' ', $text); // Replace multiple spaces with single space
  29. // Remove HTML tags (optional)
  30. if ($params['removeHTMLTags']) {
  31. $text = strip_tags($text);
  32. }
  33. $cleanedBlocks[] = ['text' => $text]; // Store cleaned text
  34. }
  35. return $cleanedBlocks;
  36. }
  37. //Example usage (can be tested with command line arguments):
  38. /*
  39. $textBlocks = [
  40. ['text' => ' This is a test.\n\nWith some extra whitespace. '],
  41. ['text' => '<div>This has HTML</div>'],
  42. ['text' => 'No HTML here'],
  43. ];
  44. $cleaned = cleanupTextBlocks($textBlocks);
  45. print_r($cleaned);
  46. */
  47. ?>

Add your comment