<?php
/**
* Cleans up artifacts of text blocks for an experiment.
* Uses default values for cleanup parameters.
*
* @param array $textBlocks An array of text block data. Each element should contain at least a 'text' key.
* @return array The cleaned text block data.
*/
function cleanupTextBlocks(array $textBlocks): array
{
$defaultParams = [
'removeLeadingWhitespace' => true,
'removeTrailingWhitespace' => true,
'removeExcessNewlines' => true,
'normalizeWhitespace' => true,
'removeHTMLTags' => false, // Default: Don't remove HTML
];
$params = array_merge($defaultParams, $argv); // Allow overriding with command-line arguments (optional)
$cleanedBlocks = [];
foreach ($textBlocks as $block) {
$text = $block['text'] ?? ''; // Default to empty string if 'text' key is missing
//Remove leading/trailing whitespace
$text = trim($text, $params['removeLeadingWhitespace'] ? "\x00\x09\x0a\x0d\x0c" : "");
$text = trim($text, $params['removeTrailingWhitespace'] ? "\x00\x09\x0a\x0d\x0c" : "");
// Remove excessive newlines
$text = preg_replace('/\s*\n+/', "\n", $text, 1); // Replace multiple newlines with single newline
// Normalize whitespace
$text = preg_replace('/\s+/', ' ', $text); // Replace multiple spaces with single space
// Remove HTML tags (optional)
if ($params['removeHTMLTags']) {
$text = strip_tags($text);
}
$cleanedBlocks[] = ['text' => $text]; // Store cleaned text
}
return $cleanedBlocks;
}
//Example usage (can be tested with command line arguments):
/*
$textBlocks = [
['text' => ' This is a test.\n\nWith some extra whitespace. '],
['text' => '<div>This has HTML</div>'],
['text' => 'No HTML here'],
];
$cleaned = cleanupTextBlocks($textBlocks);
print_r($cleaned);
*/
?>
Add your comment