/**
* Tokenizes a text string into an array of tokens.
*
* @param {string} text The input text to tokenize.
* @param {string[]} [stopwords=[]] Optional array of stopwords to remove.
* @returns {string[]} An array of tokens.
*/
function tokenize(text, stopwords = []) {
if (typeof text !== 'string') {
return []; // Handle non-string input
}
// Lowercase the text for consistent tokenization
text = text.toLowerCase();
// Remove punctuation and special characters
text = text.replace(/[^\w\s]/g, '');
// Split the text into tokens based on whitespace
const tokens = text.split(/\s+/);
// Remove stopwords if provided
if (stopwords.length > 0) {
tokens = tokens.filter(token => !stopwords.includes(token));
}
// Filter out empty tokens
return tokens.filter(token => token !== '');
}
// Example usage (for testing)
// const text = "This is a sample text, with some punctuation!";
// const tokens = tokenize(text, ["is", "a", "with"]);
// console.log(tokens); // Output: ["sample", "text", "punctuation"]
//Export the function
export default tokenize;
Add your comment