1. /**
  2. * Tokenizes a text string into an array of tokens.
  3. *
  4. * @param {string} text The input text to tokenize.
  5. * @param {string[]} [stopwords=[]] Optional array of stopwords to remove.
  6. * @returns {string[]} An array of tokens.
  7. */
  8. function tokenize(text, stopwords = []) {
  9. if (typeof text !== 'string') {
  10. return []; // Handle non-string input
  11. }
  12. // Lowercase the text for consistent tokenization
  13. text = text.toLowerCase();
  14. // Remove punctuation and special characters
  15. text = text.replace(/[^\w\s]/g, '');
  16. // Split the text into tokens based on whitespace
  17. const tokens = text.split(/\s+/);
  18. // Remove stopwords if provided
  19. if (stopwords.length > 0) {
  20. tokens = tokens.filter(token => !stopwords.includes(token));
  21. }
  22. // Filter out empty tokens
  23. return tokens.filter(token => token !== '');
  24. }
  25. // Example usage (for testing)
  26. // const text = "This is a sample text, with some punctuation!";
  27. // const tokens = tokenize(text, ["is", "a", "with"]);
  28. // console.log(tokens); // Output: ["sample", "text", "punctuation"]
  29. //Export the function
  30. export default tokenize;

Add your comment