function tokenizeLogStream(logStream, regexPatterns) {
const tokens = [];
for (const line of logStream.split('\n')) {
for (const pattern of regexPatterns) {
const match = line.match(pattern);
if (match) {
tokens.push({
type: pattern.source, // Store the regex pattern as token type
value: match[0],
// Add more fields as needed (e.g., captured groups)
});
}
}
}
return tokens;
}
// Example usage:
// Define regex patterns for tokenization. Important to compile these for performance
const regexPatterns = [
{ source: /\[(\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2})\]/g }, // Date/time
{ source: /ERROR: (.*)/g }, // Error messages
{ source: /WARN: (.*)/g }, // Warning messages
{ source: /INFO: (.*)/g }, // Info messages
{ source: /(\w+)=(\w+)/g } // Key-value pairs
];
// Simulate a log stream (replace with your actual log data)
const logStream = `
2023-10-27 10:00:00 [INFO] Application started
2023-10-27 10:00:05 ERROR: Failed to connect to database
user=john
password=secret
2023-10-27 10:00:10 [WARN] Low disk space
`;
const tokens = tokenizeLogStream(logStream, regexPatterns);
console.log(JSON.stringify(tokens, null, 2)); // Output tokens as JSON
Add your comment