Support for matching groups in tokenizer with arrays of tokens.

This commit is contained in:
Chris Spencer 2011-05-19 02:36:00 -07:00
commit ae5e83ea02

View file

@ -41,13 +41,24 @@ var Tokenizer = function(rules) {
this.rules = rules;
this.regExps = {};
this.matchMappings = {};
for ( var key in this.rules) {
var rule = this.rules[key];
var state = rule;
var ruleRegExps = [];
for ( var i = 0; i < state.length; i++)
var matchTotal = 0;
var mapping = this.matchMappings[key] = {};
for ( var i = 0; i < state.length; i++) {
var matchcount = new RegExp("(?:(" + state[i].regex + ")|(.))").exec("a");
mapping[matchTotal] = {
rule: i,
len: matchcount.length - 2
};
matchTotal += matchcount.length - 2;
ruleRegExps.push(state[i].regex);
}
this.regExps[key] = new RegExp("(?:(" + ruleRegExps.join(")|(") + ")|(.))", "g");
@ -59,25 +70,30 @@ var Tokenizer = function(rules) {
this.getLineTokens = function(line, startState) {
var currentState = startState;
var state = this.rules[currentState];
var mapping = this.matchMappings[currentState];
var re = this.regExps[currentState];
re.lastIndex = 0;
var match, tokens = [];
var lastIndex = 0;
var token = {
type: null,
value: ""
};
while (match = re.exec(line)) {
var type = "text";
var value = match[0];
var value = [match[0]];
for ( var i = 0; i < state.length; i++) {
if (match[i + 1] !== undefined) {
var rule = state[i];
var rule = state[mapping[i].rule];
if (mapping[i].len > 1) {
value = match.slice(i+2, i+1+mapping[i].len);
}
if (typeof rule.token == "function")
type = rule.token(match[0]);
@ -87,6 +103,7 @@ var Tokenizer = function(rules) {
if (rule.next && rule.next !== currentState) {
currentState = rule.next;
state = this.rules[currentState];
mapping = this.matchMappings[currentState];
lastIndex = re.lastIndex;
re = this.regExps[currentState];
@ -96,17 +113,26 @@ var Tokenizer = function(rules) {
}
};
if (token.type !== type) {
if (token.type)
tokens.push(token);
if (typeof type == "string") {
if (typeof value != "string") {
value = [value.join("")];
}
type = [type];
}
for ( var i = 0; i < value.length; i++) {
if (token.type !== type[i]) {
if (token.type) {
tokens.push(token);
}
token = {
type: type,
value: value
};
} else {
token.value += value;
token = {
type: type[i],
value: value[i]
}
} else {
token.value += value;
}
}
if (lastIndex == line.length)