Merge pull request #253 from fivesixty/master

Add matching group and back reference support to tokenizer rules.
This commit is contained in:
Fabian Jakobs 2011-05-20 00:57:51 -07:00
commit 993f2a17ee
3 changed files with 61 additions and 27 deletions

View file

@ -51,21 +51,21 @@ module.exports = {
"test: tokenize pixel number" : function() {
var line = "-12px";
var tokens = this.tokenizer.getLineTokens(line, "start").tokens;
var tokens = this.tokenizer.getLineTokens(line, "ruleset").tokens;
assert.equal(1, tokens.length);
assert.equal("constant.numeric", tokens[0].type);
},
"test: tokenize hex3 color" : function() {
var tokens = this.tokenizer.getLineTokens("#abc", "start").tokens;
var tokens = this.tokenizer.getLineTokens("#abc", "ruleset").tokens;
assert.equal(1, tokens.length);
assert.equal("constant.numeric", tokens[0].type);
},
"test: tokenize hex6 color" : function() {
var tokens = this.tokenizer.getLineTokens("#abc012", "start").tokens;
var tokens = this.tokenizer.getLineTokens("#abc012", "ruleset").tokens;
assert.equal(1, tokens.length);
assert.equal("constant.numeric", tokens[0].type);
@ -81,7 +81,7 @@ module.exports = {
},
"test for last rule in ruleset to catch capturing group bugs" : function() {
var tokens = this.tokenizer.getLineTokens("top", "start").tokens;
var tokens = this.tokenizer.getLineTokens("top", "ruleset").tokens;
assert.equal(1, tokens.length);
assert.equal("support.type", tokens[0].type);

View file

@ -1,5 +1,5 @@
var dom = require('jsdom/level2/html').dom.level2.html;
var browser = require('jsdom/browser/index').windowAugmentation(dom);
var dom = require('jsdom/lib/jsdom/level2/html').dom.level2.html;
var browser = require('jsdom/lib/jsdom/browser/index').windowAugmentation(dom);
global.document = browser.document;
global.window = browser.window;

View file

@ -41,13 +41,32 @@ var Tokenizer = function(rules) {
this.rules = rules;
this.regExps = {};
this.matchMappings = {};
for ( var key in this.rules) {
var rule = this.rules[key];
var state = rule;
var ruleRegExps = [];
for ( var i = 0; i < state.length; i++)
ruleRegExps.push(state[i].regex);
var matchTotal = 0;
var mapping = this.matchMappings[key] = {};
for ( var i = 0; i < state.length; i++) {
// Count number of matching groups. 2 extra groups from the full match
// And the catch-all on the end (used to force a match);
var matchcount = new RegExp("(?:(" + state[i].regex + ")|(.))").exec("a").length - 2;
// Replace any backreferences and offset appropriately.
var adjustedregex = state[i].regex.replace(/\\([0-9]+)/g, function (match, digit) {
return "\\" + (parseInt(digit, 10) + matchTotal + 1);
});
mapping[matchTotal] = {
rule: i,
len: matchcount
};
matchTotal += matchcount;
ruleRegExps.push(adjustedregex);
}
this.regExps[key] = new RegExp("(?:(" + ruleRegExps.join(")|(") + ")|(.))", "g");
@ -59,34 +78,40 @@ var Tokenizer = function(rules) {
this.getLineTokens = function(line, startState) {
var currentState = startState;
var state = this.rules[currentState];
var mapping = this.matchMappings[currentState];
var re = this.regExps[currentState];
re.lastIndex = 0;
var match, tokens = [];
var lastIndex = 0;
var token = {
type: null,
value: ""
};
while (match = re.exec(line)) {
var type = "text";
var value = match[0];
var value = [match[0]];
for ( var i = 0; i < state.length; i++) {
for ( var i = 0; i < match.length-2; i++) {
if (match[i + 1] !== undefined) {
var rule = state[i];
var rule = state[mapping[i].rule];
if (mapping[i].len > 1) {
value = match.slice(i+2, i+1+mapping[i].len);
}
if (typeof rule.token == "function")
type = rule.token(match[0]);
type = rule.token.apply(this, value);
else
type = rule.token;
if (rule.next && rule.next !== currentState) {
currentState = rule.next;
state = this.rules[currentState];
mapping = this.matchMappings[currentState];
lastIndex = re.lastIndex;
re = this.regExps[currentState];
@ -96,17 +121,26 @@ var Tokenizer = function(rules) {
}
};
if (token.type !== type) {
if (token.type)
tokens.push(token);
if (typeof type == "string") {
if (typeof value != "string") {
value = [value.join("")];
}
type = [type];
}
for ( var i = 0; i < value.length; i++) {
if (token.type !== type[i]) {
if (token.type) {
tokens.push(token);
}
token = {
type: type,
value: value
};
} else {
token.value += value;
token = {
type: type[i],
value: value[i]
}
} else {
token.value += value[i];
}
}
if (lastIndex == line.length)