You can now pass more than one .tmLanguage file to tmlanguage.js and it will process each one in turn. This can vastly speed up the processing of multiple files as you don't have to pay the node/V8 startup cost for each file. Note that the script's "dev mode" is now controlled by a --dev flag rather than an unnamed second argument.
697 lines
20 KiB
JavaScript
697 lines
20 KiB
JavaScript
require("amd-loader");
|
|
|
|
var fs = require("fs");
|
|
var util = require("util");
|
|
var lib = require("./lib");
|
|
var pathlib = require("path");
|
|
var parseLanguage = lib.parsePlist;
|
|
|
|
var tk = require("./regexp_tokenizer");
|
|
var tokenize = tk.tokenize;
|
|
var toStr = tk.toStr;
|
|
|
|
function last(array) {return array[array.length - 1]}
|
|
|
|
function convertHexEscape(tokens) {
|
|
var inChClass = false;
|
|
tokens.forEach(function(t) {
|
|
if (t.type == "charclass")
|
|
inChClass = true;
|
|
else if (t.type == "charclass.end")
|
|
inChClass = false;
|
|
else if (t.type == "charType"){
|
|
if (t.value == "\\h") {
|
|
t.type = "text";
|
|
t.value = inChClass ? "\\da-fA-F" : "[\\da-fA-F]";
|
|
}
|
|
else if (t.value == "\\H") {
|
|
if (inChClass) {
|
|
console.warn("can't convert \\H in charclass");
|
|
return;
|
|
}
|
|
t.type = "text";
|
|
t.value = "[^\\da-fA-F]";
|
|
}
|
|
}
|
|
});
|
|
return tokens;
|
|
}
|
|
|
|
function convertNewLinesTo$(str) {
|
|
var tokens = tokenize(str);
|
|
for (var i = 0; i < tokens.length; i++) {
|
|
var t= tokens[i];
|
|
if (t.type == "char" && t.value == "\\n") {
|
|
var p = tokens[i + 1] || {};
|
|
if (p.type != "quantifier") {
|
|
t.value = "$";
|
|
while (p.value == "\\n" || p.type == "quantifier") {
|
|
p.value = "";
|
|
p = tokens[++i + 1] || {};
|
|
}
|
|
} else if (/\?|\*|{,|{0,/.test(p.value)) {
|
|
t.value = p.value = "";
|
|
} else
|
|
p.value = "";
|
|
}
|
|
}
|
|
return toStr(tokens).replace(/[$]+/g, "$");
|
|
}
|
|
|
|
function convertCharacterTypes(str) {
|
|
var tokens = tokenize(str);
|
|
tokens = convertHexEscape(tokens);
|
|
|
|
var warn = false;
|
|
tokens.forEach(function(t){
|
|
if (t.type == "quantifier") {
|
|
var val = t.value;
|
|
if (val.slice(-1) == "+" && val.length > 1) {
|
|
t.value = val.slice(0, -1);
|
|
warn = val;
|
|
}
|
|
}
|
|
});
|
|
if (warn)
|
|
console.log("converted possesive quantifier " + warn + " to *");
|
|
return toStr(tokens);
|
|
}
|
|
|
|
function removeInlineFlags(str, rule) {
|
|
var tokens = tokenize(str);
|
|
var caseInsensitive = false;
|
|
tokens.forEach(function(t, i) {
|
|
if (t.type == "group.start" && /[imsx]/.test(t.value)) {
|
|
if (/i/.test(t.value))
|
|
caseInsensitive = true;
|
|
t.value = t.value.replace(/[imsx\-]/g, "");
|
|
var next = tokens[i + 1];
|
|
if (next && next.type == "group.end") {
|
|
t.value = next.value = "";
|
|
}
|
|
}
|
|
});
|
|
if (caseInsensitive && rule)
|
|
rule.caseInsensitive = true;
|
|
return toStr(tokens);
|
|
}
|
|
|
|
function convertToNonCapturingGroups(str) {
|
|
var tokens = tokenize(str);
|
|
tokens.forEach(function(t, i) {
|
|
if (t.type == "group.start" && t.value == "(")
|
|
t.value += "?:";
|
|
});
|
|
return toStr(tokens);
|
|
}
|
|
|
|
function simplifyNonCapturingGroups(str) {
|
|
var tokens = tokenize(str);
|
|
var t = tokens[0] || {};
|
|
if (t.type == "group.start" && t.value == "(?:"
|
|
&& t.end == last(tokens)) {
|
|
t.value = t.end.value = "";
|
|
}
|
|
var i = 0;
|
|
function iter(f) {
|
|
for (i = 0; i < tokens.length; i++)
|
|
f(tokens[i]);
|
|
}
|
|
function iterGroup(end, f) {
|
|
for (var i1 = i + 1; i1 < tokens.length; i1++) {
|
|
var t = tokens[i1];
|
|
if (t == end)
|
|
break;
|
|
var index = f && f(t);
|
|
if (index > i1)
|
|
i1 = index;
|
|
}
|
|
return i1;
|
|
}
|
|
|
|
iter(function (t) {
|
|
if (t.type == "group.start" && t.value == "(?:") {
|
|
if (!t.end)
|
|
return console.error("malformed regex: " + str);
|
|
|
|
var canRemove = true;
|
|
var next = tokens[tokens.indexOf(t.end, i) + 1];
|
|
if (next && next.type == "quantifier")
|
|
return;
|
|
iterGroup(t.end, function(t) {
|
|
if (t.type == "alternation")
|
|
canRemove = false;
|
|
else if (t.type == "group.start" && t.end)
|
|
return iterGroup(t.end);
|
|
});
|
|
if (canRemove)
|
|
t.value = t.end.value = "";
|
|
}
|
|
});
|
|
|
|
return toStr(tokens);
|
|
}
|
|
|
|
function removeLookBehinds(str) {
|
|
var tokens = tokenize(str);
|
|
var toRemove = null;
|
|
tokens.forEach(function(t, i) {
|
|
if (!toRemove && t.type == "group.start" && /</.test(t.value)) {
|
|
toRemove = t.end;
|
|
toRemove.content = [];
|
|
}
|
|
if (toRemove) {
|
|
toRemove.content.push(t.value);
|
|
t.value = "";
|
|
}
|
|
if (t == toRemove) {
|
|
var c = toRemove.content.slice(1, -1).join("");
|
|
if (/\^/.test(c))
|
|
toRemove.value = "(?:" + c +")";
|
|
|
|
toRemove = null;
|
|
}
|
|
});
|
|
return toStr(tokens);
|
|
}
|
|
|
|
function convertBeginEndBackrefs(rule) {
|
|
if (!/\\\d/.test(rule.end))
|
|
return;
|
|
var startTokens = tokenize(rule.begin);
|
|
var endTokens = tokenize(rule.end);
|
|
|
|
|
|
var groups = {};
|
|
startTokens.forEach(function(t, i) {
|
|
if (t.number && t.end && t.type == "group.start") {
|
|
var endIndex = startTokens.indexOf(t.end, i + 1);
|
|
var content = startTokens.slice(i+1, endIndex);
|
|
groups[t.number] = toStr(content);
|
|
}
|
|
});
|
|
|
|
endTokens.forEach(function(t) {
|
|
if (t.type == "backRef") {
|
|
var num = t.value.substr(1);
|
|
if (groups[num])
|
|
t.value = "(?:" + groups[num] + ")";
|
|
}
|
|
});
|
|
|
|
rule.end = toStr(endTokens);
|
|
|
|
console.warn("Begin-End-Backreference is detected", rule);
|
|
}
|
|
|
|
function checkForNamedCaptures(str) {
|
|
var tokens = tokenize(str);
|
|
tokens.forEach(function(t) {
|
|
if (t.type == "group.start" && t.name)
|
|
console.warn("named capture not implemented", str);
|
|
if (t.type == "backRef")
|
|
console.warn("backRef not implemented ", str);
|
|
});
|
|
}
|
|
|
|
function fixGroups(captures, defaultName, regex) {
|
|
var tokens = tokenize(regex);
|
|
|
|
var opened = [], isStart = true, i = 0;
|
|
function open() {
|
|
var t = {value: "(", type: "group.start", isGroup: true};
|
|
opened.push(t);
|
|
tokens.splice(i++, 0, t);
|
|
}
|
|
function close() {
|
|
var t = {value: ")", type: "group.start"};
|
|
t.start = opened.pop();
|
|
t.start.end = t;
|
|
tokens.splice(i++, 0, t);
|
|
}
|
|
function tryOpen(){if (isStart) {open(); isStart = false}}
|
|
function tryClose(){if (opened.length) close()}
|
|
function skip(t) {
|
|
var i1 = tokens.indexOf(t.end, i);
|
|
if (i1 > i)
|
|
i = i1;
|
|
}
|
|
function lst(t) {return t[t.length - 1]}
|
|
function iter(f) {
|
|
for (i = 0; i < tokens.length; i++)
|
|
f(tokens[i]);
|
|
}
|
|
function iterGroup(end, f) {
|
|
for (var i1 = i + 1; i1 < tokens.length; i1++) {
|
|
var t = tokens[i1];
|
|
if (t == end)
|
|
break;
|
|
f(t);
|
|
}
|
|
}
|
|
function peek() { return tokens[i + 1] || {}}
|
|
|
|
// groupify
|
|
iter(function(t){
|
|
if (t.type == "group.start") {
|
|
tryClose();
|
|
isStart = true;
|
|
if (!t.hasChildren || t.isSpecial)
|
|
skip(t);
|
|
} else if (t.type == "group.end") {
|
|
isStart = true;
|
|
tryClose();
|
|
} else if (t.type == "alternation") {
|
|
isStart = true;
|
|
tryClose();
|
|
} else if (t.type != "anchor" && t.type != "quantifier"){
|
|
tryOpen();
|
|
}
|
|
});
|
|
tryClose();
|
|
|
|
// remove redundand groups
|
|
var names = [defaultName];
|
|
iter(function(t){
|
|
if (t.type == "group.start" && !t.isSpecial) {
|
|
var captureName = captures[t.number];
|
|
|
|
if (!t.hasChildren) {
|
|
t.tokenName = captureName || lst(names);
|
|
skip(t);
|
|
} else {
|
|
var hasCapture = false;
|
|
iterGroup(t.end, function(t1) {
|
|
if (t1.type == "group.start" && captures[t1.number])
|
|
hasCapture = true;
|
|
});
|
|
if (hasCapture) {
|
|
t.value = "(?:";
|
|
if (captureName) {
|
|
names.push(captureName);
|
|
t.isTokenGroup = true;
|
|
}
|
|
} else {
|
|
t.tokenName = captureName || lst(names);
|
|
iterGroup(t.end, function(t1) {
|
|
if (t1.value == "(")
|
|
t1.value = "(?:";
|
|
});
|
|
}
|
|
}
|
|
} else if (t.type == "group.end") {
|
|
if (t.start.isTokenGroup)
|
|
names.pop();
|
|
}
|
|
});
|
|
|
|
// wrap capturing groups with quantifier
|
|
iter(function(t){
|
|
if (t.type == "group.end" && t.start.value == "(" && peek().type == "quantifier") {
|
|
peek().value += ")";
|
|
t.start.value += "(?:";
|
|
}
|
|
});
|
|
|
|
names = [];
|
|
tokens.forEach(function(t) {
|
|
if (t.value == "(" || t.value == "((?:" )
|
|
t.tokenName && names.push(t.tokenName);
|
|
});
|
|
return {
|
|
names: names,
|
|
regex: toStr(tokens)
|
|
};
|
|
}
|
|
|
|
/***** converter */
|
|
|
|
function logDebug(string, obj) {
|
|
console.log(string, obj);
|
|
}
|
|
|
|
|
|
// tmLanguage processor
|
|
|
|
// for tracking token states
|
|
var states = {start: []};
|
|
|
|
function processRules(rules){
|
|
if (rules.patterns)
|
|
states.start = processPatterns(rules.patterns);
|
|
if (rules.repository)
|
|
processRepository(rules.repository);
|
|
return states;
|
|
}
|
|
function processRepository(r) {
|
|
for (var key in r) {
|
|
var p = r[key];
|
|
if (p.begin)
|
|
var stateObj = [processPattern(r[key])];
|
|
else if (p.patterns && !p.repository)
|
|
var stateObj = processPatterns(p.patterns);
|
|
else
|
|
var stateObj = [processPattern(r[key])];
|
|
|
|
if (stateObj)
|
|
states["#" + key] = stateObj;
|
|
}
|
|
}
|
|
function processPatterns(pl) {
|
|
return pl.map(processPattern);
|
|
}
|
|
function processPattern(p) {
|
|
if (p.end == "(?!\\G)" && p.patterns && p.patterns.length == 1) {
|
|
var rule = processPattern(p.patterns[0]);
|
|
}
|
|
else if (p.begin != null && p.end != null) {
|
|
convertBeginEndBackrefs(p);
|
|
|
|
var rule = simpleRule(p.begin, p.name, p.beginCaptures || p.captures);
|
|
|
|
var next = processPatterns(p.patterns || []);
|
|
var endRule = simpleRule(p.end, p.name, p.endCaptures || p.captures);
|
|
endRule.next = "pop";
|
|
if (p.applyEndPatternLast)
|
|
next.push(endRule);
|
|
else
|
|
next.unshift(endRule);
|
|
|
|
if (p.name || p.contentName)
|
|
next.push({defaultToken: p.name || p.contentName});
|
|
|
|
rule.push = next;
|
|
|
|
rule = removeIncludeSelf(rule);
|
|
}
|
|
else if (p.match) {
|
|
var rule = simpleRule(p.match, p.name, p.captures);
|
|
}
|
|
else if (p.include) {
|
|
var rule = {include: p.include};
|
|
}
|
|
else {
|
|
var rule = {todo: p};
|
|
}
|
|
|
|
if (p.comment)
|
|
rule.comment = (rule.comment || "") + p.comment;
|
|
|
|
if (p.repository)
|
|
processRepository(p.repository);
|
|
return rule;
|
|
}
|
|
function simpleRule(regex, name, captures) {
|
|
name = name || "text";
|
|
var rule = {token: "", regex: ""};
|
|
|
|
var origRegex = regex;
|
|
regex = transformRegExp(origRegex, rule);
|
|
if (captures) {
|
|
var tokenArray = [];
|
|
Object.keys(captures).forEach(function(x){
|
|
tokenArray[x] = captures[x] && captures[x].name;
|
|
});
|
|
|
|
if (tokenArray.length == 1) {
|
|
name = tokenArray[0];
|
|
} else {
|
|
var fixed = fixGroups(tokenArray, name, regex);
|
|
name = fixed.names;
|
|
regex = fixed.regex;
|
|
if (name.length == 1)
|
|
name = name[0];
|
|
}
|
|
}
|
|
|
|
if (typeof name == "string")
|
|
regex = convertToNonCapturingGroups(regex);
|
|
|
|
regex = simplifyNonCapturingGroups(regex);
|
|
|
|
try {new RegExp(regex);} catch(e) {
|
|
rule.TODO = "FIXME: regexp doesn't have js equivalent";
|
|
rule.originalRegex = origRegex;
|
|
|
|
// lookbehinds are mostly used to force ordering
|
|
// regex = removeLookBehinds(regex);
|
|
}
|
|
rule.token = name;
|
|
rule.regex = regex;
|
|
return rule;
|
|
}
|
|
|
|
function removeIncludeSelf(rule) {
|
|
if (!rule.push)
|
|
return rule;
|
|
var hasSelfInclude = false;
|
|
var escapeRule = null;
|
|
var complexSelfInclude = false;
|
|
rule.push.forEach(function(sub) {
|
|
if (sub.include == "$self") {
|
|
hasSelfInclude = true;
|
|
} else if (sub.defaultToken) {
|
|
return;
|
|
} else if (sub.next == "pop") {
|
|
escapeRule = sub;
|
|
} else
|
|
complexSelfInclude = true;
|
|
});
|
|
|
|
if (hasSelfInclude) {
|
|
console.warn("can't convert include $self");
|
|
return {todo: rule};
|
|
|
|
if (complexSelfInclude) {
|
|
console.warn("can't convert include $self");
|
|
rule.toDo = "include $self not fully supported";
|
|
return rule;
|
|
}
|
|
console.warn("include $self not fully supported");
|
|
delete rule.push;
|
|
delete escapeRule.next;
|
|
rule.includeSelf = true;
|
|
escapeRule.includeSelf = true;
|
|
return [rule, escapeRule];
|
|
}
|
|
return rule;
|
|
}
|
|
|
|
// regex transformation
|
|
|
|
function removeXFlag(str) {
|
|
var tokens = tokenize(str);
|
|
return toStr(tokens);
|
|
}
|
|
|
|
function transformRegExp(str, rule) {
|
|
str = convertNewLinesTo$(str);
|
|
|
|
str = removeInlineFlags(str, rule);
|
|
|
|
str = str.replace(/(\\[xu]){([a-fA-F\d]+)}/g, '$1$2');
|
|
|
|
str = convertCharacterTypes(str, rule);
|
|
|
|
checkForNamedCaptures(str);
|
|
|
|
return str;
|
|
}
|
|
|
|
//
|
|
function extractPatterns(tmRules) {
|
|
return processRules(tmRules);
|
|
}
|
|
|
|
|
|
function detectLoops(states) {
|
|
var data = {};
|
|
var keys = Object.keys(states);
|
|
var flattenedStates = {};
|
|
function addRef(item, name) {
|
|
if (item.refs.indexOf(name) == -1)
|
|
item.refs.push(name);
|
|
}
|
|
function anonStateId(name, next) {
|
|
var i = 0, old = name;
|
|
while (flattenedStates[name] || states[name]) {
|
|
name = old + "_" + i++;
|
|
}
|
|
// console.log(old, name)
|
|
return name;
|
|
}
|
|
function addState(key, rules) {
|
|
if (rules && !flattenedStates[key])
|
|
flattenedStates[key] = rules;
|
|
return rules || flattenedStates[key];
|
|
}
|
|
|
|
|
|
for (var i = 0; i < keys.length; i++) {
|
|
var key = keys[i];
|
|
var state = addState(key, states[key]);
|
|
|
|
var item = data[key] || (data[key] = {/* name: key, */ refs: []});
|
|
state.forEach(function(rule) {
|
|
var next = rule.push || rule.next;
|
|
if (next == "pop") {
|
|
// nothing
|
|
} else if (typeof next == "string") {
|
|
addRef(item, next);
|
|
} else if (next) {
|
|
var anonId = anonStateId(key, next);
|
|
addState(anonId, next);
|
|
if (rule.push)
|
|
addRef(item, anonId);
|
|
keys.push(anonId);
|
|
} else if (rule.include) {
|
|
addRef(item, rule.include);
|
|
}
|
|
});
|
|
}
|
|
|
|
|
|
var cycles = [];
|
|
function addPath(start, path) {
|
|
var node = data[start];
|
|
path.push(start);
|
|
if (!node || !node.refs)
|
|
console.log(start);
|
|
var i = path.indexOf(start);
|
|
if (i > -1 && i != path.length - 1 || start == "$self" || start == "$base") {
|
|
if (i != -1)
|
|
path = path.slice(i);
|
|
for (var j = 0; j < cycles.length; j++) {
|
|
if (cycles[j] + "" == path + "")
|
|
return;
|
|
}
|
|
return cycles.push(path);
|
|
}
|
|
|
|
if (!node || !node.refs || !node.refs.length || path.length>30)
|
|
return;
|
|
node.refs.forEach(function(x) {
|
|
addPath(x, path.concat());
|
|
});
|
|
}
|
|
addPath("start", []);
|
|
|
|
console.error(cycles.join("\n"));
|
|
}
|
|
|
|
|
|
function test(fileName) {
|
|
console.log("testing highlighter");
|
|
try {
|
|
var module = require(fileName);
|
|
var Mode = module[Object.keys(module)[0]];
|
|
var mode = new Mode();
|
|
mode.getTokenizer().getLineTokens("hello world");
|
|
} catch(e) {
|
|
console.log(e);
|
|
}
|
|
}
|
|
|
|
function guessComment(patterns) {
|
|
var comment = {};
|
|
for (var i in patterns) {
|
|
var state = patterns[i];
|
|
state.forEach(function(r) {
|
|
if (typeof r.token == "string") {
|
|
if (/\bcomment\b/.test(r.token)) {
|
|
comment.line = r.regex;
|
|
}
|
|
}
|
|
});
|
|
}
|
|
|
|
return comment;
|
|
}
|
|
|
|
// cli stuff
|
|
var modeTemplate = fs.readFileSync(__dirname + "/templates/mode.js", "utf8");
|
|
var modeHighlightTemplate = fs.readFileSync(__dirname + "/templates/highlight_rules.js", "utf8");
|
|
|
|
function fetchAndConvert(name) {
|
|
console.log("Converting " + name);
|
|
if (/^http/.test(name)) {
|
|
if (/:\/\/github.com/.test(name)) {
|
|
name = name.replace(/\/blob\//, "/").replace("github.com", "raw.github.com");
|
|
}
|
|
return lib.download(name, function(data) {
|
|
convertTmLanguage(name, data);
|
|
});
|
|
}
|
|
var path = /^(\/|\w:)/.test(name) ? name : process.cwd() + "/" + name;
|
|
var langStr = fs.readFileSync(path, "utf8");
|
|
convertTmLanguage(name, langStr);
|
|
}
|
|
|
|
|
|
function convertTmLanguage(name, langStr) {
|
|
parseLanguage(langStr, function(language) {
|
|
var highlighterFilename = lib.snakeCase(language.name).replace(/[^\w]/g, "");
|
|
var languageNameSanitized = lib.camelCase(language.name).replace(/[^\w]/g, "");
|
|
|
|
require("./add_mode")(languageNameSanitized, (language.fileTypes || []).join("|"));
|
|
|
|
var highlighterFile = pathlib.normalize(lib.AceLib + "ace/mode/" + highlighterFilename + "_highlight_rules.js");
|
|
var modeFile = pathlib.normalize(lib.AceLib + "ace/mode/" + highlighterFilename + ".js");
|
|
|
|
if (devMode) {
|
|
console.log(util.inspect(language.patterns, false, 4));
|
|
console.log(util.inspect(language.repository, false, 4));
|
|
}
|
|
|
|
var patterns = extractPatterns(language);
|
|
detectLoops(patterns);
|
|
|
|
// var uuid = language.uuid
|
|
delete language.uuid;
|
|
delete language.patterns;
|
|
delete language.repository;
|
|
|
|
var comment = guessComment(patterns);
|
|
var languageMode = lib.fillTemplate(modeTemplate, {
|
|
language: languageNameSanitized,
|
|
languageHighlightFilename: highlighterFilename,
|
|
lineCommentStart: JSON.stringify(comment.line || "//"),
|
|
blockCommentStart: JSON.stringify(comment.start || "/*"),
|
|
blockCommentEnd: JSON.stringify(comment.end || "*/")
|
|
});
|
|
|
|
var languageHighlightRules = lib.fillTemplate(modeHighlightTemplate, {
|
|
language: languageNameSanitized,
|
|
languageTokens: lib.formatJSON(patterns, " ").trim(),
|
|
uuid: language.uuid,
|
|
name: name,
|
|
metaData: lib.formatJSON(language, " ").trim()
|
|
});
|
|
|
|
if (devMode) {
|
|
console.log(languageMode);
|
|
console.log(languageHighlightRules);
|
|
console.log("Not writing, 'cause we're in dev mode, baby.");
|
|
}
|
|
else {
|
|
fs.writeFileSync(highlighterFile, languageHighlightRules);
|
|
fs.writeFileSync(modeFile, languageMode);
|
|
console.log("created file " + highlighterFile);
|
|
test(modeFile);
|
|
}
|
|
});
|
|
}
|
|
|
|
if (!module.parent) {
|
|
var args = process.argv.splice(2);
|
|
var devMode = args[0] == "--dev";
|
|
if (devMode)
|
|
args.shift();
|
|
if (args.length < 1) {
|
|
console.error("Usage: node tmlanguage.js [--dev] path/or/url/to/syntax.file ...");
|
|
process.exit(1);
|
|
}
|
|
args.forEach(fetchAndConvert);
|
|
} else {
|
|
exports.fetchAndConvert = fetchAndConvert;
|
|
}
|