Rename tokenizer stuff to include cx prefix
This commit is contained in:
parent
6b55a13a83
commit
cfac41c7b5
16 changed files with 121 additions and 120 deletions
|
|
@ -1,481 +0,0 @@
|
|||
// Copyright (C) 2017 Jonathan Müller <jonathanmueller.dev@gmail.com>
|
||||
// This file is subject to the license terms in the LICENSE file
|
||||
// found in the top-level directory of this distribution.
|
||||
|
||||
#include "tokenizer.hpp"
|
||||
|
||||
#include <cctype>
|
||||
|
||||
#include "libclang_visitor.hpp"
|
||||
#include "parse_error.hpp"
|
||||
|
||||
using namespace cppast;
|
||||
|
||||
detail::token::token(const CXTranslationUnit& tu_unit, const CXToken& token)
|
||||
: value_(clang_getTokenSpelling(tu_unit, token)), kind_(clang_getTokenKind(token))
|
||||
{
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
bool cursor_is_function(CXCursorKind kind)
|
||||
{
|
||||
return kind == CXCursor_FunctionDecl || kind == CXCursor_CXXMethod
|
||||
|| kind == CXCursor_Constructor || kind == CXCursor_Destructor
|
||||
|| kind == CXCursor_ConversionFunction;
|
||||
}
|
||||
|
||||
CXSourceLocation get_next_location(const CXTranslationUnit& tu, CXFile file,
|
||||
const CXSourceLocation& loc, int inc = 1)
|
||||
{
|
||||
unsigned offset;
|
||||
clang_getSpellingLocation(loc, nullptr, nullptr, nullptr, &offset);
|
||||
if (inc >= 0)
|
||||
offset += unsigned(inc);
|
||||
else
|
||||
offset -= unsigned(-inc);
|
||||
return clang_getLocationForOffset(tu, file, offset);
|
||||
}
|
||||
|
||||
class simple_tokenizer
|
||||
{
|
||||
public:
|
||||
explicit simple_tokenizer(const CXTranslationUnit& tu, const CXSourceRange& range,
|
||||
const CXCursor& cur)
|
||||
: tu_(tu)
|
||||
{
|
||||
clang_tokenize(tu, range, &tokens_, &no_);
|
||||
DEBUG_ASSERT(no_ >= 1u, detail::parse_error_handler{}, cur, "no tokens available");
|
||||
}
|
||||
|
||||
~simple_tokenizer()
|
||||
{
|
||||
clang_disposeTokens(tu_, tokens_, no_);
|
||||
}
|
||||
|
||||
simple_tokenizer(const simple_tokenizer&) = delete;
|
||||
simple_tokenizer& operator=(const simple_tokenizer&) = delete;
|
||||
|
||||
unsigned size() const noexcept
|
||||
{
|
||||
return no_;
|
||||
}
|
||||
|
||||
const CXToken& operator[](unsigned i) const noexcept
|
||||
{
|
||||
return tokens_[i];
|
||||
}
|
||||
|
||||
private:
|
||||
CXTranslationUnit tu_;
|
||||
CXToken* tokens_;
|
||||
unsigned no_;
|
||||
};
|
||||
|
||||
bool token_after_is(const CXTranslationUnit& tu, const CXFile& file, const CXCursor& cur,
|
||||
const CXSourceLocation& loc, const char* token_str)
|
||||
{
|
||||
auto loc_after = get_next_location(tu, file, loc);
|
||||
|
||||
simple_tokenizer tokenizer(tu, clang_getRange(loc, loc_after), cur);
|
||||
detail::cxstring spelling(clang_getTokenSpelling(tu, tokenizer[0u]));
|
||||
return spelling == token_str;
|
||||
}
|
||||
|
||||
// clang_getCursorExtent() is somehow broken in various ways
|
||||
// this function returns the actual CXSourceRange that covers all parts required for parsing
|
||||
// might include more tokens
|
||||
// this function is the reason you shouldn't use libclang
|
||||
CXSourceRange get_extent(const CXTranslationUnit& tu, const CXFile& file, const CXCursor& cur,
|
||||
bool& unmunch)
|
||||
{
|
||||
unmunch = false;
|
||||
|
||||
auto extent = clang_getCursorExtent(cur);
|
||||
auto begin = clang_getRangeStart(extent);
|
||||
auto end = clang_getRangeEnd(extent);
|
||||
|
||||
auto kind = clang_getCursorKind(cur);
|
||||
if (cursor_is_function(kind) || cursor_is_function(clang_getTemplateCursorKind(cur)))
|
||||
{
|
||||
auto is_definition = false;
|
||||
|
||||
// if a function we need to remove the body
|
||||
// it does not need to be parsed
|
||||
detail::visit_children(cur, [&](const CXCursor& child) {
|
||||
if (clang_getCursorKind(child) == CXCursor_CompoundStmt
|
||||
|| clang_getCursorKind(child) == CXCursor_CXXTryStmt
|
||||
|| clang_getCursorKind(child) == CXCursor_InitListExpr)
|
||||
{
|
||||
auto child_extent = clang_getCursorExtent(child);
|
||||
end = clang_getRangeStart(child_extent);
|
||||
is_definition = true;
|
||||
}
|
||||
});
|
||||
|
||||
if (!is_definition)
|
||||
{
|
||||
// i have no idea why this is necessary
|
||||
is_definition = token_after_is(tu, file, cur, end, "{")
|
||||
|| token_after_is(tu, file, cur, end, "try")
|
||||
|| token_after_is(tu, file, cur, end, ":");
|
||||
if (is_definition)
|
||||
// need to extend range here to include the token
|
||||
end = get_next_location(tu, file, end);
|
||||
}
|
||||
|
||||
if (!is_definition && !token_after_is(tu, file, cur, end, ";"))
|
||||
{
|
||||
// we do not have a body, but it is not a declaration either
|
||||
do
|
||||
{
|
||||
end = get_next_location(tu, file, end);
|
||||
} while (!token_after_is(tu, file, cur, end, ";"));
|
||||
}
|
||||
else if (kind == CXCursor_CXXMethod)
|
||||
// necessary for some reason
|
||||
begin = get_next_location(tu, file, begin, -1);
|
||||
else if (kind == CXCursor_Destructor && token_after_is(tu, file, cur, end, ")"))
|
||||
// necessary for some other reason
|
||||
end = get_next_location(tu, file, end);
|
||||
}
|
||||
else if (kind == CXCursor_TemplateTypeParameter && token_after_is(tu, file, cur, end, "("))
|
||||
{
|
||||
// if you have decltype as default argument for a type template parameter
|
||||
// libclang doesn't include the parameters
|
||||
auto next = get_next_location(tu, file, end);
|
||||
auto prev = end;
|
||||
for (auto paren_count = 1; paren_count != 0; next = get_next_location(tu, file, next))
|
||||
{
|
||||
if (token_after_is(tu, file, cur, next, "("))
|
||||
++paren_count;
|
||||
else if (token_after_is(tu, file, cur, next, ")"))
|
||||
--paren_count;
|
||||
prev = next;
|
||||
}
|
||||
#if CINDEX_VERSION_MINOR < 37
|
||||
end = prev;
|
||||
#else
|
||||
end = next;
|
||||
#endif
|
||||
}
|
||||
else if (kind == CXCursor_TemplateTemplateParameter
|
||||
&& token_after_is(tu, file, cur, end, "<"))
|
||||
{
|
||||
// if you have a template template parameter in a template template parameter,
|
||||
// the tokens are all messed up, only contain the `template`
|
||||
|
||||
// first: skip to closing angle bracket
|
||||
// luckily no need to handle expressions here
|
||||
auto next = get_next_location(tu, file, end, 2);
|
||||
for (auto angle_count = 1; angle_count != 0; next = get_next_location(tu, file, next))
|
||||
{
|
||||
if (token_after_is(tu, file, cur, next, ">"))
|
||||
--angle_count;
|
||||
else if (token_after_is(tu, file, cur, next, ">>"))
|
||||
angle_count -= 2;
|
||||
else if (token_after_is(tu, file, cur, next, "<"))
|
||||
++angle_count;
|
||||
}
|
||||
|
||||
// second: skip until end of parameter
|
||||
// no need to handle default, so look for '>' or ','
|
||||
while (!token_after_is(tu, file, cur, next, ">")
|
||||
&& !token_after_is(tu, file, cur, next, ","))
|
||||
next = get_next_location(tu, file, next);
|
||||
// now we found the proper end of the token
|
||||
end = get_next_location(tu, file, next, -1);
|
||||
}
|
||||
else if ((kind == CXCursor_TemplateTypeParameter
|
||||
|| kind == CXCursor_NonTypeTemplateParameter
|
||||
|| kind == CXCursor_TemplateTemplateParameter)
|
||||
&& token_after_is(tu, file, cur, end, "..."))
|
||||
{
|
||||
// variadic tokens in unnamed parameter not included
|
||||
end = get_next_location(tu, file, end, 3);
|
||||
DEBUG_ASSERT(token_after_is(tu, file, cur, end, ">")
|
||||
|| token_after_is(tu, file, cur, end, ","),
|
||||
detail::parse_error_handler{}, cur,
|
||||
"unexpected token in variadic parameter workaround");
|
||||
}
|
||||
else if ((kind == CXCursor_TemplateTypeParameter
|
||||
|| kind == CXCursor_NonTypeTemplateParameter
|
||||
|| kind == CXCursor_TemplateTemplateParameter)
|
||||
&& !token_after_is(tu, file, cur, end, ">")
|
||||
&& !token_after_is(tu, file, cur, end, ","))
|
||||
{
|
||||
DEBUG_ASSERT(token_after_is(tu, file, cur, get_next_location(tu, file, end, -2), ">>"),
|
||||
detail::parse_error_handler{}, cur,
|
||||
"unexpected token in maximal munch workaround");
|
||||
unmunch = true;
|
||||
// need to shrink range anyway
|
||||
end = get_next_location(tu, file, end, -1);
|
||||
}
|
||||
else if (kind == CXCursor_EnumDecl && !token_after_is(tu, file, cur, end, ";"))
|
||||
{
|
||||
while (!token_after_is(tu, file, cur, end, ";"))
|
||||
end = get_next_location(tu, file, end);
|
||||
}
|
||||
else if (kind == CXCursor_FieldDecl || kind == CXCursor_ParmDecl
|
||||
|| kind == CXCursor_NonTypeTemplateParameter
|
||||
|| kind == CXCursor_TemplateTemplateParameter
|
||||
#if CINDEX_VERSION_MINOR < 37
|
||||
|| clang_isExpression(kind) || kind == CXCursor_CXXBaseSpecifier
|
||||
|| kind == CXCursor_TemplateTypeParameter
|
||||
#endif
|
||||
)
|
||||
// need to shrink range by one
|
||||
end = get_next_location(tu, file, end, -1);
|
||||
else if (kind == CXCursor_UnexposedDecl)
|
||||
{
|
||||
// include semicolon, if necessary
|
||||
if (token_after_is(tu, file, cur, end, ";"))
|
||||
end = get_next_location(tu, file, end);
|
||||
}
|
||||
|
||||
return clang_getRange(begin, end);
|
||||
}
|
||||
}
|
||||
|
||||
detail::tokenizer::tokenizer(const CXTranslationUnit& tu, const CXFile& file, const CXCursor& cur)
|
||||
{
|
||||
auto extent = get_extent(tu, file, cur, unmunch_);
|
||||
|
||||
simple_tokenizer tokenizer(tu, extent, cur);
|
||||
tokens_.reserve(tokenizer.size());
|
||||
for (auto i = 0u; i != tokenizer.size(); ++i)
|
||||
tokens_.emplace_back(tu, tokenizer[i]);
|
||||
}
|
||||
|
||||
void detail::skip(detail::token_stream& stream, const char* str)
|
||||
{
|
||||
if (*str)
|
||||
{
|
||||
// non-empty string
|
||||
DEBUG_ASSERT(!stream.done(), parse_error_handler{}, stream.cursor(),
|
||||
format("expected '", str, "', got exhausted stream"));
|
||||
auto& token = stream.peek();
|
||||
DEBUG_ASSERT(token == str, parse_error_handler{}, stream.cursor(),
|
||||
format("expected '", str, "', got '", token.c_str(), "'"));
|
||||
stream.bump();
|
||||
}
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
bool starts_with(const char*& str, const detail::token& t)
|
||||
{
|
||||
if (std::strncmp(str, t.c_str(), t.value().length()) != 0)
|
||||
return false;
|
||||
str += t.value().length();
|
||||
while (*str == ' ' || *str == '\t')
|
||||
++str;
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
bool detail::skip_if(detail::token_stream& stream, const char* str, bool multi_token)
|
||||
{
|
||||
if (!*str)
|
||||
return true;
|
||||
else if (stream.done())
|
||||
return false;
|
||||
auto save = stream.cur();
|
||||
do
|
||||
{
|
||||
auto& token = stream.peek();
|
||||
if (!starts_with(str, token) || (!multi_token && *str != '\0'))
|
||||
{
|
||||
stream.set_cur(save);
|
||||
return false;
|
||||
}
|
||||
stream.bump();
|
||||
} while (multi_token && *str);
|
||||
return true;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
// whether or not the current angle bracket can be a comparison
|
||||
// note: this is a heuristic I hope works often enough
|
||||
bool is_comparison(CXTokenKind last_kind, const detail::token& cur, CXTokenKind next_kind)
|
||||
{
|
||||
if (cur == "<")
|
||||
return last_kind == CXToken_Literal;
|
||||
else if (cur == ">")
|
||||
return next_kind == CXToken_Literal;
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
detail::token_iterator detail::find_closing_bracket(detail::token_stream stream)
|
||||
{
|
||||
auto template_bracket = false;
|
||||
auto open_bracket = stream.peek().c_str();
|
||||
const char* close_bracket = nullptr;
|
||||
if (skip_if(stream, "("))
|
||||
close_bracket = ")";
|
||||
else if (skip_if(stream, "{"))
|
||||
close_bracket = "}";
|
||||
else if (skip_if(stream, "["))
|
||||
close_bracket = "]";
|
||||
else if (skip_if(stream, "<"))
|
||||
{
|
||||
close_bracket = ">";
|
||||
template_bracket = true;
|
||||
}
|
||||
else
|
||||
DEBUG_UNREACHABLE(parse_error_handler{}, stream.cursor(),
|
||||
format("expected a bracket, got '", stream.peek().c_str(), "'"));
|
||||
|
||||
auto bracket_count = 1;
|
||||
auto paren_count = 0; // internal nested parenthesis
|
||||
auto last_token = CXToken_Comment;
|
||||
while (!stream.done() && bracket_count != 0)
|
||||
{
|
||||
auto& cur = stream.get();
|
||||
if (paren_count == 0 && cur == open_bracket
|
||||
&& !is_comparison(last_token, cur, stream.peek().kind()))
|
||||
++bracket_count;
|
||||
else if (paren_count == 0 && cur == close_bracket
|
||||
&& !is_comparison(last_token, cur, stream.peek().kind()))
|
||||
--bracket_count;
|
||||
else if (paren_count == 0 && template_bracket && cur == ">>")
|
||||
// maximal munch
|
||||
bracket_count -= 2;
|
||||
else if (cur == "(" || cur == "{" || cur == "[")
|
||||
++paren_count;
|
||||
else if (cur == ")" || cur == "}" || cur == "]")
|
||||
--paren_count;
|
||||
|
||||
last_token = cur.kind();
|
||||
}
|
||||
stream.bump_back();
|
||||
// only check first parameter, token might be ">>"
|
||||
DEBUG_ASSERT(bracket_count == 0 && paren_count == 0
|
||||
&& stream.peek().value()[0] == close_bracket[0],
|
||||
parse_error_handler{}, stream.cursor(),
|
||||
"find_closing_bracket() internal parse error");
|
||||
return stream.cur();
|
||||
}
|
||||
|
||||
void detail::skip_brackets(detail::token_stream& stream)
|
||||
{
|
||||
auto closing = find_closing_bracket(stream);
|
||||
stream.set_cur(std::next(closing));
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
bool skip_attribute_impl(detail::token_stream& stream)
|
||||
{
|
||||
if (skip_if(stream, "[") && stream.peek() == "[")
|
||||
{
|
||||
// C++11 attribute
|
||||
// [[<attribute>]]
|
||||
// ^
|
||||
skip_brackets(stream);
|
||||
// [[<attribute>]]
|
||||
// ^
|
||||
skip(stream, "]");
|
||||
return true;
|
||||
}
|
||||
else if (skip_if(stream, "__attribute__"))
|
||||
{
|
||||
// GCC/clang attributes
|
||||
// __attribute__(<attribute>)
|
||||
// ^
|
||||
skip_brackets(stream);
|
||||
return true;
|
||||
}
|
||||
else if (skip_if(stream, "__declspec"))
|
||||
{
|
||||
// MSVC declspec
|
||||
// __declspec(<attribute>)
|
||||
// ^
|
||||
skip_brackets(stream);
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
bool detail::skip_attribute(detail::token_stream& stream)
|
||||
{
|
||||
auto any = false;
|
||||
while (skip_attribute_impl(stream))
|
||||
any = true;
|
||||
return any;
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
cpp_token_kind get_kind(CXTokenKind kind)
|
||||
{
|
||||
switch (kind)
|
||||
{
|
||||
case CXToken_Punctuation:
|
||||
return cpp_token_kind::punctuation;
|
||||
case CXToken_Keyword:
|
||||
return cpp_token_kind::keyword;
|
||||
case CXToken_Identifier:
|
||||
return cpp_token_kind::identifier;
|
||||
case CXToken_Literal:
|
||||
return cpp_token_kind::literal;
|
||||
case CXToken_Comment:
|
||||
break;
|
||||
}
|
||||
|
||||
DEBUG_UNREACHABLE(detail::assert_handler{});
|
||||
return cpp_token_kind ::literal;
|
||||
}
|
||||
}
|
||||
|
||||
cpp_token_string detail::to_string(token_stream& stream, token_iterator end)
|
||||
{
|
||||
cpp_token_string::builder builder;
|
||||
|
||||
while (stream.cur() != end)
|
||||
{
|
||||
auto& token = stream.get();
|
||||
builder.add_token(cpp_token(get_kind(token.kind()), token.c_str()));
|
||||
}
|
||||
|
||||
if (stream.unmunch())
|
||||
builder.unmunch();
|
||||
|
||||
return builder.finish();
|
||||
}
|
||||
|
||||
bool detail::append_scope(detail::token_stream& stream, std::string& scope)
|
||||
{
|
||||
// add identifiers and "::" to current scope name,
|
||||
// clear if there is any other token in between, or mismatched combination
|
||||
if (stream.peek().kind() == CXToken_Identifier)
|
||||
{
|
||||
if (!scope.empty() && scope.back() != ':')
|
||||
scope.clear();
|
||||
scope += stream.get().c_str();
|
||||
}
|
||||
else if (stream.peek() == "::")
|
||||
{
|
||||
if (!scope.empty() && scope.back() == ':')
|
||||
scope.clear();
|
||||
scope += stream.get().c_str();
|
||||
}
|
||||
else if (stream.peek() == "<")
|
||||
{
|
||||
auto iter = detail::find_closing_bracket(stream);
|
||||
scope += detail::to_string(stream, iter).as_string();
|
||||
if (!detail::skip_if(stream, ">>"))
|
||||
detail::skip(stream, ">");
|
||||
scope += ">";
|
||||
}
|
||||
else
|
||||
{
|
||||
scope.clear();
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
Loading…
Add table
Add a link
Reference in a new issue