Add tokenizer

This commit is contained in:
Jonathan Müller 2017-02-21 20:06:07 +01:00
commit 1d083978ca
5 changed files with 282 additions and 11 deletions

View file

@ -5,12 +5,30 @@
#include "debug_helper.hpp"
#include <cstdio>
#include <clang-c/Index.h>
#include <mutex>
#include "tokenizer.hpp"
using namespace cppast;
namespace
{
std::mutex mtx;
}
void detail::print_cursor_info(const CXCursor& cur) noexcept
{
std::lock_guard<std::mutex> lock(mtx);
std::printf("[debug] cursor '%s' (%s)\n", cxstring(clang_getCursorDisplayName(cur)).c_str(),
cxstring(clang_getCursorKindSpelling(cur.kind)).c_str());
}
void detail::print_tokens(const detail::cxtranslation_unit& tu, const CXFile& file,
const CXCursor& cur) noexcept
{
std::lock_guard<std::mutex> lock(mtx);
detail::tokenizer tokenizer(tu, file, cur);
for (auto& token : tokenizer)
std::printf("%s ", token.c_str());
std::puts("\n");
}

View file

@ -12,6 +12,9 @@ namespace cppast
namespace detail
{
void print_cursor_info(const CXCursor& cur) noexcept;
void print_tokens(const cxtranslation_unit& tu, const CXFile& file,
const CXCursor& cur) noexcept;
}
} // namespace cppast::detail

View file

@ -10,6 +10,7 @@
#include <utility>
#include <clang-c/Index.h>
#include <type_safe/optional.hpp>
#include <cppast/detail/assert.hpp>
@ -89,22 +90,33 @@ namespace cppast
class cxstring
{
public:
explicit cxstring(CXString str) noexcept
: str_(str), c_str_(clang_getCString(str)), length_(std::strlen(c_str_))
explicit cxstring(CXString str) noexcept : str_(string(str))
{
}
cxstring(const cxstring&) = delete;
cxstring& operator=(const cxstring&) = delete;
cxstring(cxstring&& other) noexcept : str_(other.str_)
{
other.str_.reset();
}
cxstring& operator=(cxstring&& other) noexcept
{
if (str_)
clang_disposeString(str_.value().str);
str_ = other.str_;
other.str_.reset();
return *this;
}
~cxstring() noexcept
{
clang_disposeString(str_);
if (str_)
clang_disposeString(str_.value().str);
}
const char* c_str() const noexcept
{
return c_str_;
return str_ ? str_.value().c_str : "";
}
char operator[](std::size_t i) const noexcept
@ -114,13 +126,22 @@ namespace cppast
std::size_t length() const noexcept
{
return length_;
return str_ ? str_.value().length : 0u;
}
private:
CXString str_;
const char* c_str_;
std::size_t length_;
struct string
{
CXString str;
const char* c_str;
std::size_t length;
explicit string(CXString str)
: str(std::move(str)), c_str(clang_getCString(str)), length(std::strlen(c_str))
{
}
};
type_safe::optional<string> str_;
};
inline bool operator==(const cxstring& a, const cxstring& b) noexcept

166
src/libclang/tokenizer.cpp Normal file
View file

@ -0,0 +1,166 @@
// Copyright (C) 2017 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.
#include "tokenizer.hpp"
#include "libclang_visitor.hpp"
using namespace cppast;
detail::token::token(const detail::cxtranslation_unit& tu_unit, const CXToken& token)
: value_(clang_getTokenSpelling(tu_unit.get(), token)), kind_(clang_getTokenKind(token))
{
}
namespace
{
bool cursor_is_function(CXCursorKind kind)
{
return kind == CXCursor_FunctionDecl || kind == CXCursor_CXXMethod
|| kind == CXCursor_Constructor || kind == CXCursor_Destructor
|| kind == CXCursor_ConversionFunction;
}
CXSourceLocation get_next_location(const CXTranslationUnit& tu, CXFile file,
const CXSourceLocation& loc, int inc = 1)
{
unsigned offset;
clang_getSpellingLocation(loc, nullptr, nullptr, nullptr, &offset);
return clang_getLocationForOffset(tu, file, offset + inc);
}
class simple_tokenizer
{
public:
explicit simple_tokenizer(const CXTranslationUnit& tu, const CXSourceRange& range) : tu_(tu)
{
clang_tokenize(tu, range, &tokens_, &no_);
DEBUG_ASSERT(no_ >= 1u, detail::assert_handler{});
}
~simple_tokenizer()
{
clang_disposeTokens(tu_, tokens_, no_);
}
simple_tokenizer(const simple_tokenizer&) = delete;
simple_tokenizer& operator=(const simple_tokenizer&) = delete;
unsigned size() const noexcept
{
return no_;
}
const CXToken& operator[](unsigned i) const noexcept
{
return tokens_[i];
}
private:
CXTranslationUnit tu_;
CXToken* tokens_;
unsigned no_;
};
bool token_after_is(const CXTranslationUnit& tu, const CXFile& file,
const CXSourceLocation& loc, const char* token_str)
{
auto loc_after = get_next_location(tu, file, loc);
simple_tokenizer tokenizer(tu, clang_getRange(loc, loc_after));
detail::cxstring spelling(clang_getTokenSpelling(tu, tokenizer[0u]));
return spelling == token_str;
}
// clang_getCursorExtent() is somehow broken in various ways
// this function returns the actual CXSourceRange that covers all parts required for parsing
// might include more tokens
// this function is the reason you shouldn't use libclang
CXSourceRange get_extent(const CXTranslationUnit& tu, const CXFile& file, const CXCursor& cur)
{
auto extent = clang_getCursorExtent(cur);
auto begin = clang_getRangeStart(extent);
auto end = clang_getRangeEnd(extent);
if (cursor_is_function(clang_getCursorKind(cur))
|| cursor_is_function(clang_getTemplateCursorKind(cur)))
{
auto range_shrunk = false;
// if a function we need to remove the body
// it does not need to be parsed
detail::visit_children(cur, [&](const CXCursor& child) {
if (clang_getCursorKind(child) == CXCursor_CompoundStmt
|| clang_getCursorKind(child) == CXCursor_CXXTryStmt
|| clang_getCursorKind(child) == CXCursor_InitListExpr)
{
auto child_extent = clang_getCursorExtent(child);
end = clang_getRangeStart(child_extent);
range_shrunk = true;
return CXChildVisit_Break;
}
return CXChildVisit_Continue;
});
if (!range_shrunk && !token_after_is(tu, file, end, ";"))
{
// we do not have a body, but it is not a declaration either
do
{
end = get_next_location(tu, file, end);
} while (!token_after_is(tu, file, end, ";"));
}
else if (clang_getCursorKind(cur) == CXCursor_CXXMethod)
// necessary for some reason
begin = get_next_location(tu, file, begin, -1);
}
else if (clang_getCursorKind(cur) == CXCursor_TemplateTypeParameter
|| clang_getCursorKind(cur) == CXCursor_NonTypeTemplateParameter
|| clang_getCursorKind(cur) == CXCursor_TemplateTemplateParameter
|| clang_getCursorKind(cur) == CXCursor_ParmDecl)
{
if (clang_getCursorKind(cur) == CXCursor_TemplateTypeParameter
&& token_after_is(tu, file, end, "("))
{
// if you have decltype as default argument for a type template parameter
// libclang doesn't include the parameters
auto next = get_next_location(tu, file, end);
auto prev = end;
for (auto paren_count = 1; paren_count != 0;
next = get_next_location(tu, file, next))
{
if (token_after_is(tu, file, next, "("))
++paren_count;
else if (token_after_is(tu, file, next, ")"))
--paren_count;
prev = next;
}
end = prev;
}
}
else if (clang_getCursorKind(cur) == CXCursor_TypeAliasDecl
&& !token_after_is(tu, file, end, ";"))
{
// type alias tokens don't include everything
do
{
end = get_next_location(tu, file, end);
} while (!token_after_is(tu, file, end, ";"));
end = get_next_location(tu, file, end);
}
return clang_getRange(begin, end);
}
}
detail::tokenizer::tokenizer(const detail::cxtranslation_unit& tu, const CXFile& file,
const CXCursor& cur)
{
auto extent = get_extent(tu.get(), file, cur);
simple_tokenizer tokenizer(tu.get(), extent);
tokens_.reserve(tokenizer.size());
for (auto i = 0u; i != tokenizer.size(); ++i)
tokens_.emplace_back(tu, tokenizer[i]);
}

View file

@ -0,0 +1,63 @@
// Copyright (C) 2017 Jonathan Müller <jonathanmueller.dev@gmail.com>
// This file is subject to the license terms in the LICENSE file
// found in the top-level directory of this distribution.
#ifndef CPPAST_TOKENIZER_HPP_INCLUDED
#define CPPAST_TOKENIZER_HPP_INCLUDED
#include <vector>
#include "raii_wrapper.hpp"
namespace cppast
{
namespace detail
{
class token
{
public:
explicit token(const cxtranslation_unit& tu_unit, const CXToken& token);
const cxstring& value() const noexcept
{
return value_;
}
const char* c_str() const noexcept
{
return value_.c_str();
}
CXTokenKind kind() const noexcept
{
return kind_;
}
private:
cxstring value_;
CXTokenKind kind_;
};
class tokenizer
{
public:
explicit tokenizer(const cxtranslation_unit& tu, const CXFile& file,
const CXCursor& cur);
std::vector<token>::const_iterator begin() const noexcept
{
return tokens_.begin();
}
std::vector<token>::const_iterator end() const noexcept
{
return tokens_.end();
}
private:
std::vector<token> tokens_;
};
}
} // namespace cppast::detail
#endif // CPPAST_TOKENIZER_HPP_INCLUDED