From 1d083978ca342b796a4bda9c624729e17ff32e19 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jonathan=20M=C3=BCller?= Date: Tue, 21 Feb 2017 20:06:07 +0100 Subject: [PATCH] Add tokenizer --- src/libclang/debug_helper.cpp | 20 +++- src/libclang/debug_helper.hpp | 3 + src/libclang/raii_wrapper.hpp | 41 +++++++-- src/libclang/tokenizer.cpp | 166 ++++++++++++++++++++++++++++++++++ src/libclang/tokenizer.hpp | 63 +++++++++++++ 5 files changed, 282 insertions(+), 11 deletions(-) create mode 100644 src/libclang/tokenizer.cpp create mode 100644 src/libclang/tokenizer.hpp diff --git a/src/libclang/debug_helper.cpp b/src/libclang/debug_helper.cpp index a60c029..316e0df 100644 --- a/src/libclang/debug_helper.cpp +++ b/src/libclang/debug_helper.cpp @@ -5,12 +5,30 @@ #include "debug_helper.hpp" #include -#include +#include + +#include "tokenizer.hpp" using namespace cppast; +namespace +{ + std::mutex mtx; +} + void detail::print_cursor_info(const CXCursor& cur) noexcept { + std::lock_guard lock(mtx); std::printf("[debug] cursor '%s' (%s)\n", cxstring(clang_getCursorDisplayName(cur)).c_str(), cxstring(clang_getCursorKindSpelling(cur.kind)).c_str()); } + +void detail::print_tokens(const detail::cxtranslation_unit& tu, const CXFile& file, + const CXCursor& cur) noexcept +{ + std::lock_guard lock(mtx); + detail::tokenizer tokenizer(tu, file, cur); + for (auto& token : tokenizer) + std::printf("%s ", token.c_str()); + std::puts("\n"); +} diff --git a/src/libclang/debug_helper.hpp b/src/libclang/debug_helper.hpp index 7e7a890..946775b 100644 --- a/src/libclang/debug_helper.hpp +++ b/src/libclang/debug_helper.hpp @@ -12,6 +12,9 @@ namespace cppast namespace detail { void print_cursor_info(const CXCursor& cur) noexcept; + + void print_tokens(const cxtranslation_unit& tu, const CXFile& file, + const CXCursor& cur) noexcept; } } // namespace cppast::detail diff --git a/src/libclang/raii_wrapper.hpp b/src/libclang/raii_wrapper.hpp index a5deda6..f35aa4e 100644 --- a/src/libclang/raii_wrapper.hpp +++ b/src/libclang/raii_wrapper.hpp @@ -10,6 +10,7 @@ #include #include +#include #include @@ -89,22 +90,33 @@ namespace cppast class cxstring { public: - explicit cxstring(CXString str) noexcept - : str_(str), c_str_(clang_getCString(str)), length_(std::strlen(c_str_)) + explicit cxstring(CXString str) noexcept : str_(string(str)) { } - cxstring(const cxstring&) = delete; - cxstring& operator=(const cxstring&) = delete; + cxstring(cxstring&& other) noexcept : str_(other.str_) + { + other.str_.reset(); + } + + cxstring& operator=(cxstring&& other) noexcept + { + if (str_) + clang_disposeString(str_.value().str); + str_ = other.str_; + other.str_.reset(); + return *this; + } ~cxstring() noexcept { - clang_disposeString(str_); + if (str_) + clang_disposeString(str_.value().str); } const char* c_str() const noexcept { - return c_str_; + return str_ ? str_.value().c_str : ""; } char operator[](std::size_t i) const noexcept @@ -114,13 +126,22 @@ namespace cppast std::size_t length() const noexcept { - return length_; + return str_ ? str_.value().length : 0u; } private: - CXString str_; - const char* c_str_; - std::size_t length_; + struct string + { + CXString str; + const char* c_str; + std::size_t length; + + explicit string(CXString str) + : str(std::move(str)), c_str(clang_getCString(str)), length(std::strlen(c_str)) + { + } + }; + type_safe::optional str_; }; inline bool operator==(const cxstring& a, const cxstring& b) noexcept diff --git a/src/libclang/tokenizer.cpp b/src/libclang/tokenizer.cpp new file mode 100644 index 0000000..c3001cd --- /dev/null +++ b/src/libclang/tokenizer.cpp @@ -0,0 +1,166 @@ +// Copyright (C) 2017 Jonathan Müller +// This file is subject to the license terms in the LICENSE file +// found in the top-level directory of this distribution. + +#include "tokenizer.hpp" + +#include "libclang_visitor.hpp" + +using namespace cppast; + +detail::token::token(const detail::cxtranslation_unit& tu_unit, const CXToken& token) +: value_(clang_getTokenSpelling(tu_unit.get(), token)), kind_(clang_getTokenKind(token)) +{ +} + +namespace +{ + bool cursor_is_function(CXCursorKind kind) + { + return kind == CXCursor_FunctionDecl || kind == CXCursor_CXXMethod + || kind == CXCursor_Constructor || kind == CXCursor_Destructor + || kind == CXCursor_ConversionFunction; + } + + CXSourceLocation get_next_location(const CXTranslationUnit& tu, CXFile file, + const CXSourceLocation& loc, int inc = 1) + { + unsigned offset; + clang_getSpellingLocation(loc, nullptr, nullptr, nullptr, &offset); + return clang_getLocationForOffset(tu, file, offset + inc); + } + + class simple_tokenizer + { + public: + explicit simple_tokenizer(const CXTranslationUnit& tu, const CXSourceRange& range) : tu_(tu) + { + clang_tokenize(tu, range, &tokens_, &no_); + DEBUG_ASSERT(no_ >= 1u, detail::assert_handler{}); + } + + ~simple_tokenizer() + { + clang_disposeTokens(tu_, tokens_, no_); + } + + simple_tokenizer(const simple_tokenizer&) = delete; + simple_tokenizer& operator=(const simple_tokenizer&) = delete; + + unsigned size() const noexcept + { + return no_; + } + + const CXToken& operator[](unsigned i) const noexcept + { + return tokens_[i]; + } + + private: + CXTranslationUnit tu_; + CXToken* tokens_; + unsigned no_; + }; + + bool token_after_is(const CXTranslationUnit& tu, const CXFile& file, + const CXSourceLocation& loc, const char* token_str) + { + auto loc_after = get_next_location(tu, file, loc); + + simple_tokenizer tokenizer(tu, clang_getRange(loc, loc_after)); + detail::cxstring spelling(clang_getTokenSpelling(tu, tokenizer[0u])); + return spelling == token_str; + } + + // clang_getCursorExtent() is somehow broken in various ways + // this function returns the actual CXSourceRange that covers all parts required for parsing + // might include more tokens + // this function is the reason you shouldn't use libclang + CXSourceRange get_extent(const CXTranslationUnit& tu, const CXFile& file, const CXCursor& cur) + { + auto extent = clang_getCursorExtent(cur); + auto begin = clang_getRangeStart(extent); + auto end = clang_getRangeEnd(extent); + + if (cursor_is_function(clang_getCursorKind(cur)) + || cursor_is_function(clang_getTemplateCursorKind(cur))) + { + auto range_shrunk = false; + + // if a function we need to remove the body + // it does not need to be parsed + detail::visit_children(cur, [&](const CXCursor& child) { + if (clang_getCursorKind(child) == CXCursor_CompoundStmt + || clang_getCursorKind(child) == CXCursor_CXXTryStmt + || clang_getCursorKind(child) == CXCursor_InitListExpr) + { + auto child_extent = clang_getCursorExtent(child); + end = clang_getRangeStart(child_extent); + range_shrunk = true; + return CXChildVisit_Break; + } + return CXChildVisit_Continue; + }); + + if (!range_shrunk && !token_after_is(tu, file, end, ";")) + { + // we do not have a body, but it is not a declaration either + do + { + end = get_next_location(tu, file, end); + } while (!token_after_is(tu, file, end, ";")); + } + else if (clang_getCursorKind(cur) == CXCursor_CXXMethod) + // necessary for some reason + begin = get_next_location(tu, file, begin, -1); + } + else if (clang_getCursorKind(cur) == CXCursor_TemplateTypeParameter + || clang_getCursorKind(cur) == CXCursor_NonTypeTemplateParameter + || clang_getCursorKind(cur) == CXCursor_TemplateTemplateParameter + || clang_getCursorKind(cur) == CXCursor_ParmDecl) + { + if (clang_getCursorKind(cur) == CXCursor_TemplateTypeParameter + && token_after_is(tu, file, end, "(")) + { + // if you have decltype as default argument for a type template parameter + // libclang doesn't include the parameters + auto next = get_next_location(tu, file, end); + auto prev = end; + for (auto paren_count = 1; paren_count != 0; + next = get_next_location(tu, file, next)) + { + if (token_after_is(tu, file, next, "(")) + ++paren_count; + else if (token_after_is(tu, file, next, ")")) + --paren_count; + prev = next; + } + end = prev; + } + } + else if (clang_getCursorKind(cur) == CXCursor_TypeAliasDecl + && !token_after_is(tu, file, end, ";")) + { + // type alias tokens don't include everything + do + { + end = get_next_location(tu, file, end); + } while (!token_after_is(tu, file, end, ";")); + end = get_next_location(tu, file, end); + } + + return clang_getRange(begin, end); + } +} + +detail::tokenizer::tokenizer(const detail::cxtranslation_unit& tu, const CXFile& file, + const CXCursor& cur) +{ + auto extent = get_extent(tu.get(), file, cur); + + simple_tokenizer tokenizer(tu.get(), extent); + tokens_.reserve(tokenizer.size()); + for (auto i = 0u; i != tokenizer.size(); ++i) + tokens_.emplace_back(tu, tokenizer[i]); +} diff --git a/src/libclang/tokenizer.hpp b/src/libclang/tokenizer.hpp new file mode 100644 index 0000000..08e4906 --- /dev/null +++ b/src/libclang/tokenizer.hpp @@ -0,0 +1,63 @@ +// Copyright (C) 2017 Jonathan Müller +// This file is subject to the license terms in the LICENSE file +// found in the top-level directory of this distribution. + +#ifndef CPPAST_TOKENIZER_HPP_INCLUDED +#define CPPAST_TOKENIZER_HPP_INCLUDED + +#include + +#include "raii_wrapper.hpp" + +namespace cppast +{ + namespace detail + { + class token + { + public: + explicit token(const cxtranslation_unit& tu_unit, const CXToken& token); + + const cxstring& value() const noexcept + { + return value_; + } + + const char* c_str() const noexcept + { + return value_.c_str(); + } + + CXTokenKind kind() const noexcept + { + return kind_; + } + + private: + cxstring value_; + CXTokenKind kind_; + }; + + class tokenizer + { + public: + explicit tokenizer(const cxtranslation_unit& tu, const CXFile& file, + const CXCursor& cur); + + std::vector::const_iterator begin() const noexcept + { + return tokens_.begin(); + } + + std::vector::const_iterator end() const noexcept + { + return tokens_.end(); + } + + private: + std::vector tokens_; + }; + } +} // namespace cppast::detail + +#endif // CPPAST_TOKENIZER_HPP_INCLUDED