diff --git a/include/cppast/cpp_token.hpp b/include/cppast/cpp_token.hpp index 1daf036..47d1073 100644 --- a/include/cppast/cpp_token.hpp +++ b/include/cppast/cpp_token.hpp @@ -15,12 +15,13 @@ namespace cppast /// The kinds of C++ tokens. enum class cpp_token_kind { - identifier, //< Any identifier. - keyword, //< Any keyword. - literal, //< Any literal. - punctuation, //< Any other punctuation. - - unknown, //< An unknown token. + identifier, //< Any identifier. + keyword, //< Any keyword. + int_literal, //< An integer literal. + float_literal, //< A floating point literal. + char_literal, //< A character literal. + string_literal, //< A string literal. + punctuation //< Any other punctuation. }; /// A C++ token. @@ -74,16 +75,15 @@ namespace cppast std::vector tokens_; }; + /// Tokenizes a string. + /// \effects Splits the string into C++ tokens. + /// The string must contain valid tokens and must already be preprocessed (i.e. translation phase 6 is already done). + /// \returns The tokenized string. + static cpp_token_string tokenize(std::string str); + /// \effects Creates it from a sequence of tokens. cpp_token_string(std::vector tokens) : tokens_(std::move(tokens)) {} - /// \effects Creates from a string. - /// \notes This does not do tokenization, it will only store a single, unknown token! - static cpp_token_string from_string(std::string str) - { - return cpp_token_string({cpp_token(cpp_token_kind::unknown, std::move(str))}); - } - /// \exclude target using iterator = std::vector::const_iterator; diff --git a/src/code_generator.cpp b/src/code_generator.cpp index 3837352..1546ea6 100644 --- a/src/code_generator.cpp +++ b/src/code_generator.cpp @@ -1158,7 +1158,7 @@ void detail::write_template_arguments( void detail::write_token_string(code_generator::output& output, const cpp_token_string& tokens) { - auto last_kind = cpp_token_kind::unknown; + auto last_kind = cpp_token_kind::punctuation; // neutral regarding whitespace for (auto& token : tokens) { switch (token.kind) @@ -1177,14 +1177,15 @@ void detail::write_token_string(code_generator::output& output, const cpp_token_ output << operator_ws; break; - case cpp_token_kind::literal: - // determine kind of literal - if (token.spelling.front() == '\"') - output << string_literal(token.spelling); - else if (token.spelling.find('.') != std::string::npos) - output << float_literal(token.spelling); - else - output << int_literal(token.spelling); + case cpp_token_kind::int_literal: + output << int_literal(token.spelling); + break; + case cpp_token_kind::float_literal: + output << float_literal(token.spelling); + break; + case cpp_token_kind::char_literal: + case cpp_token_kind::string_literal: + output << string_literal(token.spelling); break; case cpp_token_kind::punctuation: @@ -1206,9 +1207,6 @@ void detail::write_token_string(code_generator::output& output, const cpp_token_ else output << punctuation(token.spelling); break; - - case cpp_token_kind::unknown: - output << token_seq(token.spelling); } last_kind = token.kind; diff --git a/src/cpp_token.cpp b/src/cpp_token.cpp index 3d68d0c..04d9ec5 100644 --- a/src/cpp_token.cpp +++ b/src/cpp_token.cpp @@ -6,6 +6,9 @@ #include #include +#include +#include + #include using namespace cppast; @@ -16,6 +19,586 @@ void cpp_token_string::builder::unmunch() tokens_.back().spelling = ">"; } +namespace +{ + template + bool starts_with(const char* ptr, const char (&str)[N]) + { + return std::strncmp(ptr, str, N - 1u) == 0; + } + + bool starts_with(const char* ptr, const std::string& str) + { + return std::strncmp(ptr, str.c_str(), str.size()) == 0; + } + + template + bool bump_if(const char*& ptr, const char (&str)[N]) + { + if (starts_with(ptr, str)) + { + ptr += N - 1; + return true; + } + else + return false; + } + + bool bump_if(const char*& ptr, const std::string& str) + { + if (starts_with(ptr, str)) + { + ptr += str.size(); + return true; + } + else + return false; + } + + bool is_identifier_nondigit(char c) + { + // assume ASCII + if (c >= 'a' && c <= 'z') + return true; + else if (c >= 'A' && c <= 'Z') + return true; + else if (c == '_') + return true; + else + // technically \uXXX is allowed as well, but I haven't seen that used ever + return false; + } + + bool is_digit(char c) + { + return c >= '0' && c <= '9'; + } + + bool is_hexadecimal_digit(char c) + { + return is_digit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); + } + + type_safe::optional bump_identifier(const char*& ptr) + { + if (is_identifier_nondigit(*ptr)) + { + std::string result; + result += *ptr++; + + while (is_identifier_nondigit(*ptr) || is_digit(*ptr)) + result += *ptr++; + + return result; + } + else + return type_safe::nullopt; + } + + type_safe::optional identifier_token(const char*& ptr) + { + auto identifier = bump_identifier(ptr); + if (!identifier) + return type_safe::nullopt; + + static constexpr const char* keywords[] = {"alignas", + "alignof", + "asm", + "auto", + "bool", + "break", + "case", + "catch", + "char", + "char16_t", + "char32_t", + "class", + "const", + "constexpr", + "const_cast", + "continue", + "decltype", + "default", + "delete", + "do", + "double", + "dynamic_cast", + "else", + "enum", + "explicit", + "export", + "extern", + "false", + "float", + "for", + "friend", + "goto", + "if", + "inline", + "int", + "long", + "mutable", + "namespace", + "new", + "noexcept", + "nullptr", + "operator", + "private", + "protected", + "public", + "register", + "reinterpret_cast", + "return", + "short", + "signed", + "sizeof", + "static", + "static_assert", + "static_cast", + "struct", + "switch", + "template", + "this", + "thread_local", + "throw", + "true", + "try", + "typedef", + "typeid", + "typename", + "union", + "unsigned", + "using", + "virtual", + "void", + "volatile", + "wchar_t", + "while"}; + auto find_keyword = std::find(std::begin(keywords), std::end(keywords), identifier.value()); + if (find_keyword != std::end(keywords)) + return cpp_token(cpp_token_kind::keyword, identifier.value()); + else if (identifier == "and") + return cpp_token(cpp_token_kind::punctuation, "&&"); + else if (identifier == "and_eq") + return cpp_token(cpp_token_kind::punctuation, "&="); + else if (identifier == "bitand") + return cpp_token(cpp_token_kind::punctuation, "&"); + else if (identifier == "bitor") + return cpp_token(cpp_token_kind::punctuation, "|"); + else if (identifier == "compl") + return cpp_token(cpp_token_kind::punctuation, "~"); + else if (identifier == "not") + return cpp_token(cpp_token_kind::punctuation, "!"); + else if (identifier == "not_eq") + return cpp_token(cpp_token_kind::punctuation, "!="); + else if (identifier == "or") + return cpp_token(cpp_token_kind::punctuation, "||"); + else if (identifier == "or_eq") + return cpp_token(cpp_token_kind::punctuation, "|="); + else if (identifier == "xor") + return cpp_token(cpp_token_kind::punctuation, "^"); + else if (identifier == "xor_eq") + return cpp_token(cpp_token_kind::punctuation, "^="); + else + return cpp_token(cpp_token_kind::identifier, identifier.value()); + } + + void append_udl_suffix(std::string& literal, const char*& ptr) + { + if (auto id = identifier_token(ptr)) + literal += id.value().spelling; + } + + template + std::string parse_digit_sequence(const char*& ptr, DigitPredicate is_digit) + { + std::string result; + for (; is_digit(*ptr) || *ptr == '\''; ++ptr) + if (*ptr != '\'') + result += *ptr; + DEBUG_ASSERT(result.back() != '\'', detail::assert_handler{}); + return result; + } + + void append_integer_suffix(std::string& literal, const char*& ptr) + { + auto append_unsigned_suffix = [](std::string& literal, const char*& ptr) { + if (*ptr == 'u' || *ptr == 'U') + { + literal += *ptr++; + return true; + } + else + return false; + }; + auto append_long_suffix = [](std::string& literal, const char*& ptr) { + if (starts_with(ptr, "ll") || starts_with(ptr, "LL")) + { + literal += *ptr++; + literal += *ptr++; + return true; + } + else if (*ptr == 'l' || *ptr == 'L') + { + literal += *ptr++; + return true; + } + else + return false; + }; + + if (append_unsigned_suffix(literal, ptr)) + append_long_suffix(literal, ptr); + else if (append_long_suffix(literal, ptr)) + append_unsigned_suffix(literal, ptr); + else + append_udl_suffix(literal, ptr); + } + + void append_floating_point_suffix(std::string& literal, const char*& ptr) + { + if (*ptr == 'f' || *ptr == 'F') + literal += *ptr++; + else if (*ptr == 'l' || *ptr == 'L') + literal += *ptr++; + else + append_udl_suffix(literal, ptr); + } + + type_safe::optional parse_floating_point_exponent(const char*& ptr) + { + if (*ptr == 'e' || *ptr == 'E' || *ptr == 'p' || *ptr == 'P') + { + std::string result; + result += *ptr++; + if (*ptr == '+' || *ptr == '-') + result += *ptr++; + + result += parse_digit_sequence(ptr, &is_digit); + return result; + } + else + return type_safe::nullopt; + } + + type_safe::optional numeric_literal_token(const char*& ptr) + { + if (starts_with(ptr, "0b") || starts_with(ptr, "0B")) // binary integer literal + { + std::string result; + result += *ptr++; + result += *ptr++; + result += parse_digit_sequence(ptr, [](char c) { return c == '0' || c == '1'; }); + append_integer_suffix(result, ptr); + return cpp_token(cpp_token_kind::int_literal, result); + } + else if (starts_with(ptr, "0x") || starts_with(ptr, "0X")) // hexadecimal literal + { + std::string result; + result += *ptr++; + result += *ptr++; + result += parse_digit_sequence(ptr, &is_hexadecimal_digit); + + auto is_float = false; + if (*ptr == '.') + { + // floating point hexadecimal + is_float = true; + result += *ptr++; + result += parse_digit_sequence(ptr, &is_hexadecimal_digit); + } + + if (auto exp = parse_floating_point_exponent(ptr)) + { + is_float = true; + // floating point exponent + result += exp.value(); + } + + if (is_float) + append_floating_point_suffix(result, ptr); + else + append_integer_suffix(result, ptr); + + return cpp_token(is_float ? cpp_token_kind::float_literal : cpp_token_kind::int_literal, + result); + } + else if (is_digit(*ptr)) // octal and decimal literals + { + std::string result; + result += parse_digit_sequence(ptr, &is_digit); + + auto is_float = false; + if (*ptr == '.') + { + // floating point decimal + is_float = true; + result += *ptr++; + result += parse_digit_sequence(ptr, &is_hexadecimal_digit); + } + + if (auto exp = parse_floating_point_exponent(ptr)) + { + // floating point exponent + is_float = true; + result += exp.value(); + } + + if (is_float) + append_floating_point_suffix(result, ptr); + else + append_integer_suffix(result, ptr); + + return cpp_token(is_float ? cpp_token_kind::float_literal : cpp_token_kind::int_literal, + result); + } + else if (*ptr == '.' && is_digit(ptr[1])) + { + std::string result; + + // floating point fraction + result += *ptr++; + result += parse_digit_sequence(ptr, &is_digit); + + if (auto exp = parse_floating_point_exponent(ptr)) + result += exp.value(); + + append_floating_point_suffix(result, ptr); + return cpp_token(cpp_token_kind::float_literal, result); + } + else + return type_safe::nullopt; + } + + type_safe::optional parse_encoding_prefix(const char*& ptr) + { + if (bump_if(ptr, "u8")) + return "u8"; + else if (bump_if(ptr, "u")) + return "u"; + else if (bump_if(ptr, "U")) + return "U"; + else if (bump_if(ptr, "L")) + return "L"; + else + return type_safe::nullopt; + } + + type_safe::optional character_literal(const char*& ptr) + { + auto save = ptr; + auto prefix = parse_encoding_prefix(ptr); + if (*ptr != '\'') + { + ptr = save; + return type_safe::nullopt; + } + else + { + auto result = prefix.value_or(""); + result += *ptr++; + + while (*ptr != '\'') + { + DEBUG_ASSERT(*ptr, detail::assert_handler{}); + + if (*ptr == '\\') + result += *ptr++; + result += *ptr++; + } + result += *ptr++; + + append_udl_suffix(result, ptr); + return cpp_token(cpp_token_kind::char_literal, result); + } + } + + type_safe::optional string_literal(const char*& ptr) + { + auto save = ptr; + auto prefix = parse_encoding_prefix(ptr); + if (starts_with(ptr, "R\"")) + { + // raw string literal + auto result = prefix.value_or(""); + result += *ptr++; + result += *ptr++; + + std::string terminator; + terminator += ")"; + while (*ptr != '(') + { + result += *ptr; + terminator += *ptr++; + } + result += *ptr++; + terminator += '"'; + + while (!bump_if(ptr, terminator)) + { + DEBUG_ASSERT(ptr, detail::assert_handler{}); + result += *ptr++; + } + result += terminator; + + append_udl_suffix(result, ptr); + return cpp_token(cpp_token_kind::string_literal, result); + } + else if (starts_with(ptr, "\"")) + { + // regular string literal + auto result = prefix.value_or(""); + result += *ptr++; + + while (*ptr != '"') + { + DEBUG_ASSERT(*ptr, detail::assert_handler{}); + + if (*ptr == '\\') + result += *ptr++; + result += *ptr++; + } + result += *ptr++; + + append_udl_suffix(result, ptr); + return cpp_token(cpp_token_kind::string_literal, result); + } + else + { + ptr = save; + return type_safe::nullopt; + } + } + + type_safe::optional digraph_token(const char*& ptr) + { + if (bump_if(ptr, "<%")) + return cpp_token(cpp_token_kind::punctuation, "{"); + else if (bump_if(ptr, "%>")) + return cpp_token(cpp_token_kind::punctuation, "}"); + else if (starts_with(ptr, "<::") && ptr[3] != ':' && ptr[3] != '>') + // don't detect digraph in std::vector<::std::string> + return type_safe::nullopt; + else if (bump_if(ptr, "<:")) + return cpp_token(cpp_token_kind::punctuation, "["); + else if (bump_if(ptr, ":>")) + return cpp_token(cpp_token_kind::punctuation, "]"); + else if (bump_if(ptr, "%:%:")) + return cpp_token(cpp_token_kind::punctuation, "##"); + else if (bump_if(ptr, "%:")) + return cpp_token(cpp_token_kind::punctuation, "#"); + else + return type_safe::nullopt; + } + + type_safe::optional punctuation_token(const char*& ptr) + { + static constexpr const char* punctuations[] = { + // tokens staring with # + "##", + "#", + // tokens starting with . + "...", + ".*", + ".", + // tokens starting with : + "::", + ":", + // tokens starting with + + "+=", + "++", + "+", + // tokens starting with - + "->*", + "->", + "--", + "-=", + "-", + // tokens starting with * + "*=", + "*", + // tokens starting with / + "/=", + "/", + // tokens starting with % + "%=", + "%", + // tokens starting with ^ + "^=", + "^", + // tokens starting with & + "&=", + "&&", + "&", + // tokens starting with | + "|=", + "||", + "|", + // tokens starting with < + "<<=", + "<<", + "<=", + "<", + // tokens starting with > + ">>=", + ">>", + ">=", + ">", + // tokens starting with ! + "!=", + "!", + // tokens starting with = + "==", + "=", + // single tokens + "~", + ";", + "?", + ",", + "{", + "}", + "[", + "]", + "(", + ")", + }; + + for (auto punct : punctuations) + if (bump_if(ptr, punct)) + return cpp_token(cpp_token_kind::punctuation, punct); + + return type_safe::nullopt; + } +} + +cpp_token_string cpp_token_string::tokenize(std::string str) +{ + cpp_token_string::builder builder; + + auto ptr = str.c_str(); + while (*ptr) + { + if (auto num = numeric_literal_token(ptr)) + builder.add_token(num.value()); + else if (auto char_lit = character_literal(ptr)) + builder.add_token(char_lit.value()); + else if (auto str_lit = string_literal(ptr)) + builder.add_token(str_lit.value()); + else if (auto digraphs = digraph_token(ptr)) + builder.add_token(digraphs.value()); + else if (auto punct = punctuation_token(ptr)) + builder.add_token(punct.value()); + else if (auto id = identifier_token(ptr)) + builder.add_token(id.value()); + else if (*ptr == ' ' || *ptr == '\t' || *ptr == '\n' || *ptr == '\r') + ++ptr; + else + DEBUG_UNREACHABLE(detail::assert_handler{}); + } + + return builder.finish(); +} + namespace { bool is_identifier(char c) diff --git a/src/libclang/cxtokenizer.cpp b/src/libclang/cxtokenizer.cpp index 51c3a87..cc0c6de 100644 --- a/src/libclang/cxtokenizer.cpp +++ b/src/libclang/cxtokenizer.cpp @@ -412,9 +412,9 @@ bool detail::skip_attribute(detail::cxtoken_stream& stream) namespace { - cpp_token_kind get_kind(CXTokenKind kind) + cpp_token_kind get_kind(const detail::cxtoken& token) { - switch (kind) + switch (token.kind()) { case CXToken_Punctuation: return cpp_token_kind::punctuation; @@ -422,14 +422,26 @@ namespace return cpp_token_kind::keyword; case CXToken_Identifier: return cpp_token_kind::identifier; + case CXToken_Literal: - return cpp_token_kind::literal; + { + auto spelling = token.value().std_str(); + if (spelling.find('.') != std::string::npos) + return cpp_token_kind::float_literal; + else if (std::isdigit(spelling.front())) + return cpp_token_kind::int_literal; + else if (spelling.back() == '\'') + return cpp_token_kind::char_literal; + else + return cpp_token_kind::string_literal; + } + case CXToken_Comment: break; } DEBUG_UNREACHABLE(detail::assert_handler{}); - return cpp_token_kind ::literal; + return cpp_token_kind::punctuation; } } @@ -440,7 +452,7 @@ cpp_token_string detail::to_string(cxtoken_stream& stream, cxtoken_iterator end) while (stream.cur() != end) { auto& token = stream.get(); - builder.add_token(cpp_token(get_kind(token.kind()), token.c_str())); + builder.add_token(cpp_token(get_kind(token), token.c_str())); } if (stream.unmunch()) diff --git a/src/libclang/type_parser.cpp b/src/libclang/type_parser.cpp index c850e8a..5e0dd62 100644 --- a/src/libclang/type_parser.cpp +++ b/src/libclang/type_parser.cpp @@ -246,7 +246,7 @@ namespace return size_expr.empty() ? nullptr : cpp_unexposed_expression::build(cpp_builtin_type::build(cpp_ulonglong), - cpp_token_string::from_string( + cpp_token_string::tokenize( std::string(size_expr.rbegin(), size_expr.rend()))); } @@ -488,7 +488,7 @@ namespace return cpp_decltype_type::build( cpp_unexposed_expression::build(cpp_unexposed_type::build(""), - cpp_token_string::from_string(spelling))); + cpp_token_string::tokenize(spelling))); }); } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 172e1cb..85282a6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -24,6 +24,7 @@ set(tests cpp_preprocessor.cpp cpp_static_assert.cpp cpp_template_parameter.cpp + cpp_token.cpp cpp_type_alias.cpp cpp_variable.cpp integration.cpp diff --git a/test/cpp_function.cpp b/test/cpp_function.cpp index 1208093..53cf6ed 100644 --- a/test/cpp_function.cpp +++ b/test/cpp_function.cpp @@ -104,7 +104,7 @@ void ns::l() *cpp_unexposed_expression:: build(cpp_pointer_type::build( cpp_builtin_type::build(cpp_float)), - cpp_token_string::from_string("nullptr")))); + cpp_token_string::tokenize("nullptr")))); } else REQUIRE(false); @@ -135,7 +135,7 @@ void ns::l() *cpp_decltype_type::build( cpp_unexposed_expression:: build(cpp_builtin_type::build(cpp_int), - cpp_token_string::from_string("42"))))); + cpp_token_string::tokenize("42"))))); REQUIRE(!param.default_value()); } else @@ -162,16 +162,17 @@ void ns::l() equal_expressions(func.noexcept_condition().value(), *cpp_literal_expression::build(std::move(bool_t), "true"))); else if (func.name() == "e") - REQUIRE(equal_expressions(func.noexcept_condition().value(), - *cpp_unexposed_expression:: - build(std::move(bool_t), - cpp_token_string::from_string("false")))); + REQUIRE( + equal_expressions(func.noexcept_condition().value(), + *cpp_unexposed_expression::build(std::move(bool_t), + cpp_token_string::tokenize( + "false")))); else if (func.name() == "f") REQUIRE( equal_expressions(func.noexcept_condition().value(), - *cpp_unexposed_expression:: - build(std::move(bool_t), - cpp_token_string::from_string("noexcept(d())")))); + *cpp_unexposed_expression::build(std::move(bool_t), + cpp_token_string::tokenize( + "noexcept(d())")))); } else if (func.name() == "g" || func.name() == "h" || func.name() == "i" || func.name() == "j") diff --git a/test/cpp_member_function.cpp b/test/cpp_member_function.cpp index 6165a94..8cf9708 100644 --- a/test/cpp_member_function.cpp +++ b/test/cpp_member_function.cpp @@ -402,12 +402,11 @@ d::~d() {} REQUIRE(!dtor.is_virtual()); REQUIRE(dtor.body_kind() == cpp_function_definition); REQUIRE(dtor.noexcept_condition()); - REQUIRE( - equal_expressions(dtor.noexcept_condition().value(), - *cpp_unexposed_expression::build(cpp_builtin_type::build( - cpp_bool), - cpp_token_string::from_string( - "false")))); + REQUIRE(equal_expressions(dtor.noexcept_condition().value(), + *cpp_unexposed_expression::build(cpp_builtin_type::build( + cpp_bool), + cpp_token_string::tokenize( + "false")))); } else if (dtor.name() == "~c") { diff --git a/test/cpp_member_variable.cpp b/test/cpp_member_variable.cpp index 52db1d5..a8f339e 100644 --- a/test/cpp_member_variable.cpp +++ b/test/cpp_member_variable.cpp @@ -39,7 +39,7 @@ struct foo // all initializers are unexposed auto def = cpp_unexposed_expression::build(cpp_builtin_type::build(cpp_float), - cpp_token_string::from_string("3.14f")); + cpp_token_string::tokenize("3.14f")); REQUIRE(var.default_value()); REQUIRE(equal_expressions(var.default_value().value(), *def)); diff --git a/test/cpp_static_assert.cpp b/test/cpp_static_assert.cpp index ceed7b4..1a9c38b 100644 --- a/test/cpp_static_assert.cpp +++ b/test/cpp_static_assert.cpp @@ -34,17 +34,15 @@ struct foo REQUIRE(equal_expressions(assert.expression(), *cpp_literal_expression::build(std::move(bool_t), "true"))); else if (assert.message() == "a") - REQUIRE( - equal_expressions(assert.expression(), - *cpp_unexposed_expression::build(std::move(bool_t), - cpp_token_string::from_string( - "true||false")))); + REQUIRE(equal_expressions(assert.expression(), + *cpp_unexposed_expression::build(std::move(bool_t), + cpp_token_string::tokenize( + "true||false")))); else if (assert.message() == "b") - REQUIRE( - equal_expressions(assert.expression(), - *cpp_unexposed_expression::build(std::move(bool_t), - cpp_token_string::from_string( - "!B")))); + REQUIRE(equal_expressions(assert.expression(), + *cpp_unexposed_expression::build(std::move(bool_t), + cpp_token_string::tokenize( + "!B")))); else REQUIRE(false); }); diff --git a/test/cpp_template_parameter.cpp b/test/cpp_template_parameter.cpp index 74d5f26..e0a5de6 100644 --- a/test/cpp_template_parameter.cpp +++ b/test/cpp_template_parameter.cpp @@ -151,13 +151,13 @@ using d = void; cpp_builtin_type::build(cpp_char)))); REQUIRE(!param.is_variadic()); REQUIRE(param.default_value()); - REQUIRE(equal_expressions(param.default_value().value(), - *cpp_unexposed_expression:: - build(cpp_builtin_type::build( - cpp_nullptr), - cpp_token_string:: - from_string( - "nullptr")))); + REQUIRE( + equal_expressions(param.default_value().value(), + *cpp_unexposed_expression:: + build(cpp_builtin_type::build( + cpp_nullptr), + cpp_token_string::tokenize( + "nullptr")))); } else if (param.name() == "C") { diff --git a/test/cpp_token.cpp b/test/cpp_token.cpp new file mode 100644 index 0000000..2996e72 --- /dev/null +++ b/test/cpp_token.cpp @@ -0,0 +1,129 @@ +// Copyright (C) 2017 Jonathan Müller +// This file is subject to the license terms in the LICENSE file +// found in the top-level directory of this distribution. + +#include + +#include + +#include +#include + +using namespace cppast; + +void check_equal_tokens(const std::string& str, std::initializer_list tokens) +{ + auto token_str = cpp_token_string::tokenize(str); + INFO(str); + REQUIRE(token_str.end() - token_str.begin() == tokens.size()); + REQUIRE(std::equal(token_str.begin(), token_str.end(), tokens.begin())); +} + +TEST_CASE("tokenizer") +{ + SECTION("integer literals") + { + check_equal_tokens(" 1234 ", {cpp_token(cpp_token_kind::int_literal, "1234")}); + check_equal_tokens("1, 2", {cpp_token(cpp_token_kind::int_literal, "1"), + cpp_token(cpp_token_kind::punctuation, ","), + cpp_token(cpp_token_kind::int_literal, "2")}); + + // integer suffixes + check_equal_tokens("1234ul", {cpp_token(cpp_token_kind::int_literal, "1234ul")}); + check_equal_tokens("12'34LU", {cpp_token(cpp_token_kind::int_literal, "1234LU")}); + + // other integer formats + check_equal_tokens("01234", {cpp_token(cpp_token_kind::int_literal, "01234")}); + check_equal_tokens("0x1234AF", {cpp_token(cpp_token_kind::int_literal, "0x1234AF")}); + check_equal_tokens("0b101101", {cpp_token(cpp_token_kind::int_literal, "0b101101")}); + } + SECTION("floating point literals") + { + // floating point suffixes + check_equal_tokens("3.14", {cpp_token(cpp_token_kind::float_literal, "3.14")}); + check_equal_tokens("3.14f", {cpp_token(cpp_token_kind::float_literal, "3.14f")}); + check_equal_tokens("3.14L", {cpp_token(cpp_token_kind::float_literal, "3.14L")}); + + // missing parts + check_equal_tokens(".5", {cpp_token(cpp_token_kind::float_literal, ".5")}); + check_equal_tokens("1.", {cpp_token(cpp_token_kind::float_literal, "1.")}); + + // exponents + check_equal_tokens("1.0e4", {cpp_token(cpp_token_kind::float_literal, "1.0e4")}); + check_equal_tokens("1e4", {cpp_token(cpp_token_kind::float_literal, "1e4")}); + check_equal_tokens(".5e-2", {cpp_token(cpp_token_kind::float_literal, ".5e-2")}); + + // hexadecimal + check_equal_tokens("0xabc.def", {cpp_token(cpp_token_kind::float_literal, "0xabc.def")}); + check_equal_tokens("0x123p42", {cpp_token(cpp_token_kind::float_literal, "0x123p42")}); + } + SECTION("character literals") + { + check_equal_tokens(R"('a')", {cpp_token(cpp_token_kind::char_literal, R"('a')")}); + check_equal_tokens(R"(u8'a')", {cpp_token(cpp_token_kind::char_literal, R"(u8'a')")}); + check_equal_tokens(R"(U'a')", {cpp_token(cpp_token_kind::char_literal, R"(U'a')")}); + check_equal_tokens(R"('\'')", {cpp_token(cpp_token_kind::char_literal, R"('\'')")}); + } + SECTION("string literals") + { + check_equal_tokens(R"("hello")", {cpp_token(cpp_token_kind::string_literal, R"("hello")")}); + check_equal_tokens(R"(u8"he\"llo")", + {cpp_token(cpp_token_kind::string_literal, R"(u8"he\"llo")")}); + + check_equal_tokens(R"*(R"(hel\"lo)")*", + {cpp_token(cpp_token_kind::string_literal, R"*(R"(hel\"lo)")*")}); + check_equal_tokens(R"**(R"*(hello R"(foo)")*")**", + {cpp_token(cpp_token_kind::string_literal, + R"**(R"*(hello R"(foo)")*")**")}); + } + SECTION("UDLs") + { + check_equal_tokens("123_foo", {cpp_token(cpp_token_kind::int_literal, "123_foo")}); + check_equal_tokens("123.456_foo", + {cpp_token(cpp_token_kind::float_literal, "123.456_foo")}); + check_equal_tokens(R"("hi"_foo)", + {cpp_token(cpp_token_kind::string_literal, R"("hi"_foo)")}); + } + SECTION("identifiers") + { + check_equal_tokens("foo bar baz_a", {cpp_token(cpp_token_kind::identifier, "foo"), + cpp_token(cpp_token_kind::identifier, "bar"), + cpp_token(cpp_token_kind::identifier, "baz_a")}); + check_equal_tokens("constant", {cpp_token(cpp_token_kind::identifier, "constant")}); + } + SECTION("keywords") + { + // just test some + check_equal_tokens("const float auto", {cpp_token(cpp_token_kind::keyword, "const"), + cpp_token(cpp_token_kind::keyword, "float"), + cpp_token(cpp_token_kind::keyword, "auto")}); + } + SECTION("punctuations") + { + // just test munch things + check_equal_tokens("<< <= <", {cpp_token(cpp_token_kind::punctuation, "<<"), + cpp_token(cpp_token_kind::punctuation, "<="), + cpp_token(cpp_token_kind::punctuation, "<")}); + check_equal_tokens("- -- -> ->*", {cpp_token(cpp_token_kind::punctuation, "-"), + cpp_token(cpp_token_kind::punctuation, "--"), + cpp_token(cpp_token_kind::punctuation, "->"), + cpp_token(cpp_token_kind::punctuation, "->*")}); + check_equal_tokens("--->>>>", {cpp_token(cpp_token_kind::punctuation, "--"), + cpp_token(cpp_token_kind::punctuation, "->"), + cpp_token(cpp_token_kind::punctuation, ">>"), + cpp_token(cpp_token_kind::punctuation, ">")}); + + // alternative spellings + check_equal_tokens("and not xor", {cpp_token(cpp_token_kind::punctuation, "&&"), + cpp_token(cpp_token_kind::punctuation, "!"), + cpp_token(cpp_token_kind::punctuation, "^")}); + + // digraphs + check_equal_tokens("<% foo<::bar>", {cpp_token(cpp_token_kind::punctuation, "{"), + cpp_token(cpp_token_kind::identifier, "foo"), + cpp_token(cpp_token_kind::punctuation, "<"), + cpp_token(cpp_token_kind::punctuation, "::"), + cpp_token(cpp_token_kind::identifier, "bar"), + cpp_token(cpp_token_kind::punctuation, ">")}); + } +} diff --git a/test/cpp_type_alias.cpp b/test/cpp_type_alias.cpp index 2454efd..27a0939 100644 --- a/test/cpp_type_alias.cpp +++ b/test/cpp_type_alias.cpp @@ -334,7 +334,7 @@ typedef decltype(0) w; return cpp_literal_expression::build(std::move(type), std::move(size)); else return cpp_unexposed_expression::build(std::move(type), - cpp_token_string::from_string(std::move(size))); + cpp_token_string::tokenize(std::move(size))); }; cpp_entity_index idx; @@ -507,7 +507,7 @@ typedef decltype(0) w; { auto type = cpp_decltype_type::build( cpp_unexposed_expression::build(cpp_builtin_type::build(cpp_int), - cpp_token_string::from_string("0"))); + cpp_token_string::tokenize("0"))); REQUIRE(equal_types(idx, alias.underlying_type(), *type)); } else diff --git a/test/cpp_variable.cpp b/test/cpp_variable.cpp index 83c6bd4..76e4905 100644 --- a/test/cpp_variable.cpp +++ b/test/cpp_variable.cpp @@ -101,14 +101,13 @@ int r[] = {0}; // unexposed due to implicit cast, I think type_safe::ref( *cpp_unexposed_expression::build(cpp_builtin_type::build(cpp_int), - cpp_token_string::from_string( - "42"))), + cpp_token_string::tokenize("42"))), cpp_storage_class_none, false, false); else if (var.name() == "c") check_variable(var, *cpp_builtin_type::build(cpp_float), type_safe::ref( *cpp_unexposed_expression::build(cpp_builtin_type::build(cpp_float), - cpp_token_string::from_string( + cpp_token_string::tokenize( "3.f+0.14f"))), cpp_storage_class_none, false, false); else if (var.name() == "d") @@ -126,8 +125,7 @@ int r[] = {0}; cpp_cv_const), type_safe::ref( *cpp_unexposed_expression::build(cpp_builtin_type::build(cpp_int), - cpp_token_string::from_string( - "12"))), + cpp_token_string::tokenize("12"))), cpp_storage_class_none, true, false); else if (var.name() == "i") { @@ -147,7 +145,7 @@ int r[] = {0}; *cpp_unexposed_expression::build(cpp_user_defined_type::build( cpp_type_ref(cpp_entity_id(""), "bar")), - cpp_token_string::from_string( + cpp_token_string::tokenize( "bar()"))), cpp_storage_class_none, false, false); return false; @@ -169,8 +167,7 @@ int r[] = {0}; check_variable(var, *cpp_auto_type::build(), type_safe::ref( *cpp_unexposed_expression::build(cpp_builtin_type::build(cpp_int), - cpp_token_string::from_string( - "128"))), + cpp_token_string::tokenize("128"))), cpp_storage_class_none, false, false); else if (var.name() == "n") check_variable(var, @@ -180,14 +177,13 @@ int r[] = {0}; cpp_ref_lvalue), type_safe::ref( *cpp_unexposed_expression::build(cpp_builtin_type::build(cpp_int), - cpp_token_string::from_string( - "m"))), + cpp_token_string::tokenize("m"))), cpp_storage_class_none, false, false); else if (var.name() == "o") check_variable(var, *cpp_decltype_type::build( cpp_unexposed_expression::build(cpp_builtin_type::build(cpp_int), - cpp_token_string::from_string("0"))), + cpp_token_string::tokenize("0"))), nullptr, cpp_storage_class_none, false, false); else if (var.name() == "p") check_variable(var, @@ -196,13 +192,12 @@ int r[] = {0}; build(cpp_decltype_type::build( cpp_unexposed_expression:: build(cpp_builtin_type::build(cpp_int), - cpp_token_string::from_string("o"))), + cpp_token_string::tokenize("o"))), cpp_cv_const), cpp_ref_lvalue), type_safe::ref( *cpp_unexposed_expression::build(cpp_builtin_type::build(cpp_int), - cpp_token_string::from_string( - "o"))), + cpp_token_string::tokenize("o"))), cpp_storage_class_none, false, false); else if (var.name() == "q") check_variable(var, @@ -219,8 +214,7 @@ int r[] = {0}; "1")), type_safe::ref( *cpp_unexposed_expression::build(cpp_unexposed_type::build(""), - cpp_token_string::from_string( - "{0}"))), + cpp_token_string::tokenize("{0}"))), cpp_storage_class_none, false, false); else REQUIRE(false);