diff options
author | Boris Kolpackov <boris@codesynthesis.com> | 2017-05-24 13:24:31 +0200 |
---|---|---|
committer | Boris Kolpackov <boris@codesynthesis.com> | 2017-05-24 13:24:31 +0200 |
commit | 0cef93b4e2e9bf39b0ca542876f9ab1af6d0f01d (patch) | |
tree | 187b83b65f28cdf4f8a2b0feadf392b49554fbf3 | |
parent | b3526a5c925169b3be00a5dd4d8c8222f3a475cd (diff) |
Implement support for tokenization of preprocessed C/C++ source
-rw-r--r-- | build2/buildfile | 1 | ||||
-rw-r--r-- | build2/cc/lexer.cxx | 683 | ||||
-rw-r--r-- | build2/cc/lexer.hxx | 166 | ||||
-rw-r--r-- | unit-tests/cc/lexer/buildfile | 17 | ||||
-rw-r--r-- | unit-tests/cc/lexer/char-literal.test | 67 | ||||
-rw-r--r-- | unit-tests/cc/lexer/comment.test | 88 | ||||
-rw-r--r-- | unit-tests/cc/lexer/driver.cxx | 66 | ||||
-rw-r--r-- | unit-tests/cc/lexer/line.test | 67 | ||||
-rw-r--r-- | unit-tests/cc/lexer/number.test | 48 | ||||
-rw-r--r-- | unit-tests/cc/lexer/preprocessor.test | 38 | ||||
-rw-r--r-- | unit-tests/cc/lexer/raw-string-literal.test | 90 | ||||
-rw-r--r-- | unit-tests/cc/lexer/string-literal.test | 65 |
12 files changed, 1396 insertions, 0 deletions
diff --git a/build2/buildfile b/build2/buildfile index 2d65001..69dfc94 100644 --- a/build2/buildfile +++ b/build2/buildfile @@ -54,6 +54,7 @@ exe{b}: \ cc/{hxx cxx}{ guess } \ cc/{hxx cxx}{ init } \ cc/{hxx cxx}{ install } \ + cc/{hxx cxx}{ lexer } \ cc/{hxx cxx}{ link } \ cc/{hxx cxx}{ module } \ cc/{ cxx}{ msvc } \ diff --git a/build2/cc/lexer.cxx b/build2/cc/lexer.cxx new file mode 100644 index 0000000..3eb5d5b --- /dev/null +++ b/build2/cc/lexer.cxx @@ -0,0 +1,683 @@ +// file : build2/cc/lexer.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include <build2/cc/lexer.hxx> + +using namespace std; +using namespace butl; + +// Diagnostics plumbing. +// +namespace butl // ADL +{ + inline build2::location + get_location (const butl::char_scanner::xchar& c, const void* data) + { + using namespace build2; + + assert (data != nullptr); // E.g., must be &lexer::name_. + return location (static_cast<const path*> (data), c.line, c.column); + } +} + +namespace build2 +{ + namespace cc + { + inline auto lexer:: + get (bool e) -> xchar + { + if (unget_) + { + unget_ = false; + return ungetc_; + } + else + { + xchar c (peek (e)); + base::get (c); + return c; + } + } + + auto lexer:: + peek (bool e) -> xchar + { + if (unget_) + return ungetc_; + + if (unpeek_) + return unpeekc_; + + xchar c (base::peek ()); + + if (e && c == '\\') + { + base::get (c); + xchar p (base::peek ()); + + if (p == '\n') + { + base::get (p); + return peek (e); // Recurse. + } + + // Save in the unpeek buffer so that it is returned on the subsequent + // calls to peek() (until get()). + // + unpeek_ = true; + unpeekc_ = c; + } + + return c; + } + + using type = token_type; + + void lexer:: + next (token& t, xchar c) + { + for (;; c = skip_spaces ()) + { + t.line = c.line; + t.column = c.column; + + if (eos (c)) + { + t.type = type::eos; + return; + } + + switch (c) + { + // Preprocessor lines. + // + case '#': + { + // It is tempting to simply scan until the newline ignoring + // anything in between. However, these lines can start a + // multi-line C-style comment. So we have to tokenize it. Note + // that we assume there cannot be #include directives. + // + // This may not work for things like #error that can contain + // pretty much anything. + // + for (;;) + { + c = skip_spaces (false); // Stop at newline. + + if (eos (c) || c == '\n') + break; + + next (t, c); // Keep using the passed token for buffers. + } + break; + } + // Single-letter punctuation. + // + case ';': t.type = type::semi; return; + case '{': t.type = type::lcbrace; return; + case '}': t.type = type::rcbrace; return; + // Other single-letter punctuation. + // + case '(': + case ')': + case '[': + case ']': + case ',': + case '?': + case '~': + case '\\': t.type = type::punctuation; return; + // Potentially multi-letter punctuation. + // + case '.': // . .* .<N> ... + { + xchar p (peek ()); + + if (p == '*') + { + get (p); + t.type = type::punctuation; + return; + } + else if (p >= '0' && p <= '9') + { + number_literal (t, c); + return; + } + else if (p == '.') + { + get (p); + xchar q (peek ()); + if (q == '.') + { + get (q); + t.type = type::punctuation; + return; + } + unget (p); + // Fall through. + } + + t.type = type::dot; + return; + } + case '=': // = == + case '!': // ! != + case '*': // * *= + case '/': // / /= (/* and // handled by skip_spaced() above) + case '%': // % %= + case '^': // ^ ^= + { + xchar p (peek ()); + + if (p == '=') + get (p); + + t.type = type::punctuation; + return; + } + case '>': // > >= >> >>= + case '<': // < <= << <<= + { + xchar p (peek ()); + + if (p == c) + { + get (p); + if ((p = peek ()) == '=') + get (p); + } + else if (p == '=') + get (p); + + t.type = type::punctuation; + return; + } + case '+': // + ++ += + case '-': // - -- -= -> ->* + { + xchar p (peek ()); + + if (p == c) + get (p); + else if (p == '=') + get (p); + else if (c == '-' && p == '>') + { + get (p); + if ((p = peek ()) == '*') + get (p); + } + + t.type = type::punctuation; + return; + } + case '&': // & && &= + case '|': // | || |= + { + xchar p (peek ()); + + if (p == c) + get (p); + else if (p == '=') + get (p); + + t.type = type::punctuation; + return; + } + case ':': // : :: + { + xchar p (peek ()); + + if (p == ':') + get (p); + + t.type = type::punctuation; + return; + } + // Number (and also .<N> above). + // + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + case '8': + case '9': + { + number_literal (t, c); + return; + } + // Char/string literal, identifier, or other (\, $, @, `). + // + default: + { + bool raw (false); // Raw string literal. + + if (alpha (c) || c == '_') + { + string& id (t.value); + id.clear (); + + for (id += c; (c = peek ()) == '_' || alnum (c); get (c)) + id += c; + + // If the following character is a quote, see if the identifier + // is one of the literal prefixes. + // + if (c == '\'' || c == '\"') + { + size_t n (id.size ()), i (0); + switch (id[0]) + { + case 'u': + { + if (n > 1 && id[1] == '8') + ++i; + // Fall through. + } + case 'L': + case 'U': + { + ++i; + + if (c == '\"' && n > i && id[i] == 'R') + { + ++i; + raw = true; + } + break; + } + case 'R': + { + if (c == '\"') + { + ++i; + raw = true; + } + break; + } + } + + if (i == n) // All characters "consumed". + { + get (c); + id.clear (); + } + } + + if (!id.empty ()) + { + t.type = type::identifier; + return; + } + } + + switch (c) + { + case '\'': + { + char_literal (t, c); + return; + } + case '\"': + { + if (raw) + raw_string_literal (t, c); + else + string_literal (t, c); + return; + } + default: + { + t.type = type::other; + return; + } + } + } + } + } + } + + void lexer:: + number_literal (token& t, xchar c) + { + t.line = c.line; + t.column = c.column; + + // A number (integer or floating point literal) can: + // + // 1. Start with a dot (which must be followed by a digit, e.g., .123). + // + // 2. Can have a radix prefix (0b101, 0123, 0X12AB). + // + // 3. Can have an exponent (1e10, 0x1.p-10, 1.). + // + // 4. Digits can be separated with ' (123'456, 0xff00'00ff). + // + // 5. End with a built-in or user defined literal (123f, 123UL, 123_X) + // + // Quoting from GCC's preprocessor documentation: + // + // "Formally preprocessing numbers begin with an optional period, a + // required decimal digit, and then continue with any sequence of + // letters, digits, underscores, periods, and exponents. Exponents are + // the two-character sequences 'e+', 'e-', 'E+', 'E-', 'p+', 'p-', 'P+', + // and 'P-'." + // + // So it looks like a "C++ number" is then any unseparated (with + // whitespace or punctuation) sequence of those plus '. The only mildly + // tricky part is then to recognize +/- as being part of the exponent. + // + while (!eos ((c = peek ()))) + { + switch (c) + { + // All the whitespace, punctuation, and other characters that end + // the number. + // + case ' ': + case '\n': + case '\t': + case '\r': + case '\f': + case '\v': + + case '#': + case ';': + case '{': + case '}': + case '(': + case ')': + case '[': + case ']': + case ',': + case '?': + case '~': + case '=': + case '!': + case '*': + case '/': + case '%': + case '^': + case '>': + case '<': + case '&': + case '|': + case ':': + case '+': // The exponent case is handled below. + case '-': // The exponent case is handled below. + case '"': + case '\\': + + case '@': + case '$': + case '`': + break; + + // Recognize +/- after the exponent. + // + case 'e': + case 'E': + case 'p': + case 'P': + { + get (c); + c = peek (); + if (c == '+' || c == '-') + get (c); + continue; + } + + case '_': + case '.': + case '\'': + default: // Digits and letters. + { + get (c); + continue; + } + } + + break; + } + + t.type = type::number; + } + + void lexer:: + char_literal (token& t, xchar c) + { + t.line = c.line; + t.column = c.column; + + char p (c); // Previous character (see below). + + for (;;) + { + c = get (); + + if (eos (c)) + fail (location (&name_, t.line, t.column)) << "unterminated literal"; + + if (c == '\'' && p != '\\') + break; + + // Keep track of \\-escapings so we don't confuse them with \', as in + // '\\'. + // + p = (c == '\\' && p == '\\') ? '\0' : static_cast<char> (c); + } + + // See if we have a user-defined suffix (which is an identifier). + // + if ((c = peek ()) == '_' || alpha (c)) + literal_suffix (c); + + t.type = type::character; + } + + void lexer:: + string_literal (token& t, xchar c) + { + t.line = c.line; + t.column = c.column; + + char p (c); // Previous character (see below). + + for (;;) + { + c = get (); + + if (eos (c)) + fail (location (&name_, t.line, t.column)) << "unterminated literal"; + + if (c == '\"' && p != '\\') + break; + + // Keep track of \\-escapings so we don't confuse them with \", as in + // "\\". + // + p = (c == '\\' && p == '\\') ? '\0' : static_cast<char> (c); + } + + // See if we have a user-defined suffix (which is an identifier). + // + if ((c = peek ()) == '_' || alpha (c)) + literal_suffix (c); + + t.type = type::string; + } + + void lexer:: + raw_string_literal (token& t, xchar c) + { + t.line = c.line; + t.column = c.column; + + // The overall form is: + // + // R"<delimiter>(<raw_characters>)<delimiter>" + // + // Where <delimiter> is a potentially-empty character sequence made of + // any source character but parentheses, backslash and spaces. It can be + // at most 16 characters long. + // + // Note that the <raw_characters> are not processed in any way, not even + // for line continuations. + // + + // As a first step, parse the delimiter (including the openning paren). + // + string d (1, ')'); + + for (;;) + { + c = get (); + + if (eos (c) || c == '\"' || c == ')' || c == '\\' || c == ' ') + fail (location (&name_, t.line, t.column)) << "invalid raw literal"; + + if (c == '(') + break; + + d += c; + } + + d += '"'; + + // Now parse the raw characters while trying to match the closing + // delimiter. + // + for (size_t i (0);;) // Position to match in d. + { + c = get (false); // No newline escaping. + + if (eos (c)) + fail (location (&name_, t.line, t.column)) << "invalid raw literal"; + + if (c != d[i] && i != 0) // Restart from the beginning. + i = 0; + + if (c == d[i]) + { + if (++i == d.size ()) + break; + } + } + + // See if we have a user-defined suffix (which is an identifier). + // + if ((c = peek ()) == '_' || alpha (c)) + literal_suffix (c); + + t.type = type::string; + } + + void lexer:: + literal_suffix (xchar c) + { + // Parse a user-defined literal suffix identifier. + // + for (get (c); (c = peek ()) == '_' || alnum (c); get (c)) ; + } + + auto lexer:: + skip_spaces (bool nl) -> xchar + { + xchar c (get ()); + + for (; !eos (c); c = get ()) + { + switch (c) + { + case '\n': + { + if (!nl) + break; + + // Fall through. + } + case ' ': + case '\t': + case '\r': + case '\f': + case '\v': continue; + + case '/': + { + xchar p (peek ()); + + // C++ comment. + // + if (p == '/') + { + get (p); + do { c = get (); } while (!eos (c) && c != '\n'); + + if (!nl) + break; + + continue; + } + + // C comment. + // + if (p == '*') + { + get (p); + + for (;;) + { + c = get (); + + if (eos (c)) + fail (p) << "unterminated comment"; + + if (c == '*' && (c = peek ()) == '/') + { + get (c); + break; + } + } + continue; + } + break; + } + } + break; + } + + return c; + } + + ostream& + operator<< (ostream& o, const token& t) + { + switch (t.type) + { + case type::dot: o << "'.'"; break; + case type::semi: o << "';'"; break; + case type::lcbrace: o << "'{'"; break; + case type::rcbrace: o << "'}'"; break; + case type::punctuation: o << "<punctuation>"; break; + + case type::identifier: o << '\'' << t.value << '\''; break; + + case type::number: o << "<number literal>"; break; + case type::character: o << "<char literal>"; break; + case type::string: o << "<string literal>"; break; + + case type::other: o << "<other>"; break; + case type::eos: o << "<end of file>"; break; + } + + return o; + } + } +} diff --git a/build2/cc/lexer.hxx b/build2/cc/lexer.hxx new file mode 100644 index 0000000..0735b45 --- /dev/null +++ b/build2/cc/lexer.hxx @@ -0,0 +1,166 @@ +// file : build2/cc/lexer.hxx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#ifndef BUILD2_CC_LEXER_HXX +#define BUILD2_CC_LEXER_HXX + +#include <libbutl/char-scanner.hxx> + +#include <build2/types.hxx> +#include <build2/utility.hxx> + +#include <build2/diagnostics.hxx> + +namespace build2 +{ + namespace cc + { + // Preprocessor-level tokenization of C/C++ source. In other words, the + // sequence of tokens returned is similar to what a real C/C++ compiler + // would see from its preprocessor. + // + // The input is a (partially-)preprocessed translation unit that may still + // contain comments, line continuations, and preprocessor directives such + // as #line, #pragma, etc. Currently all preprocessor directives are + // discarded and no values are saved for literals. + // + enum class token_type + { + // NOTE: remember to update operator<<() if changing anything here! + // + eos, + + dot, // . + semi, // ; + lcbrace, // { + rcbrace, // } + punctuation, // Other punctuation. + + identifier, + + number, // Number literal. + character, // Char literal. + string, // String literal. + + other // Other token. + }; + + struct token + { + token_type type; + string value; + + uint64_t line; + uint64_t column; + + public: + token () + : token (token_type::eos, 0, 0) {} + + token (token_type t, uint64_t l, uint64_t c) + : token (t, string (), l, c) {} + + token (token_type t, string v, uint64_t l, uint64_t c) + : type (t), value (move (v)), line (l), column (c) {} + }; + + // Output the token value in a format suitable for diagnostics. + // + ostream& + operator<< (ostream&, const token&); + + class lexer: protected butl::char_scanner + { + public: + lexer (istream& is, const path& name) + : char_scanner (is, false), name_ (name), fail ("error", &name_) {} + + const path& + name () const {return name_;} + + // Note that it is ok to call next() again after getting eos. + // + token + next () + { + token t; + next (t, skip_spaces ()); + return t; + } + + // As above but reuse the token to avoid a (potential) memory + // allocation. Typical usage: + // + // for (token t; l.next (t) != token_type::eos; ) + // ... + // + token_type + next (token& t) + { + next (t, skip_spaces ()); + return t.type; + } + + private: + void + next (token&, xchar); + + void + number_literal (token&, xchar); + + void + char_literal (token&, xchar); + + void + string_literal (token&, xchar); + + void + raw_string_literal (token&, xchar); + + void + literal_suffix (xchar); + + xchar + skip_spaces (bool newline = true); + + // The char_scanner adaptation for newline escape sequence processing. + // Enabled by default and is only disabled in the raw string literals. + // + private: + using base = char_scanner; + + xchar + get (bool escape = true); + + void + get (const xchar& peeked) {base::get (peeked);} + + xchar + peek (bool escape = true); + + private: + const path name_; + fail_mark fail; + }; + + // Diagnostics plumbing. We assume that any diag stream for which we can + // use token as location has its aux data pointing to pointer to path. + // + inline location + get_location (const token& t, const path& p) + { + return location (&p, t.line, t.column); + } + + inline location + get_location (const token& t, const void* data) + { + assert (data != nullptr); // E.g., must be &parser::path_. + const path* p (*static_cast<const path* const*> (data)); + return get_location (t, *p); + } + } +} + +#endif // BUILD2_CC_LEXER_HXX diff --git a/unit-tests/cc/lexer/buildfile b/unit-tests/cc/lexer/buildfile new file mode 100644 index 0000000..ff4e0b3 --- /dev/null +++ b/unit-tests/cc/lexer/buildfile @@ -0,0 +1,17 @@ +# file : unit-tests/cc/lexer/buildfile +# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +#@@ Temporary until we get utility library support. +# +import libs = libbutl%lib{butl} +src = cc/lexer token lexer diagnostics utility variable name b-options types-parsers \ +context scope parser target operation rule prerequisite file module function \ +functions-builtin functions-path functions-process-path functions-string \ +functions-target-triplet algorithm search dump filesystem scheduler \ +config/{utility init operation module} spec + +exe{driver}: cxx{driver} ../../../build2/cxx{$src} ../../../build2/liba{b} \ +$libs test{*} + +include ../../../build2/ diff --git a/unit-tests/cc/lexer/char-literal.test b/unit-tests/cc/lexer/char-literal.test new file mode 100644 index 0000000..f256785 --- /dev/null +++ b/unit-tests/cc/lexer/char-literal.test @@ -0,0 +1,67 @@ +# file : unit-tests/cc/lexer/char-literal.test +# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +# Test character literals. +# + +: normal +: +$* <<EOI >>EOO +'a' +'aa' +'"' +EOI +<char literal> +<char literal> +<char literal> +EOO + +: prefix +: +$* <<EOI >>EOO +L'a' +U'a' +u'a' +u8'a' +u8R'a' +EOI +<char literal> +<char literal> +<char literal> +<char literal> +'u8R' +<char literal> +EOO + +: suffix +: +$* <<EOI >>EOO +'a'x +'a'_X123 +EOI +<char literal> +<char literal> +EOO + +: escape +: +$* <<EOI >>EOO +'\'' +'\\' +'\\\'' +'\n' +U'\U0001f34c' +EOI +<char literal> +<char literal> +<char literal> +<char literal> +<char literal> +EOO + +: unterminated +: +$* <"'a" 2>>EOE != 0 +stdin:1:1: error: unterminated literal +EOE diff --git a/unit-tests/cc/lexer/comment.test b/unit-tests/cc/lexer/comment.test new file mode 100644 index 0000000..e90d8e0 --- /dev/null +++ b/unit-tests/cc/lexer/comment.test @@ -0,0 +1,88 @@ +# file : unit-tests/cc/lexer/comment.test +# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +# Test C and C++ comments. +# + +: c-comment +: +$* <<EOI +/* 'one' */ +/* "two" // three +*/ +/** +four +// five */ +/** +six /* +*/ +EOI + +: cxx-comment +: +$* <<EOI +// 'one' +// "two" // three +// four /* five */ +EOI + +: commented-out +: +$* <<EOI >"';'" +// /* +; +// */ +EOI + +: c-unterminated +: +$* <<EOI 2>>EOE != 0 +/* +comment +EOI +stdin:1:2: error: unterminated comment +EOE + +: cxx-unterminated +: +$* <<:EOI +// comment +EOI + +: in-char-literal +: +$* <<EOI >>EOO +'//' +'/*'*/ +EOI +<char literal> +<char literal> +<punctuation> +<punctuation> +EOO + +: in-string-literal +: +$* <<EOI >>EOO +"//foo" +"/*"*/ +EOI +<string literal> +<string literal> +<punctuation> +<punctuation> +EOO + +: in-raw-string-literal +: +$* <<EOI >>EOO +R"X( +// foo +/* bar +)X"*/ +EOI +<string literal> +<punctuation> +<punctuation> +EOO diff --git a/unit-tests/cc/lexer/driver.cxx b/unit-tests/cc/lexer/driver.cxx new file mode 100644 index 0000000..db3f516 --- /dev/null +++ b/unit-tests/cc/lexer/driver.cxx @@ -0,0 +1,66 @@ +// file : unit-tests/cc/lexer/driver.cxx -*- C++ -*- +// copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +// license : MIT; see accompanying LICENSE file + +#include <cassert> +#include <iostream> + +#include <build2/types.hxx> +#include <build2/utility.hxx> + +#include <build2/cc/lexer.hxx> + +using namespace std; + +namespace build2 +{ + namespace cc + { + // Usage: argv[0] [<file>] + // + int + main (int argc, char* argv[]) + { + try + { + istream* is; + const char* in; + + // Reading from file is several times faster. + // + ifdstream ifs; + if (argc > 1) + { + in = argv[1]; + ifs.open (in); + is = &ifs; + } + else + { + in = "stdin"; + cin.exceptions (istream::failbit | istream::badbit); + is = &cin; + } + + lexer l (*is, path (in)); + + // No use printing eos since we will either get it or loop forever. + // + for (token t; l.next (t) != token_type::eos; ) + cout << t << endl; + } + catch (const failed&) + { + return 1; + } + + return 0; + } + } +} + +int +main (int argc, char* argv[]) +{ + return build2::cc::main (argc, argv); +} diff --git a/unit-tests/cc/lexer/line.test b/unit-tests/cc/lexer/line.test new file mode 100644 index 0000000..9eda9c3 --- /dev/null +++ b/unit-tests/cc/lexer/line.test @@ -0,0 +1,67 @@ +# file : unit-tests/cc/lexer/line.test +# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +# Test line continuations. +# + +: identifier +: +$* <<EOI >"'foo123'" +fo\ +o\ +1\ +2\ +3 +EOI + +: punctuation +: +$* <<EOI >'<punctuation>' +.\ +.\ +. +EOI + +: c-comment +: +$* <<EOI +/\ +* +comment +*\ +/\ + +EOI + +: cxx-comment +: +$* <<EOI +/\ +/ comment\ +more\ +more +EOI + +: other +: +$* <<EOI >>EOO +\abc +EOI +<punctuation> +'abc' +EOO + +: multiple +: +$* <<EOI >>EOO +\\ +EOI +<punctuation> +EOO + +: unterminated +: +$* <<:EOI >'<punctuation>' +\ +EOI diff --git a/unit-tests/cc/lexer/number.test b/unit-tests/cc/lexer/number.test new file mode 100644 index 0000000..1d9b9c5 --- /dev/null +++ b/unit-tests/cc/lexer/number.test @@ -0,0 +1,48 @@ +# file : unit-tests/cc/lexer/number.test +# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +# Test numbers. +# + +$* <'1' >'<number literal>' +$* <'.1' >'<number literal>' +$* <'1.' >'<number literal>' + +$* <'0b101' >'<number literal>' +$* <'0123' >'<number literal>' +$* <'0X12AB' >'<number literal>' + +$* <'1e10' >'<number literal>' +$* <'1E+10' >'<number literal>' +$* <'0x1.p10' >'<number literal>' +$* <'0x1.P-10' >'<number literal>' + +$* <"123'456" >'<number literal>' +$* <"0xff00'00ff" >'<number literal>' + +$* <'123f' >'<number literal>' +$* <'123UL' >'<number literal>' +$* <'123_X' >'<number literal>' + +: separate-punctuation +: +$* <'123;' >>EOO +<number literal> +';' +EOO + +: separate-plus-minus +: +$* <'1.0_a+2.0' >>EOO +<number literal> +<punctuation> +<number literal> +EOO + +: separate-whitespace +: +$* <'123 abc' >>EOO +<number literal> +'abc' +EOO diff --git a/unit-tests/cc/lexer/preprocessor.test b/unit-tests/cc/lexer/preprocessor.test new file mode 100644 index 0000000..2917649 --- /dev/null +++ b/unit-tests/cc/lexer/preprocessor.test @@ -0,0 +1,38 @@ +# file : unit-tests/cc/lexer/preprocessor.test +# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +# Test preprocessor lines. +# + +: normal +: +$* <<EOI +#pragma message("abc") +EOI + +: multiline +: +$* <<EOI +#pragma message \ +( \ +"abc" \ +) +EOI + +: comment +: +$* <<EOI +#pragma foo /* +bar +baz +*/ +#pragma foo // bar baz +EOI + +: line +: +$* <<EOI +# 1 "test.cxx" 2 +#line 8 "z:\\tmp\\test.hxx" +EOI diff --git a/unit-tests/cc/lexer/raw-string-literal.test b/unit-tests/cc/lexer/raw-string-literal.test new file mode 100644 index 0000000..e8e8b6b --- /dev/null +++ b/unit-tests/cc/lexer/raw-string-literal.test @@ -0,0 +1,90 @@ +# file : unit-tests/cc/lexer/raw-string-literal.test +# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +# Test raw string literals. +# + +: normal +: +$* <<EOI >>EOO +R"()" +R"(ab)" +R"(a"b)" +R"(a)b)" +R"%(a%)b)%" +R"X(a + b)X" +R"X(a\ + b)X" +EOI +<string literal> +<string literal> +<string literal> +<string literal> +<string literal> +<string literal> +<string literal> +EOO + +: prefix +: +$* <<EOI >>EOO +LR"(ab)" +UR"(ab)" +uR"(ab)" +u8R"(ab)" +EOI +<string literal> +<string literal> +<string literal> +<string literal> +EOO + +: suffix +: +$* <<EOI >>EOO +R"(ab)"x +R"(ab)"_X123 +EOI +<string literal> +<string literal> +EOO + +: escape +: +$* <<EOI >>EOO +R"(\)" +EOI +<string literal> +EOO + +: invalid-no-paren +: +$* <'R"a"' 2>>EOE != 0 +stdin:1:2: error: invalid raw literal +EOE + +: invalid-paren +: +$* <'R")()("' 2>>EOE != 0 +stdin:1:2: error: invalid raw literal +EOE + +: invalid-unterminated-paren +: +$* <'R"(abc"' 2>>EOE != 0 +stdin:1:2: error: invalid raw literal +EOE + +: invalid-unterminated-delimiter +: +$* <'R"X(abc)"' 2>>EOE != 0 +stdin:1:2: error: invalid raw literal +EOE + +: invalid-unterminated-quote +: +$* <'R"X(abc)X' 2>>EOE != 0 +stdin:1:2: error: invalid raw literal +EOE diff --git a/unit-tests/cc/lexer/string-literal.test b/unit-tests/cc/lexer/string-literal.test new file mode 100644 index 0000000..062d290 --- /dev/null +++ b/unit-tests/cc/lexer/string-literal.test @@ -0,0 +1,65 @@ +# file : unit-tests/cc/lexer/string-literal.test +# copyright : Copyright (c) 2014-2017 Code Synthesis Ltd +# license : MIT; see accompanying LICENSE file + +# Test string literals (except raw). +# + +: normal +: +$* <<EOI >>EOO +"aa" +"'" +"a""b" +EOI +<string literal> +<string literal> +<string literal> +<string literal> +EOO + +: prefix +: +$* <<EOI >>EOO +L"ab" +U"ab" +u"ab" +u8"ab" +EOI +<string literal> +<string literal> +<string literal> +<string literal> +EOO + +: suffix +: +$* <<EOI >>EOO +"ab"x +"ab"_X123 +EOI +<string literal> +<string literal> +EOO + +: escape +: +$* <<EOI >>EOO +"\"\"" +"\\\\" +"\\\"\\" +"\n\t" +U"a\U0001f34c" +EOI +<string literal> +<string literal> +<string literal> +<string literal> +<string literal> +EOO + +: unterminated +: +$* <'"ab' 2>>EOE != 0 +stdin:1:1: error: unterminated literal +EOE |