From df1ef68cd8e8582724ce1192bfc202e0b9aeaf0c Mon Sep 17 00:00:00 2001 From: Karen Arutyunov Date: Tue, 28 Sep 2021 19:24:31 +0300 Subject: Get rid of C++ modules related code and rename *.mxx files to *.hxx --- libbutl/char-scanner.hxx | 246 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 246 insertions(+) create mode 100644 libbutl/char-scanner.hxx (limited to 'libbutl/char-scanner.hxx') diff --git a/libbutl/char-scanner.hxx b/libbutl/char-scanner.hxx new file mode 100644 index 0000000..b7ea14b --- /dev/null +++ b/libbutl/char-scanner.hxx @@ -0,0 +1,246 @@ +// file : libbutl/char-scanner.hxx -*- C++ -*- +// license : MIT; see accompanying LICENSE file + +#pragma once + +#include // char_traits +#include +#include // size_t +#include // uint64_t +#include // INT_* +#include // pair, make_pair() +#include + +#include + +#include + +namespace butl +{ + // Refer to utf8_validator for details. + // + struct noop_validator + { + std::pair + validate (char) {return std::make_pair (true, true);} + + std::pair + validate (char c, std::string&) {return validate (c);} + }; + + // Low-level character stream scanner. Normally used as a base for + // higher-level lexers. + // + template + class char_scanner + { + public: + using validator_type = V; + static constexpr const std::size_t unget_depth = N; + + // If the crlf argument is true, then recognize Windows newlines (0x0D + // 0x0A) and convert them to just '\n' (0x0A). Note that a standalone + // 0x0D is treated "as if" it was followed by 0x0A and multiple 0x0D + // are treated as one. + // + // Note also that if the stream happens to be bufstreambuf-based, then it + // includes a number of optimizations that assume nobody else is messing + // with the stream. + // + // The line and position arguments can be used to override the start line + // and position in the stream (useful when re-scanning data saved with the + // save_* facility). + // + char_scanner (std::istream&, + bool crlf = true, + std::uint64_t line = 1, + std::uint64_t position = 0); + + char_scanner (std::istream&, + validator_type, + bool crlf = true, + std::uint64_t line = 1, + std::uint64_t position = 0); + + char_scanner (const char_scanner&) = delete; + char_scanner& operator= (const char_scanner&) = delete; + + // Scanner interface. + // + public: + + // Extended character. It includes line/column/position information and is + // capable of representing EOF and invalid characters. + // + // Note that implicit conversion of EOF/invalid to char_type results in + // NUL character (which means in most cases it is safe to compare xchar to + // char without checking for EOF). + // + class xchar + { + public: + using traits_type = std::char_traits; + using int_type = traits_type::int_type; + using char_type = traits_type::char_type; + + int_type value; + + // Note that the column is of the codepoint this byte belongs to. + // + std::uint64_t line; + std::uint64_t column; + + // Logical character position (see bufstreambuf for details on the + // logical part) if the scanned stream is bufstreambuf-based and always + // zero otherwise. + // + std::uint64_t position; + + static int_type + invalid () {return traits_type::eof () != INT_MIN ? INT_MIN : INT_MAX;} + + operator char_type () const + { + return value != traits_type::eof () && value != invalid () + ? static_cast (value) + : char_type (0); + } + + xchar (int_type v = 0, + std::uint64_t l = 0, + std::uint64_t c = 0, + std::uint64_t p = 0) + : value (v), line (l), column (c), position (p) {} + }; + + // Note that if any of the get() or peek() functions return an invalid + // character, then the scanning has failed and none of them should be + // called again. + + xchar + get (); + + // As above but in case of an invalid character also return the + // description of why it is invalid. + // + xchar + get (std::string& what); + + void + get (const xchar& peeked); // Get previously peeked character (faster). + + void + unget (const xchar&); + + // Note that if there is an "ungot" character, peek() will return that. + // + xchar + peek (); + + // As above but in case of an invalid character also return the + // description of why it is invalid. + // + xchar + peek (std::string& what); + + // Tests. In the future we can add tests line alpha(), alnum(), etc. + // + static bool + eos (const xchar& c) {return c.value == xchar::traits_type::eof ();} + + static bool + invalid (const xchar& c) {return c.value == xchar::invalid ();} + + // Line, column and position of the next character to be extracted from + // the stream by peek() or get(). + // + std::uint64_t line; + std::uint64_t column; + std::uint64_t position; + + // Ability to save raw data as it is being scanned. Note that the + // character is only saved when it is got, not peeked. + // + public: + void + save_start (std::string& b) + { + assert (save_ == nullptr); + save_ = &b; + } + + void + save_stop () + { + assert (save_ != nullptr); + save_ = nullptr; + } + + struct save_guard + { + explicit + save_guard (char_scanner& s, std::string& b): s_ (&s) {s.save_start (b);} + + void + stop () {if (s_ != nullptr) {s_->save_stop (); s_ = nullptr;}} + + ~save_guard () {stop ();} + + private: + char_scanner* s_; + }; + + protected: + using int_type = typename xchar::int_type; + using char_type = typename xchar::char_type; + + int_type + peek_ (); + + void + get_ (); + + std::uint64_t + pos_ () const; + + xchar + get (std::string* what); + + xchar + peek (std::string* what); + + protected: + std::istream& is_; + + validator_type val_; + bool decoded_ = true; // The peeked character is last byte of sequence. + bool validated_ = false; // The peeked character has been validated. + + // Note that if you are reading from the buffer directly, then it is also + // your responsibility to call the validator and save the data (see + // save_*(). + // + // Besides that, make sure that the peek() call preceding the scan is + // followed by the get() call (see validated_, decoded_, and unpeek_ for + // the hairy details; realistically, you would probably only direct-scan + // ASCII fragments). + // + bufstreambuf* buf_; // NULL if not bufstreambuf-based. + const char_type* gptr_; + const char_type* egptr_; + + std::string* save_ = nullptr; + + bool crlf_; + bool eos_ = false; + + std::size_t ungetn_ = 0; + xchar ungetb_[N]; + + bool unpeek_ = false; + xchar unpeekc_ = '\0'; + }; +} + +#include +#include -- cgit v1.1