| // Copyright 2018 The Amber Authors. |
| // |
| // Licensed under the Apache License, Version 2.0 (the "License"); |
| // you may not use this file except in compliance with the License. |
| // You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, software |
| // distributed under the License is distributed on an "AS IS" BASIS, |
| // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| // See the License for the specific language governing permissions and |
| // limitations under the License. |
| |
| #include "src/tokenizer.h" |
| |
| #include <cctype> |
| #include <cstdlib> |
| #include <limits> |
| #include <sstream> |
| |
| #include "src/make_unique.h" |
| |
| namespace amber { |
| |
| Token::Token(TokenType type) : type_(type) {} |
| |
| Token::~Token() = default; |
| |
| Result Token::ConvertToDouble() { |
| if (IsDouble()) |
| return {}; |
| |
| if (IsString() || IsEOL() || IsEOS()) |
| return Result("Invalid conversion to double"); |
| |
| if (IsInteger()) { |
| if (is_negative_ || |
| uint_value_ <= |
| static_cast<uint64_t>(std::numeric_limits<int64_t>::max())) { |
| double_value_ = static_cast<double>(AsInt64()); |
| } else { |
| return Result("uint64_t value too big to fit in double"); |
| } |
| |
| uint_value_ = 0; |
| } else if (IsHex()) { |
| double_value_ = static_cast<double>(AsHex()); |
| string_value_ = ""; |
| } |
| type_ = TokenType::kDouble; |
| return {}; |
| } |
| |
| Tokenizer::Tokenizer(const std::string& data) : data_(data) {} |
| |
| Tokenizer::~Tokenizer() = default; |
| |
| std::unique_ptr<Token> Tokenizer::NextToken() { |
| SkipWhitespace(); |
| if (current_position_ >= data_.length()) |
| return MakeUnique<Token>(TokenType::kEOS); |
| |
| if (data_[current_position_] == '#') { |
| SkipComment(); |
| SkipWhitespace(); |
| } |
| if (current_position_ >= data_.length()) |
| return MakeUnique<Token>(TokenType::kEOS); |
| |
| if (data_[current_position_] == '\n') { |
| ++current_line_; |
| ++current_position_; |
| return MakeUnique<Token>(TokenType::kEOL); |
| } |
| |
| // If the current position is a , ( or ) then handle it specially as we don't |
| // want to consume any other characters. |
| if (data_[current_position_] == ',' || data_[current_position_] == '(' || |
| data_[current_position_] == ')') { |
| auto tok = MakeUnique<Token>(TokenType::kString); |
| std::string str(1, data_[current_position_]); |
| tok->SetStringValue(str); |
| ++current_position_; |
| return tok; |
| } |
| |
| size_t end_pos = current_position_; |
| while (end_pos < data_.length()) { |
| if (data_[end_pos] == ' ' || data_[end_pos] == '\r' || |
| data_[end_pos] == '\n' || data_[end_pos] == ')' || |
| data_[end_pos] == ',' || data_[end_pos] == '(') { |
| break; |
| } |
| ++end_pos; |
| } |
| |
| std::string tok_str = |
| data_.substr(current_position_, end_pos - current_position_); |
| current_position_ = end_pos; |
| |
| // Check for "NaN" explicitly. |
| bool is_nan = |
| (tok_str.size() == 3 && std::tolower(tok_str[0]) == 'n' && |
| std::tolower(tok_str[1]) == 'a' && std::tolower(tok_str[2]) == 'n'); |
| |
| // Starts with an alpha is a string. |
| if (!is_nan && !std::isdigit(tok_str[0]) && |
| !(tok_str[0] == '-' && tok_str.size() >= 2 && std::isdigit(tok_str[1])) && |
| !(tok_str[0] == '.' && tok_str.size() >= 2 && std::isdigit(tok_str[1]))) { |
| // If we've got a continuation, skip over the end of line and get the next |
| // token. |
| if (tok_str == "\\") { |
| if ((current_position_ < data_.length() && |
| data_[current_position_] == '\n')) { |
| ++current_line_; |
| ++current_position_; |
| return NextToken(); |
| } else if (current_position_ + 1 < data_.length() && |
| data_[current_position_] == '\r' && |
| data_[current_position_ + 1] == '\n') { |
| ++current_line_; |
| current_position_ += 2; |
| return NextToken(); |
| } |
| } |
| |
| auto tok = MakeUnique<Token>(TokenType::kString); |
| tok->SetStringValue(tok_str); |
| return tok; |
| } |
| |
| // Handle hex strings |
| if (!is_nan && tok_str.size() > 2 && tok_str[0] == '0' && tok_str[1] == 'x') { |
| auto tok = MakeUnique<Token>(TokenType::kHex); |
| tok->SetStringValue(tok_str); |
| return tok; |
| } |
| |
| bool is_double = false; |
| if (is_nan) { |
| is_double = true; |
| } else { |
| for (const char ch : tok_str) { |
| if (ch == '.') { |
| is_double = true; |
| break; |
| } |
| } |
| } |
| |
| std::unique_ptr<Token> tok; |
| |
| char* final_pos = nullptr; |
| if (is_double) { |
| tok = MakeUnique<Token>(TokenType::kDouble); |
| |
| double val = strtod(tok_str.c_str(), &final_pos); |
| tok->SetDoubleValue(val); |
| } else { |
| tok = MakeUnique<Token>(TokenType::kInteger); |
| |
| uint64_t val = uint64_t(std::strtoull(tok_str.c_str(), &final_pos, 10)); |
| tok->SetUint64Value(static_cast<uint64_t>(val)); |
| } |
| if (tok_str.size() > 1 && tok_str[0] == '-') |
| tok->SetNegative(); |
| |
| tok->SetOriginalString( |
| tok_str.substr(0, static_cast<size_t>(final_pos - tok_str.c_str()))); |
| |
| // If the number isn't the whole token then move back so we can then parse |
| // the string portion. |
| auto diff = size_t(final_pos - tok_str.c_str()); |
| if (diff > 0) |
| current_position_ -= tok_str.length() - diff; |
| |
| return tok; |
| } |
| |
| std::string Tokenizer::ExtractToNext(const std::string& str) { |
| size_t pos = data_.find(str, current_position_); |
| std::string ret; |
| if (pos == std::string::npos) { |
| ret = data_.substr(current_position_); |
| current_position_ = data_.length(); |
| } else { |
| ret = data_.substr(current_position_, pos - current_position_); |
| current_position_ = pos; |
| } |
| |
| // Account for any new lines in the extracted text so our current line |
| // number stays correct. |
| for (const char c : ret) { |
| if (c == '\n') |
| ++current_line_; |
| } |
| |
| return ret; |
| } |
| |
| bool Tokenizer::IsWhitespace(char ch) { |
| return ch == '\0' || ch == '\t' || ch == '\r' || ch == 0x0c /* ff */ || |
| ch == ' '; |
| } |
| |
| void Tokenizer::SkipWhitespace() { |
| while (current_position_ < data_.size() && |
| IsWhitespace(data_[current_position_])) { |
| ++current_position_; |
| } |
| } |
| |
| void Tokenizer::SkipComment() { |
| while (current_position_ < data_.length() && |
| data_[current_position_] != '\n') { |
| ++current_position_; |
| } |
| } |
| |
| } // namespace amber |