blob: 3ccc04ac71b597994e5500720b3c2d840c443dea [file] [log] [blame]
// file_input.hpp
// Copyright (c) 2008 Ben Hanson (http://www.benhanson.net/)
//
// Distributed under the Boost Software License, Version 1.0. (See accompanying
// file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
#ifndef BOOST_LEXER_FILE_INPUT
#define BOOST_LEXER_FILE_INPUT
#include "char_traits.hpp"
#include <fstream>
#include "size_t.hpp"
#include "state_machine.hpp"
namespace boost
{
namespace lexer
{
template<typename CharT, typename Traits = char_traits<CharT> >
class basic_file_input
{
public:
class iterator
{
public:
#if defined _MSC_VER && _MSC_VER <= 1200
friend basic_file_input;
#else
friend class basic_file_input;
#endif
struct data
{
std::size_t id;
const CharT *start;
const CharT *end;
std::size_t state;
// Construct in end() state.
data () :
id (0),
state (npos)
{
}
bool operator == (const data &rhs_) const
{
return id == rhs_.id && start == rhs_.start &&
end == rhs_.end && state == rhs_.state;
}
};
iterator () :
_input (0)
{
}
bool operator == (const iterator &rhs_) const
{
return _data == rhs_._data;
}
bool operator != (const iterator &rhs_) const
{
return !(*this == rhs_);
}
data &operator * ()
{
return _data;
}
data *operator -> ()
{
return &_data;
}
// Let compiler generate operator = ().
// prefix version
iterator &operator ++ ()
{
next_token ();
return *this;
}
// postfix version
iterator operator ++ (int)
{
iterator iter_ = *this;
next_token ();
return iter_;
}
void next_token ()
{
_data.start = _data.end;
if (_input->_state_machine->_dfa->size () == 1)
{
_data.id = _input->next (&_input->_state_machine->_lookup->
front ()->front (), _input->_state_machine->_dfa_alphabet.
front (), &_input->_state_machine->_dfa->front ()->
front (), _data.start, _data.end);
}
else
{
_data.id = _input->next (*_input->_state_machine, _data.state,
_data.start, _data.end);
}
if (_data.id == 0)
{
_data.start = 0;
_data.end = 0;
// Ensure current state matches that returned by end().
_data.state = npos;
}
}
private:
// Not owner (obviously!)
basic_file_input *_input;
data _data;
};
#if defined _MSC_VER && _MSC_VER <= 1200
friend iterator;
#else
friend class iterator;
#endif
// Make it explict that we are NOT taking a copy of state_machine_!
basic_file_input (const basic_state_machine<CharT> *state_machine_,
std::basic_ifstream<CharT> *is_,
const std::streamsize buffer_size_ = 4096,
const std::streamsize buffer_increment_ = 1024) :
_state_machine (state_machine_),
_stream (is_),
_buffer_size (buffer_size_),
_buffer_increment (buffer_increment_),
_buffer (_buffer_size, '!')
{
_start_buffer = &_buffer.front ();
_end_buffer = _start_buffer + _buffer.size ();
_start_token = _end_buffer;
_end_token = _end_buffer;
}
iterator begin ()
{
iterator iter_;
iter_._input = this;
iter_._data.id = npos;
iter_._data.start = 0;
iter_._data.end = 0;
iter_._data.state = 0;
++iter_;
return iter_;
}
iterator end ()
{
iterator iter_;
iter_._input = this;
iter_._data.start = 0;
iter_._data.end = 0;
return iter_;
}
void flush ()
{
// This temporary is mandatory, otherwise the
// pointer calculations won't work!
const CharT *temp_ = _end_buffer;
_start_token = _end_token = _end_buffer;
reload_buffer (temp_, true, _end_token);
}
private:
typedef std::basic_istream<CharT> istream;
typedef std::vector<CharT> buffer;
const basic_state_machine<CharT> *_state_machine;
const std::streamsize _buffer_size;
const std::streamsize _buffer_increment;
buffer _buffer;
CharT *_start_buffer;
istream *_stream;
const CharT *_start_token;
const CharT *_end_token;
CharT *_end_buffer;
std::size_t next (const basic_state_machine<CharT> &state_machine_,
std::size_t &start_state_, const CharT * &start_, const CharT * &end_)
{
_start_token = _end_token;
again:
const std::size_t * lookup_ = &state_machine_._lookup[start_state_]->
front ();
std::size_t dfa_alphabet_ = state_machine_._dfa_alphabet[start_state_];
const std::size_t *dfa_ = &state_machine_._dfa[start_state_]->front ();
const std::size_t *ptr_ = dfa_ + dfa_alphabet_;
const CharT *curr_ = _start_token;
bool end_state_ = *ptr_ != 0;
std::size_t id_ = *(ptr_ + id_index);
const CharT *end_token_ = curr_;
for (;;)
{
if (curr_ >= _end_buffer)
{
if (!reload_buffer (curr_, end_state_, end_token_))
{
// EOF
break;
}
}
const std::size_t BOL_state_ = ptr_[bol_index];
const std::size_t EOL_state_ = ptr_[eol_index];
if (BOL_state_ && (_start_token == _start_buffer ||
*(_start_token - 1) == '\n'))
{
ptr_ = &dfa_[BOL_state_ * dfa_alphabet_];
}
else if (EOL_state_ && *curr_ == '\n')
{
ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
}
else
{
const std::size_t state_ =
ptr_[lookup_[static_cast<typename Traits::index_type> (*curr_++)]];
if (state_ == 0)
{
break;
}
ptr_ = &dfa_[state_ * dfa_alphabet_];
}
if (*ptr_)
{
end_state_ = true;
id_ = *(ptr_ + id_index);
start_state_ = *(ptr_ + state_index);
end_token_ = curr_;
}
}
if (_start_token >= _end_buffer)
{
// No more tokens...
return 0;
}
const std::size_t EOL_state_ = ptr_[eol_index];
if (EOL_state_ && curr_ == end_)
{
ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
if (*ptr_)
{
end_state_ = true;
id_ = *(ptr_ + id_index);
start_state_ = *(ptr_ + state_index);
end_token_ = curr_;
}
}
if (end_state_)
{
// return longest match
_end_token = end_token_;
if (id_ == 0) goto again;
}
else
{
// No match causes char to be skipped
_end_token = _start_token + 1;
id_ = npos;
}
start_ = _start_token;
end_ = _end_token;
return id_;
}
std::size_t next (const std::size_t * const lookup_,
const std::size_t dfa_alphabet_, const std::size_t * const dfa_,
const CharT * &start_, const CharT * &end_)
{
_start_token = _end_token;
const std::size_t *ptr_ = dfa_ + dfa_alphabet_;
const CharT *curr_ = _start_token;
bool end_state_ = *ptr_ != 0;
std::size_t id_ = id_ = *(ptr_ + id_index);
const CharT *end_token_ = curr_;
for (;;)
{
if (curr_ >= _end_buffer)
{
if (!reload_buffer (curr_, end_state_, end_token_))
{
// EOF
break;
}
}
const std::size_t BOL_state_ = ptr_[bol_index];
const std::size_t EOL_state_ = ptr_[eol_index];
if (BOL_state_ && (_start_token == _start_buffer ||
*(_start_token - 1) == '\n'))
{
ptr_ = &dfa_[BOL_state_ * dfa_alphabet_];
}
else if (EOL_state_ && *curr_ == '\n')
{
ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
}
else
{
const std::size_t state_ =
ptr_[lookup_[static_cast<typename Traits::index_type> (*curr_++)]];
if (state_ == 0)
{
break;
}
ptr_ = &dfa_[state_ * dfa_alphabet_];
}
if (*ptr_)
{
end_state_ = true;
id_ = *(ptr_ + id_index);
end_token_ = curr_;
}
}
if (_start_token >= _end_buffer)
{
// No more tokens...
return 0;
}
const std::size_t EOL_state_ = ptr_[eol_index];
if (EOL_state_ && curr_ == end_)
{
ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
if (*ptr_)
{
end_state_ = true;
id_ = *(ptr_ + id_index);
end_token_ = curr_;
}
}
if (end_state_)
{
// return longest match
_end_token = end_token_;
}
else
{
// No match causes char to be skipped
_end_token = _start_token + 1;
id_ = npos;
}
start_ = _start_token;
end_ = _end_token;
return id_;
}
bool reload_buffer (const CharT * &curr_, const bool end_state_,
const CharT * &end_token_)
{
bool success_ = !_stream->eof ();
if (success_)
{
const CharT *old_start_token_ = _start_token;
std::size_t old_size_ = _buffer.size ();
std::size_t count_ = 0;
if (_start_token - 1 == _start_buffer)
{
// Run out of buffer space, so increase.
_buffer.resize (old_size_ + _buffer_increment, '!');
_start_buffer = &_buffer.front ();
_start_token = _start_buffer + 1;
_stream->read (_start_buffer + old_size_,
_buffer_increment);
count_ = _stream->gcount ();
_end_buffer = _start_buffer + old_size_ + count_;
}
else if (_start_token < _end_buffer)
{
const std::size_t len_ = _end_buffer - _start_token;
::memcpy (_start_buffer, _start_token - 1, (len_ + 1) * sizeof (CharT));
_stream->read (_start_buffer + len_ + 1,
static_cast<std::streamsize> (_buffer.size () - len_ - 1));
count_ = _stream->gcount ();
_start_token = _start_buffer + 1;
_end_buffer = _start_buffer + len_ + 1 + count_;
}
else
{
_stream->read (_start_buffer, static_cast<std::streamsize>
(_buffer.size ()));
count_ = _stream->gcount ();
_start_token = _start_buffer;
_end_buffer = _start_buffer + count_;
}
if (end_state_)
{
end_token_ = _start_token +
(end_token_ - old_start_token_);
}
curr_ = _start_token + (curr_ - old_start_token_);
}
return success_;
}
// Disallow copying of buffer
basic_file_input (const basic_file_input &);
const basic_file_input &operator = (const basic_file_input &);
};
typedef basic_file_input<char> file_input;
typedef basic_file_input<wchar_t> wfile_input;
}
}
#endif