blob: 4430ffe4a48413aa3221acbba30636ffc2cbe381 [file] [log] [blame]
/** \file
* \brief The ANTLR3 C filestream is used when the source character stream
* is a filesystem based input set and all the characters in the filestream
* can be loaded at once into memory and away the lexer goes.
*
* A number of initializers are provided in order that various character
* sets can be supported from input files. The ANTLR3 C runtime expects
* to deal with UTF32 characters only (the reasons for this are to
* do with the simplification of C code when using this form of Unicode
* encoding, though this is not a panacea. More information can be
* found on this by consulting:
* - http://www.unicode.org/versions/Unicode4.0.0/ch02.pdf#G11178
* Where a well grounded discussion of the encoding formats available
* may be found.
*
*/
// [The "BSD licence"]
// Copyright (c) 2005-2009 Jim Idle, Temporal Wave LLC
// http://www.temporal-wave.com
// http://www.linkedin.com/in/jimidle
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions
// are met:
// 1. Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// 2. Redistributions in binary form must reproduce the above copyright
// notice, this list of conditions and the following disclaimer in the
// documentation and/or other materials provided with the distribution.
// 3. The name of the author may not be used to endorse or promote products
// derived from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
// IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
// OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
// IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
// INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
// NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
// THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <antlr3.h>
static void setupInputStream (pANTLR3_INPUT_STREAM input);
static pANTLR3_INPUT_STREAM antlr3CreateFileStream (pANTLR3_UINT8 fileName);
static pANTLR3_INPUT_STREAM antlr3CreateStringStream (pANTLR3_UINT8 data);
ANTLR3_API pANTLR3_INPUT_STREAM
antlr3FileStreamNew(pANTLR3_UINT8 fileName, ANTLR3_UINT32 encoding)
{
pANTLR3_INPUT_STREAM input;
// First order of business is to read the file into some buffer space
// as just straight 8 bit bytes. Then we will work out the encoding and
// byte order and adjust the API functions that are installed for the
// default 8Bit stream accordingly.
//
input = antlr3CreateFileStream(fileName);
if (input == NULL)
{
return NULL;
}
// We have the data in memory now so we can deal with it according to
// the encoding scheme we were given by the user.
//
input->encoding = encoding;
// Now we need to work out the endian type and install any
// API functions that differ from 8Bit
//
setupInputStream(input);
// Now we can set up the file name
//
input->istream->streamName = input->strFactory->newStr8(input->strFactory, fileName);
input->fileName = input->istream->streamName;
return input;
}
ANTLR3_API pANTLR3_INPUT_STREAM
antlr3StringStreamNew(pANTLR3_UINT8 data, ANTLR3_UINT32 encoding, ANTLR3_UINT32 size, pANTLR3_UINT8 name)
{
pANTLR3_INPUT_STREAM input;
// First order of business is to set up the stream and install the data pointer.
// Then we will work out the encoding and byte order and adjust the API functions that are installed for the
// default 8Bit stream accordingly.
//
input = antlr3CreateStringStream(data);
if (input == NULL)
{
return NULL;
}
// Size (in bytes) of the given 'string'
//
input->sizeBuf = size;
// We have the data in memory now so we can deal with it according to
// the encoding scheme we were given by the user.
//
input->encoding = encoding;
// Now we need to work out the endian type and install any
// API functions that differ from 8Bit
//
setupInputStream(input);
// Now we can set up the file name
//
input->istream->streamName = input->strFactory->newStr8(input->strFactory, name);
input->fileName = input->istream->streamName;
return input;
}
/// Determine endianess of the input stream and install the
/// API required for the encoding in that format.
///
static void
setupInputStream(pANTLR3_INPUT_STREAM input)
{
ANTLR3_BOOLEAN isBigEndian;
// Used to determine the endianness of the machine we are currently
// running on.
//
ANTLR3_UINT16 bomTest = 0xFEFF;
// What endianess is the machine we are running on? If the incoming
// encoding endianess is the same as this machine's natural byte order
// then we can use more efficient API calls.
//
if (*((pANTLR3_UINT8)(&bomTest)) == 0xFE)
{
isBigEndian = ANTLR3_TRUE;
}
else
{
isBigEndian = ANTLR3_FALSE;
}
// What encoding did the user tell us {s}he thought it was? I am going
// to get sick of the questions on antlr-interest, I know I am.
//
switch (input->encoding)
{
case ANTLR3_ENC_UTF8:
// See if there is a BOM at the start of this UTF-8 sequence
// and just eat it if there is. Windows .TXT files have this for instance
// as it identifies UTF-8 even though it is of no consequence for byte order
// as UTF-8 does not have a byte order.
//
if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xEF
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xBB
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2)) == 0xBF
)
{
// The UTF8 BOM is present so skip it
//
input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 3);
}
// Install the UTF8 input routines
//
antlr3UTF8SetupStream(input);
break;
case ANTLR3_ENC_UTF16:
// See if there is a BOM at the start of the input. If not then
// we assume that the byte order is the natural order of this
// machine (or it is really UCS2). If there is a BOM we determine if the encoding
// is the same as the natural order of this machine.
//
if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xFE
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xFF
)
{
// BOM Present, indicates Big Endian
//
input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2);
antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE);
}
else if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xFF
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xFE
)
{
// BOM present, indicates Little Endian
//
input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 2);
antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE);
}
else
{
// No BOM present, assume local computer byte order
//
antlr3UTF16SetupStream(input, isBigEndian, isBigEndian);
}
break;
case ANTLR3_ENC_UTF32:
// See if there is a BOM at the start of the input. If not then
// we assume that the byte order is the natural order of this
// machine. If there is we determine if the encoding
// is the same as the natural order of this machine.
//
if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0x00
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0x00
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+2)) == 0xFE
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+3)) == 0xFF
)
{
// BOM Present, indicates Big Endian
//
input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4);
antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE);
}
else if ( (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar)) == 0xFF
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0xFE
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0x00
&& (ANTLR3_UINT8)(*((pANTLR3_UINT8)input->nextChar+1)) == 0x00
)
{
// BOM present, indicates Little Endian
//
input->nextChar = (void *)((pANTLR3_UINT8)input->nextChar + 4);
antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE);
}
else
{
// No BOM present, assume local computer byte order
//
antlr3UTF32SetupStream(input, isBigEndian, isBigEndian);
}
break;
case ANTLR3_ENC_UTF16BE:
// Encoding is definately Big Endian with no BOM
//
antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_TRUE);
break;
case ANTLR3_ENC_UTF16LE:
// Encoding is definately Little Endian with no BOM
//
antlr3UTF16SetupStream(input, isBigEndian, ANTLR3_FALSE);
break;
case ANTLR3_ENC_UTF32BE:
// Encoding is definately Big Endian with no BOM
//
antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_TRUE);
break;
case ANTLR3_ENC_UTF32LE:
// Encoding is definately Little Endian with no BOM
//
antlr3UTF32SetupStream(input, isBigEndian, ANTLR3_FALSE);
break;
case ANTLR3_ENC_EBCDIC:
// EBCDIC is basically the same as ASCII but with an on the
// fly translation to ASCII
//
antlr3EBCDICSetupStream(input);
break;
case ANTLR3_ENC_8BIT:
default:
// Standard 8bit/ASCII
//
antlr38BitSetupStream(input);
break;
}
}
/** \brief Use the contents of an operating system file as the input
* for an input stream.
*
* \param fileName Name of operating system file to read.
* \return
* - Pointer to new input stream context upon success
* - One of the ANTLR3_ERR_ defines on error.
*/
static pANTLR3_INPUT_STREAM
antlr3CreateFileStream(pANTLR3_UINT8 fileName)
{
// Pointer to the input stream we are going to create
//
pANTLR3_INPUT_STREAM input;
ANTLR3_UINT32 status;
if (fileName == NULL)
{
return NULL;
}
// Allocate memory for the input stream structure
//
input = (pANTLR3_INPUT_STREAM)
ANTLR3_CALLOC(1, sizeof(ANTLR3_INPUT_STREAM));
if (input == NULL)
{
return NULL;
}
// Structure was allocated correctly, now we can read the file.
//
status = antlr3read8Bit(input, fileName);
// Call the common 8 bit input stream handler
// initialization.
//
antlr3GenericSetupStream(input);
// However if the file was not there or something then we
// need to close. Have to wait until here as we cannot call
// close until the API is installed of course.
//
if (status != ANTLR3_SUCCESS)
{
input->close(input);
return NULL;
}
return input;
}
ANTLR3_API ANTLR3_UINT32
antlr3read8Bit(pANTLR3_INPUT_STREAM input, pANTLR3_UINT8 fileName)
{
ANTLR3_FDSC infile;
ANTLR3_UINT32 fSize;
/* Open the OS file in read binary mode
*/
infile = antlr3Fopen(fileName, "rb");
/* Check that it was there
*/
if (infile == NULL)
{
return (ANTLR3_UINT32)ANTLR3_ERR_NOFILE;
}
/* It was there, so we can read the bytes now
*/
fSize = antlr3Fsize(fileName); /* Size of input file */
/* Allocate buffer for this input set
*/
input->data = ANTLR3_MALLOC((size_t)fSize);
input->sizeBuf = fSize;
if (input->data == NULL)
{
return (ANTLR3_UINT32)ANTLR3_ERR_NOMEM;
}
input->isAllocated = ANTLR3_TRUE;
/* Now we read the file. Characters are not converted to
* the internal ANTLR encoding until they are read from the buffer
*/
antlr3Fread(infile, fSize, input->data);
/* And close the file handle
*/
antlr3Fclose(infile);
return ANTLR3_SUCCESS;
}
/** \brief Open an operating system file and return the descriptor
* We just use the common open() and related functions here.
* Later we might find better ways on systems
* such as Windows and OpenVMS for instance. But the idea is to read the
* while file at once anyway, so it may be irrelevant.
*/
ANTLR3_API ANTLR3_FDSC
antlr3Fopen(pANTLR3_UINT8 filename, const char * mode)
{
return (ANTLR3_FDSC)fopen((const char *)filename, mode);
}
/** \brief Close an operating system file and free any handles
* etc.
*/
ANTLR3_API void
antlr3Fclose(ANTLR3_FDSC fd)
{
fclose(fd);
}
ANTLR3_API ANTLR3_UINT32
antlr3Fsize(pANTLR3_UINT8 fileName)
{
struct _stat statbuf;
_stat((const char *)fileName, &statbuf);
return (ANTLR3_UINT32)statbuf.st_size;
}
ANTLR3_API ANTLR3_UINT32
antlr3Fread(ANTLR3_FDSC fdsc, ANTLR3_UINT32 count, void * data)
{
return (ANTLR3_UINT32)fread(data, (size_t)count, 1, fdsc);
}
/** \brief Use the supplied 'string' as input to the stream
*
* \param data Pointer to the input data
* \return
* - Pointer to new input stream context upon success
* - NULL defines on error.
*/
static pANTLR3_INPUT_STREAM
antlr3CreateStringStream(pANTLR3_UINT8 data)
{
// Pointer to the input stream we are going to create
//
pANTLR3_INPUT_STREAM input;
if (data == NULL)
{
return NULL;
}
// Allocate memory for the input stream structure
//
input = (pANTLR3_INPUT_STREAM)
ANTLR3_CALLOC(1, sizeof(ANTLR3_INPUT_STREAM));
if (input == NULL)
{
return NULL;
}
// Structure was allocated correctly, now we can install the pointer
//
input->data = data;
input->isAllocated = ANTLR3_FALSE;
// Call the common 8 bit input stream handler
// initialization.
//
antlr3GenericSetupStream(input);
return input;
}