Parser/tokenizer/string_tokenizer.c - platform/external/python/cpython3 - Git at Google

 #include "Python.h"
 #include "errcode.h"

 #include "helpers.h"
 #include "../lexer/state.h"

 static int
 tok_underflow_string(struct tok_state *tok) {
     char *end = strchr(tok->inp, '\n');
     if (end != NULL) {
         end++;
     }
     else {
         end = strchr(tok->inp, '\0');
         if (end == tok->inp) {
             tok->done = E_EOF;
             return 0;
         }
     }
     if (tok->start == NULL) {
         tok->buf = tok->cur;
     }
     tok->line_start = tok->cur;
     ADVANCE_LINENO();
     tok->inp = end;
     return 1;
 }

 /* Fetch a byte from TOK, using the string buffer. */
 static int
 buf_getc(struct tok_state *tok) {
     return Py_CHARMASK(*tok->str++);
 }

 /* Unfetch a byte from TOK, using the string buffer. */
 static void
 buf_ungetc(int c, struct tok_state *tok) {
     tok->str--;
     assert(Py_CHARMASK(*tok->str) == c);        /* tok->cur may point to read-only segment */
 }

 /* Set the readline function for TOK to ENC. For the string-based
    tokenizer, this means to just record the encoding. */
 static int
 buf_setreadl(struct tok_state *tok, const char* enc) {
     tok->enc = enc;
     return 1;
 }

 /* Decode a byte string STR for use as the buffer of TOK.
    Look for encoding declarations inside STR, and record them
    inside TOK.  */
 static char *
 decode_str(const char *input, int single, struct tok_state *tok, int preserve_crlf)
 {
     PyObject* utf8 = NULL;
     char *str;
     const char *s;
     const char *newl[2] = {NULL, NULL};
     int lineno = 0;
     tok->input = str = _PyTokenizer_translate_newlines(input, single, preserve_crlf, tok);
     if (str == NULL)
         return NULL;
     tok->enc = NULL;
     tok->str = str;
     if (!_PyTokenizer_check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
         return _PyTokenizer_error_ret(tok);
     str = tok->str;             /* string after BOM if any */
     assert(str);
     if (tok->enc != NULL) {
         utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
         if (utf8 == NULL)
             return _PyTokenizer_error_ret(tok);
         str = PyBytes_AsString(utf8);
     }
     for (s = str;; s++) {
         if (*s == '\0') break;
         else if (*s == '\n') {
             assert(lineno < 2);
             newl[lineno] = s;
             lineno++;
             if (lineno == 2) break;
         }
     }
     tok->enc = NULL;
     /* need to check line 1 and 2 separately since check_coding_spec
        assumes a single line as input */
     if (newl[0]) {
         if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
             return NULL;
         }
         if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
             if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],
                                    tok, buf_setreadl))
                 return NULL;
         }
     }
     if (tok->enc != NULL) {
         assert(utf8 == NULL);
         utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
         if (utf8 == NULL)
             return _PyTokenizer_error_ret(tok);
         str = PyBytes_AS_STRING(utf8);
     }
     assert(tok->decoding_buffer == NULL);
     tok->decoding_buffer = utf8; /* CAUTION */
     return str;
 }

 /* Set up tokenizer for string */
 struct tok_state *
 _PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
 {
     struct tok_state *tok = _PyTokenizer_tok_new();
     char *decoded;

     if (tok == NULL)
         return NULL;
     decoded = decode_str(str, exec_input, tok, preserve_crlf);
     if (decoded == NULL) {
         _PyTokenizer_Free(tok);
         return NULL;
     }

     tok->buf = tok->cur = tok->inp = decoded;
     tok->end = decoded;
     tok->underflow = &tok_underflow_string;
     return tok;
 }
	#include "Python.h"
	#include "errcode.h"

	#include "helpers.h"
	#include "../lexer/state.h"

	static int
	tok_underflow_string(struct tok_state *tok) {
	char *end = strchr(tok->inp, '\n');
	if (end != NULL) {
	end++;
	}
	else {
	end = strchr(tok->inp, '\0');
	if (end == tok->inp) {
	tok->done = E_EOF;
	return 0;
	}
	}
	if (tok->start == NULL) {
	tok->buf = tok->cur;
	}
	tok->line_start = tok->cur;
	ADVANCE_LINENO();
	tok->inp = end;
	return 1;
	}

	/* Fetch a byte from TOK, using the string buffer. */
	static int
	buf_getc(struct tok_state *tok) {
	return Py_CHARMASK(*tok->str++);
	}

	/* Unfetch a byte from TOK, using the string buffer. */
	static void
	buf_ungetc(int c, struct tok_state *tok) {
	tok->str--;
	assert(Py_CHARMASK(tok->str) == c); / tok->cur may point to read-only segment */
	}

	/* Set the readline function for TOK to ENC. For the string-based
	tokenizer, this means to just record the encoding. */
	static int
	buf_setreadl(struct tok_state tok, const char enc) {
	tok->enc = enc;
	return 1;
	}

	/* Decode a byte string STR for use as the buffer of TOK.
	Look for encoding declarations inside STR, and record them
	inside TOK. */
	static char *
	decode_str(const char input, int single, struct tok_state tok, int preserve_crlf)
	{
	PyObject* utf8 = NULL;
	char *str;
	const char *s;
	const char *newl[2] = {NULL, NULL};
	int lineno = 0;
	tok->input = str = _PyTokenizer_translate_newlines(input, single, preserve_crlf, tok);
	if (str == NULL)
	return NULL;
	tok->enc = NULL;
	tok->str = str;
	if (!_PyTokenizer_check_bom(buf_getc, buf_ungetc, buf_setreadl, tok))
	return _PyTokenizer_error_ret(tok);
	str = tok->str; /* string after BOM if any */
	assert(str);
	if (tok->enc != NULL) {
	utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
	if (utf8 == NULL)
	return _PyTokenizer_error_ret(tok);
	str = PyBytes_AsString(utf8);
	}
	for (s = str;; s++) {
	if (*s == '\0') break;
	else if (*s == '\n') {
	assert(lineno < 2);
	newl[lineno] = s;
	lineno++;
	if (lineno == 2) break;
	}
	}
	tok->enc = NULL;
	/* need to check line 1 and 2 separately since check_coding_spec
	assumes a single line as input */
	if (newl[0]) {
	if (!_PyTokenizer_check_coding_spec(str, newl[0] - str, tok, buf_setreadl)) {
	return NULL;
	}
	if (tok->enc == NULL && tok->decoding_state != STATE_NORMAL && newl[1]) {
	if (!_PyTokenizer_check_coding_spec(newl[0]+1, newl[1] - newl[0],
	tok, buf_setreadl))
	return NULL;
	}
	}
	if (tok->enc != NULL) {
	assert(utf8 == NULL);
	utf8 = _PyTokenizer_translate_into_utf8(str, tok->enc);
	if (utf8 == NULL)
	return _PyTokenizer_error_ret(tok);
	str = PyBytes_AS_STRING(utf8);
	}
	assert(tok->decoding_buffer == NULL);
	tok->decoding_buffer = utf8; /* CAUTION */
	return str;
	}

	/* Set up tokenizer for string */
	struct tok_state *
	_PyTokenizer_FromString(const char *str, int exec_input, int preserve_crlf)
	{
	struct tok_state *tok = _PyTokenizer_tok_new();
	char *decoded;

	if (tok == NULL)
	return NULL;
	decoded = decode_str(str, exec_input, tok, preserve_crlf);
	if (decoded == NULL) {
	_PyTokenizer_Free(tok);
	return NULL;
	}

	tok->buf = tok->cur = tok->inp = decoded;
	tok->end = decoded;
	tok->underflow = &tok_underflow_string;
	return tok;
	}