%top{
/*
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 * http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing,
 * software distributed under the License is distributed on an
 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 * KIND, either express or implied.  See the License for the
 * specific language governing permissions and limitations
 * under the License.
 */

/*
 * The rules in this scanner implementation are based on the followings.
 *
 *     - openCypher
 *           - Cypher Query Language Reference (Version 9)
 *           - Grammar Specification (M13)
 *           - ANTLR Grammar (M13)
 *     - JSON (RFC 8259)
 */

#include "postgres.h"

#include "common/string.h"
#include "mb/pg_wchar.h"

#include "parser/ag_scanner.h"
}

%option 8bit
%option never-interactive
%option noyywrap
%option reentrant
%option extra-type="ag_yy_extra"
%option prefix="ag_yy"
%option nounistd
%option fast noread
%option backup
%option perf-report perf-report
%option nodefault
%option warn

/* to override the default memory management */
%option noyyalloc noyyrealloc noyyfree

/* remove warnings */
%option noinput nounput
/* remove unneeded routines */
%option noyy_scan_bytes noyy_scan_string
%option noyyget_leng noyyget_text
%option noyyget_lineno noyyset_lineno
%option noyyget_in noyyset_in noyyget_out noyyset_out
%option noyyget_lval noyyset_lval noyyget_lloc noyyset_lloc
%option noyyget_debug noyyset_debug

/*
 * whitespace rule in Cypher handles twenty-four characters out of the
 * twenty-five characters defined as whitespace characters, four extra control
 * characters (FS, GS, RS, and US), and Mongolian vowel separator in Unicode.
 *
 * Only six of them below have been considered as whitespace characters here.
 * This character set is a superset of whitespace characters in JSON.
 *
 *     [\t\n\v\f\r ]
 *         U+0009 CHARACTER TABULATION (HT, Horizontal Tab)
 *         U+000A LINE FEED (LF)
 *         U+000B LINE TABULATION (VT, Vertical Tab)
 *         U+000C FORM FEED (FF)
 *         U+000D CARRIAGE RETURN (CR)
 *         U+0020 SPACE
 *
 * The other characters are listed below for future reference. To handle them,
 * you may use the patterns that match UTF-8 encoded code points of them.
 *
 *     \xC2[\x85\xA0]
 *         U+0085 NEXT LINE (NEL) -- not in Cypher
 *         U+00A0 NO-BREAK SPACE
 *     \xE1\x9A\x80
 *         U+1680 OGHAM SPACE MARK
 *     \xE2\x80[\x80-\x8A\xA8\xA9\xAF]
 *         U+2000 EN QUAD
 *         U+2001 EM QUAD
 *         U+2002 EN SPACE
 *         U+2003 EM SPACE
 *         U+2004 THREE-PER-EM SPACE
 *         U+2005 FOUR-PER-EM SPACE
 *         U+2006 SIX-PER-EM SPACE
 *         U+2007 FIGURE SPACE
 *         U+2008 PUNCTUATION SPACE
 *         U+2009 THIN SPACE
 *         U+200A HAIR SPACE
 *         U+2028 LINE SEPARATOR
 *         U+2029 PARAGRAPH SEPARATOR
 *         U+202F NARROW NO-BREAK SPACE
 *     \xE2\x81\x9F
 *         U+205F MEDIUM MATHEMATICAL SPACE
 *     \xE3\x80\x80
 *         U+3000 IDEOGRAPHIC SPACE
 *
 *     [\x1C-\x1F]
 *         U+001C INFORMATION SEPARATOR FOUR (FS, File Separator)
 *         U+001D INFORMATION SEPARATOR THREE (GS, Group Separator)
 *         U+001E INFORMATION SEPARATOR TWO (RS, Record Separator)
 *         U+001F INFORMATION SEPARATOR ONE (US, Unit Separator)
 *
 *     \xE1\xA0\x8E
 *         U+180E MONGOLIAN VOWEL SEPARATOR -- not a whitespace anymore
 */
whitespace [\t\n\v\f\r ]+

/*
 * Comment rule for multi-line comment in Cypher does not match comments that
 * end with an odd number of "*"s before the closing sequence.
 * Therefore, the rule has been modified so that it can match such comments.
 */
%x mlcomment
mlcstart  "/*"
mlcchars  [^*]+|\*+
mlcstop    \*+\/
slcomment "//"[^\n\r]*

/*
 * For numbers, unary plus and minus are handled as operators later in Cypher
 * grammar although JSON numbers may be prefixed with an optional minus sign.
 *
 * JSON does not support octal and hexadecimal integer literals.
 */

digit    [0-9]
hexdigit [0-9A-Fa-f]

/*
 * digitseq pattern covers DecimalInteger and OctalInteger rules in Cypher.
 * Integer in JSON is represented in "0|[1-9][0-9]*" pattern that is covered by
 * digitseq pattern.
 */
digitseq {digit}+

/*
 * hexint pattern covers HexInteger rule in Cypher and also accepts "0X" prefix
 * for convenience.
 */
hexint     0[Xx]{hexdigit}+
hexintfail 0[Xx]

/*
 * decimal pattern covers RegularDecimalReal rule in Cypher and also accepts
 * "{digitseq}\." pattern (e.g. "1.") which RegularDecimalReal rule doesn't.
 * Decimal in JSON is represented in "(0|[1-9][0-9]*)\.[0-9]+" pattern that is
 * covered by decimal pattern.
 *
 * decimalfail pattern is for ranges (e.g. "0..1"). The action for the pattern
 * consumes digitseq and returns dot_dot back to the input stream so that
 * dot_dot can be matched next.
 */
decimal     {digitseq}\.{digit}*|\.{digitseq}
decimalfail {digitseq}\.\.

/*
 * decimalsci pattern covers ExponentDecimalReal rule in Cypher. It also
 * accepts coefficients in "{digitseq}\." pattern and explicit positive
 * exponents ("+") which ExponentDecimalReal rule doesn't.
 * Scientific notation in JSON is represented in
 * "(0|[1-9][0-9]*)(\.[0-9]+)?[Ee][+-]?[0-9]+" pattern that is covered by
 * decimalsci pattern.
 */
decimalsci      ({digitseq}|{decimal})[Ee][+-]?{digitseq}
decimalscifail1 ({digitseq}|{decimal})[Ee]
decimalscifail2 ({digitseq}|{decimal})[Ee][+-]

/*
 * These patterns cover StringLiteral rule in Cypher and JSON strings.
 * The escape sequence "\/" has been added for JSON strings.
 *
 * esasciifail and esunicodefail patterns handle escape sequences that are not
 * accepted by esascii and esunicode patterns respectively.
 *
 * Since esasciifail pattern can match anything that esascii pattern can,
 * esascii must appear first before esasciifail in the rules section.
 *
 * qstru start condition is for Unicode low surrogates.
 */
%x dqstr sqstr qstru
dquote        \"
dqchars       [^"\\]+
squote        '
sqchars       [^'\\]+
esascii       \\["'/\\bfnrt]
esasciifail   \\[^Uu]?
esunicode     \\(U{hexdigit}{8}|u{hexdigit}{4})
esunicodefail \\(U{hexdigit}{0,7}|u{hexdigit}{0,3})
any           (?s:.)

/* id pattern is for UnescapedSymbolicName rule in Cypher. */
id      {idstart}{idcont}*
idstart [A-Z_a-z\x80-\xFF]
idcont  [$0-9A-Z_a-z\x80-\xFF]

/* These are for EscapedSymbolicName rule in Cypher. */
%x bqid
bquote   `
bqchars  [^`]+
esbquote {bquote}{bquote}

/*
 * Parameter rule in Cypher is "$" followed by SymbolicName or DecimalInteger
 * rule. However, according to "Cypher Query Language Reference",
 *
 *     Parameters may consist of letters and numbers, and any combination of
 *     these, but cannot start with a number or a currency symbol.
 *
 * So, a modified version of Parameter rule that follows the above explanation
 * has been used.
 */
param \${id}

/*
 * These are tokens that are used as operators and language constructs in
 * Cypher, and some of them are structural characters in JSON.
 */
left_contains  "<@"
right_contains "@>"
any_exists     "?|"
all_exists     "?&"
concat         "||"
access_path    "#>"
lt_gt          "<>"
lt_eq          "<="
gt_eq          ">="
dot_dot        ".."
plus_eq        "+="
eq_tilde       "=~"
typecast       "::"
self           [?%()*+,\-./:;<=>[\]^{|}]

other .

%{
typedef struct strbuf
{
    char *buffer;
    int capacity;
    int length;
} strbuf;

static void strbuf_init(strbuf *sb, int capacity);
static void strbuf_cleanup(strbuf *sb);
static void strbuf_append_buf(strbuf *sb, const char *b, const int len);
static void strbuf_append_char(strbuf *sb, const char c);
static void strbuf_append_codepoint(strbuf *sb, const pg_wchar c);
static void strbuf_ensure_capacity(strbuf *sb, int len);
static const char *strbuf_get_str(strbuf *sb);
static void strbuf_reset(strbuf *sb);

typedef struct ag_yy_extra
{
    /*
     * accumulate matched strings to build a complete literal if multiple rules
     * are needed to scan it, or keep a decimal integer literal that is
     * converted from a hexadecimal or an octal integer literal if it is too
     * large to fit in "int" type
     */
    strbuf literal_buf;

    // for Unicode surrogate pair
    pg_wchar high_surrogate;
    int start_cond;

    // for the location of the current token and the actual position of it
    const char *scan_buf;
    int last_loc;
} ag_yy_extra;

static void integer_literal_to_token(const char *s, ag_token *token,
                                     ag_yy_extra *extra);
#define hexadecimal_to_decimal(numstr, sb) _numstr_to_decimal(numstr, 16, sb)
#define octal_to_decimal(numstr, sb) _numstr_to_decimal(numstr, 8, sb)
static void _numstr_to_decimal(const char *numstr, const int base, strbuf *sb);
static uint32 hexdigit_value(const char c);
static uint32 octdigit_value(const char c);

static bool is_high_surrogate(const pg_wchar c);
static bool is_low_surrogate(const pg_wchar c);

#define update_location() \
    do \
    { \
        yyextra.last_loc = yytext - yyextra.scan_buf; \
    } while (0)
#define get_location() (yyextra.last_loc)

#define scan_errmsg(msg) _scan_errmsg(msg, &yyextra)
static int _scan_errmsg(const char *msg, const ag_yy_extra *extra);
#define scan_errposition() _scan_errposition(yyextra.last_loc, &yyextra)
static int _scan_errposition(const int location, const ag_yy_extra *extra);

/*
 * Avoid exit() on fatal scanner errors.
 * Call yy_fatal_error() just to keep compiler quiet.
 */
#define YY_FATAL_ERROR(msg) \
    do \
    { \
        ereport(ERROR, (errmsg_internal("%s", msg))); \
        yy_fatal_error(NULL, NULL); \
    } while (0)

/*
 * "yyscanner" must be used for the name of the parameter because it is
 * referenced internally. "yyscan_t" is OK because it is actually "void *"
 * and is the same with "ag_scanner_t".
 */
#define YY_DECL ag_token ag_scanner_next_token(yyscan_t yyscanner)
#define NDIGITS_PER_REMAINDER 9
%}

%%

%{
// This is used in the actions below.
ag_token token;
%}

{whitespace} {
    // ignore
}

{mlcstart} {
    // update location in case of unterminated comment
    update_location();
    BEGIN(mlcomment);
}

<mlcomment>{mlcchars} {
    // ignore
}

<mlcomment>{mlcstop} {
    BEGIN(INITIAL);
}

<mlcomment><<EOF>> {
    ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
                    scan_errmsg("unterminated /* comment"),
                    scan_errposition()));
}

{slcomment} {
    // ignore
}

{digitseq} |
{hexint} {
    update_location();
    integer_literal_to_token(yytext, &token, &yyextra);
    token.location = get_location();
    return token;
}

{hexintfail} {
    update_location();
    ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
                    scan_errmsg("invalid hexadecimal integer literal"),
                    scan_errposition()));
}

{decimal} |
{decimalsci} {
    update_location();
    token.type = AG_TOKEN_DECIMAL;
    token.value.s = yytext;
    token.location = get_location();
    return token;
}

{decimalfail} {
    // return dot_dot back to the input stream
    yyless(yyleng - 2);

    update_location();

    // consume digitseq
    integer_literal_to_token(yytext, &token, &yyextra);
    token.location = get_location();
    return token;
}

{decimalscifail1} |
{decimalscifail2} {
    update_location();
    ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
                    scan_errmsg("invalid scientific notation literal"),
                    scan_errposition()));
}

{dquote} {
    update_location();
    strbuf_reset(&yyextra.literal_buf);
    BEGIN(dqstr);
}

{squote} {
    update_location();
    strbuf_reset(&yyextra.literal_buf);
    BEGIN(sqstr);
}

<dqstr>{dqchars} |
<sqstr>{sqchars} {
    strbuf_append_buf(&yyextra.literal_buf, yytext, yyleng);
}

<dqstr,sqstr>{esascii} {
    char c;

    switch (yytext[1])
    {
    case 'b':
        c = '\b';
        break;
    case 'f':
        c = '\f';
        break;
    case 'n':
        c = '\n';
        break;
    case 'r':
        c = '\r';
        break;
    case 't':
        c = '\t';
        break;
    default:
        // '"', '\'', '/', and '\\'
        c = yytext[1];
        break;
    }

    strbuf_append_char(&yyextra.literal_buf, c);
}

<dqstr,sqstr>{esasciifail} {
    if (yyleng == 1)
    {
        /*
         * This happens when the scanner meets "\"<<EOF>>. Just consume "\"
         * so that <dqstr,sqstr,qstru><<EOF>> rule can do the rest.
         */
        strbuf_append_char(&yyextra.literal_buf, '\\');
    }
    else
    {
        update_location();
        ereport(ERROR,
                (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                 scan_errmsg("invalid escape sequence"),
                 errdetail("Valid escape sequences are \\\", \\', \\/, \\\\, \\b, \\f, \\n, \\r, \\t, \\uXXXX, and \\UXXXXXXXX."),
                 scan_errposition()));
    }
}

<dqstr,sqstr>{esunicode} {
    pg_wchar c;

    // It is unnecessary to check endptr and errno here.
    c = strtoul(yytext + 2, NULL, 16);
    if (c > 0x10FFFF)
    {
        // c is greater than the maximum value of a Unicode code point.
        update_location();
        ereport(ERROR,
                (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                 scan_errmsg("invalid Unicode escape value"),
                 errdetail("Unicode escape values cannot be greater than 10FFFF, which is the maximum value of a code point."),
                 scan_errposition()));
    }
    else if (c > 0x7F)
    {
        if (GetDatabaseEncoding() != PG_UTF8)
        {
            update_location();
            ereport(ERROR,
                    (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                     scan_errmsg("unsupported Unicode escape value"),
                     errdetail("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8."),
                     scan_errposition()));
        }

        if (is_high_surrogate(c))
        {
            yyextra.high_surrogate = c;
            yyextra.start_cond = YY_START;
            BEGIN(qstru);
        }
        else if (is_low_surrogate(c))
        {
            update_location();
            ereport(ERROR,
                    (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                     scan_errmsg("invalid Unicode surrogate pair"),
                     errdetail("A low surrogate must follow a high surrogate."),
                     scan_errposition()));
        }
        else
        {
            strbuf_append_codepoint(&yyextra.literal_buf, c);
        }
    }
    else if (c > 0)
    {
        // c is an ASCII character.
        strbuf_append_char(&yyextra.literal_buf, (char)c);
    }
    else
    {
        /*
         * U+0000 NUL is the minimum value of a Unicode code point.
         * However, it is invalid in quoted strings as well as query strings.
         */
        update_location();
        ereport(ERROR,
                (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                 scan_errmsg("unsupported Unicode escape value"),
                 errdetail("Unicode code point value 0000 is not allowed in quoted strings."),
                 scan_errposition()));
    }
}

<qstru>{esunicode} {
    pg_wchar c;

    c = strtoul(yytext + 2, NULL, 16);
    if (is_low_surrogate(c))
    {
        c = surrogate_pair_to_codepoint(yyextra.high_surrogate, c);
        // 0x010000 <= c <= 0x10FFFF always holds for surrogate pairs.
        strbuf_append_codepoint(&yyextra.literal_buf, c);
        BEGIN(yyextra.start_cond);
    }
    else
    {
        update_location();
        ereport(ERROR,
                (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                 scan_errmsg("invalid Unicode surrogate pair"),
                 errdetail("A low surrogate must follow a high surrogate."),
                 scan_errposition()));
    }
}

<dqstr,sqstr,qstru>{esunicodefail} {
    update_location();
    ereport(ERROR,
            (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
             scan_errmsg("invalid Unicode escape sequence"),
             errhint("Unicode escape sequences must be \\uXXXX or \\UXXXXXXXX."),
             scan_errposition()));
}

<qstru>{any} {
    update_location();
    ereport(ERROR, (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
                    scan_errmsg("invalid Unicode surrogate pair"),
                    errdetail("A low surrogate must follow a high surrogate."),
                    scan_errposition()));
}

<dqstr>{dquote} |
<sqstr>{squote} {
    BEGIN(INITIAL);

    /*
     * In quoted strings, only Unicode escape sequences need to be verified,
     * and the actions for <dqstr,sqstr>{esunicode} and <qstru>{esunicode}
     * rules verify the code point values. So, quoted strings are always valid.
     */

    token.type = AG_TOKEN_STRING;
    token.value.s = strbuf_get_str(&yyextra.literal_buf);
    token.location = get_location();
    return token;
}

<dqstr,sqstr,qstru><<EOF>> {
    ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
                    scan_errmsg("unterminated quoted string"),
                    scan_errposition()));
}

{id} {
    update_location();
    token.type = AG_TOKEN_IDENTIFIER;
    token.value.s = yytext;
    token.location = get_location();
    return token;
}

{bquote} {
    update_location();
    strbuf_reset(&yyextra.literal_buf);
    BEGIN(bqid);
}

<bqid>{bqchars} {
    strbuf_append_buf(&yyextra.literal_buf, yytext, yyleng);
}

<bqid>{esbquote} {
    strbuf_append_char(&yyextra.literal_buf, '`');
}

<bqid>{bquote} {
    BEGIN(INITIAL);

    if (yyextra.literal_buf.length == 0)
    {
        ereport(ERROR, (errcode(ERRCODE_INVALID_NAME),
                        scan_errmsg("zero-length quoted identifier"),
                        scan_errposition()));
    }

    token.type = AG_TOKEN_IDENTIFIER;
    token.value.s = strbuf_get_str(&yyextra.literal_buf);
    token.location = get_location();
    return token;
}

<bqid><<EOF>> {
    ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
                    scan_errmsg("unterminated quoted identifier"),
                    scan_errposition()));
}

{param} {
    update_location();
    token.type = AG_TOKEN_PARAMETER;
    token.value.s = yytext + 1;
    token.location = get_location();
    return token;
}

{concat} {
    update_location();
    token.type = AG_TOKEN_CONCAT;
    token.value.s = yytext;
    token.location = get_location();
    return token;
}

{access_path} {
    update_location();
    token.type = AG_TOKEN_ACCESS_PATH;
    token.value.s = yytext;
    token.location = get_location();
    return token;
}

{any_exists} {
    update_location();
    token.type = AG_TOKEN_ANY_EXISTS;
    token.value.s = yytext;
    token.location = get_location();
    return token;
}

{left_contains} {
    update_location();
    token.type = AG_TOKEN_LEFT_CONTAINS;
    token.value.s = yytext;
    token.location = get_location();
    return token;
}

{right_contains} {
    update_location();
    token.type = AG_TOKEN_RIGHT_CONTAINS;
    token.value.s = yytext;
    token.location = get_location();
    return token;
}

{all_exists} {
    update_location();
    token.type = AG_TOKEN_ALL_EXISTS;
    token.value.s = yytext;
    token.location = get_location();
    return token;
}

{lt_gt} {
    update_location();
    token.type = AG_TOKEN_LT_GT;
    token.value.s = yytext;
    token.location = get_location();
    return token;
}

{lt_eq} {
    update_location();
    token.type = AG_TOKEN_LT_EQ;
    token.value.s = yytext;
    token.location = get_location();
    return token;
}

{gt_eq} {
    update_location();
    token.type = AG_TOKEN_GT_EQ;
    token.value.s = yytext;
    token.location = get_location();
    return token;
}

{dot_dot} {
    update_location();
    token.type = AG_TOKEN_DOT_DOT;
    token.value.s = yytext;
    token.location = get_location();
    return token;
}

{plus_eq} {
    update_location();
    token.type = AG_TOKEN_PLUS_EQ;
    token.value.s = yytext;
    token.location = get_location();
    return token;
}

{eq_tilde} {
    update_location();
    token.type = AG_TOKEN_EQ_TILDE;
    token.value.s = yytext;
    token.location = get_location();
    return token;
}

{typecast} {
    update_location();
    token.type = AG_TOKEN_TYPECAST;
    token.value.s = yytext;
    token.location = get_location();
    return token;
}

{self} {
    update_location();
    token.type = AG_TOKEN_CHAR;
    token.value.c = yytext[0];
    token.location = get_location();
    return token;
}

{other} {
    update_location();
    ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
                    scan_errmsg("unexpected character"),
                    scan_errposition()));
}

<<EOF>> {
    update_location();
    token.type = AG_TOKEN_NULL;
    token.value.c = '\0';
    token.location = get_location();
    return token;
}

%%

/*
 * Override the default memory management to make flex use palloc() instead of
 * malloc().
 */

void *ag_yyalloc(yy_size_t size, yyscan_t yyscanner)
{
    return palloc(size);
}

void *ag_yyrealloc(void *ptr, yy_size_t size, yyscan_t yyscanner)
{
    // see realloc(3)
    if (ptr)
    {
        if (size == 0)
        {
            pfree(ptr);
            return NULL;
        }
        else
        {
            return repalloc(ptr, size);
        }
    }
    else
    {
        return palloc(size);
    }
}

void ag_yyfree(void *ptr, yyscan_t yyscanner)
{
    if (ptr)
        pfree(ptr);
}

static void strbuf_init(strbuf *sb, int capacity)
{
    sb->buffer = palloc(capacity);
    sb->capacity = capacity;
    sb->length = 0;
}

static void strbuf_cleanup(strbuf *sb)
{
    if (sb->buffer)
        pfree(sb->buffer);
}

static void strbuf_append_buf(strbuf *sb, const char *b, const int len)
{
    strbuf_ensure_capacity(sb, sb->length + len);
    memcpy(sb->buffer + sb->length, b, len);
    sb->length += len;
}

static void strbuf_append_char(strbuf *sb, const char c)
{
    strbuf_ensure_capacity(sb, sb->length + 1);
    sb->buffer[sb->length] = c;
    sb->length += 1;
}

static void strbuf_append_codepoint(strbuf *sb, const pg_wchar c)
{
    unsigned char buf[6];

    unicode_to_utf8(c, buf);
    strbuf_append_buf(sb, (char *)buf, pg_utf_mblen(buf));
}

/*
 * len cannot be greater than MaxAllocSize because ReadCommand() reads
 * a message and places the message body in StringInfo.
 */
static void strbuf_ensure_capacity(strbuf *sb, int len)
{
    // consider additional 1 byte for the last '\0' character
    if (len < sb->capacity)
        return;

    do
    {
        sb->capacity *= 2;
    } while (sb->capacity <= len);

    sb->buffer = repalloc(sb->buffer, sb->capacity);
}

static const char *strbuf_get_str(strbuf *sb)
{
    sb->buffer[sb->length] = '\0';
    return sb->buffer;
}

static void strbuf_reset(strbuf *sb)
{
    sb->length = 0;
}

static void integer_literal_to_token(const char *s, ag_token *token,
                                     ag_yy_extra *extra)
{
    char *endptr;
    int i;

    errno = 0;
    i = strtoint(s, &endptr, 0);

    /*
     * This is only needed for invalid octal integer literals. (e.g. "08")
     * Other cases cannot happen because of digitseq and hexint rules.
     */
    if (*endptr != '\0')
    {
        ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR),
                        _scan_errmsg("invalid octal integer literal", extra),
                        _scan_errposition(extra->last_loc, extra)));
    }

    // Treat it as a decimal if it is too large to be an "int" value.
    if (errno == ERANGE)
    {
        /*
         * Accessing s[0] and s[1] is safe because ERANGE is returned only if
         * there are 10 or more characters in s. In this case, the shortest
         * integer literals for decimal, hexadecimal, and octal integers are
         * "2147483648", "0x80000000", and "020000000000" respectively.
         */
        if (s[0] == '0')
        {
            strbuf_reset(&extra->literal_buf);

            /*
             * No matter how many characters s has, if all digits in s are
             * zeros, strtoint() returns 0 without an error.
             * So, _numstr_to_decimal() assumes that there is at least one
             * non-zero digit in s.
             */
            if (s[1] == 'X' || s[1] == 'x')
                hexadecimal_to_decimal(s + 2, &extra->literal_buf);
            else
                octal_to_decimal(s + 1, &extra->literal_buf);

            s = strbuf_get_str(&extra->literal_buf);
        }
        token->type = AG_TOKEN_DECIMAL;
        token->value.s = s;
        return;
    }

    token->type = AG_TOKEN_INTEGER;
    token->value.i = i;
}

/*
 * convert a string of a hexadecimal or an octal integer to a string of the
 * corresponding decimal integer
 */
static void _numstr_to_decimal(const char *numstr, const int base, strbuf *sb)
{
    // constants for each base
    int ndigits_per_word;
    int nbits_per_digit;
    uint32 (*digit_value)(const char);

    /*
     * constants for the conversion
     *
     *     "divisor" is 10^9.
     *
     *     At most 3 divisions are needed to eliminate 1 word.
     *         hex: 4294967295999999999 -> 4294967295 -> 4 -> 0
     *         oct: 1073741823999999999 -> 1073741823 -> 1 -> 0
     */
    const uint64 divisor = 1000000000;
    const int ndivisions = 3;

    int ndigits;
    int nwords;
    uint32 *words;
    const char *digitp;
    int word_i;
    int ndigits_word0;
    uint32 word;
    uint32 *remainders;
    int nremainders;
    int i;

    // set constants for each base
    switch (base)
    {
    case 16:
        /*
         * Hexadecimal
         *
         * Maximum value for each word
         *     0xFFFFFFFF = 4294967295
         * Divisor
         *     0x3B9ACA00 = 1000000000
         * Maximum remainder
         *     0x3B9AC9FF = 999999999
         *
         * Maximum dividend
         *     0x3B9AC9FFFFFFFFFF = 4294967295999999999
         * Quotient of the maximum dividend and the divisor
         *     0xFFFFFFFF = 4294967295
         * Remainer of the above division
         *     0x3B9AC9FF = 999999999
         */
        ndigits_per_word = 8;
        nbits_per_digit = 4;
        digit_value = hexdigit_value;
        break;
    case 8:
        /*
         * Octal
         *
         * Maximum value for each word
         *     07777777777 = 1073741823
         * Divisor
         *     07346545000 = 1000000000
         * Maximum remainder
         *     07346544777 = 999999999
         *
         * Maximum dividend
         *     073465447777777777777 = 1073741823999999999
         * Quotient of the maximum dividend and the divisor
         *     07777777777 = 1073741823
         * Remainer of the above division
         *     07346544777 = 999999999
         */
        ndigits_per_word = 10;
        nbits_per_digit = 3;
        digit_value = octdigit_value;
        break;
    default:
        Assert(!"invalid base");
        return;
    }

    // skip leading zeros
    while (*numstr == '0')
        numstr++;

    // number of digits in "numstr"
    ndigits = strlen(numstr);
    Assert(ndigits > 0);

    // prepare "words" to store "numstr" in two's complement representation
    nwords = (ndigits + (ndigits_per_word - 1)) / ndigits_per_word;
    words = palloc(sizeof(*words) * nwords);

    digitp = numstr;
    word_i = 0;

    // number of digits for the first word
    ndigits_word0 = ndigits % ndigits_per_word;
    if (ndigits_word0 == 0)
        ndigits_word0 = ndigits_per_word;

    // fill the first word
    word = digit_value(*digitp++);
    for (i = 1; i < ndigits_word0; i++)
    {
        word <<= nbits_per_digit;
        word |= digit_value(*digitp++);
    }
    words[word_i++] = word;

    // fill the rest of "words"
    while (word_i < nwords)
    {
        word = digit_value(*digitp++);
        for (i = 1; i < ndigits_per_word; i++)
        {
            word <<= nbits_per_digit;
            word |= digit_value(*digitp++);
        }
        words[word_i++] = word;
    }

    // At most "ndivisions" divisions are needed to eliminate 1 word.
    remainders = palloc(sizeof(*remainders) * (ndivisions * nwords));

    nremainders = 0;
    word_i = 0;
    // repeat dividing "words" by "divisor" until the quotient becomes 0
    while (word_i < nwords)
    {
        uint64 r;

        r = 0;
        // divide "words" by "divisor"
        for (i = word_i; i < nwords; i++)
        {
            uint64 d;
            uint64 q;

            d = (uint64)words[i];
            d |= r << (nbits_per_digit * ndigits_per_word);

            q = d / divisor;
            r = d % divisor;

            words[i] = (uint32)q;
        }

        // collect the remainder to build the result
        remainders[nremainders++] = (uint32)r;

        /*
         * Divisions over the first effective word is done
         * and "words" is getting closer to 0.
         */
        if (words[word_i] == 0)
            word_i++;
    }

    // convert the collected remainders to a string, starting from the last one
    for (i = nremainders - 1; i >= 0; i--)
    {
        char buf[NDIGITS_PER_REMAINDER];
        int buf_i;
        uint32 tmp;

        buf_i = NDIGITS_PER_REMAINDER;

        for (tmp = remainders[i]; tmp > 0; tmp /= 10)
            buf[--buf_i] = '0' + (char)(tmp % 10);

        // leading zeros for intermediate digits
        if (i < nremainders - 1)
        {
            while (buf_i > 0)
                buf[--buf_i] = '0';
        }

        strbuf_append_buf(sb, &buf[buf_i], NDIGITS_PER_REMAINDER - buf_i);
    }

    pfree(remainders);
    pfree(words);
}

static uint32 hexdigit_value(const char c)
{
    if (c >= '0' && c <= '9')
        return c - '0';

    if (c >= 'A' && c <= 'F')
        return 0xA + (c - 'A');

    Assert(c >= 'a' && c <= 'f');
    return 0xA + (c - 'a');
}

static uint32 octdigit_value(const char c)
{
    Assert(c >= '0' && c <= '7');
    return c - '0';
}

static bool is_high_surrogate(const pg_wchar c)
{
    return (c >= 0xD800 && c <= 0xDBFF);
}

static bool is_low_surrogate(const pg_wchar c)
{
    return (c >= 0xDC00 && c <= 0xDFFF);
}

static int _scan_errmsg(const char *msg, const ag_yy_extra *extra)
{
    const char *t = extra->scan_buf + extra->last_loc;

    if (t[0] == YY_END_OF_BUFFER_CHAR)
        return errmsg("%s at end of input", msg);
    else
        return errmsg("%s at or near \"%s\"", msg, t);
}

static int _scan_errposition(const int location, const ag_yy_extra *extra)
{
    int pos;

    // no-op if location is unknown
    if (location < 0)
        return 0;

    // convert byte offset to number of characters
    pos = pg_mbstrlen_with_len(extra->scan_buf, location) + 1;

    return errposition(pos);
}

ag_scanner_t ag_scanner_create(const char *s)
{
    Size len;
    char *buf;
    yyscan_t yyscanner;
    ag_yy_extra extra;
    int ret;

    // The last two YY_END_OF_BUFFER_CHAR are required by flex.
    len = strlen(s);
    buf = palloc(len + 2);
    memcpy(buf, s, len);
    buf[len] = YY_END_OF_BUFFER_CHAR;
    buf[len + 1] = YY_END_OF_BUFFER_CHAR;

    ret = ag_yylex_init(&yyscanner);
    if (ret)
        elog(ERROR, "ag_yylex_init() failed: %m");

    strbuf_init(&extra.literal_buf, 1024);
    extra.high_surrogate = 0;
    extra.start_cond = INITIAL;
    extra.scan_buf = buf;
    extra.last_loc = 0;
    ag_yyset_extra(extra, yyscanner);

    ag_yy_scan_buffer(buf, len + 2, yyscanner);

    return yyscanner;
}

void ag_scanner_destroy(ag_scanner_t scanner)
{
    ag_yy_extra extra;

    extra = ag_yyget_extra(scanner);
    strbuf_cleanup(&extra.literal_buf);

    ag_yylex_destroy(scanner);
}

int ag_scanner_errmsg(const char *msg, ag_scanner_t *scanner)
{
    ag_yy_extra extra;

    extra = ag_yyget_extra(scanner);

    return _scan_errmsg(msg, &extra);
}

int ag_scanner_errposition(const int location, ag_scanner_t *scanner)
{
    ag_yy_extra extra;

    extra = ag_yyget_extra(scanner);

    return _scan_errposition(location, &extra);
}