%top{ /* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information * regarding copyright ownership. The ASF licenses this file * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY * KIND, either express or implied. See the License for the * specific language governing permissions and limitations * under the License. */ /* * The rules in this scanner implementation are based on the followings. * * - openCypher * - Cypher Query Language Reference (Version 9) * - Grammar Specification (M13) * - ANTLR Grammar (M13) * - JSON (RFC 8259) */ #include "postgres.h" #include "common/string.h" #include "mb/pg_wchar.h" #include "parser/ag_scanner.h" } %option 8bit %option never-interactive %option noyywrap %option reentrant %option extra-type="ag_yy_extra" %option prefix="ag_yy" %option nounistd %option fast noread %option backup %option perf-report perf-report %option nodefault %option warn /* to override the default memory management */ %option noyyalloc noyyrealloc noyyfree /* remove warnings */ %option noinput nounput /* remove unneeded routines */ %option noyy_scan_bytes noyy_scan_string %option noyyget_leng noyyget_text %option noyyget_lineno noyyset_lineno %option noyyget_in noyyset_in noyyget_out noyyset_out %option noyyget_lval noyyset_lval noyyget_lloc noyyset_lloc %option noyyget_debug noyyset_debug /* * whitespace rule in Cypher handles twenty-four characters out of the * twenty-five characters defined as whitespace characters, four extra control * characters (FS, GS, RS, and US), and Mongolian vowel separator in Unicode. * * Only six of them below have been considered as whitespace characters here. * This character set is a superset of whitespace characters in JSON. * * [\t\n\v\f\r ] * U+0009 CHARACTER TABULATION (HT, Horizontal Tab) * U+000A LINE FEED (LF) * U+000B LINE TABULATION (VT, Vertical Tab) * U+000C FORM FEED (FF) * U+000D CARRIAGE RETURN (CR) * U+0020 SPACE * * The other characters are listed below for future reference. To handle them, * you may use the patterns that match UTF-8 encoded code points of them. * * \xC2[\x85\xA0] * U+0085 NEXT LINE (NEL) -- not in Cypher * U+00A0 NO-BREAK SPACE * \xE1\x9A\x80 * U+1680 OGHAM SPACE MARK * \xE2\x80[\x80-\x8A\xA8\xA9\xAF] * U+2000 EN QUAD * U+2001 EM QUAD * U+2002 EN SPACE * U+2003 EM SPACE * U+2004 THREE-PER-EM SPACE * U+2005 FOUR-PER-EM SPACE * U+2006 SIX-PER-EM SPACE * U+2007 FIGURE SPACE * U+2008 PUNCTUATION SPACE * U+2009 THIN SPACE * U+200A HAIR SPACE * U+2028 LINE SEPARATOR * U+2029 PARAGRAPH SEPARATOR * U+202F NARROW NO-BREAK SPACE * \xE2\x81\x9F * U+205F MEDIUM MATHEMATICAL SPACE * \xE3\x80\x80 * U+3000 IDEOGRAPHIC SPACE * * [\x1C-\x1F] * U+001C INFORMATION SEPARATOR FOUR (FS, File Separator) * U+001D INFORMATION SEPARATOR THREE (GS, Group Separator) * U+001E INFORMATION SEPARATOR TWO (RS, Record Separator) * U+001F INFORMATION SEPARATOR ONE (US, Unit Separator) * * \xE1\xA0\x8E * U+180E MONGOLIAN VOWEL SEPARATOR -- not a whitespace anymore */ whitespace [\t\n\v\f\r ]+ /* * Comment rule for multi-line comment in Cypher does not match comments that * end with an odd number of "*"s before the closing sequence. * Therefore, the rule has been modified so that it can match such comments. */ %x mlcomment mlcstart "/*" mlcchars [^*]+|\*+ mlcstop \*+\/ slcomment "//"[^\n\r]* /* * For numbers, unary plus and minus are handled as operators later in Cypher * grammar although JSON numbers may be prefixed with an optional minus sign. * * JSON does not support octal and hexadecimal integer literals. */ digit [0-9] hexdigit [0-9A-Fa-f] /* * digitseq pattern covers DecimalInteger and OctalInteger rules in Cypher. * Integer in JSON is represented in "0|[1-9][0-9]*" pattern that is covered by * digitseq pattern. */ digitseq {digit}+ /* * hexint pattern covers HexInteger rule in Cypher and also accepts "0X" prefix * for convenience. */ hexint 0[Xx]{hexdigit}+ hexintfail 0[Xx] /* * decimal pattern covers RegularDecimalReal rule in Cypher and also accepts * "{digitseq}\." pattern (e.g. "1.") which RegularDecimalReal rule doesn't. * Decimal in JSON is represented in "(0|[1-9][0-9]*)\.[0-9]+" pattern that is * covered by decimal pattern. * * decimalfail pattern is for ranges (e.g. "0..1"). The action for the pattern * consumes digitseq and returns dot_dot back to the input stream so that * dot_dot can be matched next. */ decimal {digitseq}\.{digit}*|\.{digitseq} decimalfail {digitseq}\.\. /* * decimalsci pattern covers ExponentDecimalReal rule in Cypher. It also * accepts coefficients in "{digitseq}\." pattern and explicit positive * exponents ("+") which ExponentDecimalReal rule doesn't. * Scientific notation in JSON is represented in * "(0|[1-9][0-9]*)(\.[0-9]+)?[Ee][+-]?[0-9]+" pattern that is covered by * decimalsci pattern. */ decimalsci ({digitseq}|{decimal})[Ee][+-]?{digitseq} decimalscifail1 ({digitseq}|{decimal})[Ee] decimalscifail2 ({digitseq}|{decimal})[Ee][+-] /* * These patterns cover StringLiteral rule in Cypher and JSON strings. * The escape sequence "\/" has been added for JSON strings. * * esasciifail and esunicodefail patterns handle escape sequences that are not * accepted by esascii and esunicode patterns respectively. * * Since esasciifail pattern can match anything that esascii pattern can, * esascii must appear first before esasciifail in the rules section. * * qstru start condition is for Unicode low surrogates. */ %x dqstr sqstr qstru dquote \" dqchars [^"\\]+ squote ' sqchars [^'\\]+ esascii \\["'/\\bfnrt] esasciifail \\[^Uu]? esunicode \\(U{hexdigit}{8}|u{hexdigit}{4}) esunicodefail \\(U{hexdigit}{0,7}|u{hexdigit}{0,3}) any (?s:.) /* id pattern is for UnescapedSymbolicName rule in Cypher. */ id {idstart}{idcont}* idstart [A-Z_a-z\x80-\xFF] idcont [$0-9A-Z_a-z\x80-\xFF] /* These are for EscapedSymbolicName rule in Cypher. */ %x bqid bquote ` bqchars [^`]+ esbquote {bquote}{bquote} /* * Parameter rule in Cypher is "$" followed by SymbolicName or DecimalInteger * rule. However, according to "Cypher Query Language Reference", * * Parameters may consist of letters and numbers, and any combination of * these, but cannot start with a number or a currency symbol. * * So, a modified version of Parameter rule that follows the above explanation * has been used. */ param \${id} /* * These are tokens that are used as operators and language constructs in * Cypher, and some of them are structural characters in JSON. */ left_contains "<@" right_contains "@>" any_exists "?|" all_exists "?&" concat "||" access_path "#>" lt_gt "<>" lt_eq "<=" gt_eq ">=" dot_dot ".." plus_eq "+=" eq_tilde "=~" typecast "::" self [?%()*+,\-./:;<=>[\]^{|}] other . %{ typedef struct strbuf { char *buffer; int capacity; int length; } strbuf; static void strbuf_init(strbuf *sb, int capacity); static void strbuf_cleanup(strbuf *sb); static void strbuf_append_buf(strbuf *sb, const char *b, const int len); static void strbuf_append_char(strbuf *sb, const char c); static void strbuf_append_codepoint(strbuf *sb, const pg_wchar c); static void strbuf_ensure_capacity(strbuf *sb, int len); static const char *strbuf_get_str(strbuf *sb); static void strbuf_reset(strbuf *sb); typedef struct ag_yy_extra { /* * accumulate matched strings to build a complete literal if multiple rules * are needed to scan it, or keep a decimal integer literal that is * converted from a hexadecimal or an octal integer literal if it is too * large to fit in "int" type */ strbuf literal_buf; // for Unicode surrogate pair pg_wchar high_surrogate; int start_cond; // for the location of the current token and the actual position of it const char *scan_buf; int last_loc; } ag_yy_extra; static void integer_literal_to_token(const char *s, ag_token *token, ag_yy_extra *extra); #define hexadecimal_to_decimal(numstr, sb) _numstr_to_decimal(numstr, 16, sb) #define octal_to_decimal(numstr, sb) _numstr_to_decimal(numstr, 8, sb) static void _numstr_to_decimal(const char *numstr, const int base, strbuf *sb); static uint32 hexdigit_value(const char c); static uint32 octdigit_value(const char c); static bool is_high_surrogate(const pg_wchar c); static bool is_low_surrogate(const pg_wchar c); #define update_location() \ do \ { \ yyextra.last_loc = yytext - yyextra.scan_buf; \ } while (0) #define get_location() (yyextra.last_loc) #define scan_errmsg(msg) _scan_errmsg(msg, &yyextra) static int _scan_errmsg(const char *msg, const ag_yy_extra *extra); #define scan_errposition() _scan_errposition(yyextra.last_loc, &yyextra) static int _scan_errposition(const int location, const ag_yy_extra *extra); /* * Avoid exit() on fatal scanner errors. * Call yy_fatal_error() just to keep compiler quiet. */ #define YY_FATAL_ERROR(msg) \ do \ { \ ereport(ERROR, (errmsg_internal("%s", msg))); \ yy_fatal_error(NULL, NULL); \ } while (0) /* * "yyscanner" must be used for the name of the parameter because it is * referenced internally. "yyscan_t" is OK because it is actually "void *" * and is the same with "ag_scanner_t". */ #define YY_DECL ag_token ag_scanner_next_token(yyscan_t yyscanner) #define NDIGITS_PER_REMAINDER 9 %} %% %{ // This is used in the actions below. ag_token token; %} {whitespace} { // ignore } {mlcstart} { // update location in case of unterminated comment update_location(); BEGIN(mlcomment); } {mlcchars} { // ignore } {mlcstop} { BEGIN(INITIAL); } <> { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), scan_errmsg("unterminated /* comment"), scan_errposition())); } {slcomment} { // ignore } {digitseq} | {hexint} { update_location(); integer_literal_to_token(yytext, &token, &yyextra); token.location = get_location(); return token; } {hexintfail} { update_location(); ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), scan_errmsg("invalid hexadecimal integer literal"), scan_errposition())); } {decimal} | {decimalsci} { update_location(); token.type = AG_TOKEN_DECIMAL; token.value.s = yytext; token.location = get_location(); return token; } {decimalfail} { // return dot_dot back to the input stream yyless(yyleng - 2); update_location(); // consume digitseq integer_literal_to_token(yytext, &token, &yyextra); token.location = get_location(); return token; } {decimalscifail1} | {decimalscifail2} { update_location(); ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), scan_errmsg("invalid scientific notation literal"), scan_errposition())); } {dquote} { update_location(); strbuf_reset(&yyextra.literal_buf); BEGIN(dqstr); } {squote} { update_location(); strbuf_reset(&yyextra.literal_buf); BEGIN(sqstr); } {dqchars} | {sqchars} { strbuf_append_buf(&yyextra.literal_buf, yytext, yyleng); } {esascii} { char c; switch (yytext[1]) { case 'b': c = '\b'; break; case 'f': c = '\f'; break; case 'n': c = '\n'; break; case 'r': c = '\r'; break; case 't': c = '\t'; break; default: // '"', '\'', '/', and '\\' c = yytext[1]; break; } strbuf_append_char(&yyextra.literal_buf, c); } {esasciifail} { if (yyleng == 1) { /* * This happens when the scanner meets "\"<>. Just consume "\" * so that <> rule can do the rest. */ strbuf_append_char(&yyextra.literal_buf, '\\'); } else { update_location(); ereport(ERROR, (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), scan_errmsg("invalid escape sequence"), errdetail("Valid escape sequences are \\\", \\', \\/, \\\\, \\b, \\f, \\n, \\r, \\t, \\uXXXX, and \\UXXXXXXXX."), scan_errposition())); } } {esunicode} { pg_wchar c; // It is unnecessary to check endptr and errno here. c = strtoul(yytext + 2, NULL, 16); if (c > 0x10FFFF) { // c is greater than the maximum value of a Unicode code point. update_location(); ereport(ERROR, (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), scan_errmsg("invalid Unicode escape value"), errdetail("Unicode escape values cannot be greater than 10FFFF, which is the maximum value of a code point."), scan_errposition())); } else if (c > 0x7F) { if (GetDatabaseEncoding() != PG_UTF8) { update_location(); ereport(ERROR, (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), scan_errmsg("unsupported Unicode escape value"), errdetail("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8."), scan_errposition())); } if (is_high_surrogate(c)) { yyextra.high_surrogate = c; yyextra.start_cond = YY_START; BEGIN(qstru); } else if (is_low_surrogate(c)) { update_location(); ereport(ERROR, (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), scan_errmsg("invalid Unicode surrogate pair"), errdetail("A low surrogate must follow a high surrogate."), scan_errposition())); } else { strbuf_append_codepoint(&yyextra.literal_buf, c); } } else if (c > 0) { // c is an ASCII character. strbuf_append_char(&yyextra.literal_buf, (char)c); } else { /* * U+0000 NUL is the minimum value of a Unicode code point. * However, it is invalid in quoted strings as well as query strings. */ update_location(); ereport(ERROR, (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), scan_errmsg("unsupported Unicode escape value"), errdetail("Unicode code point value 0000 is not allowed in quoted strings."), scan_errposition())); } } {esunicode} { pg_wchar c; c = strtoul(yytext + 2, NULL, 16); if (is_low_surrogate(c)) { c = surrogate_pair_to_codepoint(yyextra.high_surrogate, c); // 0x010000 <= c <= 0x10FFFF always holds for surrogate pairs. strbuf_append_codepoint(&yyextra.literal_buf, c); BEGIN(yyextra.start_cond); } else { update_location(); ereport(ERROR, (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), scan_errmsg("invalid Unicode surrogate pair"), errdetail("A low surrogate must follow a high surrogate."), scan_errposition())); } } {esunicodefail} { update_location(); ereport(ERROR, (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), scan_errmsg("invalid Unicode escape sequence"), errhint("Unicode escape sequences must be \\uXXXX or \\UXXXXXXXX."), scan_errposition())); } {any} { update_location(); ereport(ERROR, (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), scan_errmsg("invalid Unicode surrogate pair"), errdetail("A low surrogate must follow a high surrogate."), scan_errposition())); } {dquote} | {squote} { BEGIN(INITIAL); /* * In quoted strings, only Unicode escape sequences need to be verified, * and the actions for {esunicode} and {esunicode} * rules verify the code point values. So, quoted strings are always valid. */ token.type = AG_TOKEN_STRING; token.value.s = strbuf_get_str(&yyextra.literal_buf); token.location = get_location(); return token; } <> { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), scan_errmsg("unterminated quoted string"), scan_errposition())); } {id} { update_location(); token.type = AG_TOKEN_IDENTIFIER; token.value.s = yytext; token.location = get_location(); return token; } {bquote} { update_location(); strbuf_reset(&yyextra.literal_buf); BEGIN(bqid); } {bqchars} { strbuf_append_buf(&yyextra.literal_buf, yytext, yyleng); } {esbquote} { strbuf_append_char(&yyextra.literal_buf, '`'); } {bquote} { BEGIN(INITIAL); if (yyextra.literal_buf.length == 0) { ereport(ERROR, (errcode(ERRCODE_INVALID_NAME), scan_errmsg("zero-length quoted identifier"), scan_errposition())); } token.type = AG_TOKEN_IDENTIFIER; token.value.s = strbuf_get_str(&yyextra.literal_buf); token.location = get_location(); return token; } <> { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), scan_errmsg("unterminated quoted identifier"), scan_errposition())); } {param} { update_location(); token.type = AG_TOKEN_PARAMETER; token.value.s = yytext + 1; token.location = get_location(); return token; } {concat} { update_location(); token.type = AG_TOKEN_CONCAT; token.value.s = yytext; token.location = get_location(); return token; } {access_path} { update_location(); token.type = AG_TOKEN_ACCESS_PATH; token.value.s = yytext; token.location = get_location(); return token; } {any_exists} { update_location(); token.type = AG_TOKEN_ANY_EXISTS; token.value.s = yytext; token.location = get_location(); return token; } {left_contains} { update_location(); token.type = AG_TOKEN_LEFT_CONTAINS; token.value.s = yytext; token.location = get_location(); return token; } {right_contains} { update_location(); token.type = AG_TOKEN_RIGHT_CONTAINS; token.value.s = yytext; token.location = get_location(); return token; } {all_exists} { update_location(); token.type = AG_TOKEN_ALL_EXISTS; token.value.s = yytext; token.location = get_location(); return token; } {lt_gt} { update_location(); token.type = AG_TOKEN_LT_GT; token.value.s = yytext; token.location = get_location(); return token; } {lt_eq} { update_location(); token.type = AG_TOKEN_LT_EQ; token.value.s = yytext; token.location = get_location(); return token; } {gt_eq} { update_location(); token.type = AG_TOKEN_GT_EQ; token.value.s = yytext; token.location = get_location(); return token; } {dot_dot} { update_location(); token.type = AG_TOKEN_DOT_DOT; token.value.s = yytext; token.location = get_location(); return token; } {plus_eq} { update_location(); token.type = AG_TOKEN_PLUS_EQ; token.value.s = yytext; token.location = get_location(); return token; } {eq_tilde} { update_location(); token.type = AG_TOKEN_EQ_TILDE; token.value.s = yytext; token.location = get_location(); return token; } {typecast} { update_location(); token.type = AG_TOKEN_TYPECAST; token.value.s = yytext; token.location = get_location(); return token; } {self} { update_location(); token.type = AG_TOKEN_CHAR; token.value.c = yytext[0]; token.location = get_location(); return token; } {other} { update_location(); ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), scan_errmsg("unexpected character"), scan_errposition())); } <> { update_location(); token.type = AG_TOKEN_NULL; token.value.c = '\0'; token.location = get_location(); return token; } %% /* * Override the default memory management to make flex use palloc() instead of * malloc(). */ void *ag_yyalloc(yy_size_t size, yyscan_t yyscanner) { return palloc(size); } void *ag_yyrealloc(void *ptr, yy_size_t size, yyscan_t yyscanner) { // see realloc(3) if (ptr) { if (size == 0) { pfree(ptr); return NULL; } else { return repalloc(ptr, size); } } else { return palloc(size); } } void ag_yyfree(void *ptr, yyscan_t yyscanner) { if (ptr) pfree(ptr); } static void strbuf_init(strbuf *sb, int capacity) { sb->buffer = palloc(capacity); sb->capacity = capacity; sb->length = 0; } static void strbuf_cleanup(strbuf *sb) { if (sb->buffer) pfree(sb->buffer); } static void strbuf_append_buf(strbuf *sb, const char *b, const int len) { strbuf_ensure_capacity(sb, sb->length + len); memcpy(sb->buffer + sb->length, b, len); sb->length += len; } static void strbuf_append_char(strbuf *sb, const char c) { strbuf_ensure_capacity(sb, sb->length + 1); sb->buffer[sb->length] = c; sb->length += 1; } static void strbuf_append_codepoint(strbuf *sb, const pg_wchar c) { unsigned char buf[6]; unicode_to_utf8(c, buf); strbuf_append_buf(sb, (char *)buf, pg_utf_mblen(buf)); } /* * len cannot be greater than MaxAllocSize because ReadCommand() reads * a message and places the message body in StringInfo. */ static void strbuf_ensure_capacity(strbuf *sb, int len) { // consider additional 1 byte for the last '\0' character if (len < sb->capacity) return; do { sb->capacity *= 2; } while (sb->capacity <= len); sb->buffer = repalloc(sb->buffer, sb->capacity); } static const char *strbuf_get_str(strbuf *sb) { sb->buffer[sb->length] = '\0'; return sb->buffer; } static void strbuf_reset(strbuf *sb) { sb->length = 0; } static void integer_literal_to_token(const char *s, ag_token *token, ag_yy_extra *extra) { char *endptr; int i; errno = 0; i = strtoint(s, &endptr, 0); /* * This is only needed for invalid octal integer literals. (e.g. "08") * Other cases cannot happen because of digitseq and hexint rules. */ if (*endptr != '\0') { ereport(ERROR, (errcode(ERRCODE_SYNTAX_ERROR), _scan_errmsg("invalid octal integer literal", extra), _scan_errposition(extra->last_loc, extra))); } // Treat it as a decimal if it is too large to be an "int" value. if (errno == ERANGE) { /* * Accessing s[0] and s[1] is safe because ERANGE is returned only if * there are 10 or more characters in s. In this case, the shortest * integer literals for decimal, hexadecimal, and octal integers are * "2147483648", "0x80000000", and "020000000000" respectively. */ if (s[0] == '0') { strbuf_reset(&extra->literal_buf); /* * No matter how many characters s has, if all digits in s are * zeros, strtoint() returns 0 without an error. * So, _numstr_to_decimal() assumes that there is at least one * non-zero digit in s. */ if (s[1] == 'X' || s[1] == 'x') hexadecimal_to_decimal(s + 2, &extra->literal_buf); else octal_to_decimal(s + 1, &extra->literal_buf); s = strbuf_get_str(&extra->literal_buf); } token->type = AG_TOKEN_DECIMAL; token->value.s = s; return; } token->type = AG_TOKEN_INTEGER; token->value.i = i; } /* * convert a string of a hexadecimal or an octal integer to a string of the * corresponding decimal integer */ static void _numstr_to_decimal(const char *numstr, const int base, strbuf *sb) { // constants for each base int ndigits_per_word; int nbits_per_digit; uint32 (*digit_value)(const char); /* * constants for the conversion * * "divisor" is 10^9. * * At most 3 divisions are needed to eliminate 1 word. * hex: 4294967295999999999 -> 4294967295 -> 4 -> 0 * oct: 1073741823999999999 -> 1073741823 -> 1 -> 0 */ const uint64 divisor = 1000000000; const int ndivisions = 3; int ndigits; int nwords; uint32 *words; const char *digitp; int word_i; int ndigits_word0; uint32 word; uint32 *remainders; int nremainders; int i; // set constants for each base switch (base) { case 16: /* * Hexadecimal * * Maximum value for each word * 0xFFFFFFFF = 4294967295 * Divisor * 0x3B9ACA00 = 1000000000 * Maximum remainder * 0x3B9AC9FF = 999999999 * * Maximum dividend * 0x3B9AC9FFFFFFFFFF = 4294967295999999999 * Quotient of the maximum dividend and the divisor * 0xFFFFFFFF = 4294967295 * Remainer of the above division * 0x3B9AC9FF = 999999999 */ ndigits_per_word = 8; nbits_per_digit = 4; digit_value = hexdigit_value; break; case 8: /* * Octal * * Maximum value for each word * 07777777777 = 1073741823 * Divisor * 07346545000 = 1000000000 * Maximum remainder * 07346544777 = 999999999 * * Maximum dividend * 073465447777777777777 = 1073741823999999999 * Quotient of the maximum dividend and the divisor * 07777777777 = 1073741823 * Remainer of the above division * 07346544777 = 999999999 */ ndigits_per_word = 10; nbits_per_digit = 3; digit_value = octdigit_value; break; default: Assert(!"invalid base"); return; } // skip leading zeros while (*numstr == '0') numstr++; // number of digits in "numstr" ndigits = strlen(numstr); Assert(ndigits > 0); // prepare "words" to store "numstr" in two's complement representation nwords = (ndigits + (ndigits_per_word - 1)) / ndigits_per_word; words = palloc(sizeof(*words) * nwords); digitp = numstr; word_i = 0; // number of digits for the first word ndigits_word0 = ndigits % ndigits_per_word; if (ndigits_word0 == 0) ndigits_word0 = ndigits_per_word; // fill the first word word = digit_value(*digitp++); for (i = 1; i < ndigits_word0; i++) { word <<= nbits_per_digit; word |= digit_value(*digitp++); } words[word_i++] = word; // fill the rest of "words" while (word_i < nwords) { word = digit_value(*digitp++); for (i = 1; i < ndigits_per_word; i++) { word <<= nbits_per_digit; word |= digit_value(*digitp++); } words[word_i++] = word; } // At most "ndivisions" divisions are needed to eliminate 1 word. remainders = palloc(sizeof(*remainders) * (ndivisions * nwords)); nremainders = 0; word_i = 0; // repeat dividing "words" by "divisor" until the quotient becomes 0 while (word_i < nwords) { uint64 r; r = 0; // divide "words" by "divisor" for (i = word_i; i < nwords; i++) { uint64 d; uint64 q; d = (uint64)words[i]; d |= r << (nbits_per_digit * ndigits_per_word); q = d / divisor; r = d % divisor; words[i] = (uint32)q; } // collect the remainder to build the result remainders[nremainders++] = (uint32)r; /* * Divisions over the first effective word is done * and "words" is getting closer to 0. */ if (words[word_i] == 0) word_i++; } // convert the collected remainders to a string, starting from the last one for (i = nremainders - 1; i >= 0; i--) { char buf[NDIGITS_PER_REMAINDER]; int buf_i; uint32 tmp; buf_i = NDIGITS_PER_REMAINDER; for (tmp = remainders[i]; tmp > 0; tmp /= 10) buf[--buf_i] = '0' + (char)(tmp % 10); // leading zeros for intermediate digits if (i < nremainders - 1) { while (buf_i > 0) buf[--buf_i] = '0'; } strbuf_append_buf(sb, &buf[buf_i], NDIGITS_PER_REMAINDER - buf_i); } pfree(remainders); pfree(words); } static uint32 hexdigit_value(const char c) { if (c >= '0' && c <= '9') return c - '0'; if (c >= 'A' && c <= 'F') return 0xA + (c - 'A'); Assert(c >= 'a' && c <= 'f'); return 0xA + (c - 'a'); } static uint32 octdigit_value(const char c) { Assert(c >= '0' && c <= '7'); return c - '0'; } static bool is_high_surrogate(const pg_wchar c) { return (c >= 0xD800 && c <= 0xDBFF); } static bool is_low_surrogate(const pg_wchar c) { return (c >= 0xDC00 && c <= 0xDFFF); } static int _scan_errmsg(const char *msg, const ag_yy_extra *extra) { const char *t = extra->scan_buf + extra->last_loc; if (t[0] == YY_END_OF_BUFFER_CHAR) return errmsg("%s at end of input", msg); else return errmsg("%s at or near \"%s\"", msg, t); } static int _scan_errposition(const int location, const ag_yy_extra *extra) { int pos; // no-op if location is unknown if (location < 0) return 0; // convert byte offset to number of characters pos = pg_mbstrlen_with_len(extra->scan_buf, location) + 1; return errposition(pos); } ag_scanner_t ag_scanner_create(const char *s) { Size len; char *buf; yyscan_t yyscanner; ag_yy_extra extra; int ret; // The last two YY_END_OF_BUFFER_CHAR are required by flex. len = strlen(s); buf = palloc(len + 2); memcpy(buf, s, len); buf[len] = YY_END_OF_BUFFER_CHAR; buf[len + 1] = YY_END_OF_BUFFER_CHAR; ret = ag_yylex_init(&yyscanner); if (ret) elog(ERROR, "ag_yylex_init() failed: %m"); strbuf_init(&extra.literal_buf, 1024); extra.high_surrogate = 0; extra.start_cond = INITIAL; extra.scan_buf = buf; extra.last_loc = 0; ag_yyset_extra(extra, yyscanner); ag_yy_scan_buffer(buf, len + 2, yyscanner); return yyscanner; } void ag_scanner_destroy(ag_scanner_t scanner) { ag_yy_extra extra; extra = ag_yyget_extra(scanner); strbuf_cleanup(&extra.literal_buf); ag_yylex_destroy(scanner); } int ag_scanner_errmsg(const char *msg, ag_scanner_t *scanner) { ag_yy_extra extra; extra = ag_yyget_extra(scanner); return _scan_errmsg(msg, &extra); } int ag_scanner_errposition(const int location, ag_scanner_t *scanner) { ag_yy_extra extra; extra = ag_yyget_extra(scanner); return _scan_errposition(location, &extra); }