/* * For PostgreSQL Database Management System: * (formerly known as Postgres, then as Postgres95) * * Portions Copyright (c) 1996-2010, The PostgreSQL Global Development Group * * Portions Copyright (c) 1994, The Regents of the University of California * * Permission to use, copy, modify, and distribute this software and its * documentation for any purpose, without fee, and without a written agreement * is hereby granted, provided that the above copyright notice and this * paragraph and the following two paragraphs appear in all copies. * * IN NO EVENT SHALL THE UNIVERSITY OF CALIFORNIA BE LIABLE TO ANY PARTY FOR * DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, INCLUDING * LOST PROFITS, ARISING OUT OF THE USE OF THIS SOFTWARE AND ITS DOCUMENTATION, * EVEN IF THE UNIVERSITY OF CALIFORNIA HAS BEEN ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * THE UNIVERSITY OF CALIFORNIA SPECIFICALLY DISCLAIMS ANY WARRANTIES, * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND * FITNESS FOR A PARTICULAR PURPOSE. * * THE SOFTWARE PROVIDED HEREUNDER IS ON AN "AS IS" BASIS, AND THE UNIVERSITY OF * CALIFORNIA HAS NO OBLIGATIONS TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, * ENHANCEMENTS, OR MODIFICATIONS. */ #include "postgres.h" #include "access/gin.h" #include "access/hash.h" #include "catalog/pg_collation.h" #include "utils/agtype.h" #include "utils/float.h" #include "utils/builtins.h" #include "utils/varlena.h" #include "pg_fix.h" typedef struct PathHashStack { uint32 hash; struct PathHashStack *parent; } PathHashStack; static Datum make_text_key(char flag, const char *str, int len); static Datum make_scalar_key(const agtype_value *scalar_val, bool is_key); /* * * agtype_ops GIN opclass support functions * */ /* * Compares two keys (not indexed items!) and returns an integer less than zero, * zero, or greater than zero, indicating whether the first key is less than, * equal to, or greater than the second. NULL keys are never passed to this * function. */ PG_FUNCTION_INFO_V1(gin_compare_agtype); Datum gin_compare_agtype(PG_FUNCTION_ARGS) { text *arg1, *arg2; int32 result; char *a1p, *a2p; int len1, len2; if (PG_ARGISNULL(0) || PG_ARGISNULL(1)) { PG_RETURN_NULL(); } arg1 = PG_GETARG_TEXT_PP(0); arg2 = PG_GETARG_TEXT_PP(1); a1p = VARDATA_ANY(arg1); a2p = VARDATA_ANY(arg2); len1 = VARSIZE_ANY_EXHDR(arg1); len2 = VARSIZE_ANY_EXHDR(arg2); /* Compare text as bttextcmp does, but always using C collation */ result = varstr_cmp(a1p, len1, a2p, len2, C_COLLATION_OID); PG_FREE_IF_COPY(arg1, 0); PG_FREE_IF_COPY(arg2, 1); PG_RETURN_INT32(result); } /* * Returns a palloc'd array of keys given an item to be indexed. The number of * returned keys must be stored into *nkeys. The return value can be NULL if the * item contains no keys. */ PG_FUNCTION_INFO_V1(gin_extract_agtype); Datum gin_extract_agtype(PG_FUNCTION_ARGS) { agtype *agt; int32 *nentries; int total; agtype_iterator *it; agtype_value v; agtype_iterator_token r; int i = 0; Datum *entries; if (PG_ARGISNULL(0) || PG_ARGISNULL(1)) { PG_RETURN_POINTER(NULL); } agt = (agtype *) AG_GET_ARG_AGTYPE_P(0); nentries = (int32 *) PG_GETARG_POINTER(1); total = 2 * AGT_ROOT_COUNT(agt); /* If the root level is empty, we certainly have no keys */ if (total == 0) { *nentries = 0; PG_RETURN_POINTER(NULL); } /* Otherwise, use 2 * root count as initial estimate of result size */ entries = (Datum *) palloc(sizeof(Datum) * total); it = agtype_iterator_init(&agt->root); while ((r = agtype_iterator_next(&it, &v, false)) != WAGT_DONE) { /* Since we recurse into the object, we might need more space */ if (i >= total) { total *= 2; entries = (Datum *) repalloc(entries, sizeof(Datum) * total); } switch (r) { case WAGT_KEY: entries[i++] = make_scalar_key(&v, true); break; case WAGT_ELEM: /* Pretend string array elements are keys */ entries[i++] = make_scalar_key(&v, (v.type == AGTV_STRING)); break; case WAGT_VALUE: entries[i++] = make_scalar_key(&v, false); break; default: /* we can ignore structural items */ break; } } *nentries = i; PG_RETURN_POINTER(entries); } /* * Returns a palloc'd array of keys given a value to be queried; that is, query * is the value on the right-hand side of an indexable operator whose left-hand * side is the indexed column. The number of returned keys must be stored into * *nkeys. If any of the keys can be null, also palloc an array of *nkeys bool * fields, store its address at *nullFlags, and set these null flags as needed. * *nullFlags can be left NULL (its initial value) if all keys are non-null. * The return value can be NULL if the query contains no keys. * * searchMode is an output argument that allows extractQuery to specify details * about how the search will be done. If *searchMode is set to * GIN_SEARCH_MODE_DEFAULT (which is the value it is initialized to before * call), only items that match at least one of the returned keys are considered * candidate matches. If *searchMode is set to GIN_SEARCH_MODE_ALL, then all * non-null items in the index are considered candidate matches, whether they * match any of the returned keys or not. This is only done when the contains * or exists all strategy are used and the passed map is empty. */ PG_FUNCTION_INFO_V1(gin_extract_agtype_query); Datum gin_extract_agtype_query(PG_FUNCTION_ARGS) { int32 *nentries; StrategyNumber strategy; int32 *searchMode; Datum *entries; if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(2) || PG_ARGISNULL(6)) { PG_RETURN_NULL(); } nentries = (int32 *) PG_GETARG_POINTER(1); strategy = PG_GETARG_UINT16(2); searchMode = (int32 *) PG_GETARG_POINTER(6); if (strategy == AGTYPE_CONTAINS_STRATEGY_NUMBER) { /* Query is a agtype, so just apply gin_extract_agtype... */ entries = (Datum *) DatumGetPointer(DirectFunctionCall2(gin_extract_agtype, PG_GETARG_DATUM(0), PointerGetDatum(nentries))); /* ...although "contains {}" requires a full index scan */ if (*nentries == 0) { *searchMode = GIN_SEARCH_MODE_ALL; } } else if (strategy == AGTYPE_EXISTS_STRATEGY_NUMBER) { /* Query is a text string, which we treat as a key */ text *query = PG_GETARG_TEXT_PP(0); *nentries = 1; entries = (Datum *)palloc(sizeof(Datum)); entries[0] = make_text_key(AGT_GIN_FLAG_KEY, VARDATA_ANY(query), VARSIZE_ANY_EXHDR(query)); } else if (strategy == AGTYPE_EXISTS_ANY_STRATEGY_NUMBER || strategy == AGTYPE_EXISTS_ALL_STRATEGY_NUMBER) { /* Query is a text array; each element is treated as a key */ agtype *agt = AG_GET_ARG_AGTYPE_P(0); agtype_iterator *it = NULL; agtype_value elem; agtype_iterator_token itok; int key_count = AGTYPE_CONTAINER_SIZE(&agt->root); int index = 0; if (AGTYPE_CONTAINER_IS_SCALAR(&agt->root) || !AGTYPE_CONTAINER_IS_ARRAY(&agt->root)) { elog(ERROR, "GIN query requires an agtype array"); } entries = (Datum *) palloc(sizeof(Datum) * key_count); it = agtype_iterator_init(&agt->root); /* it should be WAGT_BEGIN_ARRAY */ itok = agtype_iterator_next(&it, &elem, true); if (itok != WAGT_BEGIN_ARRAY) { elog(ERROR, "unexpected iterator token: %d", itok); } while (WAGT_END_ARRAY != agtype_iterator_next(&it, &elem, true)) { if (elem.type != AGTV_STRING) { elog(ERROR, "unsupport agtype for GIN lookup: %d", elem.type); } entries[index++] = make_text_key(AGT_GIN_FLAG_KEY, elem.val.string.val, elem.val.string.len); } *nentries = index; /* ExistsAll with no keys should match everything */ if (index == 0 && strategy == AGTYPE_EXISTS_ALL_STRATEGY_NUMBER) { *searchMode = GIN_SEARCH_MODE_ALL; } } else { elog(ERROR, "unrecognized strategy number: %d", strategy); entries = NULL; /* keep compiler quiet */ } PG_RETURN_POINTER(entries); } /* * Returns true if an indexed item satisfies the query operator for the given * strategy (or might satisfy it, if the recheck indication is returned). This * function does not have direct access to the indexed item's value, since GIN * does not store items explicitly. Rather, what is available is knowledge about * which key values extracted from the query appear in a given indexed item. The * check array has length nkeys, which is the same as the number of keys * previously returned by gin_extract_agtype_query for this query datum. Each * element of the check array is true if the indexed item contains the * corresponding query key, i.e., if (check[i] == true) the i-th key of the * gin_extract_agtype_query result array is present in the indexed item. The * original query datum is passed in case the consistent method needs to consult * it, and so are the queryKeys[] and nullFlags[] arrays previously returned by * gin_extract_agtype_query. * * When extractQuery returns a null key in queryKeys[], the corresponding * check[] element is true if the indexed item contains a null key; that is, the * semantics of check[] are like IS NOT DISTINCT FROM. The consistent function * can examine the corresponding nullFlags[] element if it needs to tell the * difference between a regular value match and a null match. * * On success, *recheck should be set to true if the heap tuple needs to be * rechecked against the query operator, or false if the index test is exact. * That is, a false return value guarantees that the heap tuple does not match * the query; a true return value with *recheck set to false guarantees that the * heap tuple does match the query; and a true return value with *recheck set to * true means that the heap tuple might match the query, so it needs to be * fetched and rechecked by evaluating the query operator directly against the * originally indexed item. */ PG_FUNCTION_INFO_V1(gin_consistent_agtype); Datum gin_consistent_agtype(PG_FUNCTION_ARGS) { bool *check; StrategyNumber strategy; int32 nkeys; bool *recheck; bool res = true; int32 i; if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(3) || PG_ARGISNULL(5)) { PG_RETURN_NULL(); } check = (bool *) PG_GETARG_POINTER(0); strategy = PG_GETARG_UINT16(1); nkeys = PG_GETARG_INT32(3); recheck = (bool *) PG_GETARG_POINTER(5); if (strategy == AGTYPE_CONTAINS_STRATEGY_NUMBER) { /* * We must always recheck, since we can't tell from the index whether * the positions of the matched items match the structure of the query * object. (Even if we could, we'd also have to worry about hashed * keys and the index's failure to distinguish keys from string array * elements.) However, the tuple certainly doesn't match unless it * contains all the query keys. */ *recheck = true; for (i = 0; i < nkeys; i++) { if (!check[i]) { res = false; break; } } } else if (strategy == AGTYPE_EXISTS_STRATEGY_NUMBER) { /* * Although the key is certainly present in the index, we must recheck * because (1) the key might be hashed, and (2) the index match might * be for a key that's not at top level of the JSON object. For (1), * we could look at the query key to see if it's hashed and not * recheck if not, but the index lacks enough info to tell about (2). */ *recheck = true; res = true; } else if (strategy == AGTYPE_EXISTS_ANY_STRATEGY_NUMBER) { /* As for plain exists, we must recheck */ *recheck = true; res = true; } else if (strategy == AGTYPE_EXISTS_ALL_STRATEGY_NUMBER) { /* As for plain exists, we must recheck */ *recheck = true; /* ... but unless all the keys are present, we can say "false" */ for (i = 0; i < nkeys; i++) { if (!check[i]) { res = false; break; } } } else { elog(ERROR, "unrecognized strategy number: %d", strategy); } PG_RETURN_BOOL(res); } /* * gin_triconsistent_agtype is similar to gin_consistent_agtype, but instead of * booleans in the check vector, there are three possible values for each key: * GIN_TRUE, GIN_FALSE and GIN_MAYBE. GIN_FALSE and GIN_TRUE have the same * meaning as regular boolean values, while GIN_MAYBE means that the presence of * that key is not known. When GIN_MAYBE values are present, the function should * only return GIN_TRUE if the item certainly matches whether or not the index * item contains the corresponding query keys. Likewise, the function must * return GIN_FALSE only if the item certainly does not match, whether or not it * contains the GIN_MAYBE keys. If the result depends on the GIN_MAYBE entries, * i.e., the match cannot be confirmed or refuted based on the known query keys, * the function must return GIN_MAYBE. * * When there are no GIN_MAYBE values in the check vector, a GIN_MAYBE return * value is the equivalent of setting the recheck flag in the boolean consistent * function. */ PG_FUNCTION_INFO_V1(gin_triconsistent_agtype); Datum gin_triconsistent_agtype(PG_FUNCTION_ARGS) { GinTernaryValue *check; StrategyNumber strategy; int32 nkeys; GinTernaryValue res = GIN_MAYBE; int32 i; if (PG_ARGISNULL(0) || PG_ARGISNULL(1) || PG_ARGISNULL(3)) { PG_RETURN_NULL(); } check = (GinTernaryValue *)PG_GETARG_POINTER(0); strategy = PG_GETARG_UINT16(1); nkeys = PG_GETARG_INT32(3); /* * Note that we never return GIN_TRUE, only GIN_MAYBE or GIN_FALSE; this * corresponds to always forcing recheck in the regular consistent * function, for the reasons listed there. */ if (strategy == AGTYPE_CONTAINS_STRATEGY_NUMBER || strategy == AGTYPE_EXISTS_ALL_STRATEGY_NUMBER) { /* All extracted keys must be present */ for (i = 0; i < nkeys; i++) { if (check[i] == GIN_FALSE) { res = GIN_FALSE; break; } } } else if (strategy == AGTYPE_EXISTS_STRATEGY_NUMBER || strategy == AGTYPE_EXISTS_ANY_STRATEGY_NUMBER) { /* At least one extracted key must be present */ res = GIN_FALSE; for (i = 0; i < nkeys; i++) { if (check[i] == GIN_TRUE || check[i] == GIN_MAYBE) { res = GIN_MAYBE; break; } } } else { elog(ERROR, "unrecognized strategy number: %d", strategy); } PG_RETURN_GIN_TERNARY_VALUE(res); } /* * Construct a agtype_ops GIN key from a flag byte and a textual representation * (which need not be null-terminated). This function is responsible * for hashing overlength text representations; it will add the * AGT_GIN_FLAG_HASHED bit to the flag value if it does that. */ static Datum make_text_key(char flag, const char *str, int len) { text *item; char hashbuf[10]; if (len > AGT_GIN_MAX_LENGTH) { uint32 hashval; hashval = DatumGetUInt32(hash_any((const unsigned char *) str, len)); snprintf(hashbuf, sizeof(hashbuf), "%08x", hashval); str = hashbuf; len = 8; flag |= AGT_GIN_FLAG_HASHED; } /* * Now build the text Datum. For simplicity we build a 4-byte-header * varlena text Datum here, but we expect it will get converted to short * header format when stored in the index. */ item = (text *)palloc(VARHDRSZ + len + 1); SET_VARSIZE(item, VARHDRSZ + len + 1); *VARDATA(item) = flag; memcpy(VARDATA(item) + 1, str, len); return PointerGetDatum(item); } /* * Create a textual representation of a agtype_value that will serve as a GIN * key in a agtype_ops index. is_key is true if the JsonbValue is a key, * or if it is a string array element (since we pretend those are keys, * see jsonb.h). */ static Datum make_scalar_key(const agtype_value *scalarVal, bool is_key) { Datum item = 0; char *cstr = NULL; char buf[MAXINT8LEN + 1]; switch (scalarVal->type) { case AGTV_NULL: Assert(!is_key); item = make_text_key(AGT_GIN_FLAG_NULL, "", 0); break; case AGTV_INTEGER: { char *result; Assert(!is_key); pg_lltoa(scalarVal->val.int_value, buf); result = pstrdup(buf); item = make_text_key(AGT_GIN_FLAG_NUM, result, strlen(result)); break; } case AGTV_FLOAT: Assert(!is_key); cstr = float8out_internal(scalarVal->val.float_value); item = make_text_key(AGT_GIN_FLAG_NUM, cstr, strlen(cstr)); break; case AGTV_BOOL: Assert(!is_key); item = make_text_key(AGT_GIN_FLAG_BOOL, scalarVal->val.boolean ? "t" : "f", 1); break; case AGTV_NUMERIC: Assert(!is_key); /* * A normalized textual representation, free of trailing zeroes, * is required so that numerically equal values will produce equal * strings. * * It isn't ideal that numerics are stored in a relatively bulky * textual format. However, it's a notationally convenient way of * storing a "union" type in the GIN B-Tree, and indexing Jsonb * strings takes precedence. */ cstr = numeric_normalize(scalarVal->val.numeric); item = make_text_key(AGT_GIN_FLAG_NUM, cstr, strlen(cstr)); pfree(cstr); break; case AGTV_STRING: item = make_text_key(is_key ? AGT_GIN_FLAG_KEY : AGT_GIN_FLAG_STR, scalarVal->val.string.val, scalarVal->val.string.len); break; case AGTV_VERTEX: case AGTV_EDGE: case AGTV_PATH: elog(ERROR, "agtype type: %d is not a scalar", scalarVal->type); break; default: elog(ERROR, "unrecognized agtype type: %d", scalarVal->type); break; } return item; }