From 1647cae825d5ffb6ad2acbabc617d92b8cf64605 Mon Sep 17 00:00:00 2001 From: Peter Smith Date: Tue, 14 Oct 2025 16:22:21 +1100 Subject: [PATCH 1/3] VCI - changes to postgres core --- src/backend/access/heap/heapam.c | 18 +++ src/backend/access/heap/heapam_handler.c | 4 + src/backend/access/heap/heapam_visibility.c | 6 + src/backend/access/transam/xlogfuncs.c | 1 + src/backend/access/transam/xlogrecovery.c | 18 ++- src/backend/catalog/dependency.c | 7 ++ src/backend/catalog/index.c | 33 +++++- src/backend/commands/explain.c | 24 +++- src/backend/commands/indexcmds.c | 109 ++++++++--------- src/backend/commands/tablecmds.c | 125 ++++++++++++++++++++ src/backend/commands/vacuum.c | 16 +++ src/backend/executor/execAmi.c | 2 + src/backend/executor/execExpr.c | 6 +- src/backend/executor/execExprInterp.c | 36 ++++++ src/backend/executor/execProcnode.c | 1 + src/backend/executor/execScan.c | 2 +- src/backend/executor/nodeCustom.c | 16 +++ src/backend/executor/nodeModifyTable.c | 10 ++ src/backend/jit/llvm/llvmjit_expr.c | 12 ++ src/backend/jit/llvm/llvmjit_types.c | 2 + src/backend/nodes/gen_node_support.pl | 56 +++++++-- src/backend/optimizer/path/allpaths.c | 18 ++- src/backend/optimizer/plan/createplan.c | 4 +- src/backend/storage/ipc/procarray.c | 3 +- src/backend/storage/lmgr/lock.c | 26 +++- src/backend/utils/adt/timestamp.c | 18 +-- src/backend/utils/cache/relcache.c | 66 +++++++++++ src/bin/pg_dump/common.c | 1 + src/bin/pg_dump/pg_dump.c | 35 +++++- src/bin/pg_dump/pg_dump.h | 1 + src/include/access/heapam.h | 4 + src/include/access/xlogrecovery.h | 3 + src/include/catalog/dependency.h | 3 + src/include/catalog/index.h | 2 + src/include/commands/explain.h | 4 + src/include/commands/tablecmds.h | 6 + src/include/datatype/timestamp.h | 22 ++++ src/include/executor/execExpr.h | 16 +++ src/include/executor/nodeModifyTable.h | 2 + src/include/nodes/extensible.h | 4 + src/include/nodes/params.h | 2 + src/include/nodes/plannodes.h | 3 + src/include/optimizer/planner.h | 2 + src/include/utils/relcache.h | 5 + src/include/utils/snapshot.h | 10 ++ 45 files changed, 664 insertions(+), 100 deletions(-) diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 568696333c25..40b18fa909d2 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -54,6 +54,7 @@ #include "utils/spccache.h" #include "utils/syscache.h" +void (*add_index_delete_hook) (Relation indexRelation, ItemPointer heap_tid, TransactionId xmin) = NULL; static HeapTuple heap_prepare_insert(Relation relation, HeapTuple tup, TransactionId xid, CommandId cid, int options); @@ -364,6 +365,9 @@ initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock) * results for a non-MVCC snapshot, the caller must hold some higher-level * lock that ensures the interesting tuple(s) won't change.) */ + if (keep_startblock) + goto skip_get_number_of_blocks; + if (scan->rs_base.rs_parallel != NULL) { bpscan = (ParallelBlockTableScanDesc) scan->rs_base.rs_parallel; @@ -372,6 +376,8 @@ initscan(HeapScanDesc scan, ScanKey key, bool keep_startblock) else scan->rs_nblocks = RelationGetNumberOfBlocks(scan->rs_base.rs_rd); +skip_get_number_of_blocks: + /* * If the table is large relative to NBuffers, use a bulk-read access * strategy and enable synchronized scanning (see syncscan.c). Although @@ -2806,6 +2812,7 @@ heap_delete(Relation relation, ItemPointer tid, bool all_visible_cleared = false; HeapTuple old_key_tuple = NULL; /* replica identity of the tuple */ bool old_key_copied = false; + TransactionId old_xmin; Assert(ItemPointerIsValid(tid)); @@ -3052,6 +3059,8 @@ heap_delete(Relation relation, ItemPointer tid, xid, LockTupleExclusive, true, &new_xmax, &new_infomask, &new_infomask2); + old_xmin = HeapTupleHeaderGetXmin(tp.t_data); + START_CRIT_SECTION(); /* @@ -3197,6 +3206,9 @@ heap_delete(Relation relation, ItemPointer tid, if (old_key_tuple != NULL && old_key_copied) heap_freetuple(old_key_tuple); + if (add_index_delete_hook) + add_index_delete_hook(relation, tid, old_xmin); + return TM_Ok; } @@ -3299,6 +3311,7 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, infomask2_old_tuple, infomask_new_tuple, infomask2_new_tuple; + TransactionId old_xmin; Assert(ItemPointerIsValid(otid)); @@ -3745,6 +3758,8 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, &xmax_old_tuple, &infomask_old_tuple, &infomask2_old_tuple); + old_xmin = HeapTupleHeaderGetRawXmin(oldtup.t_data); + /* * And also prepare an Xmax value for the new copy of the tuple. If there * was no xmax previously, or there was one but all lockers are now gone, @@ -4228,6 +4243,9 @@ heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, bms_free(modified_attrs); bms_free(interesting_attrs); + if (add_index_delete_hook) + add_index_delete_hook(relation, otid, old_xmin); + return TM_Ok; } diff --git a/src/backend/access/heap/heapam_handler.c b/src/backend/access/heap/heapam_handler.c index bcbac844bb66..f4bba2f3d5e1 100644 --- a/src/backend/access/heap/heapam_handler.c +++ b/src/backend/access/heap/heapam_handler.c @@ -46,6 +46,9 @@ #include "utils/builtins.h" #include "utils/rel.h" +/* Preserve the original heap tuple that is passed to callback in heapam_index_build_range_scan() */ +HeapTuple IndexHeapTuple; + static void reform_and_rewrite_tuple(HeapTuple tuple, Relation OldHeap, Relation NewHeap, Datum *values, bool *isnull, RewriteState rwstate); @@ -1658,6 +1661,7 @@ heapam_index_build_range_scan(Relation heapRelation, * some index AMs want to do further processing on the data first. So * pass the values[] and isnull[] arrays, instead. */ + IndexHeapTuple = heapTuple; if (HeapTupleIsHeapOnly(heapTuple)) { diff --git a/src/backend/access/heap/heapam_visibility.c b/src/backend/access/heap/heapam_visibility.c index 05f6946fe60d..abb12264c62b 100644 --- a/src/backend/access/heap/heapam_visibility.c +++ b/src/backend/access/heap/heapam_visibility.c @@ -78,6 +78,8 @@ #include "utils/builtins.h" #include "utils/snapmgr.h" +bool (*add_snapshot_satisfies_hook) (HeapTuple tup, Snapshot snapshot, Buffer buffer); + /* * SetHintBits() @@ -1791,6 +1793,10 @@ HeapTupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer) return HeapTupleSatisfiesHistoricMVCC(htup, snapshot, buffer); case SNAPSHOT_NON_VACUUMABLE: return HeapTupleSatisfiesNonVacuumable(htup, snapshot, buffer); + case SNAPSHOT_VCI_WOS2ROS: + case SNAPSHOT_VCI_LOCALROS: + if (add_snapshot_satisfies_hook) + return add_snapshot_satisfies_hook(htup, snapshot, buffer); } return false; /* keep compiler quiet */ diff --git a/src/backend/access/transam/xlogfuncs.c b/src/backend/access/transam/xlogfuncs.c index 8c3090165f00..b0b02d7ac9d0 100644 --- a/src/backend/access/transam/xlogfuncs.c +++ b/src/backend/access/transam/xlogfuncs.c @@ -607,6 +607,7 @@ pg_get_wal_replay_pause_state(PG_FUNCTION_ARGS) statestr = "not paused"; break; case RECOVERY_PAUSE_REQUESTED: + case RECOVERY_VCI_PAUSE_REQUESTED: statestr = "pause requested"; break; case RECOVERY_PAUSED: diff --git a/src/backend/access/transam/xlogrecovery.c b/src/backend/access/transam/xlogrecovery.c index 3e3c4da01a24..597bb815455b 100644 --- a/src/backend/access/transam/xlogrecovery.c +++ b/src/backend/access/transam/xlogrecovery.c @@ -2949,7 +2949,8 @@ recoveryPausesHere(bool endOfRecovery) ereport(LOG, (errmsg("pausing at the end of recovery"), errhint("Execute pg_wal_replay_resume() to promote."))); - else + /* If pause requested by VCI, the log is not output. */ + else if (GetRecoveryPauseState() != RECOVERY_VCI_PAUSE_REQUESTED) ereport(LOG, (errmsg("recovery has paused"), errhint("Execute pg_wal_replay_resume() to continue."))); @@ -3115,6 +3116,18 @@ SetRecoveryPause(bool recoveryPause) ConditionVariableBroadcast(&XLogRecoveryCtl->recoveryNotPausedCV); } +/* Set the recovery pause requested for VCI. */ +void +SetVciRecoveryPause(void) +{ + SpinLockAcquire(&XLogRecoveryCtl->info_lck); + + if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_NOT_PAUSED) + XLogRecoveryCtl->recoveryPauseState = RECOVERY_VCI_PAUSE_REQUESTED; + + SpinLockRelease(&XLogRecoveryCtl->info_lck); +} + /* * Confirm the recovery pause by setting the recovery pause state to * RECOVERY_PAUSED. @@ -3124,7 +3137,8 @@ ConfirmRecoveryPaused(void) { /* If recovery pause is requested then set it paused */ SpinLockAcquire(&XLogRecoveryCtl->info_lck); - if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED) + if (XLogRecoveryCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED || + XLogRecoveryCtl->recoveryPauseState == RECOVERY_VCI_PAUSE_REQUESTED) XLogRecoveryCtl->recoveryPauseState = RECOVERY_PAUSED; SpinLockRelease(&XLogRecoveryCtl->info_lck); } diff --git a/src/backend/catalog/dependency.c b/src/backend/catalog/dependency.c index 7dded634eb81..bce301dc35a4 100644 --- a/src/backend/catalog/dependency.c +++ b/src/backend/catalog/dependency.c @@ -85,6 +85,7 @@ #include "utils/lsyscache.h" #include "utils/syscache.h" +bool (*add_drop_relation_hook) (const ObjectAddress *object, int flags) = NULL; /* * Deletion processing requires additional state for each ObjectAddress that @@ -1357,6 +1358,12 @@ doDeletion(const ObjectAddress *object, int flags) { char relKind = get_rel_relkind(object->objectId); + if (add_drop_relation_hook) + { + if (add_drop_relation_hook(object, flags)) + break; + } + if (relKind == RELKIND_INDEX || relKind == RELKIND_PARTITIONED_INDEX) { diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c index 5d9db167e595..a53cb0db3677 100644 --- a/src/backend/catalog/index.c +++ b/src/backend/catalog/index.c @@ -133,6 +133,7 @@ static void ResetReindexProcessing(void); static void SetReindexPending(List *indexes); static void RemoveReindexPending(Oid indexOid); +bool (*add_reindex_index_hook) (Relation) = NULL; /* * relationHasPrimaryKey @@ -342,12 +343,17 @@ ConstructTupleDescriptor(Relation heapRelation, /* Simple index column */ const FormData_pg_attribute *from; - Assert(atnum > 0); /* should've been caught above */ - if (atnum > natts) /* safety check */ elog(ERROR, "invalid column number %d", atnum); - from = TupleDescAttr(heapTupDesc, - AttrNumberGetAttrOffset(atnum)); + if (atnum > 0) + { + from = TupleDescAttr(heapTupDesc, + AttrNumberGetAttrOffset(atnum)); + } + else + { + from = SystemAttributeDefinition(atnum); + } to->atttypid = from->atttypid; to->attlen = from->attlen; @@ -3692,6 +3698,25 @@ reindex_index(const ReindexStmt *stmt, Oid indexId, return; } + if (add_reindex_index_hook) + { + if (!add_reindex_index_hook(iRel)) + { + RemoveReindexPending(RelationGetRelid(iRel)); + + /* Roll back any GUC changes */ + AtEOXact_GUC(false, save_nestlevel); + + /* Restore userid and security context */ + SetUserIdAndSecContext(save_userid, save_sec_context); + + /* Close rels, but keep locks */ + index_close(iRel, NoLock); + table_close(heapRelation, NoLock); + return; + } + } + if (progress) pgstat_progress_update_param(PROGRESS_CREATEIDX_ACCESS_METHOD_OID, iRel->rd_rel->relam); diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c index e6edae0845cb..2635fc06d526 100644 --- a/src/backend/commands/explain.c +++ b/src/backend/commands/explain.c @@ -1208,6 +1208,7 @@ ExplainPreScanNode(PlanState *planstate, Bitmapset **rels_used) ((ForeignScan *) plan)->fs_base_relids); break; case T_CustomScan: + case T_CustomPlanMarkPos: *rels_used = bms_add_members(*rels_used, ((CustomScan *) plan)->custom_relids); break; @@ -1511,6 +1512,7 @@ ExplainNode(PlanState *planstate, List *ancestors, } break; case T_CustomScan: + case T_CustomPlanMarkPos: sname = "Custom Scan"; custom_name = ((CustomScan *) plan)->methods->CustomName; if (custom_name) @@ -1674,10 +1676,18 @@ ExplainNode(PlanState *planstate, List *ancestors, ExplainScanTarget((Scan *) plan, es); break; case T_ForeignScan: - case T_CustomScan: if (((Scan *) plan)->scanrelid > 0) ExplainScanTarget((Scan *) plan, es); break; + case T_CustomScan: + case T_CustomPlanMarkPos: + { + CustomScanState *css = (CustomScanState *) planstate; + + if (css->methods->ExplainCustomPlanTargetRel) + css->methods->ExplainCustomPlanTargetRel(css, es); + } + break; case T_IndexScan: { IndexScan *indexscan = (IndexScan *) plan; @@ -2149,6 +2159,7 @@ ExplainNode(PlanState *planstate, List *ancestors, show_foreignscan_info((ForeignScanState *) planstate, es); break; case T_CustomScan: + case T_CustomPlanMarkPos: { CustomScanState *css = (CustomScanState *) planstate; @@ -2410,6 +2421,7 @@ ExplainNode(PlanState *planstate, List *ancestors, "Subquery", NULL, es); break; case T_CustomScan: + case T_CustomPlanMarkPos: ExplainCustomChildren((CustomScanState *) planstate, ancestors, es); break; @@ -4428,6 +4440,7 @@ ExplainTargetRel(Plan *plan, Index rti, ExplainState *es) case T_TidRangeScan: case T_ForeignScan: case T_CustomScan: + case T_CustomPlanMarkPos: case T_ModifyTable: /* Assert it's on a real relation */ Assert(rte->rtekind == RTE_RELATION); @@ -5109,3 +5122,12 @@ ExplainFlushWorkersState(ExplainState *es) pfree(wstate->worker_state_save); pfree(wstate); } + +void +ExplainPropertySortGroupKeys(PlanState *planstate, const char *qlabel, + int nkeys, AttrNumber *keycols, + List *ancestors, ExplainState *es) +{ + show_sort_group_keys(planstate, qlabel, nkeys, 0, keycols, + NULL, NULL, NULL, ancestors, es); +} diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c index 5712fac36971..8dc2e547d4b3 100644 --- a/src/backend/commands/indexcmds.c +++ b/src/backend/commands/indexcmds.c @@ -1097,71 +1097,74 @@ DefineIndex(Oid tableId, } } - - /* - * We disallow indexes on system columns. They would not necessarily get - * updated correctly, and they don't seem useful anyway. - * - * Also disallow virtual generated columns in indexes (use expression - * index instead). - */ - for (int i = 0; i < indexInfo->ii_NumIndexAttrs; i++) - { - AttrNumber attno = indexInfo->ii_IndexAttrNumbers[i]; - - if (attno < 0) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("index creation on system columns is not supported"))); - - - if (TupleDescAttr(RelationGetDescr(rel), attno - 1)->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) - ereport(ERROR, - errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - stmt->primary ? - errmsg("primary keys on virtual generated columns are not supported") : - stmt->isconstraint ? - errmsg("unique constraints on virtual generated columns are not supported") : - errmsg("indexes on virtual generated columns are not supported")); - } - - /* - * Also check for system and generated columns used in expressions or - * predicates. - */ - if (indexInfo->ii_Expressions || indexInfo->ii_Predicate) + /* Skip disallowing index creation of system columns for VCI access method */ + if (strcmp(accessMethodName, "vci") != 0) { - Bitmapset *indexattrs = NULL; - int j; - pull_varattnos((Node *) indexInfo->ii_Expressions, 1, &indexattrs); - pull_varattnos((Node *) indexInfo->ii_Predicate, 1, &indexattrs); - - for (int i = FirstLowInvalidHeapAttributeNumber + 1; i < 0; i++) + /* + * We disallow indexes on system columns. They would not necessarily + * get updated correctly, and they don't seem useful anyway. + * + * Also disallow virtual generated columns in indexes (use expression + * index instead). + */ + for (int i = 0; i < indexInfo->ii_NumIndexAttrs; i++) { - if (bms_is_member(i - FirstLowInvalidHeapAttributeNumber, - indexattrs)) + AttrNumber attno = indexInfo->ii_IndexAttrNumbers[i]; + + if (attno < 0) ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), errmsg("index creation on system columns is not supported"))); + + if (TupleDescAttr(RelationGetDescr(rel), attno - 1)->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) + ereport(ERROR, + errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + stmt->primary ? + errmsg("primary keys on virtual generated columns are not supported") : + stmt->isconstraint ? + errmsg("unique constraints on virtual generated columns are not supported") : + errmsg("indexes on virtual generated columns are not supported")); } /* - * XXX Virtual generated columns in index expressions or predicates - * could be supported, but it needs support in - * RelationGetIndexExpressions() and RelationGetIndexPredicate(). + * Also check for system and generated columns used in expressions or + * predicates. */ - j = -1; - while ((j = bms_next_member(indexattrs, j)) >= 0) + if (indexInfo->ii_Expressions || indexInfo->ii_Predicate) { - AttrNumber attno = j + FirstLowInvalidHeapAttributeNumber; + Bitmapset *indexattrs = NULL; + int j; - if (TupleDescAttr(RelationGetDescr(rel), attno - 1)->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) - ereport(ERROR, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - stmt->isconstraint ? - errmsg("unique constraints on virtual generated columns are not supported") : - errmsg("indexes on virtual generated columns are not supported"))); + pull_varattnos((Node *) indexInfo->ii_Expressions, 1, &indexattrs); + pull_varattnos((Node *) indexInfo->ii_Predicate, 1, &indexattrs); + + for (int i = FirstLowInvalidHeapAttributeNumber + 1; i < 0; i++) + { + if (bms_is_member(i - FirstLowInvalidHeapAttributeNumber, + indexattrs)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("index creation on system columns is not supported"))); + } + + /* + * XXX Virtual generated columns in index expressions or + * predicates could be supported, but it needs support in + * RelationGetIndexExpressions() and RelationGetIndexPredicate(). + */ + j = -1; + while ((j = bms_next_member(indexattrs, j)) >= 0) + { + AttrNumber attno = j + FirstLowInvalidHeapAttributeNumber; + + if (TupleDescAttr(RelationGetDescr(rel), attno - 1)->attgenerated == ATTRIBUTE_GENERATED_VIRTUAL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + stmt->isconstraint ? + errmsg("unique constraints on virtual generated columns are not supported") : + errmsg("indexes on virtual generated columns are not supported"))); + } } } diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 5fd8b51312c8..84ad8ee7e073 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -110,6 +110,10 @@ #include "utils/typcache.h" #include "utils/usercontext.h" +bool (*add_alter_tablespace_hook) (Relation rel) = NULL; +void (*add_alter_table_change_owner_hook) (Oid relOid, char relKind, Oid newOwnerId) = NULL; +void (*add_alter_table_change_schema_hook) (Oid relOid, char relKind, Oid newNspOid) = NULL; + /* * ON COMMIT action list */ @@ -16261,6 +16265,9 @@ ATExecChangeOwner(Oid relationOid, Oid newOwnerId, bool recursing, LOCKMODE lock /* If it has dependent sequences, recurse to change them too */ change_owner_recurse_to_sequences(relationOid, newOwnerId, lockmode); + + if (add_alter_table_change_owner_hook) + add_alter_table_change_owner_hook(relationOid, tuple_class->relkind, newOwnerId); } InvokeObjectPostAlterHook(RelationRelationId, relationOid, 0); @@ -16810,6 +16817,112 @@ ATExecSetRelOptions(Relation rel, List *defList, AlterTableType operation, } table_close(pgclass, RowExclusiveLock); + + /* + * Look up the index's access method, save the OID of its handler function + */ + if (rel->rd_rel->relam) + { + Form_pg_am aform; + HeapTuple amtuple; + + amtuple = SearchSysCache1(AMOID, ObjectIdGetDatum(rel->rd_rel->relam)); + if (!HeapTupleIsValid(amtuple)) + elog(ERROR, "cache lookup failed for access method %u", + rel->rd_rel->relam); + aform = (Form_pg_am) GETSTRUCT(amtuple); + + if (strcmp(NameStr(aform->amname), "vci") == 0) + + /* + * if((rel->rd_am) && (strcmp(NameStr(rel->rd_am->amname), "vci") + * == 0)) + */ + { + Relation depRel, + viewRel; + Oid vci_relid, + viewrelid; + ScanKeyData key; + SysScanDesc scan; + HeapTuple tup; + + depRel = table_open(DependRelationId, AccessShareLock); + vci_relid = RelationGetRelid(rel); + ScanKeyInit(&key, + Anum_pg_depend_refobjid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(vci_relid)); + scan = systable_beginscan(depRel, DependReferenceIndexId, true, + NULL, 1, &key); + while (HeapTupleIsValid((tup = systable_getnext(scan)))) + { + Form_pg_depend depform = (Form_pg_depend) GETSTRUCT(tup); + + /* Retrieve objid of the internal function */ + if (depform->classid == RelationRelationId) + { + viewrelid = depform->objid; + viewRel = table_open(viewrelid, AccessExclusiveLock); + pgclass = table_open(RelationRelationId, RowExclusiveLock); + tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(viewrelid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", viewrelid); + + datum = (Datum) 0; + + /* Generate new proposed reloptions (text array) */ + newOptions = transformRelOptions(datum, defList, NULL, validnsps, false, + operation == AT_ResetRelOptions); + + (void) view_reloptions(newOptions, true); + + /* + * All we need do here is update the pg_class row; the new + * options will be propagated into relcaches during + * post-commit cache inval. + */ + memset(repl_val, 0, sizeof(repl_val)); + memset(repl_null, false, sizeof(repl_null)); + memset(repl_repl, false, sizeof(repl_repl)); + + if (newOptions != (Datum) 0) + repl_val[Anum_pg_class_reloptions - 1] = newOptions; + else + repl_null[Anum_pg_class_reloptions - 1] = true; + + repl_repl[Anum_pg_class_reloptions - 1] = true; + + newtuple = heap_modify_tuple(tuple, RelationGetDescr(pgclass), + repl_val, repl_null, repl_repl); + + /* + * simple_heap_update(pgclass, &newtuple->t_self, + * newtuple); + * + * CatalogUpdateIndexes(pgclass, newtuple); + */ + /* Perform actual update */ + CatalogTupleUpdate(pgclass, &newtuple->t_self, newtuple); + InvokeObjectPostAlterHook(RelationRelationId, viewrelid, 0); + + heap_freetuple(newtuple); + + ReleaseSysCache(tuple); + + table_close(pgclass, RowExclusiveLock); + table_close(viewRel, AccessExclusiveLock); + + } + } + + systable_endscan(scan); + + table_close(depRel, AccessShareLock); + } + + ReleaseSysCache(amtuple); + } } /* @@ -16831,6 +16944,15 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode) */ rel = relation_open(tableOid, lockmode); + if (add_alter_tablespace_hook) + { + if (add_alter_tablespace_hook(rel)) + { + relation_close(rel, NoLock); + return; + } + } + /* Check first if relation can be moved to new tablespace */ if (!CheckRelationTableSpaceMove(rel, newTableSpace)) { @@ -19080,6 +19202,9 @@ AlterRelationNamespaceInternal(Relation classRel, Oid relOid, { add_exact_object_address(&thisobj, objsMoved); + if (add_alter_table_change_schema_hook) + add_alter_table_change_schema_hook(relOid, classForm->relkind, newNspOid); + InvokeObjectPostAlterHook(RelationRelationId, relOid, 0); } diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index ed03e3bd50d8..075b4a14c631 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -130,6 +130,8 @@ static double compute_parallel_delay(void); static VacOptValue get_vacoptval_from_boolean(DefElem *def); static bool vac_tid_reaped(ItemPointer itemptr, void *state); +bool (*add_skip_vacuum_hook) (Relation rel); + /* * GUC check function to ensure GUC value specified is within the allowable * range. @@ -2121,6 +2123,20 @@ vacuum_rel(Oid relid, RangeVar *relation, VacuumParams params, return false; } + /* + * Silently ignore if it's a VCI internal table. + */ + if (add_skip_vacuum_hook) + { + if (add_skip_vacuum_hook(rel)) + { + relation_close(rel, lmode); + PopActiveSnapshot(); + CommitTransactionCommand(); + return false; + } + } + /* * Silently ignore tables that are temp tables of other backends --- * trying to vacuum these will lead to great unhappiness, since their diff --git a/src/backend/executor/execAmi.c b/src/backend/executor/execAmi.c index 1d0e8ad57b4a..fff1b85ff593 100644 --- a/src/backend/executor/execAmi.c +++ b/src/backend/executor/execAmi.c @@ -437,6 +437,7 @@ ExecSupportsMarkRestore(Path *pathnode) return true; case T_CustomScan: + case T_CustomPlanMarkPos: if (castNode(CustomPath, pathnode)->flags & CUSTOMPATH_SUPPORT_MARK_RESTORE) return true; return false; @@ -563,6 +564,7 @@ ExecSupportsBackwardScan(Plan *node) return ExecSupportsBackwardScan(((SubqueryScan *) node)->subplan); case T_CustomScan: + case T_CustomPlanMarkPos: if (((CustomScan *) node)->flags & CUSTOMPATH_SUPPORT_BACKWARD_SCAN) return true; return false; diff --git a/src/backend/executor/execExpr.c b/src/backend/executor/execExpr.c index f1569879b529..5c29ab337afa 100644 --- a/src/backend/executor/execExpr.c +++ b/src/backend/executor/execExpr.c @@ -68,7 +68,6 @@ typedef struct ExprSetupInfo List *multiexpr_subplans; } ExprSetupInfo; -static void ExecReadyExpr(ExprState *state); static void ExecInitExprRec(Expr *node, ExprState *state, Datum *resv, bool *resnull); static void ExecInitFunc(ExprEvalStep *scratch, Expr *node, List *args, @@ -77,7 +76,6 @@ static void ExecInitFunc(ExprEvalStep *scratch, Expr *node, List *args, static void ExecInitSubPlanExpr(SubPlan *subplan, ExprState *state, Datum *resv, bool *resnull); -static void ExecCreateExprSetupSteps(ExprState *state, Node *node); static void ExecPushExprSetupSteps(ExprState *state, ExprSetupInfo *info); static bool expr_setup_walker(Node *node, ExprSetupInfo *info); static bool ExecComputeSlotInfo(ExprState *state, ExprEvalStep *op); @@ -898,7 +896,7 @@ ExecCheck(ExprState *state, ExprContext *econtext) * Therefore this should be used instead of directly calling * ExecReadyInterpretedExpr(). */ -static void +void ExecReadyExpr(ExprState *state) { if (jit_compile_expr(state)) @@ -2877,7 +2875,7 @@ ExecInitSubPlanExpr(SubPlan *subplan, * Add expression steps performing setup that's needed before any of the * main execution of the expression. */ -static void +void ExecCreateExprSetupSteps(ExprState *state, Node *node) { ExprSetupInfo info = {0, 0, 0, 0, 0, NIL}; diff --git a/src/backend/executor/execExprInterp.c b/src/backend/executor/execExprInterp.c index 0e1a74976f7d..6bc0de36b8d0 100644 --- a/src/backend/executor/execExprInterp.c +++ b/src/backend/executor/execExprInterp.c @@ -188,6 +188,21 @@ static pg_attribute_always_inline void ExecAggPlainTransByRef(AggState *aggstate int setno); static char *ExecGetJsonValueItemString(JsonbValue *item, bool *resnull); +ExprEvalVar_hook_type ExprEvalVar_hook = NULL; +ExprEvalParam_hook_type ExprEvalParam_hook = NULL; +void +VciExprEvalVarHook(ExprState *state, ExprEvalStep *op, ExprContext *econtext) +{ + Assert(ExprEvalVar_hook != NULL); + (*ExprEvalVar_hook) (state, op, econtext); +} +void +VciExprEvalParamHook(ExprState *state, ExprEvalStep *op, ExprContext *econtext) +{ + Assert(ExprEvalParam_hook != NULL); + (*ExprEvalParam_hook) (state, op, econtext); +} + /* * ScalarArrayOpExprHashEntry * Hash table entry type used during EEOP_HASHED_SCALARARRAYOP @@ -592,6 +607,8 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull) &&CASE_EEOP_AGG_PRESORTED_DISTINCT_MULTI, &&CASE_EEOP_AGG_ORDERED_TRANS_DATUM, &&CASE_EEOP_AGG_ORDERED_TRANS_TUPLE, + &&CASE_EEOP_VCI_VAR, + &&CASE_EEOP_VCI_PARAM_EXEC, &&CASE_EEOP_LAST }; @@ -2265,6 +2282,25 @@ ExecInterpExpr(ExprState *state, ExprContext *econtext, bool *isnull) EEO_NEXT(); } + EEO_CASE(EEOP_VCI_VAR) + { + /* TO-do */ + Assert(ExprEvalVar_hook != NULL); + (*ExprEvalVar_hook) (state, op, econtext); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_VCI_PARAM_EXEC) + { + /* To-do */ + Assert(ExprEvalParam_hook != NULL); + (*ExprEvalParam_hook) (state, op, econtext); + + EEO_NEXT(); + } + + EEO_CASE(EEOP_LAST) { /* unreachable */ diff --git a/src/backend/executor/execProcnode.c b/src/backend/executor/execProcnode.c index f5f9cfbeeada..60fd2a4e46a9 100644 --- a/src/backend/executor/execProcnode.c +++ b/src/backend/executor/execProcnode.c @@ -287,6 +287,7 @@ ExecInitNode(Plan *node, EState *estate, int eflags) break; case T_CustomScan: + case T_CustomPlanMarkPos: result = (PlanState *) ExecInitCustomScan((CustomScan *) node, estate, eflags); break; diff --git a/src/backend/executor/execScan.c b/src/backend/executor/execScan.c index 90726949a870..99f7c6005206 100644 --- a/src/backend/executor/execScan.c +++ b/src/backend/executor/execScan.c @@ -139,7 +139,7 @@ ExecScanReScan(ScanState *node) */ if (IsA(node->ps.plan, ForeignScan)) relids = ((ForeignScan *) node->ps.plan)->fs_base_relids; - else if (IsA(node->ps.plan, CustomScan)) + else if (IsA(node->ps.plan, CustomScan) || IsA(node->ps.plan, CustomPlanMarkPos)) relids = ((CustomScan *) node->ps.plan)->custom_relids; else elog(ERROR, "unexpected scan node: %d", diff --git a/src/backend/executor/nodeCustom.c b/src/backend/executor/nodeCustom.c index ac2196b64c7a..2eaef2f2d864 100644 --- a/src/backend/executor/nodeCustom.c +++ b/src/backend/executor/nodeCustom.c @@ -49,6 +49,22 @@ ExecInitCustomScan(CustomScan *cscan, EState *estate, int eflags) css->ss.ps.state = estate; css->ss.ps.ExecProcNode = ExecCustomScan; + if (strcmp(cscan->methods->CustomName, "VCI Scan") == 0 || + strcmp(cscan->methods->CustomName, "VCI Sort") == 0 || + strcmp(cscan->methods->CustomName, "VCI Aggregate") == 0 || + strcmp(cscan->methods->CustomName, "VCI HashAggregate") == 0 || + strcmp(cscan->methods->CustomName, "VCI GroupAggregate") == 0 || + strcmp(cscan->methods->CustomName, "VCI Gather") == 0) + { + /* + * The callback of custom-scan provider applies the final + * initialization of the custom-scan-state node according to its + * logic. + */ + css->methods->BeginCustomScan(css, estate, eflags); + return css; + } + /* create expression context for node */ ExecAssignExprContext(estate, &css->ss.ps); diff --git a/src/backend/executor/nodeModifyTable.c b/src/backend/executor/nodeModifyTable.c index 4c5647ac38a1..92e0815a52d5 100644 --- a/src/backend/executor/nodeModifyTable.c +++ b/src/backend/executor/nodeModifyTable.c @@ -71,6 +71,7 @@ #include "utils/rel.h" #include "utils/snapmgr.h" +List *(*add_should_index_insert_hook) (ResultRelInfo *, TupleTableSlot *, ItemPointer, EState *) = NULL; typedef struct MTTargetRelLookup { @@ -2333,6 +2334,15 @@ ExecUpdateEpilogue(ModifyTableContext *context, UpdateContext *updateCxt, NULL, NIL, (updateCxt->updateIndexes == TU_Summarizing)); + /* + * VCI update hook + */ + else if (resultRelInfo->ri_NumIndices > 0 && !updateCxt->updateIndexes) + { + if (add_should_index_insert_hook) + recheckIndexes = add_should_index_insert_hook(resultRelInfo, slot, &slot->tts_tid, context->estate); + } + /* AFTER ROW UPDATE Triggers */ ExecARUpdateTriggers(context->estate, resultRelInfo, NULL, NULL, diff --git a/src/backend/jit/llvm/llvmjit_expr.c b/src/backend/jit/llvm/llvmjit_expr.c index 712b35df7e58..4d3cf8d46cb1 100644 --- a/src/backend/jit/llvm/llvmjit_expr.c +++ b/src/backend/jit/llvm/llvmjit_expr.c @@ -2940,6 +2940,18 @@ llvm_compile_expr(ExprState *state) LLVMBuildBr(b, opblocks[opno + 1]); break; + case EEOP_VCI_VAR: + build_EvalXFunc(b, mod, "VciExprEvalVarHook", + v_state, op, v_econtext); + LLVMBuildBr(b, opblocks[opno + 1]); + break; + + case EEOP_VCI_PARAM_EXEC: + build_EvalXFunc(b, mod, "VciExprEvalParamHook", + v_state, op, v_econtext); + LLVMBuildBr(b, opblocks[opno + 1]); + break; + case EEOP_LAST: Assert(false); break; diff --git a/src/backend/jit/llvm/llvmjit_types.c b/src/backend/jit/llvm/llvmjit_types.c index 167cd554b9c0..4f4a2a91a0b3 100644 --- a/src/backend/jit/llvm/llvmjit_types.c +++ b/src/backend/jit/llvm/llvmjit_types.c @@ -182,4 +182,6 @@ void *referenced_functions[] = strlen, varsize_any, ExecInterpExprStillValid, + VciExprEvalParamHook, + VciExprEvalVarHook, }; diff --git a/src/backend/nodes/gen_node_support.pl b/src/backend/nodes/gen_node_support.pl index 9ecddb142314..6bd0008072b6 100644 --- a/src/backend/nodes/gen_node_support.pl +++ b/src/backend/nodes/gen_node_support.pl @@ -153,6 +153,7 @@ sub elem AllocSetContext GenerationContext SlabContext BumpContext TIDBitmap WindowObjectData + CustomPlanMarkPos ); # This is a regular node, but we skip parsing it from its header file @@ -679,10 +680,21 @@ sub elem my $struct_no_equal = (elem $n, @no_equal); next if $struct_no_copy && $struct_no_equal; - print $cfs "\t\tcase T_${n}:\n" - . "\t\t\tretval = _copy${n}(from);\n" - . "\t\t\tbreak;\n" - unless $struct_no_copy; + if($n eq 'CustomScan') + { + print $cfs "\t\tcase T_${n}:\n" + . "\t\tcase T_CustomPlanMarkPos:\n" + . "\t\t\tretval = _copy${n}(from);\n" + . "\t\t\tbreak;\n" + unless $struct_no_copy; + } + else + { + print $cfs "\t\tcase T_${n}:\n" + . "\t\t\tretval = _copy${n}(from);\n" + . "\t\t\tbreak;\n" + unless $struct_no_copy; + } print $efs "\t\tcase T_${n}:\n" . "\t\t\tretval = _equal${n}(a, b);\n" @@ -691,13 +703,35 @@ sub elem next if elem $n, @custom_copy_equal; - print $cff " -static $n * -_copy${n}(const $n *from) -{ -\t${n} *newnode = makeNode($n); - -" unless $struct_no_copy; + if ($n eq 'CustomScan') + { + print $cff "static $n *\n" + . "_copy${n}(const $n *from)\n" + . "{\n" + . "\tCustomScan *newnode;\n\n" + . "\tif (strcmp(from->methods->CustomName, \"VCI Scan\") == 0 || + strcmp(from->methods->CustomName, \"VCI Sort\") == 0 || + strcmp(from->methods->CustomName, \"VCI Aggregate\") == 0 || + strcmp(from->methods->CustomName, \"VCI HashAggregate\") == 0 || + strcmp(from->methods->CustomName, \"VCI GroupAggregate\") == 0 || + strcmp(from->methods->CustomName, \"VCI Gather\") == 0)\n" + . "\t{\n" + . "\t\tnewnode = from->methods->CopyCustomPlan(from);\n" + . "\t}\n" + . "\telse\n" + . "\t\tnewnode = makeNode(CustomScan);\n\n" + . "\t((Node *) newnode)->type = nodeTag(from);\n\n" unless $struct_no_copy; + } + else + { + print $cff + "static $n *\n" + . "_copy${n}(const $n *from)\n" + . "{\n" + ."\t${n} *newnode = makeNode($n);\n\n" + + unless $struct_no_copy; + } print $eff " static bool diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c index 9c6436eb72f2..5d4a2dc0880e 100644 --- a/src/backend/optimizer/path/allpaths.c +++ b/src/backend/optimizer/path/allpaths.c @@ -47,10 +47,10 @@ #include "partitioning/partbounds.h" #include "port/pg_bitutils.h" #include "rewrite/rewriteManip.h" +#include "utils/guc.h" #include "utils/lsyscache.h" #include "utils/selfuncs.h" - /* Bitmask flags for pushdown_safety_info.unsafeFlags */ #define UNSAFE_HAS_VOLATILE_FUNC (1 << 0) #define UNSAFE_HAS_SET_FUNC (1 << 1) @@ -822,6 +822,8 @@ static void set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) { Relids required_outer; + char *isVCIEnabled; + bool is_partition = false; /* * We don't support pushing join clauses into the quals of a seqscan, but @@ -847,6 +849,20 @@ set_plain_rel_pathlist(PlannerInfo *root, RelOptInfo *rel, RangeTblEntry *rte) /* If appropriate, consider parallel sequential scan */ if (rel->consider_parallel && required_outer == NULL) create_plain_partial_paths(root, rel); + /**** + * Putting the isRelHasVCIIndex after the create_plain_partial_paths because + * want to enable oss parallelscan working on VCI tables but disable other + * gather plan like parallel_loop,parallel_agg working on VCI tables. + * Don't do this for partitioned tables or partitions as parallelscans on partitioned + * tables require gather plans + */ + if (isRelHasVCIIndex(rte->relid, &is_partition) && (bms_membership(root->all_baserels) == BMS_SINGLETON) && + !is_partition) + { + isVCIEnabled = GetConfigOptionByName("vci.enable", NULL, false); + if (strcmp(isVCIEnabled, "on") == 0) + rel->consider_parallel = false; + } /* Consider index scans */ create_index_paths(root, rel); diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 63fe66371556..ddd3328af2c9 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -34,6 +34,7 @@ #include "optimizer/placeholder.h" #include "optimizer/plancat.h" #include "optimizer/planmain.h" +#include "optimizer/planner.h" #include "optimizer/prep.h" #include "optimizer/restrictinfo.h" #include "optimizer/subselect.h" @@ -174,7 +175,6 @@ static Node *fix_indexqual_operand(Node *node, IndexOptInfo *index, int indexcol static List *get_switched_clauses(List *clauses, Relids outerrelids); static List *order_qual_clauses(PlannerInfo *root, List *clauses); static void copy_generic_path_info(Plan *dest, Path *src); -static void copy_plan_costsize(Plan *dest, Plan *src); static void label_sort_with_costsize(PlannerInfo *root, Sort *plan, double limit_tuples); static void label_incrementalsort_with_costsize(PlannerInfo *root, IncrementalSort *plan, @@ -5372,7 +5372,7 @@ copy_generic_path_info(Plan *dest, Path *src) * Copy cost and size info from a lower plan node to an inserted node. * (Most callers alter the info after copying it.) */ -static void +void copy_plan_costsize(Plan *dest, Plan *src) { dest->disabled_nodes = src->disabled_nodes; diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index 200f72c6e256..d9ee639c868f 100644 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -1925,7 +1925,8 @@ GlobalVisHorizonKindForRel(Relation rel) Assert(!rel || rel->rd_rel->relkind == RELKIND_RELATION || rel->rd_rel->relkind == RELKIND_MATVIEW || - rel->rd_rel->relkind == RELKIND_TOASTVALUE); + rel->rd_rel->relkind == RELKIND_TOASTVALUE || + rel->rd_rel->relkind == RELKIND_INDEX); if (rel == NULL || rel->rd_rel->relisshared || RecoveryInProgress()) return VISHORIZON_SHARED; diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c index 4cc7f645c317..b60af88203f8 100644 --- a/src/backend/storage/lmgr/lock.c +++ b/src/backend/storage/lmgr/lock.c @@ -39,6 +39,8 @@ #include "access/xlogutils.h" #include "miscadmin.h" #include "pg_trace.h" +#include "pgstat.h" +#include "postmaster/bgworker.h" #include "storage/lmgr.h" #include "storage/proc.h" #include "storage/procarray.h" @@ -811,8 +813,18 @@ LockAcquire(const LOCKTAG *locktag, bool sessionLock, bool dontWait) { - return LockAcquireExtended(locktag, lockmode, sessionLock, dontWait, - true, NULL, false); + /* + * Don't lock for VCI parallel workers and other type of workers should go + * in normal flow, In case if there is any change in background worker + * name for VCI parallel workers, the following code also needs an update. + * FIXME: Try to use the community parallelism code, so that we don't need + * our own VCI parallel infrastructure. + */ + if (AmBackgroundWorkerProcess() && strstr(MyBgworkerEntry->bgw_name, "backend=")) + return LOCKACQUIRE_OK; + else + return LockAcquireExtended(locktag, lockmode, sessionLock, dontWait, + true, NULL, false); } /* @@ -2139,6 +2151,16 @@ LockRelease(const LOCKTAG *locktag, LOCKMODE lockmode, bool sessionLock) */ if (!locallock || locallock->nLocks <= 0) { + /* + * Don't lock for VCI parallel workers and other type of workers + * should go in normal flow, In case if there is any change in + * background worker name for VCI parallel workers, the following code + * also needs an update. FIXME: Try to use the community parallelism + * code, so that we don't need our own VCI parallel infrastructure. + */ + if (AmBackgroundWorkerProcess() && strstr(MyBgworkerEntry->bgw_name, "backend=")) + return true; + elog(WARNING, "you don't own a lock of type %s", lockMethodTable->lockModeNames[lockmode]); return false; diff --git a/src/backend/utils/adt/timestamp.c b/src/backend/utils/adt/timestamp.c index 156a4830ffda..b45a21b5de8f 100644 --- a/src/backend/utils/adt/timestamp.c +++ b/src/backend/utils/adt/timestamp.c @@ -24,6 +24,7 @@ #include "catalog/pg_type.h" #include "common/int.h" #include "common/int128.h" +#include "datatype/timestamp.h" #include "funcapi.h" #include "libpq/pqformat.h" #include "miscadmin.h" @@ -73,19 +74,6 @@ typedef struct pg_tz *attimezone; } generate_series_timestamptz_fctx; -/* - * The transition datatype for interval aggregates is declared as internal. - * It's a pointer to an IntervalAggState allocated in the aggregate context. - */ -typedef struct IntervalAggState -{ - int64 N; /* count of finite intervals processed */ - Interval sumX; /* sum of finite intervals processed */ - /* These counts are *not* included in N! Use IA_TOTAL_COUNT() as needed */ - int64 pInfcount; /* count of +infinity intervals */ - int64 nInfcount; /* count of -infinity intervals */ -} IntervalAggState; - #define IA_TOTAL_COUNT(ia) \ ((ia)->N + (ia)->pInfcount + (ia)->nInfcount) @@ -3481,7 +3469,7 @@ interval_larger(PG_FUNCTION_ARGS) PG_RETURN_INTERVAL_P(result); } -static void +void finite_interval_pl(const Interval *span1, const Interval *span2, Interval *result) { Assert(!INTERVAL_NOT_FINITE(span1)); @@ -3960,7 +3948,7 @@ in_range_interval_interval(PG_FUNCTION_ARGS) * context. When the state data needs to be allocated in the current memory * context, we use palloc0 directly e.g. interval_avg_deserialize(). */ -static IntervalAggState * +IntervalAggState * makeIntervalAggState(FunctionCallInfo fcinfo) { IntervalAggState *state; diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index 2b798b823ea5..b92856f18d2c 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -92,6 +92,8 @@ #define RELCACHE_INIT_FILEMAGIC 0x573266 /* version ID value */ +bool (*add_skip_vci_index_hook) (Relation rel) = NULL; + /* * Whether to bother checking if relation cache memory needs to be freed * eagerly. See also RelationBuildDesc() and pg_config_manual.h. @@ -5389,6 +5391,16 @@ RelationGetIndexAttrBitmap(Relation relation, IndexAttrBitmapKind attrKind) indexDesc = index_open(indexOid, AccessShareLock); + if (add_skip_vci_index_hook) + { + if (add_skip_vci_index_hook(indexDesc)) + { + /* Skip if Index is VCI index */ + index_close(indexDesc, AccessShareLock); + continue; + } + } + /* * Extract index expressions and index predicate. Note: Don't use * RelationGetIndexExpressions()/RelationGetIndexPredicate(), because @@ -6967,6 +6979,60 @@ unlink_initfile(const char *initfilename, int elevel) } } +bool +isRelHasVCIIndex(Oid relid, bool *is_partition) +{ + ListCell *l; + Relation relation; + + bool hasVCI = false; + + *is_partition = false; + relation = table_open(relid, NoLock); + + if ((relation->rd_rel->relispartition == true) || relation->rd_rel->relkind == RELKIND_PARTITIONED_TABLE) + *is_partition = true; + + if (relation->rd_rel->relhasindex) + { + List *indexoidlist; + + indexoidlist = RelationGetIndexList(relation); + + foreach(l, indexoidlist) + { + Oid relam; + Oid indexoid = lfirst_oid(l); + Relation indexRelation; + Form_pg_am aform; + HeapTuple amtuple; + + indexRelation = index_open(indexoid, NoLock); + relam = indexRelation->rd_rel->relam; + + amtuple = SearchSysCache1(AMOID, ObjectIdGetDatum(relam)); + if (!HeapTupleIsValid(amtuple)) + elog(ERROR, "cache lookup failed for access method %u", + relam); + aform = (Form_pg_am) GETSTRUCT(amtuple); + + if (strcmp(NameStr(aform->amname), "vci") == 0) + { + hasVCI = true; + } + + ReleaseSysCache(amtuple); + index_close(indexRelation, NoLock); + + if (hasVCI) + break; + } + } + + table_close(relation, NoLock); + return hasVCI; +} + /* * ResourceOwner callbacks */ diff --git a/src/bin/pg_dump/common.c b/src/bin/pg_dump/common.c index 4e7303ea6317..bf4f4298b3a8 100644 --- a/src/bin/pg_dump/common.c +++ b/src/bin/pg_dump/common.c @@ -360,6 +360,7 @@ flagInhTables(Archive *fout, TableInfo *tblinfo, int numTables, AssignDumpId(&attachinfo->dobj); attachinfo->dobj.name = pg_strdup(tblinfo[i].dobj.name); attachinfo->dobj.namespace = tblinfo[i].dobj.namespace; + attachinfo->dobj.isvciview = false; attachinfo->parentTbl = tblinfo[i].parents[0]; attachinfo->partitionTbl = &tblinfo[i]; diff --git a/src/bin/pg_dump/pg_dump.c b/src/bin/pg_dump/pg_dump.c index 47913178a93b..b47ae22655cc 100644 --- a/src/bin/pg_dump/pg_dump.c +++ b/src/bin/pg_dump/pg_dump.c @@ -7552,6 +7552,8 @@ getTables(Archive *fout, int *numTables) tblinfo[i].dummy_view = false; /* might get set during sort */ tblinfo[i].postponed_def = false; /* might get set during sort */ + tblinfo[i].dobj.isvciview = false; + /* Tables have data */ tblinfo[i].dobj.components |= DUMP_COMPONENT_DATA; @@ -8625,6 +8627,17 @@ getRules(Archive *fout) ruleinfo[i].ev_enabled = *(PQgetvalue(res, i, i_ev_enabled)); if (ruleinfo[i].ruletable) { + /* + * isvciview is set to true when the table is a VCI internal relation. The + * judgement is done by the logic in vci_isVciRelation function + */ + if ((ruleinfo[i].ruletable->relkind == RELKIND_MATVIEW || + ruleinfo[i].ruletable->relkind == RELKIND_VIEW) && + !strcmp(ruleinfo[i].ruletable->dobj.name, ruleinfo[i].dobj.name)) + { + ruleinfo[i].ruletable->dobj.isvciview = true; + } + /* * If the table is a view or materialized view, force its ON * SELECT rule to be sorted before the view itself --- this @@ -17087,8 +17100,9 @@ dumpTableSchema(Archive *fout, const TableInfo *tbinfo) int j, k; - /* We had better have loaded per-column details about this table */ - Assert(tbinfo->interesting); + /* Do not dump VCI internal relations */ + if (tbinfo->dobj.isvciview) + return; qrelname = pg_strdup(fmtId(tbinfo->dobj.name)); qualrelname = pg_strdup(fmtQualifiedDumpable(tbinfo)); @@ -20158,12 +20172,27 @@ addBoundaryDependencies(DumpableObject **dobjs, int numObjs, case DO_FDW: case DO_FOREIGN_SERVER: case DO_TRANSFORM: + + /* + * Do not add dependency for VCI internal relations to suppress dependency + * loop message + */ + if (dobj->isvciview) + break; /* Pre-data objects: must come before the pre-data boundary */ addObjectDependency(preDataBound, dobj->dumpId); break; + case DO_LARGE_OBJECT: + + /* + * Do not add dependency for VCI internal relations to suppress dependency + * loop message + */ + if (dobj->isvciview) + break; + /* fallthrough */ case DO_TABLE_DATA: case DO_SEQUENCE_SET: - case DO_LARGE_OBJECT: case DO_LARGE_OBJECT_DATA: /* Data objects: must come between the boundaries */ addObjectDependency(dobj, preDataBound->dumpId); diff --git a/src/bin/pg_dump/pg_dump.h b/src/bin/pg_dump/pg_dump.h index 72a00e1bc202..61a16ffe109b 100644 --- a/src/bin/pg_dump/pg_dump.h +++ b/src/bin/pg_dump/pg_dump.h @@ -159,6 +159,7 @@ typedef struct _dumpableObject DumpId *dependencies; /* dumpIds of objects this one depends on */ int nDeps; /* number of valid dependencies */ int allocDeps; /* allocated size of dependencies[] */ + bool isvciview; /* this table is avci internal relation */ } DumpableObject; /* diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 8cbff6ab0eb1..7c93dbba4058 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -391,6 +391,10 @@ extern void log_heap_prune_and_freeze(Relation relation, Buffer buffer, OffsetNumber *dead, int ndead, OffsetNumber *unused, int nunused); +extern PGDLLIMPORT void (*add_index_delete_hook) (Relation indexRelation, ItemPointer heap_tid, TransactionId xmin); +extern PGDLLIMPORT bool (*add_snapshot_satisfies_hook) (HeapTuple htup, Snapshot snapshot, Buffer buffer); +extern PGDLLIMPORT bool (*add_skip_vacuum_hook) (Relation rel); + /* in heap/vacuumlazy.c */ extern void heap_vacuum_rel(Relation rel, const VacuumParams params, BufferAccessStrategy bstrategy); diff --git a/src/include/access/xlogrecovery.h b/src/include/access/xlogrecovery.h index 8e475e266d18..c4feb0b4b458 100644 --- a/src/include/access/xlogrecovery.h +++ b/src/include/access/xlogrecovery.h @@ -56,6 +56,7 @@ typedef enum RecoveryPauseState RECOVERY_NOT_PAUSED, /* pause not requested */ RECOVERY_PAUSE_REQUESTED, /* pause requested, but not yet paused */ RECOVERY_PAUSED, /* recovery is paused */ + RECOVERY_VCI_PAUSE_REQUESTED, /* pause requested for VCI query */ } RecoveryPauseState; /* User-settable GUC parameters */ @@ -161,6 +162,8 @@ extern void WakeupRecovery(void); extern void StartupRequestWalReceiverRestart(void); extern void XLogRequestWalReceiverReply(void); +extern void SetVciRecoveryPause(void); + extern void RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue); extern void xlog_outdesc(StringInfo buf, XLogReaderState *record); diff --git a/src/include/catalog/dependency.h b/src/include/catalog/dependency.h index 0ea7ccf52430..768057a395fd 100644 --- a/src/include/catalog/dependency.h +++ b/src/include/catalog/dependency.h @@ -225,4 +225,7 @@ extern void shdepDropOwned(List *roleids, DropBehavior behavior); extern void shdepReassignOwned(List *roleids, Oid newrole); +/* vci index original hook*/ +extern PGDLLIMPORT bool (*add_drop_relation_hook) (const ObjectAddress *object, int flags); + #endif /* DEPENDENCY_H */ diff --git a/src/include/catalog/index.h b/src/include/catalog/index.h index 4daa8bef5eea..2b87e629fb56 100644 --- a/src/include/catalog/index.h +++ b/src/include/catalog/index.h @@ -175,6 +175,8 @@ extern void RestoreReindexState(const void *reindexstate); extern void IndexSetParentIndex(Relation partitionIdx, Oid parentOid); +extern PGDLLIMPORT bool (*add_reindex_index_hook) (Relation); +extern PGDLLIMPORT HeapTuple IndexHeapTuple; /* * itemptr_encode - Encode ItemPointer as int64/int8 diff --git a/src/include/commands/explain.h b/src/include/commands/explain.h index 6e51d50efc73..69641b8bb4d9 100644 --- a/src/include/commands/explain.h +++ b/src/include/commands/explain.h @@ -81,4 +81,8 @@ extern void ExplainQueryText(ExplainState *es, QueryDesc *queryDesc); extern void ExplainQueryParameters(ExplainState *es, ParamListInfo params, int maxlen); +extern void ExplainPropertySortGroupKeys(PlanState *planstate, const char *qlabel, + int nkeys, AttrNumber *keycols, + List *ancestors, ExplainState *es); + #endif /* EXPLAIN_H */ diff --git a/src/include/commands/tablecmds.h b/src/include/commands/tablecmds.h index e9b0fab0767b..3df8070c784e 100644 --- a/src/include/commands/tablecmds.h +++ b/src/include/commands/tablecmds.h @@ -108,4 +108,10 @@ extern void RangeVarCallbackOwnsRelation(const RangeVar *relation, extern bool PartConstraintImpliedByRelConstraint(Relation scanrel, List *partConstraint); +extern PGDLLIMPORT bool (*add_alter_tablespace_hook) (Relation rel); +extern PGDLLIMPORT void (*add_alter_table_change_owner_hook) (Oid relOid, + char relKind, Oid newOwnerId); +extern PGDLLIMPORT void (*add_alter_table_change_schema_hook) (Oid relOid, + char relKind, Oid newNspOid); + #endif /* TABLECMDS_H */ diff --git a/src/include/datatype/timestamp.h b/src/include/datatype/timestamp.h index a924d58aabe1..44f8bf7106a3 100644 --- a/src/include/datatype/timestamp.h +++ b/src/include/datatype/timestamp.h @@ -15,6 +15,11 @@ #ifndef DATATYPE_TIMESTAMP_H #define DATATYPE_TIMESTAMP_H +#ifndef FRONTEND +#include "postgres.h" +#include "fmgr.h" +#endif + /* * Timestamp represents absolute time. * @@ -87,6 +92,23 @@ struct pg_itm_in int tm_year; }; +#ifndef FRONTEND +/* + * The transition datatype for interval aggregates is declared as internal. + * It's a pointer to an IntervalAggState allocated in the aggregate context. + */ +typedef struct IntervalAggState +{ + int64 N; /* count of finite intervals processed */ + Interval sumX; /* sum of finite intervals processed */ + /* These counts are *not* included in N! Use IA_TOTAL_COUNT() as needed */ + int64 pInfcount; /* count of +infinity intervals */ + int64 nInfcount; /* count of -infinity intervals */ +} IntervalAggState; + +extern IntervalAggState *makeIntervalAggState(FunctionCallInfo fcinfo); +extern void finite_interval_pl(const Interval *span1, const Interval *span2, Interval *result); +#endif /* Limits on the "precision" option (typmod) for these data types */ #define MAX_TIMESTAMP_PRECISION 6 diff --git a/src/include/executor/execExpr.h b/src/include/executor/execExpr.h index 75366203706c..e137b3f985d8 100644 --- a/src/include/executor/execExpr.h +++ b/src/include/executor/execExpr.h @@ -292,6 +292,8 @@ typedef enum ExprEvalOp EEOP_AGG_ORDERED_TRANS_DATUM, EEOP_AGG_ORDERED_TRANS_TUPLE, + EEOP_VCI_VAR, + EEOP_VCI_PARAM_EXEC, /* non-existent operation, used e.g. to check array lengths */ EEOP_LAST } ExprEvalOp; @@ -338,6 +340,7 @@ typedef struct ExprEvalStep /* but it's just the normal (negative) attr number for SYSVAR */ int attnum; Oid vartype; /* type OID of variable */ + PlanState *vci_parent_planstate; VarReturningType varreturningtype; /* return old/new/default */ } var; @@ -424,6 +427,7 @@ typedef struct ExprEvalStep { int paramid; /* numeric ID for parameter */ Oid paramtype; /* OID of parameter's datatype */ + Plan *vci_parent_plan; } param; /* for EEOP_PARAM_CALLBACK */ @@ -902,6 +906,18 @@ extern void ExecEvalWholeRowVar(ExprState *state, ExprEvalStep *op, ExprContext *econtext); extern void ExecEvalSysVar(ExprState *state, ExprEvalStep *op, ExprContext *econtext, TupleTableSlot *slot); +extern void ExecCreateExprSetupSteps(ExprState *state, Node *node); +extern void ExecReadyExpr(ExprState *state); + +typedef void (*ExprEvalVar_hook_type) (ExprState *state, ExprEvalStep *op, + ExprContext *econtext); +extern PGDLLIMPORT ExprEvalVar_hook_type ExprEvalVar_hook; + +typedef void (*ExprEvalParam_hook_type) (ExprState *state, ExprEvalStep *op, + ExprContext *econtext); +extern PGDLLIMPORT ExprEvalParam_hook_type ExprEvalParam_hook; +extern void VciExprEvalVarHook(ExprState *state, ExprEvalStep *op, ExprContext *econtext); +extern void VciExprEvalParamHook(ExprState *state, ExprEvalStep *op, ExprContext *econtext); extern void ExecAggInitGroup(AggState *aggstate, AggStatePerTrans pertrans, AggStatePerGroup pergroup, ExprContext *aggcontext); diff --git a/src/include/executor/nodeModifyTable.h b/src/include/executor/nodeModifyTable.h index bf3b592e28fd..b49d55220ce5 100644 --- a/src/include/executor/nodeModifyTable.h +++ b/src/include/executor/nodeModifyTable.h @@ -27,6 +27,8 @@ extern ModifyTableState *ExecInitModifyTable(ModifyTable *node, EState *estate, extern void ExecEndModifyTable(ModifyTableState *node); extern void ExecReScanModifyTable(ModifyTableState *node); +extern PGDLLIMPORT List *(*add_should_index_insert_hook) (ResultRelInfo *, TupleTableSlot *, ItemPointer, EState *); + extern void ExecInitMergeTupleSlots(ModifyTableState *mtstate, ResultRelInfo *resultRelInfo); diff --git a/src/include/nodes/extensible.h b/src/include/nodes/extensible.h index 1129c4ba4b1b..c9a349ea2f08 100644 --- a/src/include/nodes/extensible.h +++ b/src/include/nodes/extensible.h @@ -115,6 +115,7 @@ typedef struct CustomScanMethods /* Create execution state (CustomScanState) from a CustomScan plan node */ Node *(*CreateCustomScanState) (CustomScan *cscan); + struct CustomScan *(*CopyCustomPlan) (const struct CustomScan *from); } CustomScanMethods; /* @@ -155,6 +156,9 @@ typedef struct CustomExecMethods void (*ExplainCustomScan) (CustomScanState *node, List *ancestors, ExplainState *es); + void (*SetBoundCustomScan) (const LimitState *limit, + CustomScanState *cps); + void (*ExplainCustomPlanTargetRel) (CustomScanState *node, ExplainState *es); } CustomExecMethods; extern void RegisterCustomScanMethods(const CustomScanMethods *methods); diff --git a/src/include/nodes/params.h b/src/include/nodes/params.h index ca4117b14496..6d7add360994 100644 --- a/src/include/nodes/params.h +++ b/src/include/nodes/params.h @@ -19,6 +19,7 @@ typedef struct ExprState ExprState; typedef struct Param Param; typedef struct ParseState ParseState; +struct PlanState; /* * ParamListInfo @@ -165,5 +166,6 @@ extern ParamListInfo RestoreParamList(char **start_address); extern char *BuildParamLogString(ParamListInfo params, char **knownTextValues, int maxlen); extern void ParamsErrorCallback(void *arg); +typedef struct ExprState *(*ExecInitParam_hook_type) (Param *param, struct PlanState *parent); #endif /* PARAMS_H */ diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 7cdd2b51c94e..84bbd9992409 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -214,6 +214,9 @@ typedef struct Plan /* OK to use as part of parallel plan? */ bool parallel_safe; + /* plan number (1-origin) in the Query */ + AttrNumber plan_no; + /* * information needed for asynchronous execution */ diff --git a/src/include/optimizer/planner.h b/src/include/optimizer/planner.h index 55d9b7940aa9..8b8c50c51c67 100644 --- a/src/include/optimizer/planner.h +++ b/src/include/optimizer/planner.h @@ -53,6 +53,8 @@ typedef void (*create_upper_paths_hook_type) (PlannerInfo *root, void *extra); extern PGDLLIMPORT create_upper_paths_hook_type create_upper_paths_hook; +extern void copy_plan_costsize(Plan *dest, Plan *src); + extern PlannedStmt *standard_planner(Query *parse, const char *query_string, int cursorOptions, diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h index 3561c6bef0bf..7c2bcdfbd5fe 100644 --- a/src/include/utils/relcache.h +++ b/src/include/utils/relcache.h @@ -61,6 +61,8 @@ extern List *RelationGetDummyIndexExpressions(Relation relation); extern List *RelationGetIndexPredicate(Relation relation); extern bytea **RelationGetIndexAttOptions(Relation relation, bool copy); +extern bool isRelHasVCIIndex(Oid relid, bool *is_partition); + /* * Which set of columns to return by RelationGetIndexAttrBitmap. */ @@ -160,4 +162,7 @@ extern PGDLLIMPORT bool criticalRelcachesBuilt; /* should be used only by relcache.c and postinit.c */ extern PGDLLIMPORT bool criticalSharedRelcachesBuilt; +/* vci index original hook*/ +extern PGDLLIMPORT bool (*add_skip_vci_index_hook) (Relation rel); + #endif /* RELCACHE_H */ diff --git a/src/include/utils/snapshot.h b/src/include/utils/snapshot.h index 0e546ec14974..18154d7d90cf 100644 --- a/src/include/utils/snapshot.h +++ b/src/include/utils/snapshot.h @@ -112,6 +112,16 @@ typedef enum SnapshotType * horizon to use. */ SNAPSHOT_NON_VACUUMABLE, + + /* + * VCI WOS2ROS visible + */ + SNAPSHOT_VCI_WOS2ROS, + + /* + * VCI Local ROS visible + */ + SNAPSHOT_VCI_LOCALROS } SnapshotType; typedef struct SnapshotData *Snapshot; From 1fec0343aa93a25f1b9d16a9b741a26a9be9b7e6 Mon Sep 17 00:00:00 2001 From: Peter Smith Date: Tue, 14 Oct 2025 16:34:27 +1100 Subject: [PATCH 2/3] VCI module - main --- contrib/Makefile | 3 +- contrib/meson.build | 1 + contrib/vci/.gitignore | 4 + contrib/vci/Makefile | 40 + contrib/vci/README | 976 ++++ contrib/vci/executor/Makefile | 33 + contrib/vci/executor/meson.build | 18 + contrib/vci/executor/vci_agg.c | 1967 ++++++++ contrib/vci/executor/vci_aggmergetranstype.c | 133 + contrib/vci/executor/vci_aggref.c | 1287 +++++ contrib/vci/executor/vci_executor.c | 2116 +++++++++ contrib/vci/executor/vci_fetch_column_store.c | 1202 +++++ contrib/vci/executor/vci_gather.c | 157 + contrib/vci/executor/vci_param.c | 60 + contrib/vci/executor/vci_plan.c | 235 + contrib/vci/executor/vci_plan_func.c | 950 ++++ contrib/vci/executor/vci_planner.c | 1913 ++++++++ contrib/vci/executor/vci_planner_preanalyze.c | 415 ++ contrib/vci/executor/vci_scan.c | 632 +++ contrib/vci/executor/vci_sort.c | 415 ++ contrib/vci/executor/vci_vector_executor.c | 2338 +++++++++ contrib/vci/expected/bugs.out | 149 + contrib/vci/expected/vci.out | 128 + contrib/vci/include/postgresql_copy.h | 176 + contrib/vci/include/vci.h | 153 + contrib/vci/include/vci_aggref.h | 227 + contrib/vci/include/vci_aggref_impl.inc | 873 ++++ contrib/vci/include/vci_chunk.h | 114 + contrib/vci/include/vci_columns.h | 319 ++ contrib/vci/include/vci_columns_data.h | 33 + contrib/vci/include/vci_executor.h | 893 ++++ contrib/vci/include/vci_fetch.h | 1007 ++++ contrib/vci/include/vci_fetch_row_store.h | 22 + contrib/vci/include/vci_freelist.h | 75 + contrib/vci/include/vci_mem.h | 177 + contrib/vci/include/vci_memory_entry.h | 118 + contrib/vci/include/vci_planner.h | 151 + contrib/vci/include/vci_ros.h | 1085 +++++ contrib/vci/include/vci_ros_command.h | 214 + contrib/vci/include/vci_ros_daemon.h | 69 + contrib/vci/include/vci_supported_oid.h | 34 + contrib/vci/include/vci_tidcrid.h | 344 ++ contrib/vci/include/vci_utils.h | 238 + contrib/vci/include/vci_wos.h | 29 + contrib/vci/include/vci_xact.h | 39 + contrib/vci/meson.build | 67 + contrib/vci/sql/bugs.sql | 87 + contrib/vci/sql/vci.sql | 108 + contrib/vci/storage/Makefile | 34 + contrib/vci/storage/meson.build | 19 + contrib/vci/storage/vci_chunk.c | 616 +++ contrib/vci/storage/vci_columns.c | 1163 +++++ contrib/vci/storage/vci_columns_data.c | 232 + contrib/vci/storage/vci_fetch.c | 2497 ++++++++++ contrib/vci/storage/vci_freelist.c | 474 ++ contrib/vci/storage/vci_index.c | 2152 +++++++++ contrib/vci/storage/vci_internal_view.c | 663 +++ contrib/vci/storage/vci_low_utils.c | 90 + contrib/vci/storage/vci_memory_entry.c | 915 ++++ contrib/vci/storage/vci_ros.c | 1674 +++++++ contrib/vci/storage/vci_ros_command.c | 4165 +++++++++++++++++ contrib/vci/storage/vci_ros_daemon.c | 865 ++++ contrib/vci/storage/vci_tidcrid.c | 1778 +++++++ contrib/vci/storage/vci_wos.c | 265 ++ contrib/vci/storage/vci_xact.c | 146 + contrib/vci/utils/Makefile | 20 + contrib/vci/utils/meson.build | 10 + contrib/vci/utils/nodes.t | 448 ++ contrib/vci/utils/vci_symbols.c | 48 + contrib/vci/vci--1.0.sql | 76 + contrib/vci/vci.conf | 8 + contrib/vci/vci.control | 5 + contrib/vci/vci_main.c | 183 + contrib/vci/vci_read_guc.c | 422 ++ contrib/vci/vci_shmem.c | 206 + contrib/vci/vci_supported_funcs.c | 855 ++++ contrib/vci/vci_supported_funcs.sql | 114 + contrib/vci/vci_supported_types.c | 245 + src/tools/pgindent/typedefs.list | 133 + 79 files changed, 42344 insertions(+), 1 deletion(-) create mode 100644 contrib/vci/.gitignore create mode 100644 contrib/vci/Makefile create mode 100755 contrib/vci/README create mode 100644 contrib/vci/executor/Makefile create mode 100644 contrib/vci/executor/meson.build create mode 100644 contrib/vci/executor/vci_agg.c create mode 100644 contrib/vci/executor/vci_aggmergetranstype.c create mode 100644 contrib/vci/executor/vci_aggref.c create mode 100644 contrib/vci/executor/vci_executor.c create mode 100644 contrib/vci/executor/vci_fetch_column_store.c create mode 100644 contrib/vci/executor/vci_gather.c create mode 100644 contrib/vci/executor/vci_param.c create mode 100644 contrib/vci/executor/vci_plan.c create mode 100644 contrib/vci/executor/vci_plan_func.c create mode 100644 contrib/vci/executor/vci_planner.c create mode 100644 contrib/vci/executor/vci_planner_preanalyze.c create mode 100644 contrib/vci/executor/vci_scan.c create mode 100644 contrib/vci/executor/vci_sort.c create mode 100644 contrib/vci/executor/vci_vector_executor.c create mode 100644 contrib/vci/expected/bugs.out create mode 100644 contrib/vci/expected/vci.out create mode 100644 contrib/vci/include/postgresql_copy.h create mode 100644 contrib/vci/include/vci.h create mode 100644 contrib/vci/include/vci_aggref.h create mode 100644 contrib/vci/include/vci_aggref_impl.inc create mode 100644 contrib/vci/include/vci_chunk.h create mode 100644 contrib/vci/include/vci_columns.h create mode 100644 contrib/vci/include/vci_columns_data.h create mode 100644 contrib/vci/include/vci_executor.h create mode 100644 contrib/vci/include/vci_fetch.h create mode 100644 contrib/vci/include/vci_fetch_row_store.h create mode 100644 contrib/vci/include/vci_freelist.h create mode 100644 contrib/vci/include/vci_mem.h create mode 100644 contrib/vci/include/vci_memory_entry.h create mode 100644 contrib/vci/include/vci_planner.h create mode 100644 contrib/vci/include/vci_ros.h create mode 100644 contrib/vci/include/vci_ros_command.h create mode 100644 contrib/vci/include/vci_ros_daemon.h create mode 100644 contrib/vci/include/vci_supported_oid.h create mode 100644 contrib/vci/include/vci_tidcrid.h create mode 100644 contrib/vci/include/vci_utils.h create mode 100644 contrib/vci/include/vci_wos.h create mode 100644 contrib/vci/include/vci_xact.h create mode 100644 contrib/vci/meson.build create mode 100644 contrib/vci/sql/bugs.sql create mode 100644 contrib/vci/sql/vci.sql create mode 100644 contrib/vci/storage/Makefile create mode 100644 contrib/vci/storage/meson.build create mode 100644 contrib/vci/storage/vci_chunk.c create mode 100644 contrib/vci/storage/vci_columns.c create mode 100644 contrib/vci/storage/vci_columns_data.c create mode 100644 contrib/vci/storage/vci_fetch.c create mode 100644 contrib/vci/storage/vci_freelist.c create mode 100644 contrib/vci/storage/vci_index.c create mode 100644 contrib/vci/storage/vci_internal_view.c create mode 100644 contrib/vci/storage/vci_low_utils.c create mode 100644 contrib/vci/storage/vci_memory_entry.c create mode 100644 contrib/vci/storage/vci_ros.c create mode 100644 contrib/vci/storage/vci_ros_command.c create mode 100644 contrib/vci/storage/vci_ros_daemon.c create mode 100644 contrib/vci/storage/vci_tidcrid.c create mode 100644 contrib/vci/storage/vci_wos.c create mode 100644 contrib/vci/storage/vci_xact.c create mode 100644 contrib/vci/utils/Makefile create mode 100644 contrib/vci/utils/meson.build create mode 100644 contrib/vci/utils/nodes.t create mode 100644 contrib/vci/utils/vci_symbols.c create mode 100644 contrib/vci/vci--1.0.sql create mode 100644 contrib/vci/vci.conf create mode 100644 contrib/vci/vci.control create mode 100644 contrib/vci/vci_main.c create mode 100644 contrib/vci/vci_read_guc.c create mode 100644 contrib/vci/vci_shmem.c create mode 100644 contrib/vci/vci_supported_funcs.c create mode 100644 contrib/vci/vci_supported_funcs.sql create mode 100644 contrib/vci/vci_supported_types.c diff --git a/contrib/Makefile b/contrib/Makefile index 2f0a88d3f774..c0c2f6df1410 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -51,7 +51,8 @@ SUBDIRS = \ tsm_system_rows \ tsm_system_time \ unaccent \ - vacuumlo + vacuumlo \ + vci ifeq ($(with_ssl),openssl) SUBDIRS += pgcrypto sslinfo diff --git a/contrib/meson.build b/contrib/meson.build index ed30ee7d639f..d8bd5c855a58 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -70,4 +70,5 @@ subdir('tsm_system_time') subdir('unaccent') subdir('uuid-ossp') subdir('vacuumlo') +subdir('vci') subdir('xml2') diff --git a/contrib/vci/.gitignore b/contrib/vci/.gitignore new file mode 100644 index 000000000000..5dcb3ff97235 --- /dev/null +++ b/contrib/vci/.gitignore @@ -0,0 +1,4 @@ +# Generated subdirectories +/log/ +/results/ +/tmp_check/ diff --git a/contrib/vci/Makefile b/contrib/vci/Makefile new file mode 100644 index 000000000000..1e02ebb90bbc --- /dev/null +++ b/contrib/vci/Makefile @@ -0,0 +1,40 @@ +# contrib/vci/Makefile + +MODULE_big = vci + +OBJS = \ + vci_main.o \ + vci_read_guc.o \ + vci_shmem.o \ + vci_supported_funcs.o \ + vci_supported_types.o +SUBDIRS = \ + executor \ + storage \ + utils + +OBJS += \ + $(patsubst $(top_srcdir)/contrib/vci/%.c,%.o,$(foreach dir,$(SUBDIRS), $(sort $(wildcard $(top_srcdir)/contrib/vci/$(dir)/*.c)))) + +EXTENSION = vci +DATA = vci--1.0.sql + +PG_CPPFLAGS = -I $(top_srcdir)/contrib/vci/include + +REGRESS = vci bugs +REGRESS_OPTS = --temp-config $(top_srcdir)/contrib/vci/vci.conf + +# Disabled because these tests require "shared_preload_libraries=vci", +# which typical installcheck users do not have (e.g. buildfarm clients). +NO_INSTALLCHECK = 1 + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/vci +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/vci/README b/contrib/vci/README new file mode 100755 index 000000000000..5f62a0f72169 --- /dev/null +++ b/contrib/vci/README @@ -0,0 +1,976 @@ +src/contrib/vci/README + +VCI (Vertical Clustered Indexing) + +1. Overview +2. Core Architecture + 2.1 Dual Storage - WOS and ROS + 2.2 WOS (Write-Optimized Storage) + 2.2.1 MVCC Handling + 2.2.2 Data WOS and Whiteout WOS + 2.3 ROS (Read-Optimized Storage) + 2.3.1 Column data + 2.3.2 TID-CRID mapping + 2.3.3 Delete Vector + 2.3.4 NULL information + 2.4. VCI "Internal Relations" + 2.5. Extent-based Storage Management + 2.5.1 Locating an extent + 2.5.2 Locating column data within an extent + 2.5.3 Garbage Collection + 2.6. Compression System + 2.6.1 Dictionaries +3. VCI Integration with PostgreSQL + 3.1 Standard hooks + 3.1.1 IndexAccessMethod hooks + 3.1.2 Other standard hooks + 3.2 Executor hooks + 3.3 Known problems + 3.3.1 Ad-hoc hooks + 3.3.2 Embedded code +4. Data Flow and Conversion + 4.1 WOS-to-ROS Background Process + 4.2 Overview Diagram + 4.3 Local ROS Creation + 4.4 Existing Table Data +5. MVCC and Transaction Handling +6. Query Execution + 6.1 Custom Plan Integration + 6.1.1 VCI mode requirements + 6.2 Custom Plan Execution Steps + 6.3 Is VCI getting used? +7. Configuration Parameters + 7.1 VCI-specific parameters + 7.1.1 Core Parameters + 7.1.2 Memory Management + 7.1.3 Background Worker Control + 7.1.4 Data Management Thresholds + 7.2 Affected PostgreSQL Parameters +8. Known Restrictions/Limitations/Differences + 8.1 Restrictions + 8.1.1 DROP EXTENSION vci + 8.1.2 Backup and Restore + 8.1.3 Version Upgrades + 8.2 Limitations + 8.2.1 CREATE INDEX + 8.2.2 Supported Relation Types + 8.2.3 Supported Data Types + 8.2.4 Performance Testing + 8.3 Differences + 8.3.1 Configuration (Planner GUCs) + 8.3.2 Disk Size and Estimation + + + +============================================================================== +1. Overview +============================================================================== + +VCI (Vertical Clustered Indexing) is a PostgreSQL extension that implements a +hybrid storage architecture combining row-oriented OLTP capabilities with +column-oriented OLAP performance. It provides an in-memory column store while +maintaining PostgreSQL's transactional guarantees and row-based architecture. + +The extension provides a new indexing method "vci", which can be specified as +the method for the CREATE INDEX statement, to create a VCI index for a +nominated set of columns: +┌─────────────────────────────────────────────────────────────────┐ +│ CREATE INDEX index ON table │ +│ USING vci (column [, ...]) │ +│ [WITH (storage_parameter = value, [...])] │ +│ [TABLESPACE tablespace] │ +└─────────────────────────────────────────────────────────────────┘ + + + +============================================================================== +2. Core Architecture +============================================================================== + +2.1 Dual storage - WOS and ROS +============================== + +VCI implements two storage areas (WOS and ROS) that work together. +- WOS: Write Optimized Storage +- ROS: Read Optimized Storage + +The purpose of VCI is to maintain the ROS as the column-oriented storage for +live table data. + +┌─────────────────────────────────────────────────────────────────┐ +│ VCI Architecture │ +├─────────────────────────────────┬───────────────────────────────┤ +│ WOS │ ROS │ +│ (Write Optimized Storage) │ (Read Optimized Storage) │ +├─────────────────────────────────┼───────────────────────────────┤ +│ • Row-oriented format │ • Column-oriented format │ +│ • Handles INSERT/UPDATE/DELETE │ • Optimized for SELECT/OLAP │ +│ • MVCC transaction data │ • Compressed storage │ +│ • Temporary buffer │ • Frozen committed data │ +│ • Tuple IDs (TID) │ • Columnar Record IDs (CRID) │ +│ │ • Extent-based storage units │ +└─────────────────────────────────┴───────────────────────────────┘ + │ + Background Worker + (WOS → ROS Conversion) + + +The WOS is a row-oriented temporary buffer for incoming write operations. It +maintains MVCC consistency. + +The ROS provides column-oriented storage for efficient OLAP queries. + +At intervals, a Background Worker performs WOS-to-ROS conversion for any +frozen/committed WOS rows. + +Additional WOS/ROS related components, are included to defer the need for +WOS-to-ROS conversion for every query: +e.g. +- Whiteout WOS = TID records of WOS rows that are marked for deletion on ROS +- ROS delete vector = Records of ROS data marked for deletion +- Local ROS = Temporary ROS (scope of SELECT) for any unconverted WOS data + + +2.2 WOS (Write Optimized Storage) +================================= + +Purpose: Buffer incoming write operations and maintain MVCC consistency + +Data Structure: +- Row-oriented format same as PostgreSQL's native storage +- Contains both actual tuple data and MVCC metadata +- Stores transaction information (xmin, xmax, cmin, cmax) +- Maintains TID for each record + + +2.2.1 MVCC Handling: +-------------------- +Uses cmin,cmax,xmin,xmax + + +2.2.2 Data WOS and Whiteout WOS +------------------------------- +(internal relations) + +There are two kinds of data in the WOS: +a. Data WOS -- Actual tuple data from INSERT/UPDATE operations +b. Whiteout WOS -- TID records of WOS rows that are marked for deletion on ROS + + +Example: + +INSERT Operation (Data WOS): +┌────────────────────────────────────────────────────────────┐ +│ TID │ xmin │ xmax │ cmin │ cmax │ Column Data │ +├─────┼──────┼──────┼──────┼──────┼──────────────────────────┤ +│ 100 │ T1 │ - │ 1 │ 1 │ customer_id=123, amt=500 │ +└────────────────────────────────────────────────────────────┘ + +DELETE Operation (Whiteout WOS): +┌────────────────────────────────────┐ +│ TID │ xmax │ Status │ +├─────┼──────┼───────────────────────┤ +│ 100 │ T2 │ Marked for deletion │ +└────────────────────────────────────┘ + + +2.3 ROS (Read Optimized Storage) +================================ + +Purpose: Provide columnar storage for efficient analytical queries + +Data Organization: +- Column-oriented storage with independent compression per column +- Uses CRID (Columnar Record ID) instead of TID for internal addressing +- Organized into fixed-size "extents" (262,144 records each) +- Maintains TID-to-CRID mapping for consistency + +ROS Structure: +┌─────────────────────────────────────────────────────────────────┐ +│ ROS Components │ +├─────────────────────┬───────────────────┬───────────────────────┤ +│ Management Info │ Data Storage │ Support Structures │ +├─────────────────────┼───────────────────┼───────────────────────┤ +│ • TID-CRID mapping │ • Column data │ • Delete vector │ +│ • Extent metadata │ • Compression │ • NULL information │ +│ • Dictionary info │ • TOAST links │ • TID relation │ +└─────────────────────┴───────────────────┴───────────────────────┘ + + +2.3.1. Column data +------------------ +(multiple internal relations) + +Each VCI indexed column is stored as an internal relation. Records are +addresses by CRID (Columnar Record ID) instead of by TID. The CRID gives +the logical position of the columnar data, and is generated in increasing +order of record registration. + +CRID is used to address the data (from each different column-data relation) +that comprised the whole record. + +ROS column data Relations: +┌────────────────────────────────────────────────────────────────┐ +│ ROS column data │ +├────────────────────────────────────────────────────────────────┤ +│ ┌──────┐ ┌──────┐ ┌──────┐ ┌──────┐ │ +│ [CRID=0] │ col1 │ │ col2 │ │ col3 │ │ colN │ │ +│ [CRID=1] │ col1 │ │ col2 │ │ col3 │ │ colN │ │ +│ [CRID=2] │ col1 │ │ col2 │ │ col3 │ ... │ colN │ │ +│ [CRID=3] │ col1 │ │ col2 │ │ col3 │ │ colN │ │ +│ [CRID=4] │ col1 │ │ col2 │ │ col3 │ │ colN │ │ +│ [CRID=5] │ col1 │ │ col2 │ │ col3 │ │ colN │ │ +│ ... └──────┘ └──────┘ └──────┘ └──────┘ │ +│ │ +└────────────────────────────────────────────────────────────────┘ + + +2.3.2. TID-CRID mapping +----------------------- +(internal relation) + +This maps TID to the CRID. When a WOS record (identified by TID) is deleted, +this mapping is needed to identify the matching records from the ROS + +There is also a TID relation that maps a CRID back to the original TID + + +2.3.3. Delete Vector +-------------------- +(internal relation) + +Instead of immediately removing deleted records, VCI uses a bit vector for +efficient tracking of CRIDs of deleted records. + +Delete Vector (Bit Array): +┌────────────────────────────────────────────────────────────────┐ +│ CRID: 0 1 2 3 4 5 6 7 8 9 ... │ +│ Status: [0] [1] [0] [0] [1] [0] [0] [1] [0] [0] ... │ +│ Live Del Live Live Del Live Live Del Live Live │ +└────────────────────────────────────────────────────────────────┘ + +Actual deletion of the ROS records (called "deleted-rows-collection") +happens during the WOS-to-ROS conversion, but it is triggered only when the +deleted records exceeds some configurable threshold. + + +2.3.4. NULL information +------------------------ +(internal relation) + +Bit vector implemented as a fixed-length column element of an internal table. +This indicates (with 1 bit) whether the column element at this CRID is null or +not null. + + +2.4. VCI "Internal Relations" +============================= + +Each VCI index results in the creation of multiple internal relations. + +Notice that most of the VCI data structures of the WOS and ROS etc are stored +within (e.g. binary data columns of) these internal relations. + +These internal relations have a common name pattern "pg_vci_%010d_%05d_%c". + +Where: +┌────────────────────────────────────────────────────────────────┐ +│ %010d: Original table OID (the table with the VCI index) │ +│ %05d: Internal relation type identifier │ +│ %c: Special meanings -- e.g. Metadata vs Data indicator │ +└────────────────────────────────────────────────────────────────┘ + +Internal Relation Types: +- -1: TID relation (maps CRID to original TID) +- -2: NULL vector (bit array for NULL values) +- -3: Delete vector (bit array for deleted records) +- -5: TID-CRID mappings +- -6: TID-CRID mappings (update list) +- -9: Data WOS (buffered row data) +- -10: Whiteout WOS (deletion markers) +- 0-N: ROS column data relations (one per indexed column) + +Example: +For a VCI index on sales(customer_id, amount, date): + +Generated relations include: +pg_vci_0000012345_00000_d → Column 0 data (customer_id) +pg_vci_0000012345_00000_m ... and metadata +pg_vci_0000012345_00001_d → Column 1 data (amount) +pg_vci_0000012345_00001_m ... and metadata +pg_vci_0000012345_00002_d → Column 2 data (date) +pg_vci_0000012345_00002_m ... and metadata +pg_vci_0000012345_65526_d → Whiteout WOS +pg_vci_0000012345_65527_d → Data WOS +pg_vci_0000012345_65531_d → TID-CRID mappings +pg_vci_0000012345_65531_m ... and metadata +pg_vci_0000012345_65530_0 ... and update list #0 +pg_vci_0000012345_65530_1 ... and update list #1 +pg_vci_0000012345_65533_d → Delete vector +pg_vci_0000012345_65533_m ... and metadata +pg_vci_0000012345_65534_d → NULL vector +pg_vci_0000012345_65534_m ... and metadata +pg_vci_0000012345_65535_d → TID relation +pg_vci_0000012345_65535_m ... and metadata + +These relations (implemented as materialized views) are for internal VCI use +only. Normal users do not need to be aware of them and are not allowed to +tamper with them. + + +2.5. Extent-Based Storage Management +==================================== + +VCI introduces the concept of "extents". Extents are logical units of data +management used by the ROS. + +Each extent contains a fixed number of consecutive CRIDs/data; there are +always exactly 262,144 (= 256 * 1024) records per extent, including used and +unused CRIDs. + +Notice that even though the number of records per extent is fixed, the size of +extents might vary according to the VCI index column data type sizes and +compression. + +When a large number of records is transferred during WOS-to-ROS conversion +that work is divided into units of extents. The NULL information and +compression is also executed in units of extents. + +Extent Layout: +┌────────────────────────────────────────────────────────────────┐ +│ Extent N │ +├────────────────────────────────────────────────────────────────┤ +│ Header: │ +│ • Extent ID │ +│ • Compression dictionary │ +│ • Offset information (for variable-length data) │ +│ • Record count and capacity │ +├────────────────────────────────────────────────────────────────┤ +│ Data Section: │ +│ ┌─────────────────────────────────────────────┐ │ +│ │ │ Data0 │ Data1 │ ... │ DataN │ │ │ +│ └─────────────────────────────────────────────┘ │ +└────────────────────────────────────────────────────────────────┘ + + +2.5.1 Locating an Extent +------------------------ + +Extents can be different sizes. Also, due to the garbage collection, they can +become shuffled and fragmented. + +Extents can be located by offset. The approriate offset is found using +extent-ID as an index, as shown below. + +┌────────────────────────────────────────────────────────────────┐ +│ Locating where is Extent N in memory? │ +├────────────────────────────────────────────────────────────────┤ +│ │ +│ Relation (meta) Relation (data) │ +│ ┌───────────┐ ┌───────────┐ │ +│ │ Offsets: │ │ Extent 0 │ │ +│ │ │ │ │ │ +│ │ [Extent0] │ ├───────────┤ │ +│ │ [Extent1] │ │ Extent 1 │ │ +│ │ [Extent2] │ ├───────────┤ │ +│ │ [Extent3] │ │ Extent 5 │ │ +│ │ [Extent4] │ ├───────────┤ │ +│ │ [Extent5] │ │///////////│ │ +│ └───────────┘ │/ gap /│ │ +│ │///////////│ │ +│ ├───────────┤ │ +│ │ Extent 3 │ │ +│ ├───────────┤ │ +│ │ Extent 4 │ │ +│ │ │ │ +│ │ │ │ +│ │ │ │ +│ ├───────────┤ │ +│ │ Extent 2 │ │ +│ └───────────┘ │ +│ │ +└────────────────────────────────────────────────────────────────┘ + + +2.5.2 Locating column data within an extent +------------------------------------------- + +Fixed-Length Data Access: + +Since the extent ID is known and the extent always has a fixed number of +records, the column data position of fixed-length data can be directly +calculated. + +e.g. Position = Extent_Base + (CRID % 262144) * Element_Size + +┌────────────────────────────────────────────────────────────────┐ +│ Addressing fixed-length data: Direct CRID-based addressing │ +├────────────────────────────────────────────────────────────────┤ +│ Extent │ +│ ┌─────────────────────────────────────────────────────────┐ │ +│ │ Header │ [pos0] │ [pos1] │ │ [posN] ││ │ +│ │ │ Data0 │ Data1 │ ... │ DataN ││ │ +│ └─────────────────────────────────────────────────────────┘ │ +└────────────────────────────────────────────────────────────────┘ + +Variable-Length Data Access: + +- Data offsets recorded in the extent header +- TOAST links stored for very large data +- TOAST link vs normal data is indicated by reserved bits in the offset + +┌────────────────────────────────────────────────────────────────┐ +│ Addressing variable-length data: Offset array + data │ +├────────────────────────────────────────────────────────────────┤ +│ Extent │ +│ ┌────────────────────────────────────────────────────────────┐ │ +│ │ Header │ │ │ +│ │ Offset array │ [pos0] │ [pos1] │ ... │ [posN] │ │ │ +│ │ [pos1...posN] │ Data0 │ Data1 │ │ DataN │ │ │ +│ └────────────────────────────────────────────────────────────┘ │ +└────────────────────────────────────────────────────────────────┘ + + +2.5.3 Garbage collection +------------------------ + +During WOS-to-ROS conversion, when the delete vector is processed to delete the +ROS data (aka "deleted-rows-collection"), VCI makes a "copy" of the extent. +It writes the modified extent back to the ROS after the necessary data is +deleted, and removes the original extent. + +Copying like this allows VCI queries to run even during the +"deleted-rows-collection" phase, but it can lead to some fragmentation (i.e. +"gaps") between extents as they are copied/deleted. VCI includes logic to +relocate extents such that fragmentation is minimized. + + +2.6. Compression System +======================== + +// +// Note: compression logic is currently disabled in contrib/vci +// + +Column data compression occurs (if enabled) at the time of WOS-to-ROS +conversion. + +The only compression method currently implemented is run-length encoding, +which is used for integer columns with low cardinality. + +e.g. +Original Data: [5, 5, 5, 7, 7, 2, 2, 2, 2, 5, 5] +Compressed: [(5,3), (7,2), (2,4), (5,2)] +Dictionary: {[0]=5, [1]=7, [2]=2} +Final Encoding: [(0,3), (1,2), (2,4), (0,2)] + + +2.6.1 Dictionaries +------------------ + +- Each extent of each column is compressed independently. +- Each extent can have its own independent compression dictionary or all + extents can share a common dictionary + +Independent Dictionaries (per extent): +- Stored in the column-data internal relation in each extent header, along with +- size/position/count information. + +Common Dictionaries (shared across extents): +- Stored in the column-data internal relation in the first extent header +- The size/positions/counts etc needed for the "common" dictionaries are + stored in the column-metadata internal relation + + + +============================================================================== +3. VCI Integration with PostgreSQL +============================================================================== + +In general, the VCI extension integrates with the PostgreSQL core via hooks. + +e.g. Some primary hooks are for: +- adding data to the WOS (see 'add_should_index_insert_hook') +- deleting data from the WOS (see 'add_index_delete_hook') + + +3.1 Standard hooks +=========================== + +Mostly VCI is implemented as per other indexes; many of the hooks are +implementations of the Index Access Method (IAM) routine. + + +3.1.1 IndexAccessMethod hooks +----------------------------- + +(see vci_handler) +• amroutine->ambuild = vci_build; +• amroutine->ambuildempty = vci_buildempty; +• amroutine->aminsert = vci_insert; +• amroutine->aminsertcleanup = NULL; +• amroutine->ambulkdelete = vci_bulkdelete; +• amroutine->amvacuumcleanup = vci_vacuumcleanup; +• amroutine->amcanreturn = NULL; +• amroutine->amcostestimate = vci_costestimate; +• amroutine->amgettreeheight = vci_gettreeheight; +• amroutine->amoptions = vci_options; +• amroutine->amvalidate = vci_validate; +• amroutine->amadjustmembers = NULL; +• amroutine->ambeginscan = vci_beginscan; +• amroutine->amrescan = vci_rescan; +• amroutine->amgettuple = NULL; +• amroutine->amgetbitmap = NULL; +• amroutine->amendscan = vci_endscan; +• amroutine->ammarkpos = vci_markpos; +• amroutine->amrestrpos = vci_restrpos; + + +3.1.2 Other standard hooks +--------------------------- + +(see _PG_init) +• ProcessUtility_hook = vci_process_utility; +• shmem_request_hook = vci_shmem_request; + +(see vci_setup_shmem) +• shmem_startup_hook = vci_shmem_startup_routine; + + +3.2 Executor hooks +================== +VCI also implements Executor hooks. These enable the Query planner to identify +which queries are capable of using the ROS data. + +(see function vci_setup_executor_hook) +• ExecutorStart_hook = vci_executor_start_routine; +• ExecutorRun_hook = vci_executor_run_routine; +• ExecutorEnd_hook = vci_executor_end_routine; +• ExplainOneQuery_hook = vci_explain_one_query_routine; +• ExprEvalVar_hook = VciExecEvalScalarVarFromColumnStore; +• ExprEvalParam_hook = VciExecEvalParamExec; + + +3.3 Known problems +================== + +NOTE: +There are some artifacts in the current VCI implementation since these patches +historically were implemented in vendor-specific source code, forked from the +PostgreSQL master code. + +These are known problems that will need to be addressed for the VCI +implementation to be accepted by the OSS community. + +3.3.1 Ad-hoc hooks +------------------ +There are places where VCI uses non-standard, hardwired non-extensible hooks, +instead of implementing callbacks from well documented APIs such as IAM. + +(see _PG_init) +• add_index_delete_hook = vci_add_index_delete; +• add_should_index_insert_hook = vci_add_should_index_insert; +• add_drop_relation_hook = vci_add_drop_relation; +• add_reindex_index_hook = vci_add_reindex_index; +• add_skip_vci_index_hook = vci_add_skip_vci_index; +• add_alter_tablespace_hook = vci_add_alter_tablespace; +• add_alter_table_change_owner_hook = vci_alter_table_change_owner; +• add_alter_table_change_schema_hook = vci_alter_table_change_schema; +• add_snapshot_satisfies_hook = VCITupleSatisfiesVisibility; +• add_skip_vacuum_hook = vci_isVciAdditionalRelation; + + +3.3.2 Embedded code +-------------------- +There are some places where VCI code is simply embedded in the PostgreSQL core. + + + +============================================================================== +4. Data Flow and Conversion +============================================================================== + + +4.1. WOS-to-ROS Background Process +================================== + +A dedicated background worker continuously converts "freezable" data from WOS +to ROS: + +Conversion Process: +┌-───────────────────────────────────────────────────────────────┐ +│ Background Worker Cycle │ +├────────────────────────────────────────────────────────────────┤ +│ 1. Check Whiteout WOS → Update ROS delete vector │ +│ 2. Execute "deleted-rows-collection" if threshold exceeded │ +│ 3. Identify freezable tuples in WOS │ +│ 4. Convert freezable data to columnar format │ +│ 5. Apply compression algorithms │ +│ 6. Update TID-CRID mapping │ +│ 7. Truncate processed WOS data │ +└────────────────────────────────────────────────────────────────┘ + +Freezable Data Criteria: +- Transaction must be committed +- No active transactions started before the commit timestamp +- Ensures MVCC consistency during WOS-to-ROS conversion + + +4.2 Overview Diagram +==================== +Details of some of these concepts (e.g. extents) are given later. + +┌-───────────────────────────────────────────────────────────────┐ +│ WOS-to-ROS conversion │ +├────────────────────────────────────────────────────────────────┤ +│BEFORE: │ +│ │ +│ Data WOS (new rows) ROS extent (before) │ +│ ┌────────────────────────┐ ┌──────────────────────────┐ │ +│ │ newA-5 │ newB-5 │ ... │ │ delete vector │ │ +│ │ newA-6 │ newB-6 │ ... │ │CRID ▼ column data: │ │ +│ └────────────────────────┘ │ 1 │ 0 │ ColA-1 │ ColB-1 │ │ +│ │ 2 │ 1 │ ColA-2 │ ColB-2 │ │ +│ Whiteout WOS: │ 3 │ 0 │ ColA-3 │ ColB-3 │ │ +│ ┌────────────────────────┐ │ 4 │ 0 │ ColA-4 │ ColB-4 │ │ +│ │ delete rec with CRID 4 │ └──────────────────────────┘ │ +│ └────────────────────────┘ │ │ +│ • Remove WOS Whiteout records │ +│ • Remove delete vector rows │ +│ • Add new records from Data WOS │ +│ • Renumber CRIDs │ +│ • Compress data │ +│ │ │ +│ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ │ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─│ +│AFTER: │ │ +│ ▼ │ +│ Data WOS: ROS extent (after) │ +│ ┌─────────────────────────┐ ┌──────────────────────────┐ │ +│ └─────────────────────────┘ │ 1 │ │ ColA-1 │ ColB-1 │ │ +│ │ 2 │ │ ColA-3 │ ColB-3 │ │ +│ Whiteout WOS: │ 3 │ │ new5-1 │ new5-1 │ │ +│ ┌─────────────────────────┐ │ 4 │ │ new6-1 │ new6-1 │ │ +│ └─────────────────────────┘ └──────────────────────────┘ │ +│ │ +└────────────────────────────────────────────────────────────────┘ + + +4.3 Local ROS Creation +====================== +During SELECT operations, if WOS contains unconverted data, VCI creates a +temporary Local ROS. This is combined with the persisted ROS during query +execution. At the end of the query the Local ROS is discarded. + +Later, a full WOS-to-ROS conversion will be performed, so any unconverted WOS +data will be permanently converted to the ROS. + +Query Execution with Local ROS: +┌────────────────────────────────────────────────────────────────┐ +│ SELECT Operation │ +├────────────────────────────────────────────────────────────────┤ +│ │ +│ ROS │ +│ /-------------------------\ │ +│ ┌─────────┐ ┌────────────────────────┐ │ +│ │ WOS │───▶│ Local ROS │ } │ +│ │(Partial)│ │(Temporary) │ } Combined ROS │ +│ └─────────┘ └────────────────────────┘ } Columnar │ +│ ┌────────────────────────┐ } Processing │ +│ │ Persistent ROS │ } for SELECT │ +│ │ (Previously Converted) │ } │ +│ └────────────────────────┘ │ +│ │ +└────────────────────────────────────────────────────────────────┘ + + +4.4 Existing table data +======================= +The whole point of the asynchronous WOS-to-ROS conversion is to maintain the +column-oriented data in sync with row-based table data on-the-fly. + +Be aware that creating a VCI index for a table with lots of existing data is +costly because VCI has to initialise the ROS for all that data up-front. + + + +============================================================================== +5. MVCC and Transaction Handling +============================================================================== + +Transaction Visibility + +VCI maintains MVCC consistency across both WOS and ROS: + +e.g. Transaction Timeline: +┌────────────────────────────────────────────────────────────────┐ +│ T1: BEGIN → INSERT → COMMIT (timestamp: 100) │ +│ T2: BEGIN (timestamp: 99) → SELECT → ... │ +│ T3: BEGIN (timestamp: 101) → SELECT → ... │ +└────────────────────────────────────────────────────────────────┘ +- T2 cannot see T1's insert (it started before T1 committed) +- T3 can see T1's insert (it started after T1 committed) + +Visibility Rules: +Data remains in WOS until no transaction needs old versions -- see +"freezable" criteria. + +WOS: Handles active transaction visibility using xmin/xmax +ROS: Contains only frozen data visible to all current transactions +Local ROS: Applies snapshot visibility rules during query execution + + + +============================================================================== +6. Query Execution +============================================================================== + +6.1 Custom Plan Integration +=========================== + +VCI integrates with PostgreSQL's query planner by replacing standard plan +nodes with custom plan nodes for VCI when possible. + +VCI registers a set of callbacks as custom plan providers. + +There are 4 types of operations that can potentially be replaced: +table scan, aggregation (e.g. SUM, COUNT, AVG), sort, join. + +Furthermore, instead of replacing one plan node with one custom plan node, +plan nodes are collectively replaced. + +┌─────────────────────────────────────────────────────────────────┐ +│ Replacing a Plan Tree │ +├─────────────────────────────────────────────────────────────────┤ +│ Standard PostgreSQL Plan │ VCI Optimized Plan │ +│ │ (e.g. VCI judges Agg and │ +│ │ SeqScan are replaceable) │ +├─────────────────────────────────────────────────────────────────┤ +│ ┌─────────────┐ │ ┌─────────────┐ │ +│ │ │ │ │ │ │ +│ └─────┬───────┘ │ └─────┬───────┘ │ +│ │ │ │ │ +│ ┌─────▼───────┐ │ ┌─────▼───────┐ │ +│ │ Sort │ │ │ Sort │ │ +│ └─────┬───────┘ │ └─────┬───────┘ │ +│ │ │ │ │ +│ ┌─────▼───────┐ │ ┌─────▼───────────┐ │ +│ │ Agg │ │ │ VCI CustomPlan │ │ +│ └─────┬───────┘ │ │ │ │ +│ │ │ │ Agg and SeqScan │ │ +│ ┌─────▼───────┐ │ │ combined │ │ +│ │ SeqScan │ │ └─────────────────┘ │ +│ └─────────────┘ │ │ +└─────────────────────────────────────────────────────────────────┘ + + +6.1.1 VCI mode requirements +---------------------------- + +There are a number of conditions that must be met for nodes to be replaced: + +Scan node - must be for a relation (table) with a VCI index +Agg node - plan tree must be for scan node as above +Sort node - plan tree must be for scan node as above +Join node - plan tree must be for scan node as above + +Expression node: +- All columns in the expression node tree must be indexed by VCI +- Cannot have any SubPlan in the expression node tree +- If an expression has functions then there are other restrictions: + - not all functions are supported + - function data types must be supported by VCI + - user-defined functions not supported + - user-defined aggregate functions not supported + + +6.2 Custom Plan Execution Steps +=============================== + +Preparation Phase: +- Create Local ROS from visible WOS data +- Build delete TID list from Whiteout WOS +- Prepare extent access structures + +Execution Phase: +- Process ROS extents using columnar operations +- Apply delete vector filtering +- Combine results with Local ROS data +- Execute aggregations and sorting in columnar format + +VCI implements optimized hash joins for columnar data. + + +6.3 Is VCI getting used? +======================== + +Use EXPLAIN ANALYZE to see if VCI custom nodes are in the query plan. + +Boolean function vci_runs_in_query() returns true if a VCI index and custom +scan are used by the current query execution. + +e.g. +┌────────────────────────────────────────────────────────────────┐ +│ SELECT │ +│ vci_runs_in_query() AS vci_runs_in_query, key, count(*) │ +│ FROM test_table; │ +└────────────────────────────────────────────────────────────────┘ + + + +============================================================================== +7. Configuration Parameters +============================================================================== + +7.1 VCI-specific Parameters +============================= +VCI provides numerous configuration parameters that can be set in +postgresql.conf. These parameters control various aspects of VCI behavior, +from basic functionality to advanced performance tuning. + +There are also many DEVELOPER_OPTIONS (see code contrib/vci/vci_read_guc.c). + + +7.1.1 Core Parameters +---------------------- + +- vci.enable + Controls whether VCI functionality is active. + +- vci.enable_compression + Enables compression of column data in ROS storage. + +- vci.log_query + Logs detailed information when queries fail to execute through VCI's + columnar processing path, useful for debugging query execution issues. + + +7.1.2 Memory Management +------------------------ + +- vci.maintenance_work_mem + Memory limit for VCI background operations, including WOS-to-ROS conversions + and garbage collection processes. + +- vci.max_local_ros + Maximum memory allowed for temporary Local ROS creation during query + execution when WOS contains unconverted data. + + +7.1.3 Background Worker Control +-------------------------------- + +- vci.enable_ros_control_daemon + Enables the background worker responsible for WOS-to-ROS conversion. + Essential for maintaining columnar storage efficiency. + +- vci.control_max_workers + Maximum number of concurrent VCI background workers. Should be balanced + with system resources and PostgreSQL's max_worker_processes setting. + +- vci.control_naptime + Sleep interval between background worker cycles. Lower values provide + more responsive WOS-to-ROS conversion but increase system overhead. + +- vci.cost_threshold + CPU load threshold above which VCI background workers are paused to + avoid impacting foreground query performance. + + +7.1.4 Data Management Thresholds +--------------------------------- + +- vci.wosros_conv_threshold + Number of WOS rows that trigger automatic conversion to ROS format. + Lower values reduce WOS size but increase conversion overhead. + +- vci.cdr_threshold + Percentage of deleted rows in ROS that triggers garbage collection. + Typical values range from 20-40% depending on workload patterns. + + + +7.2 Affected PostgreSQL Parameters +=================================== + +- max_worker_processes + Must be increased from default values to accommodate VCI background + workers. Recommended minimum increase of 4-8 workers for VCI operation. + + + +============================================================================== +8. Known Restrictions/Limitations/Differences +============================================================================== + +8.1 Restrictions +================== + +8.1.1 DROP EXTENSION vci +------------------------- +Unloading the extension is not supported. + + +8.1.2 Backup and Restore +-------------------------- +Databases containing VCI indexes cannot be restored to PostgreSQL +installations without VCI extension. Cross-compatibility requires +dropping VCI indexes before backup or ensuring target system has +VCI available. + + +8.1.3 Version Upgrades +----------------------- +pg_upgrade is not currently supported for VCI-enabled databases. +Upgrades require dump/restore procedures with VCI-compatible target +systems. + + +8.2 Limitations +================ + +8.2.1 CREATE INDEX +------------------- +Since the internal structure of VCI indexes is different from other indexes, +some of the CREATE INDEX options are not supported for VCI. +- expressions instead of columns +- UNIQUE, CONCURRENTLY, WHERE clauses +- ASC/DESC, NULLS FIRST/LAST operator class options + + +8.2.2 Supported Relation Types +------------------------------- +Cannot create a VCI index for a view. + + +8.2.3 Supported Data Types +--------------------------- +Not all data types are supported for VCI indexing. + + +8.2.4 Performance Testing +-------------------------- +Standard benchmarks like pgbench focuses on OLTP workloads and will not +demonstrate VCI's analytical query performance benefits. Custom OLAP +benchmarks are needed to evaluate VCI effectiveness. + + +8.3 Differences +================ + +8.3.1 Configuration (Planner GUCs) +----------------------------------- +VCI may execute operations (such as hash joins) even when corresponding +PostgreSQL planner options are disabled, as VCI uses its own execution +strategies within custom plan nodes. + +8.3.2 Disk Size and Estimation +-------------------------------- +VCI maintains both WOS and ROS storage simultaneously, requiring +additional disk space during data conversion periods. + +Standard PostgreSQL index size functions (pg_relation_size) do not +account for VCI's distributed storage architecture. Use the provided +pg_vci_index_size() function for VCI storage measurements. + + +[END] diff --git a/contrib/vci/executor/Makefile b/contrib/vci/executor/Makefile new file mode 100644 index 000000000000..97b5c7b29467 --- /dev/null +++ b/contrib/vci/executor/Makefile @@ -0,0 +1,33 @@ +# contrib/vci/executor/Makefile + +SUBOBJS = \ + vci_agg.o \ + vci_aggmergetranstype.o \ + vci_aggref.o \ + vci_executor.o \ + vci_fetch_column_store.o \ + vci_gather.o \ + vci_param.o \ + vci_plan.o \ + vci_planner.o \ + vci_planner_preanalyze.o \ + vci_plan_func.o \ + vci_scan.o \ + vci_sort.o \ + vci_vector_executor.o + +EXTRA_CLEAN = SUBSYS.o $(SUBOBJS) + +PG_CPPFLAGS = -I$(top_srcdir)/contrib/vci/include + +ifdef USE_PGXS +PGXS := $(shell pg_config --pgxs) +include $(PGXS) +else +subdir = contrib/vci/executor +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +override CFLAGS += $(CFLAGS_SL) diff --git a/contrib/vci/executor/meson.build b/contrib/vci/executor/meson.build new file mode 100644 index 000000000000..7f9fcc2a587f --- /dev/null +++ b/contrib/vci/executor/meson.build @@ -0,0 +1,18 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +vci_executor_sources = files( + 'vci_agg.c', + 'vci_aggmergetranstype.c', + 'vci_aggref.c', + 'vci_executor.c', + 'vci_fetch_column_store.c', + 'vci_gather.c', + 'vci_param.c', + 'vci_plan.c', + 'vci_planner.c', + 'vci_planner_preanalyze.c', + 'vci_plan_func.c', + 'vci_scan.c', + 'vci_sort.c', + 'vci_vector_executor.c', +) diff --git a/contrib/vci/executor/vci_agg.c b/contrib/vci/executor/vci_agg.c new file mode 100644 index 000000000000..be24bb0980e0 --- /dev/null +++ b/contrib/vci/executor/vci_agg.c @@ -0,0 +1,1967 @@ +/*------------------------------------------------------------------------- + * + * vci_agg.c + * Routines to handle VCI Agg nodes + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/executor/vci_agg.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "catalog/objectaccess.h" +#include "catalog/pg_aggregate.h" +#include "catalog/pg_proc.h" +#include "catalog/pg_type.h" +#include "commands/explain.h" +#include "commands/explain_format.h" +#include "executor/execdebug.h" +#include "executor/executor.h" +#include "executor/nodeCustom.h" +#include "miscadmin.h" +#include "nodes/nodeFuncs.h" +#include "optimizer/optimizer.h" +#include "optimizer/tlist.h" +#include "parser/parse_agg.h" +#include "parser/parse_coerce.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/datum.h" +#include "utils/expandeddatum.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/syscache.h" +#include "utils/tuplesort.h" + +#include "vci.h" +#include "vci_executor.h" +#include "vci_utils.h" +#include "vci_aggref.h" + +static void advance_transition_function(VciAggState *aggstate, + VciAggStatePerAgg peraggstate, + VciAggStatePerGroup pergroupstate); +static void advance_aggregates_vector(VciAggState *aggstate, VciAggStatePerGroup *entries, int max_slots); +static void find_cols(VciAggState *aggstate, Bitmapset **unaggregated); +static bool find_cols_walker(Node *node, Bitmapset **colnos); +static void build_hash_table(VciAggState *aggstate); +static List *find_hash_columns(VciAggState *aggstate); +static void lookup_hash_entry_vector(VciAggState *aggstate, + VciAggStatePerGroup *entries, int max_slots); +static TupleTableSlot *agg_retrieve_direct(VciAggState *aggstate); +static void agg_fill_hash_table_vector(VciAggState *aggstate); +static Datum GetAggInitVal(Datum textInitVal, Oid transtype); + +static void vci_agg_BeginCustomPlan_preprocess(VciAggState *aggstate); +static void vci_agg_BeginCustomPlan_postprocess_for_advance_aggref(VciAggState *aggstate); +static void vci_agg_BeginCustomPlan_postprocess_for_vp(VciAggState *aggstate, ExprContext *econtext); +static void vci_ExecFreeExprContext(PlanState *planstate); + +/** + * Initialize all aggregates for a new group of input values. + * + * When called, CurrentMemoryContext should be the per-query context. + * + * copied from src/backend/executor/nodeAgg.c + */ +void +vci_initialize_aggregates(VciAggState *aggstate, + VciAggStatePerAgg peragg, + VciAggStatePerGroup pergroup) +{ + int aggno; + + for (aggno = 0; aggno < aggstate->numaggs; aggno++) + { + VciAggStatePerAgg peraggstate = &peragg[aggno]; + VciAggStatePerGroup pergroupstate = &pergroup[aggno]; + + Assert(peraggstate->numSortCols == 0); + + /* + * (Re)set transValue to the initial value. + * + * Note that when the initial value is pass-by-ref, we must copy it + * (into the aggcontext) since we will pfree the transValue later. + */ + if (peraggstate->initValueIsNull) + pergroupstate->transValue = peraggstate->initValue; + else + { + MemoryContext oldContext; + + oldContext = MemoryContextSwitchTo(aggstate->aggcontext); + pergroupstate->transValue = datumCopy(peraggstate->initValue, + peraggstate->transtypeByVal, + peraggstate->transtypeLen); + MemoryContextSwitchTo(oldContext); + } + pergroupstate->transValueIsNull = peraggstate->initValueIsNull; + + /* + * If the initial value for the transition state doesn't exist in the + * pg_aggregate table then we will let the first non-NULL value + * returned from the outer procNode become the initial value. (This is + * useful for aggregates like max() and min().) The noTransValue flag + * signals that we still need to do this. + */ + pergroupstate->noTransValue = peraggstate->initValueIsNull; + } +} + +/** + * Given new input value(s), advance the transition function of an aggregate. + * + * The new values (and null flags) have been preloaded into argument positions + * 1 and up in peraggstate->transfn_fcinfo, so that we needn't copy them again + * to pass to the transition function. We also expect that the static fields + * of the fcinfo are already initialized; that was done by ExecInitAgg(). + * + * It doesn't matter which memory context this is called in. + * + * copied from src/backend/executor/nodeAgg.c + */ +static void +advance_transition_function(VciAggState *aggstate, + VciAggStatePerAgg peraggstate, + VciAggStatePerGroup pergroupstate) +{ + FunctionCallInfo fcinfo = peraggstate->transfn_fcinfo; + MemoryContext oldContext; + Datum newVal; + + if (peraggstate->transfn.fn_strict) + { + /* + * For a strict transfn, nothing happens when there's a NULL input; we + * just keep the prior transValue. + */ + int numTransInputs = peraggstate->numTransInputs; + int i; + + for (i = 1; i <= numTransInputs; i++) + { + if (fcinfo->args[i].isnull) + return; + } + if (pergroupstate->noTransValue) + { + /* + * transValue has not been initialized. This is the first non-NULL + * input value. We use it as the initial value for transValue. (We + * already checked that the agg's input type is binary-compatible + * with its transtype, so straight copy here is OK.) + * + * We must copy the datum into aggcontext if it is pass-by-ref. We + * do not need to pfree the old transValue, since it's NULL. + */ + oldContext = MemoryContextSwitchTo(aggstate->aggcontext); + pergroupstate->transValue = datumCopy(fcinfo->args[1].value, + peraggstate->transtypeByVal, + peraggstate->transtypeLen); + pergroupstate->transValueIsNull = false; + pergroupstate->noTransValue = false; + MemoryContextSwitchTo(oldContext); + return; + } + if (pergroupstate->transValueIsNull) + { + /* + * Don't call a strict function with NULL inputs. Note it is + * possible to get here despite the above tests, if the transfn is + * strict *and* returned a NULL on a prior cycle. If that happens + * we will propagate the NULL all the way to the end. + */ + return; + } + } + + /* We run the transition functions in per-input-tuple memory context */ + oldContext = MemoryContextSwitchTo(aggstate->tmpcontext->ecxt_per_tuple_memory); + + /* set up aggstate->curperagg for AggGetAggref() */ + aggstate->pseudo_aggstate->curperagg = (AggStatePerAgg) peraggstate; /* @remark */ + + /* + * OK to call the transition function + */ + fcinfo->args[0].value = pergroupstate->transValue; + fcinfo->args[0].isnull = pergroupstate->transValueIsNull; + fcinfo->isnull = false; /* just in case transfn doesn't set it */ + + newVal = FunctionCallInvoke(fcinfo); + + aggstate->pseudo_aggstate->curperagg = NULL; + + /* + * If pass-by-ref datatype, must copy the new value into aggcontext and + * pfree the prior transValue. But if transfn returned a pointer to its + * first input, we don't need to do anything. + */ + if (!peraggstate->transtypeByVal && + DatumGetPointer(newVal) != DatumGetPointer(pergroupstate->transValue)) + { + if (!fcinfo->isnull) + { + MemoryContextSwitchTo(aggstate->aggcontext); + newVal = datumCopy(newVal, + peraggstate->transtypeByVal, + peraggstate->transtypeLen); + } + else + { + /* + * Ensure that VciAggStatePerGroup->transValue ends up being 0, so + * callers can safely compare newValue/oldValue without having to + * check their respective nullness. + */ + newVal = (Datum) 0; + } + if (!pergroupstate->transValueIsNull) + pfree(DatumGetPointer(pergroupstate->transValue)); + } + + pergroupstate->transValue = newVal; + pergroupstate->transValueIsNull = fcinfo->isnull; + + MemoryContextSwitchTo(oldContext); +} + +/** + * Perform aggregation processing for 1 input + * + * @param[in,out] aggstate VCI Agg State + * @param[in,out] pergroup Pointer to the VciAggStatePerGroup struct holding the Transition data + */ +void +vci_advance_aggregates(VciAggState *aggstate, VciAggStatePerGroup pergroup) +{ + int aggno; + + for (aggno = 0; aggno < aggstate->numaggs; aggno++) + { + VciAggStatePerAgg peraggstate = &aggstate->peragg[aggno]; + VciAggStatePerGroup pergroupstate = &pergroup[aggno]; + int numTransInputs = peraggstate->numTransInputs; + int i; + TupleTableSlot *slot; + + /* Evaluate the current input expressions for this aggregate */ + slot = VciExecProject(peraggstate->evalproj); + + Assert(peraggstate->numSortCols == 0); + + { + /* We can apply the transition function immediately */ + FunctionCallInfo fcinfo = peraggstate->transfn_fcinfo; + + /* Load values into fcinfo */ + /* Start from 1, since the 0th arg will be the transition value */ + Assert(slot->tts_nvalid >= numTransInputs); + for (i = 0; i < numTransInputs; i++) + { + fcinfo->args[i + 1].value = slot->tts_values[i]; + fcinfo->args[i + 1].isnull = slot->tts_isnull[i]; + } + + advance_transition_function(aggstate, peraggstate, pergroupstate); + } + } +} + +/** + * Perform aggregation processing for 1 vector + * + * @param[in,out] aggstate VCI Agg State + * @param[in,out] entries Pointer to VciAggHashEntry struct holding a pair of hash key and Transition data + * @param[in] max_slots Number of vector rows + */ +static void +advance_aggregates_vector(VciAggState *aggstate, VciAggStatePerGroup *entries, int max_slots) +{ + int aggno; + + aggstate->tmpcontext->ecxt_outertuple = NULL; + + for (aggno = 0; aggno < aggstate->numaggs; aggno++) + { + VciAggStatePerAgg peraggstate = &aggstate->peragg[aggno]; + + /* + * slot_getsomeattrs() is not required + */ + Assert(peraggstate->advance_aggref != NULL); + peraggstate->advance_aggref(aggstate, aggno, entries, max_slots); + } +} + +/** + * Compute the final value of one aggregate. + * + * The finalfunction will be run, and the result delivered, in the + * output-tuple context; caller's CurrentMemoryContext does not matter. + * + * copied from src/backend/executor/nodeAgg.c + */ +void +vci_finalize_aggregate(VciAggState *aggstate, + VciAggStatePerAgg peraggstate, + VciAggStatePerGroup pergroupstate, + Datum *resultVal, bool *resultIsNull) +{ + LOCAL_FCINFO(fcinfo, FUNC_MAX_ARGS); + bool anynull = false; + MemoryContext oldContext; + int i; + + oldContext = MemoryContextSwitchTo(aggstate->vci.css.ss.ps.ps_ExprContext->ecxt_per_tuple_memory); + + /* + * Evaluate any direct arguments. We do this even if there's no finalfn + * (which is unlikely anyway), so that side-effects happen as expected. + * The direct arguments go into arg positions 1 and up, leaving position 0 + * for the transition state value. + */ + i = 1; + + /* + * Apply the agg's finalfn if one is provided, else return transValue. + */ + if (OidIsValid(peraggstate->finalfn_oid)) + { + int numFinalArgs = peraggstate->numFinalArgs; + + /* set up aggstate->curperagg for AggGetAggref() */ + aggstate->pseudo_aggstate->curperagg = (AggStatePerAgg) peraggstate; /* @remark */ + + InitFunctionCallInfoData(*fcinfo, &(peraggstate->finalfn), + numFinalArgs, + peraggstate->aggCollation, + (Node *) aggstate->pseudo_aggstate, NULL); + + /* Fill in the transition state value */ + fcinfo->args[0].value = + MakeExpandedObjectReadOnly(pergroupstate->transValue, + pergroupstate->transValueIsNull, + peraggstate->transtypeLen); + fcinfo->args[0].isnull = pergroupstate->transValueIsNull; + anynull |= pergroupstate->transValueIsNull; + + /* Fill any remaining argument positions with nulls */ + for (; i < numFinalArgs; i++) + { + fcinfo->args[i].value = (Datum) 0; + fcinfo->args[i].isnull = true; + anynull = true; + } + + if (fcinfo->flinfo->fn_strict && anynull) + { + /* don't call a strict function with NULL inputs */ + *resultVal = (Datum) 0; + *resultIsNull = true; + } + else + { + Datum result; + + result = FunctionCallInvoke(fcinfo); + *resultIsNull = fcinfo->isnull; + *resultVal = MakeExpandedObjectReadOnly(result, + fcinfo->isnull, + peraggstate->resulttypeLen); + } + aggstate->pseudo_aggstate->curperagg = NULL; + } + else + { + *resultVal = + MakeExpandedObjectReadOnly(pergroupstate->transValue, + pergroupstate->transValueIsNull, + peraggstate->transtypeLen); + *resultIsNull = pergroupstate->transValueIsNull; + } + + MemoryContextSwitchTo(oldContext); +} + +/** + * find_cols + * Construct a bitmapset of the column numbers of un-aggregated Vars + * appearing in our targetlist and qual (HAVING clause) + * + * copied from src/backend/executor/nodeAgg.c + */ +static void +find_cols(VciAggState *aggstate, Bitmapset **unaggregated) +{ + VciAgg *node = (VciAgg *) aggstate->vci.css.ss.ps.plan; + Bitmapset *colnos; + + colnos = NULL; + (void) find_cols_walker((Node *) node->vci.cscan.scan.plan.targetlist, + &colnos); + (void) find_cols_walker((Node *) node->vci.cscan.scan.plan.qual, + &colnos); + + *unaggregated = colnos; +} + +static bool +find_cols_walker(Node *node, Bitmapset **colnos) +{ + if (node == NULL) + return false; + + if (IsA(node, Var)) + { + Var *var = (Var *) node; + + /* setrefs.c should have set the varno to OUTER_VAR */ + Assert(var->varno == OUTER_VAR); + Assert(var->varlevelsup == 0); + *colnos = bms_add_member(*colnos, var->varattno); + return false; + } + + if (IsA(node, Aggref)) /* do not descend into aggregate exprs */ + return false; + + return expression_tree_walker(node, find_cols_walker, colnos); +} + +/** + * Initialize the hash table to empty. + * + * The hash table always lives in the aggcontext memory context. + * + * copied from src/backend/executor/nodeAgg.c + */ +static void +build_hash_table(VciAggState *aggstate) +{ + VciAgg *node = (VciAgg *) aggstate->vci.css.ss.ps.plan; + MemoryContext tmpmem = aggstate->tmpcontext->ecxt_per_tuple_memory; + Size additionalsize; + + Assert(node->aggstrategy == AGG_HASHED); + Assert(node->numGroups > 0); + + additionalsize = aggstate->numaggs * sizeof(VciAggStatePerGroupData); + + aggstate->hashtable = BuildTupleHashTable(&aggstate->vci.css.ss.ps, + aggstate->hashslot->tts_tupleDescriptor, + NULL, + node->numCols, + node->grpColIdx, + aggstate->eqfuncoids, + aggstate->hashfunctions, + node->grpCollations, + node->numGroups, + additionalsize, + aggstate->aggcontext, + aggstate->aggcontext, + tmpmem, false); +} + +/** + * Create a list of the tuple columns that actually need to be stored in + * hashtable entries. The incoming tuples from the child plan node will + * contain grouping columns, other columns referenced in our targetlist and + * qual, columns used to compute the aggregate functions, and perhaps just + * junk columns we don't use at all. Only columns of the first two types + * need to be stored in the hashtable, and getting rid of the others can + * make the table entries significantly smaller. To avoid messing up Var + * numbering, we keep the same tuple descriptor for hashtable entries as the + * incoming tuples have, but set unwanted columns to NULL in the tuples that + * go into the table. + * + * To eliminate duplicates, we build a bitmapset of the needed columns, then + * convert it to an integer list (cheaper to scan at runtime). The list is + * in decreasing order so that the first entry is the largest; + * lookup_hash_entry depends on this to use slot_getsomeattrs correctly. + * Note that the list is preserved over ExecReScanAgg, so we allocate it in + * the per-query context (unlike the hash table itself). + * + * Note: at present, searching the tlist/qual is not really necessary since + * the parser should disallow any unaggregated references to ungrouped + * columns. However, the search will be needed when we add support for + * SQL99 semantics that allow use of "functionally dependent" columns that + * haven't been explicitly grouped by. + * + * copied from src/backend/executor/nodeAgg.c + */ +static List * +find_hash_columns(VciAggState *aggstate) +{ + VciAgg *node = (VciAgg *) aggstate->vci.css.ss.ps.plan; + Bitmapset *colnos; + List *collist; + int i; + + /* Find Vars that will be needed in tlist and qual */ + find_cols(aggstate, &colnos); + /* Add in all the grouping columns */ + for (i = 0; i < node->numCols; i++) + colnos = bms_add_member(colnos, node->grpColIdx[i]); + /* Convert to list, using lcons so largest element ends up first */ + collist = NIL; + i = -1; + while ((i = bms_next_member(colnos, i)) >= 0) + collist = lcons_int(i, collist); + bms_free(colnos); + + return collist; +} + +static void +lookup_hash_entry_vector(VciAggState *aggstate, + VciAggStatePerGroup *entries, int max_slots) +{ + VciScanState *scanstate = (VciScanState *) outerPlanState(aggstate); + TupleTableSlot *hashslot = aggstate->hashslot; + uint16 *skip_list; + int slot_index; + + Assert(scanstate->vci.css.ss.ps.type == T_CustomScanState); + + skip_list = vci_CSGetSkipFromVirtualTuples(scanstate->vector_set); + + /* Clear the tuple */ + ExecClearTuple(hashslot); + + /* + * Fill all the columns of the virtual tuple with nulls + */ + MemSet(hashslot->tts_values, 0, + hashslot->tts_tupleDescriptor->natts * sizeof(Datum)); + memset(hashslot->tts_isnull, true, + hashslot->tts_tupleDescriptor->natts * sizeof(bool)); + + for (slot_index = skip_list[0]; slot_index < max_slots; slot_index += skip_list[slot_index + 1] + 1) + { + int i; + TupleHashEntry entry; + bool isnew; + VciAggStatePerGroup pergroup; + + ExecClearTuple(hashslot); + memset(hashslot->tts_isnull, true, + hashslot->tts_tupleDescriptor->natts * sizeof(bool)); + + for (i = 0; i < aggstate->num_hash_needed; i++) + { + int varNumber = aggstate->hash_needed[i] - 1; + + hashslot->tts_values[varNumber] = aggstate->hash_input_values[i][slot_index]; + hashslot->tts_isnull[varNumber] = aggstate->hash_input_isnull[i][slot_index]; + } + ExecStoreVirtualTuple(hashslot); + + /* find or create the hashtable entry using the filtered tuple */ + entry = LookupTupleHashEntry(aggstate->hashtable, + hashslot, + &isnew, + NULL); + + pergroup = (VciAggStatePerGroup) TupleHashEntryGetAdditional(aggstate->hashtable, entry); + + if (isnew && aggstate->numaggs) + { + /* initialize aggregates for new tuple group */ + vci_initialize_aggregates(aggstate, aggstate->peragg, pergroup); + } + + entries[slot_index] = pergroup; + } +} + +/** + * ExecAgg for non-hashed case + * + * copied from src/backend/executor/nodeAgg.c + */ +static TupleTableSlot * +agg_retrieve_direct(VciAggState *aggstate) +{ + VciAgg *node = (VciAgg *) aggstate->vci.css.ss.ps.plan; + PlanState *outerPlan; + ExprContext *econtext; + ExprContext *tmpcontext; + Datum *aggvalues; + bool *aggnulls; + VciAggStatePerAgg peragg; + VciAggStatePerGroup pergroup; + TupleTableSlot *outerslot; + TupleTableSlot *firstSlot; + int aggno; + + /* + * get state info from node + */ + outerPlan = outerPlanState(aggstate); + /* econtext is the per-output-tuple expression context */ + econtext = aggstate->vci.css.ss.ps.ps_ExprContext; + aggvalues = econtext->ecxt_aggvalues; + aggnulls = econtext->ecxt_aggnulls; + /* tmpcontext is the per-input-tuple expression context */ + tmpcontext = aggstate->tmpcontext; + peragg = aggstate->peragg; + pergroup = aggstate->pergroup; + firstSlot = aggstate->vci.css.ss.ss_ScanTupleSlot; + + /* + * We loop retrieving groups until we find one matching + * aggstate->ss.ps.qual + */ + while (!aggstate->agg_done) + { + /* + * If we don't already have the first tuple of the new group, fetch it + * from the outer plan. + */ + if (aggstate->grp_firstTuple == NULL) + { + outerslot = ExecProcNode(outerPlan); + if (!TupIsNull(outerslot)) + { + /* + * Make a copy of the first input tuple; we will use this for + * comparisons (in group mode) and for projection. + */ + aggstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot); + } + else + { + /* outer plan produced no tuples at all */ + aggstate->agg_done = true; + /* If we are grouping, we should produce no tuples too */ + if (node->aggstrategy != AGG_PLAIN) + return NULL; + } + } + + /* + * Clear the per-output-tuple context for each group, as well as + * aggcontext (which contains any pass-by-ref transvalues of the old + * group). We also clear any child contexts of the aggcontext; some + * aggregate functions store working state in such contexts. + * + * We use ReScanExprContext not just ResetExprContext because we want + * any registered shutdown callbacks to be called. That allows + * aggregate functions to ensure they've cleaned up any non-memory + * resources. + */ + ReScanExprContext(econtext); + + MemoryContextReset(aggstate->aggcontext); + + /* + * Initialize working state for a new input tuple group + */ + vci_initialize_aggregates(aggstate, peragg, pergroup); + + if (aggstate->grp_firstTuple != NULL) + { + /* + * Store the copied first input tuple in the tuple table slot + * reserved for it. The tuple will be deleted when it is cleared + * from the slot. + */ + ExecForceStoreHeapTuple(aggstate->grp_firstTuple, + firstSlot, + true); + aggstate->grp_firstTuple = NULL; /* don't keep two pointers */ + + /* set up for first advance_aggregates call */ + tmpcontext->ecxt_outertuple = firstSlot; + + /* + * Process each outer-plan tuple, and then fetch the next one, + * until we exhaust the outer plan or cross a group boundary. + */ + for (;;) + { + vci_advance_aggregates(aggstate, pergroup); + + /* Reset per-input-tuple context after each tuple */ + ResetExprContext(tmpcontext); + + outerslot = ExecProcNode(outerPlan); + if (TupIsNull(outerslot)) + { + /* no more outer-plan tuples available */ + aggstate->agg_done = true; + break; + } + /* set up for next advance_aggregates call */ + tmpcontext->ecxt_outertuple = outerslot; + + /* + * If we are grouping, check whether we've crossed a group + * boundary. + */ + if (node->aggstrategy == AGG_SORTED) + { + tmpcontext->ecxt_innertuple = firstSlot; + if (!ExecQual(aggstate->eqfunctions[0], + tmpcontext)) + { + /* + * Save the first input tuple of the next group. + */ + aggstate->grp_firstTuple = ExecCopySlotHeapTuple(outerslot); + break; + } + } + } + } + + /* + * Use the representative input tuple for any references to + * non-aggregated input columns in aggregate direct args, the node + * qual, and the tlist. (If we are not grouping, and there are no + * input rows at all, we will come here with an empty firstSlot ... + * but if not grouping, there can't be any references to + * non-aggregated input columns, so no problem.) + */ + econtext->ecxt_outertuple = firstSlot; + + /* + * Done scanning input tuple group. Finalize each aggregate + * calculation, and stash results in the per-output-tuple context. + */ + for (aggno = 0; aggno < aggstate->numaggs; aggno++) + { + VciAggStatePerAgg peraggstate = &peragg[aggno]; + VciAggStatePerGroup pergroupstate = &pergroup[aggno]; + + Assert(peraggstate->numSortCols == 0); + + vci_finalize_aggregate(aggstate, peraggstate, pergroupstate, + &aggvalues[aggno], &aggnulls[aggno]); + } + + /* + * Check the qual (HAVING clause); if the group does not match, ignore + * it and loop back to try to process another group. + */ + if (ExecQual(aggstate->vci.css.ss.ps.qual, econtext)) + { + /* + * Form and return a projection tuple using the aggregate results + * and the representative input tuple. + */ + TupleTableSlot *result; + + result = VciExecProject(aggstate->vps_ProjInfo); + + return result; + } + else + InstrCountFiltered1(aggstate, 1); + } + + /* No more groups */ + return NULL; +} + +/** + * When Hashed aggregation is selected, tuples are received from lower nodes, + * constructs a has table, and aggregate them. However, processing is performed in vector units. + * + * @param[in,out] aggstate VCI Agg State + */ +void +vci_agg_fill_hash_table(VciAggState *aggstate) +{ + agg_fill_hash_table_vector(aggstate); +} + +static void +agg_fill_hash_table_vector(VciAggState *aggstate) +{ + ExprContext *tmpcontext; + VciScanState *scanstate = (VciScanState *) outerPlanState(aggstate); + + Assert(scanstate->vci.css.ss.ps.type == T_CustomScanState); + + /* + * get state info from node + */ + /* tmpcontext is the per-input-tuple expression context */ + tmpcontext = aggstate->tmpcontext; + + /* + * Process each outer-plan tuple, and then fetch the next one, until we + * exhaust the outer plan. + */ + for (;;) + { + int max_slots; + VciAggStatePerGroup entries[VCI_MAX_FETCHING_ROWS]; + + /* fetch VCI_MAX_FETCHING_ROWS rows from column store */ + max_slots = VciExecProcScanVector(scanstate); + + if (max_slots == 0) + break; + + tmpcontext->ecxt_outertuple = NULL; /* safety */ + + lookup_hash_entry_vector(aggstate, entries, max_slots); + + /* Advance the aggregates */ + advance_aggregates_vector(aggstate, entries, max_slots); + + /* Reset per-input-tuple context after each tuple */ + ResetExprContext(tmpcontext); + + /* Vector loading is complete */ + vci_finish_vector_set_from_column_store(scanstate); + } + + aggstate->table_filled = true; + /* Initialize to walk the hash table */ + ResetTupleHashIterator(aggstate->hashtable, &aggstate->hashiter); +} + +/** + * Retrieve 1 tuple at a time from the hash table + * + * @param[in,out] aggstate VCI Agg State + * @return Resulting output tuple + * + * @note This function is used after executing vci_agg_fill_hash_table(). + */ +TupleTableSlot * +vci_agg_retrieve_hash_table(VciAggState *aggstate) +{ + ExprContext *econtext; + Datum *aggvalues; + bool *aggnulls; + VciAggStatePerAgg peragg; + VciAggStatePerGroup pergroup; + TupleHashEntry entry; + TupleTableSlot *firstSlot; + int aggno; + + /* + * get state info from node + */ + /* econtext is the per-output-tuple expression context */ + econtext = aggstate->vci.css.ss.ps.ps_ExprContext; + aggvalues = econtext->ecxt_aggvalues; + aggnulls = econtext->ecxt_aggnulls; + peragg = aggstate->peragg; + firstSlot = aggstate->vci.css.ss.ss_ScanTupleSlot; + + /* + * We loop retrieving groups until we find one satisfying + * aggstate->ss.ps.qual + */ + while (!aggstate->agg_done) + { + + /* + * Find the next entry in the hash table + */ + entry = vci_agg_find_group_from_hash_table(aggstate); + if (entry == NULL) + { + /* No more entries in hashtable, so done */ + aggstate->agg_done = true; + return NULL; + } + + /* + * Clear the per-output-tuple context for each group + * + * We intentionally don't use ReScanExprContext here; if any aggs have + * registered shutdown callbacks, they mustn't be called yet, since we + * might not be done with that agg. + */ + ResetExprContext(econtext); + + /* + * Store the copied first input tuple in the tuple table slot reserved + * for it, so that it can be used in ExecProject. + */ + ExecForceStoreMinimalTuple(entry->firstTuple, + firstSlot, + false); + + pergroup = (VciAggStatePerGroup) TupleHashEntryGetAdditional(aggstate->hashtable, entry); + + /* + * Finalize each aggregate calculation, and stash results in the + * per-output-tuple context. + */ + for (aggno = 0; aggno < aggstate->numaggs; aggno++) + { + VciAggStatePerAgg peraggstate = &peragg[aggno]; + VciAggStatePerGroup pergroupstate = &pergroup[aggno]; + + Assert(peraggstate->numSortCols == 0); + vci_finalize_aggregate(aggstate, peraggstate, pergroupstate, + &aggvalues[aggno], &aggnulls[aggno]); + } + + /* + * Use the representative input tuple for any references to + * non-aggregated input columns in the qual and tlist. + */ + econtext->ecxt_outertuple = firstSlot; + + /* + * Check the qual (HAVING clause); if the group does not match, ignore + * it and loop back to try to process another group. + */ + if (ExecQual(aggstate->vci.css.ss.ps.qual, econtext)) + { + /* + * Form and return a projection tuple using the aggregate results + * and the representative input tuple. + */ + TupleTableSlot *result; + + result = VciExecProject(aggstate->vps_ProjInfo); + + return result; + } + else + InstrCountFiltered1(aggstate, 1); + } + + /* No more groups */ + return NULL; +} + +/** + * Retrive only 1 entry from hash table + * + * @param[in,out] aggstate VCI Agg State + * @return One VciAggHashEntry retrieved from hash table + */ +TupleHashEntry +vci_agg_find_group_from_hash_table(VciAggState *aggstate) +{ + while (!aggstate->agg_done) + { + return (TupleHashEntry) ScanTupleHashTable(aggstate->hashtable, &aggstate->hashiter); + } + + /* No more groups */ + return NULL; +} + +static Datum +GetAggInitVal(Datum textInitVal, Oid transtype) +{ + Oid typinput, + typioparam; + char *strInitVal; + Datum initVal; + + getTypeInputInfo(transtype, &typinput, &typioparam); + strInitVal = TextDatumGetCString(textInitVal); + initVal = OidInputFunctionCall(typinput, strInitVal, + typioparam, -1); + pfree(strInitVal); + return initVal; +} + +/*********************************************************************** + * API exposed to aggregate functions + ***********************************************************************/ + +/* + * The following function is a callback function from AggState, + * but there is no need to directly maintain it in VCI Agg. + * + * - AggCheckCallContext - test if a SQL function is being called as an aggregate + * - AggGetAggref - allow an aggregate support function to get its Aggref + * - AggGetTempMemoryContext - fetch short-term memory context for aggregates + * - AggRegisterCallback - register a cleanup callback for an aggregate + */ + +/* ---------------- + * VciAgg information + * ---------------- + */ +static Node * +vci_agg_CreateCustomScanState(CustomScan *cscan) +{ + VciAgg *vagg = (VciAgg *) cscan; + VciAggState *vas = (VciAggState *) palloc0(sizeof(VciAggState)); + + vas->vci.css.ss.ps.type = T_CustomScanState; + vas->vci.css.ss.ps.plan = (Plan *) vagg; + + vas->vci.css.flags = cscan->flags; + + switch (vagg->aggstrategy) + { + case AGG_HASHED: + vas->vci.css.methods = &vci_hashagg_exec_methods; + break; + + case AGG_SORTED: + vas->vci.css.methods = &vci_groupagg_exec_methods; + break; + + case AGG_PLAIN: + vas->vci.css.methods = &vci_agg_exec_methods; + break; + + default: + break; + } + + vas->aggs = NIL; + vas->numaggs = 0; + vas->eqfunctions = NULL; + vas->hashfunctions = NULL; + vas->peragg = NULL; + vas->agg_done = false; + vas->pergroup = NULL; + vas->grp_firstTuple = NULL; + vas->hashtable = NULL; + + return (Node *) vas; +} + +/** + * ExecCustomPlan callback called from CustomPlanState of VCI Agg + */ +static TupleTableSlot * +vci_agg_ExecCustomPlan(CustomScanState *node) +{ + VciAggState *aggstate; + + aggstate = (VciAggState *) node; + + /* + * Exit if nothing left to do. (We must do the ps_TupFromTlist check + * first, because in some cases agg_done gets set before we emit the final + * aggregate tuple, and we have to finish running SRFs for it.) + */ + if (aggstate->agg_done) + return NULL; + + Assert(IsA(node->ss.ps.plan, CustomScan)); + + /* Dispatch based on strategy */ + if (((VciAgg *) node->ss.ps.plan)->aggstrategy == AGG_HASHED) + { + if (!aggstate->table_filled) + vci_agg_fill_hash_table(aggstate); + return vci_agg_retrieve_hash_table(aggstate); + } + else + return agg_retrieve_direct(aggstate); + + return NULL; +} + +/** + * Copy the contents of VCI Agg State to pseudo Agg state + */ +static void +copy_into_pseudo_aggstate(AggState *pseudo_aggstate, VciAggState *aggstate) +{ + pseudo_aggstate->ss.ps.plan = aggstate->vci.css.ss.ps.plan; + pseudo_aggstate->ss.ps.state = aggstate->vci.css.ss.ps.state; + pseudo_aggstate->ss.ps.instrument = aggstate->vci.css.ss.ps.instrument; + pseudo_aggstate->ss.ps.qual = aggstate->vci.css.ss.ps.qual; + pseudo_aggstate->ss.ps.lefttree = aggstate->vci.css.ss.ps.lefttree; + pseudo_aggstate->ss.ps.righttree = aggstate->vci.css.ss.ps.righttree; + pseudo_aggstate->ss.ps.initPlan = aggstate->vci.css.ss.ps.initPlan; + pseudo_aggstate->ss.ps.subPlan = aggstate->vci.css.ss.ps.subPlan; + pseudo_aggstate->ss.ps.chgParam = aggstate->vci.css.ss.ps.chgParam; + pseudo_aggstate->ss.ps.ps_ResultTupleSlot = aggstate->vci.css.ss.ps.ps_ResultTupleSlot; + pseudo_aggstate->ss.ps.ps_ExprContext = aggstate->vci.css.ss.ps.ps_ExprContext; + pseudo_aggstate->ss.ps.ps_ProjInfo = aggstate->vci.css.ss.ps.ps_ProjInfo; + + pseudo_aggstate->ss.ss_currentRelation = aggstate->vci.css.ss.ss_currentRelation; + pseudo_aggstate->ss.ss_currentScanDesc = aggstate->vci.css.ss.ss_currentScanDesc; + pseudo_aggstate->ss.ss_ScanTupleSlot = aggstate->vci.css.ss.ss_ScanTupleSlot; + + pseudo_aggstate->aggs = aggstate->aggs; + pseudo_aggstate->numaggs = aggstate->numaggs; + pseudo_aggstate->phases[0].eqfunctions = aggstate->eqfunctions; + pseudo_aggstate->perhash->hashfunctions = aggstate->hashfunctions; + pseudo_aggstate->peragg = (AggStatePerAgg) aggstate->peragg; + pseudo_aggstate->tmpcontext = aggstate->tmpcontext; + pseudo_aggstate->curperagg = NULL; + pseudo_aggstate->agg_done = aggstate->agg_done; + pseudo_aggstate->pergroups = (AggStatePerGroup *) &aggstate->pergroup; + pseudo_aggstate->grp_firstTuple = aggstate->grp_firstTuple; + pseudo_aggstate->perhash->hashtable = aggstate->hashtable; + pseudo_aggstate->perhash->hashslot = NULL; + pseudo_aggstate->table_filled = aggstate->table_filled; + pseudo_aggstate->perhash->hashiter = aggstate->hashiter; +} + +/** + * BeginCustomPlan callback called from CustomPlan of VCI Agg + */ +static void +vci_agg_BeginCustomPlan(CustomScanState *node, EState *estate, int eflags) +{ + VciAgg *agg; + VciAggState *aggstate; + VciAggStatePerAgg peragg; + Plan *outerPlan; + ExprContext *econtext; + int max_aggno; + int numaggs; + ListCell *l; + vci_initexpr_t initexpr; + TupleDesc scanDesc; + + agg = (VciAgg *) node->ss.ps.plan; + + /* check for unsupported flags */ + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + + /* + * create state structure + */ + aggstate = (VciAggState *) node; + + aggstate->vci.css.ss.ps.state = estate; + + if (vci_get_vci_plan_type(outerPlan(agg)) == VCI_CUSTOMPLAN_SCAN) + { + aggstate->enable_vp = true; + } + + vci_agg_BeginCustomPlan_preprocess(aggstate); + + /* + * Create expression contexts. We need three or more, one for + * per-input-tuple processing, one for per-output-tuple processing, and + * one for each grouping set. The per-tuple memory context of the + * per-grouping-set ExprContexts (aggcontexts) replaces the standalone + * memory context formerly used to hold transition values. We cheat a + * little by using ExecAssignExprContext() to build all of them. + * + * NOTE: the details of what is stored in aggcontexts and what is stored + * in the regular per-query memory context are driven by a simple + * decision: we want to reset the aggcontext at group boundaries (if not + * hashing) and in ExecReScanAgg to recover no-longer-wanted space. + */ + ExecAssignExprContext(estate, &aggstate->vci.css.ss.ps); + aggstate->tmpcontext = aggstate->vci.css.ss.ps.ps_ExprContext; + ExecAssignExprContext(estate, &aggstate->vci.css.ss.ps); + + aggstate->pseudo_aggstate->aggcontexts[0] = aggstate->vci.css.ss.ps.ps_ExprContext; + ExecAssignExprContext(estate, &aggstate->vci.css.ss.ps); + + aggstate->aggcontext = + AllocSetContextCreate(CurrentMemoryContext, + "VciAggContext", + ALLOCSET_DEFAULT_SIZES); + + /* + * The timing of ExecInitExpr() for targetlist and qual, and the timing of + * ExecInitNode() for outer node are reversed from the original. + * + * This is because we want VciScanState to exist when Var is evaluated. + */ + + /* + * initialize child nodes + * + * If we are doing a hashed aggregation then the child plan does not need + * to handle REWIND efficiently; see ExecReScanAgg. + */ + if (agg->aggstrategy == AGG_HASHED) + eflags &= ~EXEC_FLAG_REWIND; + outerPlan = outerPlan(node->ss.ps.plan); + + outerPlanState(aggstate) = ExecInitNode(outerPlan, estate, eflags); + + /* + * tuple table initialization + */ + aggstate->vci.css.ss.ps.outerops = + ExecGetResultSlotOps(outerPlanState(&aggstate->vci.css.ss), + &aggstate->vci.css.ss.ps.outeropsfixed); + aggstate->vci.css.ss.ps.outeropsset = true; + + ExecCreateScanSlotFromOuterPlan(estate, &aggstate->vci.css.ss, + aggstate->vci.css.ss.ps.outerops); + scanDesc = aggstate->vci.css.ss.ss_ScanTupleSlot->tts_tupleDescriptor; + + ExecInitResultTupleSlotTL(&aggstate->vci.css.ss.ps, &TTSOpsVirtual); + aggstate->hashslot = ExecInitExtraTupleSlot(estate, scanDesc, &TTSOpsMinimalTuple); + + /* + * In the case of hashed aggregation, Var in targetlist and qual are read + * using outer tuple, but targetlist under Aggref will fetch column store. + * (However if outer is other than VCI Scan, read from outer tuple) + * + * Sorted aggregation and plain aggregation are all read from outer tuple. + */ + if (agg->aggstrategy == AGG_HASHED) + initexpr = VCI_INIT_EXPR_FETCHING_COLUMN_STORE; + else + initexpr = VCI_INIT_EXPR_NORMAL; + + /* + * initialize child expressions + * + * Note: ExecInitExpr finds Aggrefs for us, and also checks that no aggs + * contain other agg calls in their arguments. This would make no sense + * under SQL semantics anyway (and it's forbidden by the spec). Because + * that is true, we don't need to worry about evaluating the aggs in any + * particular order. + */ + aggstate->vci.css.ss.ps.qual = + VciExecInitQual(agg->vci.cscan.scan.plan.qual, (PlanState *) aggstate, initexpr); + + /* + * Initialize projection info. + */ + aggstate->vps_ProjInfo = + VciExecBuildProjectionInfo(aggstate->vci.css.ss.ps.plan->targetlist, + aggstate->vci.css.ss.ps.ps_ExprContext, + aggstate->vci.css.ss.ps.ps_ResultTupleSlot, + &aggstate->vci.css.ss.ps, + NULL); + + /* + * get the count of aggregates in targetlist and quals + */ + max_aggno = -1; + foreach(l, aggstate->aggs) + { + Aggref *aggref = (Aggref *) lfirst(l); + + max_aggno = Max(max_aggno, aggref->aggno); + } + aggstate->numaggs = numaggs = max_aggno + 1; + + /* + * If we are grouping, precompute fmgr lookup data for inner loop. We need + * both equality and hashing functions to do it by hashing, but only + * equality if not hashing. + */ + if (agg->numCols > 0) + { + if (agg->aggstrategy == AGG_HASHED) + execTuplesHashPrepare(agg->numCols, + agg->grpOperators, + &aggstate->eqfuncoids, + &aggstate->hashfunctions); + else + { + aggstate->eqfunctions = + (ExprState **) palloc0(1 * sizeof(ExprState *)); + aggstate->eqfunctions[0] = + execTuplesMatchPrepare(scanDesc, + agg->numCols, + agg->grpColIdx, + agg->grpOperators, + agg->grpCollations, + (PlanState *) aggstate); + } + } + + /* + * Set up aggregate-result storage in the output expr context, and also + * allocate my private per-agg working storage + */ + econtext = aggstate->vci.css.ss.ps.ps_ExprContext; + econtext->ecxt_aggvalues = (Datum *) palloc0(sizeof(Datum) * numaggs); + econtext->ecxt_aggnulls = (bool *) palloc0(sizeof(bool) * numaggs); + + peragg = (VciAggStatePerAgg) palloc0(sizeof(VciAggStatePerAggData) * numaggs); + aggstate->peragg = peragg; + + if (agg->aggstrategy == AGG_HASHED) + { + int i; + List *hash_need; + ListCell *lc; + + /* Compute the columns we actually need to hash on */ + hash_need = find_hash_columns(aggstate); + aggstate->num_hash_needed = list_length(hash_need); + aggstate->hash_needed = palloc(sizeof(int) * aggstate->num_hash_needed); + + Assert(aggstate->num_hash_needed > 0); + + i = 0; + foreach(lc, hash_need) + { + aggstate->hash_needed[i++] = lfirst_int(lc); + + if (aggstate->last_hash_column < lfirst_int(lc)) + aggstate->last_hash_column = lfirst_int(lc); + } + + { + VciScanState *scanstate = (VciScanState *) outerPlanState(aggstate); + + Assert(scanstate->vci.css.ss.ps.type == T_CustomScanState); + + aggstate->hash_input_values = palloc(sizeof(Datum *) * aggstate->num_hash_needed); + aggstate->hash_input_isnull = palloc(sizeof(bool *) * aggstate->num_hash_needed); + + for (i = 0; i < aggstate->num_hash_needed; i++) + { + int varNumber = aggstate->hash_needed[i] - 1; + + aggstate->hash_input_values[i] = + scanstate->result_values[varNumber]; + + aggstate->hash_input_isnull[i] = + scanstate->result_isnull[varNumber]; + } + } + + build_hash_table(aggstate); + aggstate->table_filled = false; + } + else + { + VciAggStatePerGroup pergroup; + + pergroup = (VciAggStatePerGroup) palloc0(sizeof(VciAggStatePerGroupData) * numaggs); + aggstate->pergroup = pergroup; + } + + /* + * Perform lookups of aggregate function info, and initialize the + * unchanging fields of the per-agg data. We also detect duplicate + * aggregates (for example, "SELECT sum(x) ... HAVING sum(x) > 0"). When + * duplicates are detected, we only make an AggStatePerAgg struct for the + * first one. The clones are simply pointed at the same result entry by + * giving them duplicate aggno values. + */ + foreach(l, aggstate->aggs) + { + Aggref *aggref = lfirst(l); + VciAggStatePerAgg peraggstate; + Oid inputTypes[FUNC_MAX_ARGS]; + int numArguments; + int numDirectArgs; + int numInputs; + int numSortCols; + int numDistinctCols; + List *sortlist; + HeapTuple aggTuple; + Form_pg_aggregate aggform; + Oid aggtranstype; + AclResult aclresult; + Oid transfn_oid, + finalfn_oid; + Expr *transfnexpr, + *finalfnexpr; + Datum textInitVal; + + /* Planner should have assigned aggregate to correct level */ + Assert(aggref->agglevelsup == 0); + + peraggstate = &peragg[aggref->aggno]; + + /* Check if we initialized the state for this aggregate already. */ + if (peraggstate->aggref != NULL) + continue; + + peraggstate->aggref = aggref; + peraggstate->sortstate = NULL; + + /* Fetch the pg_aggregate row */ + aggTuple = SearchSysCache1(AGGFNOID, + ObjectIdGetDatum(aggref->aggfnoid)); + if (!HeapTupleIsValid(aggTuple)) + elog(ERROR, "cache lookup failed for aggregate %u", + aggref->aggfnoid); + aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple); + + /* Check permission to call aggregate function */ + aclresult = object_aclcheck(ProcedureRelationId, aggref->aggfnoid, GetUserId(), + ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_AGGREGATE, + get_func_name(aggref->aggfnoid)); + InvokeFunctionExecuteHook(aggref->aggfnoid); + + peraggstate->transfn_oid = transfn_oid = aggform->aggtransfn; + peraggstate->finalfn_oid = finalfn_oid = aggform->aggfinalfn; + + /* Check that aggregate owner has permission to call component fns */ + { + HeapTuple procTuple; + Oid aggOwner; + + procTuple = SearchSysCache1(PROCOID, + ObjectIdGetDatum(aggref->aggfnoid)); + if (!HeapTupleIsValid(procTuple)) + elog(ERROR, "cache lookup failed for function %u", + aggref->aggfnoid); + aggOwner = ((Form_pg_proc) GETSTRUCT(procTuple))->proowner; + ReleaseSysCache(procTuple); + + aclresult = object_aclcheck(ProcedureRelationId, transfn_oid, aggOwner, + ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_AGGREGATE, + get_func_name(transfn_oid)); + InvokeFunctionExecuteHook(transfn_oid); + if (OidIsValid(finalfn_oid)) + { + aclresult = object_aclcheck(ProcedureRelationId, finalfn_oid, aggOwner, + ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_AGGREGATE, + get_func_name(finalfn_oid)); + InvokeFunctionExecuteHook(finalfn_oid); + } + } + + /* + * Get actual datatypes of the (nominal) aggregate inputs. These + * could be different from the agg's declared input types, when the + * agg accepts ANY or a polymorphic type. + */ + numArguments = get_aggregate_argtypes(aggref, inputTypes); + peraggstate->numArguments = numArguments; + + /* Count the "direct" arguments, if any */ + numDirectArgs = list_length(aggref->aggdirectargs); + + /* Count the number of aggregated input columns */ + numInputs = list_length(aggref->args); + peraggstate->numInputs = numInputs; + + Assert(!AGGKIND_IS_ORDERED_SET(aggref->aggkind)); + Assert(!aggform->aggfinalextra); + + peraggstate->numTransInputs = numArguments; + peraggstate->numFinalArgs = numDirectArgs + 1; + + /* resolve actual type of transition state, if polymorphic */ + aggtranstype = resolve_aggregate_transtype(aggref->aggfnoid, + aggform->aggtranstype, + inputTypes, + numArguments); + + /* build expression trees using actual argument & result types */ + build_aggregate_transfn_expr(inputTypes, + numArguments, + numDirectArgs, + aggref->aggvariadic, + aggtranstype, + aggref->inputcollid, + transfn_oid, + InvalidOid, /* invtrans is not needed + * here */ + &transfnexpr, + NULL); + + /* set up infrastructure for calling the transfn */ + fmgr_info(transfn_oid, &peraggstate->transfn); + fmgr_info_set_expr((Node *) transfnexpr, &peraggstate->transfn); + + if (OidIsValid(finalfn_oid)) + { + build_aggregate_finalfn_expr(inputTypes, + peragg->numFinalArgs, + aggtranstype, + aggref->aggtype, + aggref->inputcollid, + finalfn_oid, + &finalfnexpr); + + /* set up infrastructure for calling the finalfn */ + fmgr_info(finalfn_oid, &peraggstate->finalfn); + fmgr_info_set_expr((Node *) finalfnexpr, &peraggstate->finalfn); + } + + peraggstate->aggCollation = aggref->inputcollid; + + peraggstate->transfn_fcinfo = + (FunctionCallInfo) palloc(SizeForFunctionCallInfo(peraggstate->numTransInputs + 1)); + InitFunctionCallInfoData(*peraggstate->transfn_fcinfo, + &peraggstate->transfn, + peraggstate->numTransInputs + 1, + peraggstate->aggCollation, + (void *) aggstate->pseudo_aggstate, NULL); + + /* get info about relevant datatypes */ + get_typlenbyval(aggref->aggtype, + &peraggstate->resulttypeLen, + &peraggstate->resulttypeByVal); + get_typlenbyval(aggtranstype, + &peraggstate->transtypeLen, + &peraggstate->transtypeByVal); + + /* + * initval is potentially null, so don't try to access it as a struct + * field. Must do it the hard way with SysCacheGetAttr. + */ + textInitVal = SysCacheGetAttr(AGGFNOID, aggTuple, + Anum_pg_aggregate_agginitval, + &peraggstate->initValueIsNull); + + if (peraggstate->initValueIsNull) + peraggstate->initValue = (Datum) 0; + else + peraggstate->initValue = GetAggInitVal(textInitVal, + aggtranstype); + + /* + * If the transfn is strict and the initval is NULL, make sure input + * type and transtype are the same (or at least binary-compatible), so + * that it's OK to use the first aggregated input value as the initial + * transValue. This should have been checked at agg definition time, + * but we must check again in case the transfn's strictness property + * has been changed. + */ + if (peraggstate->transfn.fn_strict && peraggstate->initValueIsNull) + { + if (numArguments <= numDirectArgs || + !IsBinaryCoercible(inputTypes[numDirectArgs], aggtranstype)) + ereport(ERROR, + (errcode(ERRCODE_INVALID_FUNCTION_DEFINITION), + errmsg("aggregate %u needs to have compatible input type and transition type", + aggref->aggfnoid))); + } + + /* + * Get a tupledesc corresponding to the aggregated inputs (including + * sort expressions) of the agg. + */ + peraggstate->evaldesc = ExecTypeFromTL(aggref->args); + + /* Create slot we're going to do argument evaluation in */ + peraggstate->evalslot = ExecInitExtraTupleSlot(estate, peraggstate->evaldesc, &TTSOpsMinimalTuple); + + /* Set up projection info for evaluation */ + peraggstate->evalproj = VciExecBuildProjectionInfo(aggref->args, + aggstate->tmpcontext, + peraggstate->evalslot, + &aggstate->vci.css.ss.ps, + NULL); + + Assert(!AGGKIND_IS_ORDERED_SET(aggref->aggkind)); + Assert(!aggref->aggdistinct); + + sortlist = aggref->aggorder; + numSortCols = list_length(sortlist); + numDistinctCols = 0; + + peraggstate->numSortCols = numSortCols; + peraggstate->numDistinctCols = numDistinctCols; + + Assert(numSortCols == 0); + + Assert(aggref->aggdistinct == NIL); + + ReleaseSysCache(aggTuple); + } + + if (agg->aggstrategy == AGG_HASHED) + { + vci_agg_BeginCustomPlan_postprocess_for_advance_aggref(aggstate); + vci_agg_BeginCustomPlan_postprocess_for_vp(aggstate, econtext); + } + + /* Recopy dummy AggState */ + copy_into_pseudo_aggstate(aggstate->pseudo_aggstate, aggstate); +} + +/** + * Create and connect a pseudo Agg state to VCI Agg State + */ +static void +vci_agg_BeginCustomPlan_preprocess(VciAggState *aggstate) +{ + AggState *pseudo_aggstate; + + /* + * Create dummy AggState + * + * aggregation function registered in pg_proc system catalog checks if + * Execution Plan State Node is AggState or WindowsAggState. VciAggState + * is not considered an AggState because it is a CustomPlanState. The + * dummy AggState is used to fool Execution Plan State Node seen by + * aggregation function. + * + * Since it is necessary to set aggstate->pseudo_aggstate at the stage + * when the AggrefState is initialized, insert it before + * vci_agg_BeginCustomPlan. + */ + pseudo_aggstate = makeNode(AggState); + + /* only one (no grouping setsallowed) */ + pseudo_aggstate->aggcontexts = (ExprContext **) + palloc0(sizeof(ExprContext *) * 1); + ExecAssignExprContext(aggstate->vci.css.ss.ps.state, &aggstate->vci.css.ss.ps); + pseudo_aggstate->aggcontexts[0] = aggstate->vci.css.ss.ps.ps_ExprContext; + pseudo_aggstate->curaggcontext = pseudo_aggstate->aggcontexts[0]; + + pseudo_aggstate->phases = palloc0(sizeof(AggStatePerPhaseData)); + pseudo_aggstate->phases[0].grouped_cols = NULL; + pseudo_aggstate->phases[0].sortnode = NULL; + pseudo_aggstate->phases[0].numsets = 0; + pseudo_aggstate->phases[0].gset_lengths = NULL; + + pseudo_aggstate->perhash = palloc0(sizeof(AggStatePerHashData)); + + copy_into_pseudo_aggstate(pseudo_aggstate, aggstate); + aggstate->pseudo_aggstate = pseudo_aggstate; +} + +/** + * Replace transition function for each Aggref with an optimized version. + */ +static void +vci_agg_BeginCustomPlan_postprocess_for_advance_aggref(VciAggState *aggstate) +{ + int aggno; + + for (aggno = 0; aggno < aggstate->numaggs; aggno++) + { + VciAggStatePerAgg peraggstate = &aggstate->peragg[aggno]; + + peraggstate->advance_aggref = VciGetSpecialAdvanceAggrefFunc(peraggstate); + } +} + +/** + * Create vector processing context from targetlist to execute vector processing + */ +static void +vci_agg_BeginCustomPlan_postprocess_for_vp(VciAggState *aggstate, ExprContext *econtext) +{ + int aggno; + VciScanState *scansate = vci_search_scan_state(&aggstate->vci); + uint16 *skip_list; + + skip_list = vci_CSGetSkipAddrFromVirtualTuples(scansate->vector_set); + + for (aggno = 0; aggno < aggstate->numaggs; aggno++) + { + int i; + VciAggStatePerAgg peraggstate = &aggstate->peragg[aggno]; + VciProjectionInfo *proj = peraggstate->evalproj; + + if (proj->pi_tle_array_len > 0) + proj->pi_vp_tle_array = palloc0(sizeof(VciVPContext *) * proj->pi_tle_array_len); + + for (i = 0; i < proj->pi_tle_array_len; i++) + { + TargetEntry *tle; + + tle = (TargetEntry *) proj->pi_tle_array[i]; + + proj->pi_vp_tle_array[i] = + VciBuildVectorProcessing(tle->expr, (PlanState *) aggstate, + econtext, skip_list); + } + } +} + +/* ---------------- + * vci_ExecFreeExprContext + * + * A plan node's ExprContext should be freed explicitly during executor + * shutdown because there may be shutdown callbacks to call. (Other resources + * made by the above routines, such as projection info, don't need to be freed + * explicitly because they're just memory in the per-query memory context.) + */ +static void +vci_ExecFreeExprContext(PlanState *planstate) +{ + /* + * Per above discussion, don't actually delete the ExprContext. We do + * unlink it from the plan node, though. + */ + planstate->ps_ExprContext = NULL; +} + +/** + * EndCustomPlan callback called from CustomPlanState of VCI Agg + */ +static void +vci_agg_EndCustomPlan(CustomScanState *node) +{ + VciAggState *aggstate; + PlanState *outerPlan; + + aggstate = (VciAggState *) node; + + node = (CustomScanState *) aggstate; + + /* And ensure any agg shutdown callbacks have been called */ + ReScanExprContext(aggstate->vci.css.ss.ps.ps_ExprContext); + + /* + * Free both the expr contexts. + */ + vci_ExecFreeExprContext(&aggstate->vci.css.ss.ps); + node->ss.ps.ps_ExprContext = aggstate->tmpcontext; + vci_ExecFreeExprContext(&aggstate->vci.css.ss.ps); + + MemoryContextDelete(aggstate->aggcontext); + + outerPlan = outerPlanState(node); + + ExecEndNode(outerPlan); +} + +/** + * ReScanCustomPlan callback called from CustomPlanState of VCI Agg + */ +static void +vci_agg_ReScanCustomPlan(CustomScanState *node) +{ + VciAggState *aggstate; + ExprContext *econtext; + + aggstate = (VciAggState *) node; + + econtext = aggstate->vci.css.ss.ps.ps_ExprContext; + + aggstate->agg_done = false; + + if (((VciAgg *) aggstate->vci.css.ss.ps.plan)->aggstrategy == AGG_HASHED) + { + /* + * In the hashed case, if we haven't yet built the hash table then we + * can just return; nothing done yet, so nothing to undo. If subnode's + * chgParam is not NULL then it will be re-scanned by ExecProcNode, + * else no reason to re-scan it at all. + */ + if (!aggstate->table_filled) + return; + + /* + * If we do have the hash table and the subplan does not have any + * parameter changes, then we can just rescan the existing hash table; + * no need to build it again. + */ + if (aggstate->vci.css.ss.ps.lefttree->chgParam == NULL) + { + ResetTupleHashIterator(aggstate->hashtable, &aggstate->hashiter); + return; + } + } + + /* We don't need to ReScanExprContext here; ExecReScan already did it */ + + /* Release first tuple of group, if we have made a copy */ + if (aggstate->grp_firstTuple != NULL) + { + heap_freetuple(aggstate->grp_firstTuple); + aggstate->grp_firstTuple = NULL; + } + + /* Forget current agg values */ + MemSet(econtext->ecxt_aggvalues, 0, sizeof(Datum) * aggstate->numaggs); + MemSet(econtext->ecxt_aggnulls, 0, sizeof(bool) * aggstate->numaggs); + + /* + * Release all temp storage. Note that with AGG_HASHED, the hash table is + * allocated in a sub-context of the aggcontext. We're going to rebuild + * the hash table from scratch, so we need to use MemoryContextReset() to + * avoid leaking the old hash table's memory context header. + */ + MemoryContextReset(aggstate->aggcontext); + + Assert(IsA(aggstate->vci.css.ss.ps.plan, CustomScan)); + + if (((VciAgg *) aggstate->vci.css.ss.ps.plan)->aggstrategy == AGG_HASHED) + { + /* Rebuild an empty hash table */ + build_hash_table(aggstate); + aggstate->table_filled = false; + } + else + { + /* + * Reset the per-group state (in particular, mark transvalues null) + */ + MemSet(aggstate->pergroup, 0, + sizeof(VciAggStatePerGroupData) * aggstate->numaggs); + } + + /* + * if chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (aggstate->vci.css.ss.ps.lefttree->chgParam == NULL) + ExecReScan(aggstate->vci.css.ss.ps.lefttree); +} + +/* LCOV_EXCL_START */ + +/** + * MarkPosCustomPlan callback called by CustomPlanState of VCI Agg + */ +static void +vci_agg_MarkPosCustomPlan(CustomScanState *node) +{ + elog(PANIC, "VCI Agg does not support MarkPosCustomPlan call convention"); +} + +/** + * RestrPosCustomPlan callback called by CustomPlanState of VCI Agg + */ +static void +vci_agg_RestrPosCustomPlan(CustomScanState *node) +{ + elog(PANIC, "VCI Agg does not support RestrPosCustomPlan call convention"); +} + +/* LCOV_EXCL_STOP */ + +/** + * ExplainCustomPlan callback called by CustomPlanState of VCI Agg + */ +static void +vci_agg_ExplainCustomPlan(CustomScanState *cpstate, + List *ancestors, + ExplainState *es) +{ + VciAgg *agg = (VciAgg *) cpstate->ss.ps.plan; + + if (agg->numCols > 0) + { + /* The key columns refer to the tlist of the child plan */ + ancestors = lcons(&cpstate->ss.ps, ancestors); + + ExplainPropertySortGroupKeys(outerPlanState(&cpstate->ss.ps), "Group Key", + agg->numCols, agg->grpColIdx, + ancestors, es); + ancestors = list_delete_first(ancestors); + } +} + +/** + * CopyCustomPlan callback called by CustomPlan of VCI Agg + */ +static CustomScan * +vci_agg_CopyCustomPlan(const CustomScan *_from) +{ + const VciAgg *from = (const VciAgg *) _from; + VciAgg *newnode = (VciAgg *) newNode(sizeof(VciAgg), _from->scan.plan.type); + int numCols; + + vci_copy_plan(&newnode->vci, &from->vci); + + newnode->aggstrategy = from->aggstrategy; + + numCols = from->numCols; + newnode->numCols = numCols; + if (numCols > 0) + { + int i; + + newnode->grpColIdx = palloc(sizeof(AttrNumber) * numCols); + newnode->grpOperators = palloc(sizeof(Oid) * numCols); + newnode->grpCollations = palloc(sizeof(Oid) * numCols); + for (i = 0; i < numCols; i++) + { + newnode->grpColIdx[i] = from->grpColIdx[i]; + newnode->grpOperators[i] = from->grpOperators[i]; + newnode->grpCollations[i] = from->grpCollations[i]; + } + } + newnode->numGroups = from->numGroups; + + ((Node *) newnode)->type = nodeTag((Node *) from); + + return &newnode->vci.cscan; +} + +CustomScanMethods vci_agg_scan_methods = { + "VCI Aggregate", + vci_agg_CreateCustomScanState, + vci_agg_CopyCustomPlan +}; + +CustomScanMethods vci_hashagg_scan_methods = { + "VCI HashAggregate", + vci_agg_CreateCustomScanState, + vci_agg_CopyCustomPlan +}; + +CustomScanMethods vci_groupagg_scan_methods = { + "VCI GroupAggregate", + vci_agg_CreateCustomScanState, + vci_agg_CopyCustomPlan +}; + +/** + * VCI Agg's CustomPlanMethods callbacks + */ +CustomExecMethods vci_agg_exec_methods = { + "VCI Aggregate", + vci_agg_BeginCustomPlan, + vci_agg_ExecCustomPlan, + vci_agg_EndCustomPlan, + vci_agg_ReScanCustomPlan, + vci_agg_MarkPosCustomPlan, + vci_agg_RestrPosCustomPlan, + NULL, + NULL, + NULL, + NULL, + NULL, + vci_agg_ExplainCustomPlan, + NULL, + NULL +}; + +/** + * VCI Agg's CustomPlanMethods callbacks + */ +CustomExecMethods vci_hashagg_exec_methods = { + "VCI HashAggregate", + vci_agg_BeginCustomPlan, + vci_agg_ExecCustomPlan, + vci_agg_EndCustomPlan, + vci_agg_ReScanCustomPlan, + vci_agg_MarkPosCustomPlan, + vci_agg_RestrPosCustomPlan, + NULL, + NULL, + NULL, + NULL, + NULL, + vci_agg_ExplainCustomPlan, + NULL, + NULL +}; + +/** + * VCI Agg's CustomPlanMethods callbacks + */ +CustomExecMethods vci_groupagg_exec_methods = { + "VCI GroupAggregate", + vci_agg_BeginCustomPlan, + vci_agg_ExecCustomPlan, + vci_agg_EndCustomPlan, + vci_agg_ReScanCustomPlan, + vci_agg_MarkPosCustomPlan, + vci_agg_RestrPosCustomPlan, + NULL, + NULL, + NULL, + NULL, + NULL, + vci_agg_ExplainCustomPlan, + NULL, + NULL +}; diff --git a/contrib/vci/executor/vci_aggmergetranstype.c b/contrib/vci/executor/vci_aggmergetranstype.c new file mode 100644 index 000000000000..61a00a6b8dac --- /dev/null +++ b/contrib/vci/executor/vci_aggmergetranstype.c @@ -0,0 +1,133 @@ +/*------------------------------------------------------------------------- + * + * vci_aggmergetranstype.c + * Parallel merge utility routines to merge between aggregate function's + * internal transition (state) data. + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/executor/vci_aggmergetranstype.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup.h" +#include "access/htup_details.h" +#include "catalog/pg_aggregate.h" +#include "catalog/pg_proc.h" +#include "catalog/pg_type.h" +#include "datatype/timestamp.h" +#include "lib/stringinfo.h" +#include "libpq/pqformat.h" +#include "utils/acl.h" +#include "utils/array.h" +#include "utils/datum.h" +#include "utils/fmgroids.h" +#include "utils/fmgrprotos.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/numeric.h" +#include "utils/syscache.h" +#include "utils/timestamp.h" + +#include "vci.h" + +#include "vci_executor.h" + +#include "postgresql_copy.h" + +/** + * Determine if the given aggregation function is a type that can be supported by VCI + * + * @param[in] aggref Pointer to Aggref that holds the aggregate function to be determined + * @return true if supportable, false if not + */ +bool +vci_is_supported_aggregation(Aggref *aggref) +{ + int numInputs; + HeapTuple aggTuple; + Form_pg_aggregate aggform; + AclResult aclresult; + Oid transfn_oid; + Oid rettype; + Oid *argtypes; + int nargs; + bool ret = false; + + /* not UDF */ + if (FirstNormalObjectId <= aggref->aggfnoid) + { + elog(DEBUG1, "Aggref contains user-defined aggregation"); + return false; + } + + /* 0 or 1 input function */ + numInputs = list_length(aggref->args); + if (1 < numInputs) + { + elog(DEBUG1, "Aggref contains an aggregation with 2 or more arguments"); + return false; + } + + /* Fetch the pg_aggregate row */ + aggTuple = SearchSysCache1(AGGFNOID, + ObjectIdGetDatum(aggref->aggfnoid)); + if (!HeapTupleIsValid(aggTuple)) + elog(ERROR, "cache lookup failed for aggregate %u", + aggref->aggfnoid); + + aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple); + + aclresult = object_aclcheck(ProcedureRelationId, aggref->aggfnoid, GetUserId(), + ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_AGGREGATE, + get_func_name(aggref->aggfnoid)); + + transfn_oid = aggform->aggtransfn; + + /* Check that aggregate owner has permission to call component fns */ + + rettype = get_func_signature(transfn_oid, &argtypes, &nargs); + + if ((rettype != INTERNALOID) && + (nargs == 2) && (rettype == argtypes[0]) && (rettype == argtypes[1])) + { + ret = true; + } + else + { + switch (transfn_oid) + { + case F_FLOAT4_ACCUM: + case F_FLOAT8_ACCUM: + case F_INT8INC: + case F_NUMERIC_ACCUM: + case F_INT2_ACCUM: + case F_INT4_ACCUM: + case F_INT8_ACCUM: + case F_INT2_SUM: + case F_INT4_SUM: + case F_INT2_AVG_ACCUM: + case F_INT4_AVG_ACCUM: + case F_INT8_AVG_ACCUM: + case F_INT8INC_ANY: + case F_NUMERIC_AVG_ACCUM: + case F_INTERVAL_AVG_COMBINE: + ret = true; + break; + default: + break; + } + } + + if (!ret) + elog(DEBUG1, "Aggref contains unsupported aggregation function"); + + ReleaseSysCache(aggTuple); + + return ret; +} diff --git a/contrib/vci/executor/vci_aggref.c b/contrib/vci/executor/vci_aggref.c new file mode 100644 index 000000000000..31d40677341c --- /dev/null +++ b/contrib/vci/executor/vci_aggref.c @@ -0,0 +1,1287 @@ +/*------------------------------------------------------------------------- + * + * vci_aggref.c + * Routine to inline transition functions for speeding up aggregate functions + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/executor/vci_aggref.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/htup_details.h" +#include "catalog/objectaccess.h" +#include "catalog/pg_aggregate.h" +#include "catalog/pg_proc.h" +#include "commands/explain.h" +#include "executor/execdebug.h" +#include "executor/executor.h" +#include "executor/nodeCustom.h" +#include "miscadmin.h" +#include "nodes/nodeFuncs.h" +#include "optimizer/clauses.h" +#include "optimizer/tlist.h" +#include "parser/parse_agg.h" +#include "parser/parse_coerce.h" +#include "utils/acl.h" +#include "utils/array.h" +#include "utils/builtins.h" +#include "utils/cash.h" +#include "utils/date.h" +#include "utils/datum.h" +#include "utils/fmgroids.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/syscache.h" +#include "utils/timestamp.h" +#include "utils/tuplesort.h" + +#include "vci.h" +#include "vci_executor.h" +#include "vci_aggref.h" + +#define VCI_USE_CMP_FUNC +#include "postgresql_copy.h" +#undef VCI_USE_CMP_FUNC + +#define VCI_TRANS_INPUTS_0 (0) +#define VCI_TRANS_INPUTS_1_SIMPLEVAR (1) +#define VCI_TRANS_INPUTS_1_EVALEXPR (2) + +/* + * Default pattern + */ +#define VCI_ADVANCE_AGGREF_FUNC aggref_0input_default +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_0 +#define VCI_TRANFN_OID 0 +#define VCI_TRANS_FN_STRICT peraggstate->transfn.fn_strict +#define VCI_TRANS_TYPE_BYVAL -1 +#define VCI_TRANS_USE_CURPERAGG +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_USE_CURPERAGG +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_default +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID 0 +#define VCI_TRANS_FN_STRICT peraggstate->transfn.fn_strict +#define VCI_TRANS_TYPE_BYVAL -1 +#define VCI_TRANS_USE_CURPERAGG +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_USE_CURPERAGG +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_default +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID 0 +#define VCI_TRANS_FN_STRICT peraggstate->transfn.fn_strict +#define VCI_TRANS_TYPE_BYVAL -1 +#define VCI_TRANS_USE_CURPERAGG +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_USE_CURPERAGG +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +/* + * Individual advance transition routine + */ +#define VCI_ADVANCE_AGGREF_FUNC aggref_0input_int8inc +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_0 +#define VCI_TRANFN_OID F_INT8INC +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_int8inc_any +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INT8INC_ANY +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_float4_accum +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_FLOAT4_ACCUM +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 0 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_float4pl +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_FLOAT4PL +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_float4larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_FLOAT4LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_float4smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_FLOAT4SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_float8pl +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_FLOAT8PL +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_int4larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INT4LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_int4smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INT4SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_cash_pl +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_CASH_PL +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_cashlarger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_CASHLARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_cashsmaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_CASHSMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_date_larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_DATE_LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_date_smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_DATE_SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_interval_pl +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INTERVAL_PL +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 0 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_timestamp_smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_TIMESTAMP_SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_timestamp_larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_TIMESTAMP_LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_interval_smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INTERVAL_SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 0 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_interval_larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INTERVAL_LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 0 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_time_larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_TIME_LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_time_smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_TIME_SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_timetz_larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_TIMETZ_LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 0 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_timetz_smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_TIMETZ_SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 0 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_int2_sum +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INT2_SUM +#define VCI_TRANS_FN_STRICT 0 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_int4_sum +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INT4_SUM +#define VCI_TRANS_FN_STRICT 0 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_int4and +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INT4AND +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_int4or +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INT4OR +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_int4_avg_accum +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INT4_AVG_ACCUM +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 0 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_booland_statefunc +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_BOOLAND_STATEFUNC +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_boolor_statefunc +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_BOOLOR_STATEFUNC +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_int2and +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INT2AND +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_int2or +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INT2OR +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_int2_avg_accum +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INT2_AVG_ACCUM +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 0 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_int2larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INT2LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_int2smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INT2SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_int8and +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INT8AND +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_int8or +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INT8OR +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_int8larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INT8LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_int8smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_INT8SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_float8larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_FLOAT8LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_float8smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_FLOAT8SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_simple_var_float8_accum +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_SIMPLEVAR +#define VCI_TRANFN_OID F_FLOAT8_ACCUM +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 0 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +/* eval expr */ + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_int8inc_any +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INT8INC_ANY +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_float4_accum +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_FLOAT4_ACCUM +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 0 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_float4pl +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_FLOAT4PL +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_float4larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_FLOAT4LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_float4smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_FLOAT4SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_float8pl +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_FLOAT8PL +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_int4larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INT4LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_int4smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INT4SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_cash_pl +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_CASH_PL +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_cashlarger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_CASHLARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_cashsmaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_CASHSMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_date_larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_DATE_LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_date_smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_DATE_SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_interval_pl +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INTERVAL_PL +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 0 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_timestamp_smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_TIMESTAMP_SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_timestamp_larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_TIMESTAMP_LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_interval_smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INTERVAL_SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 0 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_interval_larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INTERVAL_LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 0 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_time_larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_TIME_LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_time_smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_TIME_SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_timetz_larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_TIMETZ_LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 0 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_timetz_smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_TIMETZ_SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 0 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_int2_sum +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INT2_SUM +#define VCI_TRANS_FN_STRICT 0 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_int4_sum +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INT4_SUM +#define VCI_TRANS_FN_STRICT 0 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_int4and +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INT4AND +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_int4or +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INT4OR +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_int4_avg_accum +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INT4_AVG_ACCUM +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 0 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_booland_statefunc +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_BOOLAND_STATEFUNC +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_boolor_statefunc +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_BOOLOR_STATEFUNC +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_int2and +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INT2AND +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_int2or +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INT2OR +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_int2_avg_accum +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INT2_AVG_ACCUM +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 0 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_int2larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INT2LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_int2smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INT2SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 1 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_int8and +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INT8AND +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_int8or +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INT8OR +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_int8larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INT8LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_int8smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_INT8SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_float8larger +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_FLOAT8LARGER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_float8smaller +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_FLOAT8SMALLER +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL USE_FLOAT8_BYVAL +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +#define VCI_ADVANCE_AGGREF_FUNC aggref_eval_expr_float8_accum +#define VCI_TRANS_INPUTS_ARG VCI_TRANS_INPUTS_1_EVALEXPR +#define VCI_TRANFN_OID F_FLOAT8_ACCUM +#define VCI_TRANS_FN_STRICT 1 +#define VCI_TRANS_TYPE_BYVAL 0 +#include "vci_aggref_impl.inc" +#undef VCI_TRANS_TYPE_BYVAL +#undef VCI_TRANS_FN_STRICT +#undef VCI_TRANFN_OID +#undef VCI_TRANS_INPUTS_ARG +#undef VCI_ADVANCE_AGGREF_FUNC + +typedef struct +{ + Oid fn_oid; + short fn_nargs; + bool fn_strict; + bool transtypeByVal; + bool consumeMemory; + bool useCurPerAgg; + VciAdvanceAggref_Func simple_var_func; + VciAdvanceAggref_Func eval_expr_func; +} AggrefTransInfo; + +#define VCI_F_TIMESTAMP_SMALLER 2035 +#define VCI_F_TIMESTAMP_LARGER 2036 + +static int compare_aggref_trans_info(const void *p1, const void *p2); +static AggrefTransInfo *search_aggref_trans_info(Oid oid); + +/** + * Show the inline expansion routine for each transition function + * + * + * @note Array should be ordered in ascending fn_oid order + */ + +#ifdef USE_FLOAT8_BYVAL +#define VCI_FLOAT8_TRANSTYPEBYVAL true +#else +#define VCI_FLOAT8_TRANSTYPEBYVAL false +#endif + +static AggrefTransInfo function_table[] = { + {F_FLOAT4PL, 2, true, true, false, false, aggref_simple_var_float4pl, aggref_eval_expr_float4pl}, /* 204 */ + {F_FLOAT4_ACCUM, 2, true, false, false, false, aggref_simple_var_float4_accum, aggref_eval_expr_float4_accum}, /* 208 */ + {F_FLOAT4LARGER, 2, true, true, false, false, aggref_simple_var_float4larger, aggref_eval_expr_float4larger}, /* 209 */ + {F_FLOAT4SMALLER, 2, true, true, false, false, aggref_simple_var_float4smaller, aggref_eval_expr_float4smaller}, /* 211 */ + {F_FLOAT8PL, 2, true, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_simple_var_float8pl, aggref_eval_expr_float8pl}, /* 218 */ + {F_FLOAT8_ACCUM, 2, true, false, false, false, aggref_simple_var_float8_accum, aggref_eval_expr_float8_accum}, /* 222 */ + {F_FLOAT8LARGER, 2, true, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_simple_var_float8larger, aggref_eval_expr_float8larger}, /* 223 */ + {F_FLOAT8SMALLER, 2, true, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_simple_var_float8smaller, aggref_eval_expr_float8smaller}, /* 224 */ + {F_INT4LARGER, 2, true, true, false, false, aggref_simple_var_int4larger, aggref_eval_expr_int4larger}, /* 768 */ + {F_INT4SMALLER, 2, true, true, false, false, aggref_simple_var_int4smaller, aggref_eval_expr_int4smaller}, /* 769 */ + {F_INT2LARGER, 2, true, true, false, false, aggref_simple_var_int2larger, aggref_eval_expr_int2larger}, /* 770 */ + {F_INT2SMALLER, 2, true, true, false, false, aggref_simple_var_int2smaller, aggref_eval_expr_int2smaller}, /* 771 */ + {F_CASH_PL, 2, true, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_simple_var_cash_pl, aggref_eval_expr_cash_pl}, /* 894 */ + {F_CASHLARGER, 2, true, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_simple_var_cashlarger, aggref_eval_expr_cashlarger}, /* 898 */ + {F_CASHSMALLER, 2, true, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_simple_var_cashsmaller, aggref_eval_expr_cashsmaller}, /* 899 */ + {F_DATE_LARGER, 2, true, true, false, false, aggref_simple_var_date_larger, aggref_eval_expr_date_larger}, /* 1138 */ + {F_DATE_SMALLER, 2, true, true, false, false, aggref_simple_var_date_smaller, aggref_eval_expr_date_smaller}, /* 1139 */ + {F_INTERVAL_PL, 2, true, false, true, false, aggref_simple_var_interval_pl, aggref_eval_expr_interval_pl}, /* 1169 */ + {F_TIMESTAMP_SMALLER, 2, true, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_simple_var_timestamp_smaller, aggref_eval_expr_timestamp_smaller}, /* 1195 */ + {F_TIMESTAMP_LARGER, 2, true, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_simple_var_timestamp_larger, aggref_eval_expr_timestamp_larger}, /* 1196 */ + {F_INTERVAL_SMALLER, 2, true, false, false, false, aggref_simple_var_interval_smaller, aggref_eval_expr_interval_smaller}, /* 1197 */ + {F_INTERVAL_LARGER, 2, true, false, false, false, aggref_simple_var_interval_larger, aggref_eval_expr_interval_larger}, /* 1198 */ + {F_INT8INC, 1, true, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_0input_int8inc, NULL}, /* 1219 */ + {F_INT8LARGER, 2, true, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_simple_var_int8larger, aggref_eval_expr_int8larger}, /* 1236 */ + {F_INT8SMALLER, 2, true, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_simple_var_int8smaller, aggref_eval_expr_int8smaller}, /* 1237 */ + {F_TIME_LARGER, 2, true, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_simple_var_time_larger, aggref_eval_expr_time_larger}, /* 1377 */ + {F_TIME_SMALLER, 2, true, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_simple_var_time_smaller, aggref_eval_expr_time_smaller}, /* 1378 */ + {F_TIMETZ_LARGER, 2, true, false, false, false, aggref_simple_var_timetz_larger, aggref_eval_expr_timetz_larger}, /* 1379 */ + {F_TIMETZ_SMALLER, 2, true, false, false, false, aggref_simple_var_timetz_smaller, aggref_eval_expr_timetz_smaller}, /* 1380 */ + {F_INT2_SUM, 2, false, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_simple_var_int2_sum, aggref_eval_expr_int2_sum}, /* 1840 */ + {F_INT4_SUM, 2, false, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_simple_var_int4_sum, aggref_eval_expr_int4_sum}, /* 1841 */ + {F_INT2AND, 2, true, true, false, false, aggref_simple_var_int2and, aggref_eval_expr_int2and}, /* 1892 */ + {F_INT2OR, 2, true, true, false, false, aggref_simple_var_int2or, aggref_eval_expr_int2or}, /* 1893 */ + {F_INT4AND, 2, true, true, false, false, aggref_simple_var_int4and, aggref_eval_expr_int4and}, /* 1898 */ + {F_INT4OR, 2, true, true, false, false, aggref_simple_var_int4or, aggref_eval_expr_int4or}, /* 1899 */ + {F_INT8AND, 2, true, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_simple_var_int8and, aggref_eval_expr_int8and}, /* 1904 */ + {F_INT8OR, 2, true, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_simple_var_int8or, aggref_eval_expr_int8or}, /* 1905 */ + {F_INT2_AVG_ACCUM, 2, true, false, false, false, aggref_simple_var_int2_avg_accum, aggref_eval_expr_int2_avg_accum}, /* 1962 */ + {F_INT4_AVG_ACCUM, 2, true, false, false, false, aggref_simple_var_int4_avg_accum, aggref_eval_expr_int4_avg_accum}, /* 1963 */ + {VCI_F_TIMESTAMP_SMALLER, 2, true, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_simple_var_timestamp_smaller, aggref_eval_expr_timestamp_smaller}, /* 2035 */ + {VCI_F_TIMESTAMP_LARGER, 2, true, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_simple_var_timestamp_larger, aggref_eval_expr_timestamp_larger}, /* 2036 */ + {F_BOOLAND_STATEFUNC, 2, true, true, false, false, aggref_simple_var_booland_statefunc, aggref_eval_expr_booland_statefunc}, /* 2515 */ + {F_BOOLOR_STATEFUNC, 2, true, true, false, false, aggref_simple_var_boolor_statefunc, aggref_eval_expr_boolor_statefunc}, /* 2516 */ + {F_INT8INC_ANY, 2, true, VCI_FLOAT8_TRANSTYPEBYVAL, false, false, aggref_simple_var_int8inc_any, aggref_eval_expr_int8inc_any}, /* 2804 */ +}; + +/** + * Returns routine that individually inlines transition function for aggregate function + * + * When PostgreSQL performs avg aggregation on float4, the transition function that adds each input data (of float4 type) + * is float4_accum(). This function uses information from VciAggStatePerAgg to identify the transition function and + * returns a pointer to a fast routine that inlines the transition function, if any. + * + * @param[in] peraggstate Pointer to AggrefState information + * @return Returns pointer to transition routine for the aggregate function. If not supported, returns NULL + */ +VciAdvanceAggref_Func +VciGetSpecialAdvanceAggrefFunc(VciAggStatePerAgg peraggstate) +{ + VciProjectionInfo *projInfo = peraggstate->evalproj; + + if (peraggstate->aggref->aggfilter != NULL || peraggstate->numSortCols > 0) + return NULL; + + if ((peraggstate->numTransInputs == 0) || + (peraggstate->numTransInputs == 1 && + projInfo->pi_numSimpleVars == 1 && projInfo->pi_directMap && projInfo->pi_tle_array_len == 0)) + { + AggrefTransInfo *trans_info_p = search_aggref_trans_info(peraggstate->transfn_oid); + + if (trans_info_p) + { + if (trans_info_p->simple_var_func) + { + if (peraggstate->transfn.fn_nargs != trans_info_p->fn_nargs) + elog(ERROR, "Oid %d fn_nargs = %d, trans_info.fn_nargs = %d", + peraggstate->transfn_oid, peraggstate->transfn.fn_nargs, trans_info_p->fn_nargs); + + if (peraggstate->transfn.fn_strict != trans_info_p->fn_strict) + elog(ERROR, "Oid %d peraggstate fn_strict = %d, trans_info.fn_strict = %d", + peraggstate->transfn_oid, peraggstate->transfn.fn_strict, trans_info_p->fn_strict); + + if (peraggstate->transtypeByVal != trans_info_p->transtypeByVal) + elog(ERROR, "Oid %d transtypeByVal peraggstate = %d, trans_info = %d", + peraggstate->transfn_oid, peraggstate->transtypeByVal, trans_info_p->transtypeByVal); + + return trans_info_p->simple_var_func; + } + } + + if (peraggstate->numTransInputs == 0) + return aggref_0input_default; + else + return aggref_simple_var_default; + } + else if (peraggstate->numTransInputs == 1 && + projInfo->pi_numSimpleVars == 0 && projInfo->pi_tle_array_len == 1) + { + AggrefTransInfo *trans_info_p = search_aggref_trans_info(peraggstate->transfn_oid); + + if (trans_info_p) + { + if (trans_info_p->eval_expr_func) + { + if (peraggstate->transfn.fn_nargs != trans_info_p->fn_nargs) + elog(ERROR, "Oid %d fn_nargs = %d, trans_info.fn_nargs = %d", + peraggstate->transfn_oid, peraggstate->transfn.fn_nargs, trans_info_p->fn_nargs); + + if (peraggstate->transfn.fn_strict != trans_info_p->fn_strict) + elog(ERROR, "Oid %d peraggstate fn_strict = %d, trans_info.fn_strict = %d", + peraggstate->transfn_oid, peraggstate->transfn.fn_strict, trans_info_p->fn_strict); + + if (peraggstate->transtypeByVal != trans_info_p->transtypeByVal) + elog(ERROR, "Oid %d transtypeByVal peraggstate = %d, trans_info = %d", + peraggstate->transfn_oid, peraggstate->transtypeByVal, trans_info_p->transtypeByVal); + + return trans_info_p->eval_expr_func; + } + } + + return aggref_eval_expr_default; + } + + return NULL; +} + +static AggrefTransInfo * +search_aggref_trans_info(Oid oid) +{ + AggrefTransInfo key = {0}; + AggrefTransInfo *res; + + key.fn_oid = oid; + + res = (AggrefTransInfo *) bsearch(&key, function_table, + lengthof(function_table), sizeof(function_table[0]), + compare_aggref_trans_info); + + return res; +} + +static int +compare_aggref_trans_info(const void *p1, const void *p2) +{ + const AggrefTransInfo *info1 = (const AggrefTransInfo *) p1; + const AggrefTransInfo *info2 = (const AggrefTransInfo *) p2; + + if (info1->fn_oid > info2->fn_oid) + return +1; + else if (info1->fn_oid < info2->fn_oid) + return -1; + else + return 0; +} diff --git a/contrib/vci/executor/vci_executor.c b/contrib/vci/executor/vci_executor.c new file mode 100644 index 000000000000..d43355a3a815 --- /dev/null +++ b/contrib/vci/executor/vci_executor.c @@ -0,0 +1,2116 @@ +/*------------------------------------------------------------------------- + * + * vci_executor.c + * Miscellaneous executor utility routines + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/executor/vci_executor.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "access/relscan.h" +#include "access/transam.h" +#include "access/tupconvert.h" +#include "access/xact.h" /* for XactEvent */ +#include "catalog/index.h" +#include "catalog/objectaccess.h" +#include "catalog/pg_proc.h" +#include "catalog/pg_type.h" +#include "commands/explain.h" +#include "commands/typecmds.h" +#include "executor/execdebug.h" +#include "executor/execExpr.h" +#include "executor/executor.h" +#include "executor/nodeCustom.h" +#include "executor/nodeSubplan.h" +#include "fmgr.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "nodes/makefuncs.h" +#include "nodes/nodeFuncs.h" +#include "nodes/nodes.h" +#include "optimizer/planner.h" +#include "parser/parse_coerce.h" +#include "parser/parsetree.h" +#include "pgstat.h" +#include "storage/lmgr.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/jsonfuncs.h" +#include "utils/jsonpath.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/typcache.h" +#include "utils/xml.h" + +#include "vci.h" + +#include "vci_executor.h" +#include "vci_utils.h" + +/** + * Record QueryDesc executing VCI on Executor + * + * - NULL on no execution, and records QueryDesc when executing VCI on ExecutorStart hook + * - Only 1 VCI running query runs at a time (VCI does not run multiple queries in parallel) + * - Return to NULL when VCI ends at ExecutorEnd + * - In case of transaction error, force return to NULL using vci_xact_callback callback + * - In case of error in subtransaction, determine if it is applicable using SubTransactionId + * + * @note There are patterns in which Executor is recursively called, such as when stored procedure is called + * @note When FETCH-ing a DECLARE CURSOR, multiple Executor of queries are called in parallel. + */ +static QueryDesc *vci_execution_query_desc = NULL; +static SubTransactionId vci_execution_subid = InvalidSubTransactionId; + +/** + * Record the first call of vci_executor_run_routine() of VCI execution + * + * - When using cursor, ExecutorRun() may be called multiple times for a query, + * but this is used to limit the setup process to the first time. + * - Even if stored procedure calls executor at multiple stages, it does not change + * at stages unrelated to VCI execution. + */ +static bool vci_executor_run_routine_once = false; + +/* static function decls */ +static bool should_fetch_column_store(Var *var, PlanState *parent); + +static void vci_executor_start_routine(QueryDesc *queryDesc, int eflags); +static void vci_executor_run_routine(QueryDesc *queryDesc, ScanDirection direction, uint64 count); +static void vci_executor_end_routine(QueryDesc *queryDesc); +static void vci_explain_one_query_routine(Query *queryDesc, int cursorOptions, IntoClause *into, + ExplainState *es, const char *queryString, ParamListInfo params, + QueryEnvironment *queryEnv); + +/* Static variables */ +static ExecutorStart_hook_type executor_start_prev; +static ExecutorRun_hook_type executor_run_prev; +static ExecutorEnd_hook_type executor_end_prev; +static ExplainOneQuery_hook_type explain_one_query_prev; + +static void VciExecInitExprRec(Expr *node, PlanState *parent, ExprState *state, Datum *resv, bool *resnull, vci_initexpr_t inittype); +static void VciExecInitFunc(ExprEvalStep *scratch, Expr *node, List *args, Oid funcid, Oid inputcollid, PlanState *parent, ExprState *state, vci_initexpr_t inittype); +static void VciExecInitJsonExpr(JsonExpr *jsexpr, PlanState *parent, ExprState *state, + Datum *resv, bool *resnull, + ExprEvalStep *scratch, vci_initexpr_t inittype); +static void VciExecInitJsonCoercion(ExprState *state, JsonReturning *returning, + ErrorSaveContext *escontext, bool omit_quotes, + bool exists_coerce, + Datum *resv, bool *resnull); + +/** + * Registration of VCI's executor routine + */ +void +vci_setup_executor_hook(void) +{ + executor_start_prev = ExecutorStart_hook; + ExecutorStart_hook = vci_executor_start_routine; + + executor_run_prev = ExecutorRun_hook; + ExecutorRun_hook = vci_executor_run_routine; + + executor_end_prev = ExecutorEnd_hook; + ExecutorEnd_hook = vci_executor_end_routine; + + explain_one_query_prev = ExplainOneQuery_hook; + ExplainOneQuery_hook = vci_explain_one_query_routine; + + ExprEvalVar_hook = VciExecEvalScalarVarFromColumnStore; + ExprEvalParam_hook = VciExecEvalParamExec; + +} + +/** + * ExecutorStart hook callback + */ +static void +vci_executor_start_routine(QueryDesc *queryDesc, int eflags) +{ + SubTransactionId mySubid; + + if (IsParallelWorker()) + goto end; + + mySubid = GetCurrentSubTransactionId(); + + if (vci_execution_query_desc == NULL) + { + /* Start plan rewrite only if no other Executor is running */ + vci_initialize_query_context(queryDesc, eflags); + + if (vci_is_processing_custom_plan()) + { + vci_execution_query_desc = queryDesc; + vci_execution_subid = mySubid; + vci_executor_run_routine_once = false; + } + } + +end: + if (executor_start_prev) + executor_start_prev(queryDesc, eflags); + else + standard_ExecutorStart(queryDesc, eflags); +} + +/** + * ExecutorRun hook callback + */ +static void +vci_executor_run_routine(QueryDesc *queryDesc, ScanDirection direction, uint64 count) +{ + if (IsParallelWorker()) + goto end; + +end: + if (executor_run_prev) + executor_run_prev(queryDesc, direction, count); + else + standard_ExecutorRun(queryDesc, direction, count); +} + +/** + * ExecutorEnd hook callback + */ +static void +vci_executor_end_routine(QueryDesc *queryDesc) +{ + if (executor_end_prev) + executor_end_prev(queryDesc); + else + standard_ExecutorEnd(queryDesc); + + if (IsParallelWorker()) + return; + + if (vci_execution_query_desc == queryDesc) + { + vci_finalize_query_context(); + + /* + * vci_free_query_context call is moved inside + * vci_finalize_query_context , otherwise this call will not delete + * SMC created for parallelism + */ + /* vci_free_query_context(); */ + + vci_execution_query_desc = NULL; + vci_execution_subid = InvalidSubTransactionId; + vci_executor_run_routine_once = false; + } +} + +static void +vci_explain_one_query_routine(Query *queryDesc, int cursorOptions, IntoClause *into, + ExplainState *es, const char *queryString, ParamListInfo params, + QueryEnvironment *queryEnv) +{ + if (explain_one_query_prev) + explain_one_query_prev(queryDesc, cursorOptions, into, es, queryString, params, queryEnv); + else + { + /* + * copy from ExplainOneQuery() in src/backend/commands/explain.c + */ + standard_ExplainOneQuery(queryDesc, cursorOptions, into, es, + queryString, params, queryEnv); + } +} + +/** + * Stop VCI execute at transaction switch time + */ +void +vci_xact_change_handler(XactEvent event) +{ + switch (event) + { + case XACT_EVENT_ABORT: + if (vci_execution_query_desc != NULL) + { + elog(DEBUG1, "vci:executor caught any exception"); + vci_free_query_context(); + } + vci_execution_query_desc = NULL; + vci_execution_subid = InvalidSubTransactionId; + vci_executor_run_routine_once = false; + break; + + case XACT_EVENT_PRE_COMMIT: + case XACT_EVENT_COMMIT: + Assert(vci_execution_query_desc == NULL); + break; + + default: + /** + * XACT_EVENT_PREPARE + * XACT_EVENT_PRE_PREPARE + */ + break; + } +} + +/** + * Event Handler on subxact change. + */ +void +vci_subxact_change_handler(SubXactEvent event, SubTransactionId mySubid) +{ + switch (event) + { + case SUBXACT_EVENT_START_SUB: + break; + + case SUBXACT_EVENT_ABORT_SUB: + if (mySubid == vci_execution_subid) + { + elog(DEBUG1, "vci:executor caught any exception in sub transaction"); + vci_free_query_context(); + + vci_execution_query_desc = NULL; + vci_execution_subid = InvalidSubTransactionId; + vci_executor_run_routine_once = false; + } + break; + + case SUBXACT_EVENT_PRE_COMMIT_SUB: + case SUBXACT_EVENT_COMMIT_SUB: + break; + } +} + +/** + * Determine whether Var fetches from column store + */ +static bool +should_fetch_column_store(Var *var, PlanState *planstate) +{ + Assert(var != NULL); + Assert(planstate != NULL); + Assert(nodeTag(planstate) != T_Invalid); + + if (IsA(planstate, CustomScanState)) + { + CustomScanState *cps = (CustomScanState *) planstate; + uint32 plan_type = cps->flags & VCI_CUSTOMPLAN_MASK; + + if ((plan_type == VCI_CUSTOMPLAN_SCAN) || + (plan_type == VCI_CUSTOMPLAN_SORT) || + (plan_type == VCI_CUSTOMPLAN_AGG)) + { + return true; + } + } + + return false; +} + +/* ---------------------------------------------------------------- + * ExecEvalOper / ExecEvalFunc support routines + * ---------------------------------------------------------------- + */ + +/* + * VciExecInitExprRec + * Append the steps necessary for the evaluation of node to ExprState->steps, + * possibly recursing into sub-expressions of node. + * + * node - expression to evaluate + * parent - parent executor node (or NULL if a standalone expression) + * state - ExprState to whose ->steps to append the necessary operations + * resv / resnull - where to store the result of the node into + * copied from src/backend/executor/execExpr.c + */ +static void +VciExecInitExprRec(Expr *node, PlanState *parent, ExprState *state, + Datum *resv, bool *resnull, vci_initexpr_t inittype) +{ + ExprEvalStep scratch = {0}; + + /* Guard against stack overflow due to overly complex expressions */ + check_stack_depth(); + + /* Step's output location is always what the caller gave us */ + Assert(resv != NULL && resnull != NULL); + scratch.resvalue = resv; + scratch.resnull = resnull; + + switch (nodeTag(node)) + { + case T_Var: + { + Var *variable = (Var *) node; + + Assert(((Var *) node)->varattno != InvalidAttrNumber); + + if ((inittype == VCI_INIT_EXPR_FETCHING_COLUMN_STORE) && + should_fetch_column_store((Var *) node, parent)) + { + /* + * CustomScanState *cstate; VciScanState *vci_scanstate; + * cstate = (CustomScanState *) parent; + * + * Assert(IsA(cstate, CustomScanState)); vci_scanstate = + * vci_search_scan_state((VciPlanState *) parent); + * scratch.opcode = EEOP_VCI_VAR; scratch.d.vci_scanstate + * = vci_scanstate; + * + * This is to make use of OSS structure ExprEvalStep + * rathen then copying it in VCI again for additional + * information on var and param nodes. Searching for + * underlying scan state is postponed to + * vciExecEvalScalarVarFromColumnStore() + */ + scratch.opcode = EEOP_VCI_VAR; + scratch.d.var.vci_parent_planstate = parent; + + } + else if (variable->varattno <= 0) + { + /* system column */ + scratch.d.var.attnum = variable->varattno; + scratch.d.var.vartype = variable->vartype; + scratch.d.var.varreturningtype = variable->varreturningtype; + switch (variable->varno) + { + case INNER_VAR: + scratch.opcode = EEOP_INNER_SYSVAR; + break; + case OUTER_VAR: + scratch.opcode = EEOP_OUTER_SYSVAR; + break; + + /* INDEX_VAR is handled by default case */ + + default: + switch (variable->varreturningtype) + { + case VAR_RETURNING_DEFAULT: + scratch.opcode = EEOP_SCAN_SYSVAR; + break; + case VAR_RETURNING_OLD: + scratch.opcode = EEOP_OLD_SYSVAR; + state->flags |= EEO_FLAG_HAS_OLD; + break; + case VAR_RETURNING_NEW: + scratch.opcode = EEOP_NEW_SYSVAR; + state->flags |= EEO_FLAG_HAS_NEW; + break; + } + break; + } + } + else + { + /* regular user column */ + scratch.d.var.attnum = variable->varattno - 1; + scratch.d.var.vartype = variable->vartype; + scratch.d.var.varreturningtype = variable->varreturningtype; + /* select EEOP_*_FIRST opcode to force one-time checks */ + switch (variable->varno) + { + case INNER_VAR: + scratch.opcode = EEOP_INNER_VAR; + break; + case OUTER_VAR: + scratch.opcode = EEOP_OUTER_VAR; + break; + + /* INDEX_VAR is handled by default case */ + + default: + switch (variable->varreturningtype) + { + case VAR_RETURNING_DEFAULT: + scratch.opcode = EEOP_SCAN_VAR; + break; + case VAR_RETURNING_OLD: + scratch.opcode = EEOP_OLD_VAR; + state->flags |= EEO_FLAG_HAS_OLD; + break; + case VAR_RETURNING_NEW: + scratch.opcode = EEOP_NEW_VAR; + state->flags |= EEO_FLAG_HAS_NEW; + break; + } + break; + } + } + + ExprEvalPushStep(state, &scratch); + break; + } + case T_Const: + { + Const *con = (Const *) node; + + scratch.opcode = EEOP_CONST; + scratch.d.constval.value = con->constvalue; + scratch.d.constval.isnull = con->constisnull; + + ExprEvalPushStep(state, &scratch); + break; + } + case T_Param: + { + Param *param = (Param *) node; + + Assert(param->paramkind == PARAM_EXEC); + scratch.d.param.vci_parent_plan = parent->plan; + scratch.opcode = EEOP_VCI_PARAM_EXEC; + scratch.d.param.paramid = param->paramid; + scratch.d.param.paramtype = param->paramtype; + + ExprEvalPushStep(state, &scratch); + break; + } + case T_CaseTestExpr: + + /* + * Read from location identified by innermost_caseval. Note that + * innermost_caseval could be NULL, if this node isn't actually + * within a CASE structure; some parts of the system abuse + * CaseTestExpr to cause a read of a value externally supplied in + * econtext->caseValue_datum. We'll take care of that by + * generating a specialized operation. + */ + if (state->innermost_caseval == NULL) + scratch.opcode = EEOP_CASE_TESTVAL_EXT; + else + { + scratch.opcode = EEOP_CASE_TESTVAL; + scratch.d.casetest.value = state->innermost_caseval; + scratch.d.casetest.isnull = state->innermost_casenull; + } + ExprEvalPushStep(state, &scratch); + break; + + case T_Aggref: + { + Aggref *aggref = (Aggref *) node; + + scratch.opcode = EEOP_AGGREF; + scratch.d.aggref.aggno = aggref->aggno; + + if (parent && IsA(parent, CustomScanState)) + { + VciAggState *aggstate = (VciAggState *) parent; + + aggstate->aggs = lappend(aggstate->aggs, aggref); + } + else + { + /* planner messed up */ + elog(ERROR, "Aggref found in non-Agg plan node"); + } + + ExprEvalPushStep(state, &scratch); + break; + } + break; + + case T_MergeSupportFunc: + { + /* must be in a MERGE, else something messed up */ + if (!state->parent || + !IsA(state->parent, ModifyTableState) || + ((ModifyTableState *) state->parent)->operation != CMD_MERGE) + elog(ERROR, "MergeSupportFunc found in non-merge plan node"); + scratch.opcode = EEOP_MERGE_SUPPORT_FUNC; + ExprEvalPushStep(state, &scratch); + break; + } + + case T_FuncExpr: + { + FuncExpr *func = (FuncExpr *) node; + + VciExecInitFunc(&scratch, node, + func->args, func->funcid, func->inputcollid, + parent, state, inittype); + ExprEvalPushStep(state, &scratch); + break; + } + break; + case T_OpExpr: + { + OpExpr *op = (OpExpr *) node; + + VciExecInitFunc(&scratch, node, + op->args, op->opfuncid, op->inputcollid, + parent, state, inittype); + ExprEvalPushStep(state, &scratch); + break; + } + break; + case T_DistinctExpr: + { + DistinctExpr *op = (DistinctExpr *) node; + + VciExecInitFunc(&scratch, node, + op->args, op->opfuncid, op->inputcollid, + parent, state, inittype); + + /* + * Change opcode of call instruction to EEOP_DISTINCT. + * + * XXX: historically we've not called the function usage + * pgstat infrastructure - that seems inconsistent given that + * we do so for normal function *and* operator evaluation. If + * we decided to do that here, we'd probably want separate + * opcodes for FUSAGE or not. + */ + scratch.opcode = EEOP_DISTINCT; + ExprEvalPushStep(state, &scratch); + break; + } + break; + case T_NullIfExpr: + { + NullIfExpr *op = (NullIfExpr *) node; + + VciExecInitFunc(&scratch, node, + op->args, op->opfuncid, op->inputcollid, + parent, state, inittype); + + /* + * If first argument is of varlena type, we'll need to ensure + * that the value passed to the comparison function is a + * read-only pointer. + */ + scratch.d.func.make_ro = + (get_typlen(exprType((Node *) linitial(op->args))) == -1); + + /* + * Change opcode of call instruction to EEOP_NULLIF. + * + * XXX: historically we've not called the function usage + * pgstat infrastructure - that seems inconsistent given that + * we do so for normal function *and* operator evaluation. If + * we decided to do that here, we'd probably want separate + * opcodes for FUSAGE or not. + */ + scratch.opcode = EEOP_NULLIF; + ExprEvalPushStep(state, &scratch); + break; + } + break; + case T_ScalarArrayOpExpr: + { + ScalarArrayOpExpr *opexpr = (ScalarArrayOpExpr *) node; + Expr *scalararg; + Expr *arrayarg; + FmgrInfo *finfo; + FunctionCallInfo fcinfo; + AclResult aclresult; + + Assert(list_length(opexpr->args) == 2); + scalararg = (Expr *) linitial(opexpr->args); + arrayarg = (Expr *) lsecond(opexpr->args); + + /* Check permission to call function */ + aclresult = object_aclcheck(ProcedureRelationId, opexpr->opfuncid, + GetUserId(), + ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, + get_func_name(opexpr->opfuncid)); + InvokeFunctionExecuteHook(opexpr->opfuncid); + + if (OidIsValid(opexpr->hashfuncid)) + { + aclresult = object_aclcheck(ProcedureRelationId, opexpr->hashfuncid, + GetUserId(), + ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, + get_func_name(opexpr->hashfuncid)); + InvokeFunctionExecuteHook(opexpr->hashfuncid); + } + + /* Set up the primary fmgr lookup information */ + finfo = palloc0(sizeof(FmgrInfo)); + fcinfo = palloc0(SizeForFunctionCallInfo(2)); + fmgr_info(opexpr->opfuncid, finfo); + fmgr_info_set_expr((Node *) node, finfo); + InitFunctionCallInfoData(*fcinfo, finfo, 2, + opexpr->inputcollid, NULL, NULL); + + /* + * If hashfuncid is set, we create a EEOP_HASHED_SCALARARRAYOP + * step instead of a EEOP_SCALARARRAYOP. This provides much + * faster lookup performance than the normal linear search + * when the number of items in the array is anything but very + * small. + */ + if (OidIsValid(opexpr->hashfuncid)) + { + + /* Evaluate scalar directly into left function argument */ + VciExecInitExprRec(scalararg, parent, state, + &fcinfo->args[0].value, &fcinfo->args[0].isnull, inittype); + + /* + * Evaluate array argument into our return value. There's + * no danger in that, because the return value is + * guaranteed to be overwritten by + * EEOP_HASHED_SCALARARRAYOP, and will not be passed to + * any other expression. + */ + VciExecInitExprRec(arrayarg, parent, state, resv, resnull, inittype); + + /* And perform the operation */ + scratch.opcode = EEOP_HASHED_SCALARARRAYOP; + scratch.d.hashedscalararrayop.finfo = finfo; + scratch.d.hashedscalararrayop.fcinfo_data = fcinfo; + scratch.d.hashedscalararrayop.saop = opexpr; + + ExprEvalPushStep(state, &scratch); + } + else + { + /* Evaluate scalar directly into left function argument */ + VciExecInitExprRec(scalararg, parent, state, + &fcinfo->args[0].value, &fcinfo->args[0].isnull, inittype); + + /* + * Evaluate array argument into our return value. There's + * no danger in that, because the return value is + * guaranteed to be overwritten by EEOP_SCALARARRAYOP, and + * will not be passed to any other expression. + */ + VciExecInitExprRec(arrayarg, parent, state, resv, resnull, inittype); + + /* And perform the operation */ + scratch.opcode = EEOP_SCALARARRAYOP; + scratch.d.scalararrayop.element_type = InvalidOid; + scratch.d.scalararrayop.useOr = opexpr->useOr; + scratch.d.scalararrayop.finfo = finfo; + scratch.d.scalararrayop.fcinfo_data = fcinfo; + scratch.d.scalararrayop.fn_addr = finfo->fn_addr; + ExprEvalPushStep(state, &scratch); + } + break; + } + break; + case T_BoolExpr: + { + BoolExpr *boolexpr = (BoolExpr *) node; + int nargs = list_length(boolexpr->args); + List *adjust_jumps = NIL; + int off; + ListCell *lc; + + /* allocate scratch memory used by all steps of AND/OR */ + if (boolexpr->boolop != NOT_EXPR) + scratch.d.boolexpr.anynull = (bool *) palloc(sizeof(bool)); + + /* + * For each argument evaluate the argument itself, then + * perform the bool operation's appropriate handling. + * + * We can evaluate each argument into our result area, since + * the short-circuiting logic means we only need to remember + * previous NULL values. + * + * AND/OR is split into separate STEP_FIRST (one) / STEP (zero + * or more) / STEP_LAST (one) steps, as each of those has to + * perform different work. The FIRST/LAST split is valid + * because AND/OR have at least two arguments. + */ + off = 0; + foreach(lc, boolexpr->args) + { + Expr *arg = (Expr *) lfirst(lc); + + /* Evaluate argument into our output variable */ + VciExecInitExprRec(arg, parent, state, resv, resnull, inittype); + + /* Perform the appropriate step type */ + switch (boolexpr->boolop) + { + case AND_EXPR: + Assert(nargs >= 2); + + if (off == 0) + scratch.opcode = EEOP_BOOL_AND_STEP_FIRST; + else if (off + 1 == nargs) + scratch.opcode = EEOP_BOOL_AND_STEP_LAST; + else + scratch.opcode = EEOP_BOOL_AND_STEP; + break; + case OR_EXPR: + Assert(nargs >= 2); + + if (off == 0) + scratch.opcode = EEOP_BOOL_OR_STEP_FIRST; + else if (off + 1 == nargs) + scratch.opcode = EEOP_BOOL_OR_STEP_LAST; + else + scratch.opcode = EEOP_BOOL_OR_STEP; + break; + case NOT_EXPR: + Assert(nargs == 1); + + scratch.opcode = EEOP_BOOL_NOT_STEP; + break; + default: + elog(ERROR, "unrecognized boolop: %d", + (int) boolexpr->boolop); + break; + } + + scratch.d.boolexpr.jumpdone = -1; + ExprEvalPushStep(state, &scratch); + adjust_jumps = lappend_int(adjust_jumps, + state->steps_len - 1); + off++; + } + + /* adjust jump targets */ + foreach(lc, adjust_jumps) + { + ExprEvalStep *as = &state->steps[lfirst_int(lc)]; + + Assert(as->d.boolexpr.jumpdone == -1); + as->d.boolexpr.jumpdone = state->steps_len; + } + } + break; + case T_RelabelType: + { + /* relabel doesn't need to do anything at runtime */ + RelabelType *relabel = (RelabelType *) node; + + VciExecInitExprRec(relabel->arg, parent, state, resv, resnull, inittype); + break; + } + break; + case T_CaseExpr: + { + CaseExpr *caseExpr = (CaseExpr *) node; + List *adjust_jumps = NIL; + Datum *caseval = NULL; + bool *casenull = NULL; + ListCell *lc; + + /* + * If there's a test expression, we have to evaluate it and + * save the value where the CaseTestExpr placeholders can find + * it. + */ + if (caseExpr->arg != NULL) + { + /* Evaluate testexpr into caseval/casenull workspace */ + caseval = palloc(sizeof(Datum)); + casenull = palloc(sizeof(bool)); + + VciExecInitExprRec(caseExpr->arg, parent, state, + caseval, casenull, inittype); + + /* + * Since value might be read multiple times, force to R/O + * - but only if it could be an expanded datum. + */ + if (get_typlen(exprType((Node *) caseExpr->arg)) == -1) + { + /* change caseval in-place */ + scratch.opcode = EEOP_MAKE_READONLY; + scratch.resvalue = caseval; + scratch.resnull = casenull; + scratch.d.make_readonly.value = caseval; + scratch.d.make_readonly.isnull = casenull; + ExprEvalPushStep(state, &scratch); + /* restore normal settings of scratch fields */ + scratch.resvalue = resv; + scratch.resnull = resnull; + } + } + + /* + * Prepare to evaluate each of the WHEN clauses in turn; as + * soon as one is true we return the value of the + * corresponding THEN clause. If none are true then we return + * the value of the ELSE clause, or NULL if there is none. + */ + foreach(lc, caseExpr->args) + { + CaseWhen *when = (CaseWhen *) lfirst(lc); + Datum *save_innermost_caseval; + bool *save_innermost_casenull; + int whenstep; + + /* + * Make testexpr result available to CaseTestExpr nodes + * within the condition. We must save and restore prior + * setting of innermost_caseval fields, in case this node + * is itself within a larger CASE. + * + * If there's no test expression, we don't actually need + * to save and restore these fields; but it's less code to + * just do so unconditionally. + */ + save_innermost_caseval = state->innermost_caseval; + save_innermost_casenull = state->innermost_casenull; + state->innermost_caseval = caseval; + state->innermost_casenull = casenull; + + /* evaluate condition into CASE's result variables */ + VciExecInitExprRec(when->expr, parent, state, resv, resnull, inittype); + + state->innermost_caseval = save_innermost_caseval; + state->innermost_casenull = save_innermost_casenull; + + /* If WHEN result isn't true, jump to next CASE arm */ + scratch.opcode = EEOP_JUMP_IF_NOT_TRUE; + scratch.d.jump.jumpdone = -1; /* computed later */ + ExprEvalPushStep(state, &scratch); + whenstep = state->steps_len - 1; + + /* + * If WHEN result is true, evaluate THEN result, storing + * it into the CASE's result variables. + */ + VciExecInitExprRec(when->result, parent, state, resv, resnull, inittype); + + /* Emit JUMP step to jump to end of CASE's code */ + scratch.opcode = EEOP_JUMP; + scratch.d.jump.jumpdone = -1; /* computed later */ + ExprEvalPushStep(state, &scratch); + + /* + * Don't know address for that jump yet, compute once the + * whole CASE expression is built. + */ + adjust_jumps = lappend_int(adjust_jumps, + state->steps_len - 1); + + /* + * But we can set WHEN test's jump target now, to make it + * jump to the next WHEN subexpression or the ELSE. + */ + state->steps[whenstep].d.jump.jumpdone = state->steps_len; + } + + /* transformCaseExpr always adds a default */ + Assert(caseExpr->defresult); + + /* evaluate ELSE expr into CASE's result variables */ + VciExecInitExprRec(caseExpr->defresult, parent, state, + resv, resnull, inittype); + + /* adjust jump targets */ + foreach(lc, adjust_jumps) + { + ExprEvalStep *as = &state->steps[lfirst_int(lc)]; + + Assert(as->opcode == EEOP_JUMP); + Assert(as->d.jump.jumpdone == -1); + as->d.jump.jumpdone = state->steps_len; + } + } + break; + case T_CoalesceExpr: + { + CoalesceExpr *coalesce = (CoalesceExpr *) node; + List *adjust_jumps = NIL; + ListCell *lc; + + /* We assume there's at least one arg */ + Assert(coalesce->args != NIL); + + /* + * Prepare evaluation of all coalesced arguments, after each + * one push a step that short-circuits if not null. + */ + foreach(lc, coalesce->args) + { + Expr *e = (Expr *) lfirst(lc); + + /* evaluate argument, directly into result datum */ + VciExecInitExprRec(e, parent, state, resv, resnull, inittype); + + /* if it's not null, skip to end of COALESCE expr */ + scratch.opcode = EEOP_JUMP_IF_NOT_NULL; + scratch.d.jump.jumpdone = -1; /* adjust later */ + ExprEvalPushStep(state, &scratch); + + adjust_jumps = lappend_int(adjust_jumps, + state->steps_len - 1); + } + + /* + * No need to add a constant NULL return - we only can get to + * the end of the expression if a NULL already is being + * returned. + */ + + /* adjust jump targets */ + foreach(lc, adjust_jumps) + { + ExprEvalStep *as = &state->steps[lfirst_int(lc)]; + + Assert(as->opcode == EEOP_JUMP_IF_NOT_NULL); + Assert(as->d.jump.jumpdone == -1); + as->d.jump.jumpdone = state->steps_len; + } + } + break; + case T_MinMaxExpr: + { + MinMaxExpr *minmaxexpr = (MinMaxExpr *) node; + int nelems = list_length(minmaxexpr->args); + TypeCacheEntry *typentry; + FmgrInfo *finfo; + FunctionCallInfo fcinfo; + ListCell *lc; + int off; + + /* Look up the btree comparison function for the datatype */ + typentry = lookup_type_cache(minmaxexpr->minmaxtype, + TYPECACHE_CMP_PROC); + if (!OidIsValid(typentry->cmp_proc)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("could not identify a comparison function for type %s", + format_type_be(minmaxexpr->minmaxtype)))); + + /* + * If we enforced permissions checks on index support + * functions, we'd need to make a check here. But the index + * support machinery doesn't do that, and thus neither does + * this code. + */ + + /* Perform function lookup */ + finfo = palloc0(sizeof(FmgrInfo)); + fcinfo = palloc0(SizeForFunctionCallInfo(2)); + fmgr_info(typentry->cmp_proc, finfo); + fmgr_info_set_expr((Node *) node, finfo); + InitFunctionCallInfoData(*fcinfo, finfo, 2, + minmaxexpr->inputcollid, NULL, NULL); + + scratch.opcode = EEOP_MINMAX; + /* allocate space to store arguments */ + scratch.d.minmax.values = + (Datum *) palloc(sizeof(Datum) * nelems); + scratch.d.minmax.nulls = + (bool *) palloc(sizeof(bool) * nelems); + scratch.d.minmax.nelems = nelems; + + scratch.d.minmax.op = minmaxexpr->op; + scratch.d.minmax.finfo = finfo; + scratch.d.minmax.fcinfo_data = fcinfo; + + /* evaluate expressions into minmax->values/nulls */ + off = 0; + foreach(lc, minmaxexpr->args) + { + Expr *e = (Expr *) lfirst(lc); + + VciExecInitExprRec(e, parent, state, + &scratch.d.minmax.values[off], + &scratch.d.minmax.nulls[off], inittype); + off++; + } + + /* and push the final comparison */ + ExprEvalPushStep(state, &scratch); + break; + } + + case T_SQLValueFunction: + { + SQLValueFunction *svf = (SQLValueFunction *) node; + + scratch.opcode = EEOP_SQLVALUEFUNCTION; + scratch.d.sqlvaluefunction.svf = svf; + + ExprEvalPushStep(state, &scratch); + break; + } + + case T_JsonValueExpr: + { + JsonValueExpr *jve = (JsonValueExpr *) node; + + Assert(jve->raw_expr != NULL); + VciExecInitExprRec(jve->raw_expr, parent, state, resv, resnull, inittype); + Assert(jve->formatted_expr != NULL); + VciExecInitExprRec(jve->formatted_expr, parent, state, resv, resnull, inittype); + + break; + } + + case T_JsonConstructorExpr: + { + JsonConstructorExpr *ctor = (JsonConstructorExpr *) node; + List *args = ctor->args; + ListCell *lc; + int nargs = list_length(args); + int argno = 0; + + if (ctor->func) + { + VciExecInitExprRec(ctor->func, parent, state, resv, resnull, inittype); + } + else if ((ctor->type == JSCTOR_JSON_PARSE && !ctor->unique) || + ctor->type == JSCTOR_JSON_SERIALIZE) + { + /* Use the value of the first argument as result */ + VciExecInitExprRec(linitial(args), parent, state, resv, resnull, inittype); + } + else + { + JsonConstructorExprState *jcstate; + + jcstate = palloc0(sizeof(JsonConstructorExprState)); + + scratch.opcode = EEOP_JSON_CONSTRUCTOR; + scratch.d.json_constructor.jcstate = jcstate; + + jcstate->constructor = ctor; + jcstate->arg_values = (Datum *) palloc(sizeof(Datum) * nargs); + jcstate->arg_nulls = (bool *) palloc(sizeof(bool) * nargs); + jcstate->arg_types = (Oid *) palloc(sizeof(Oid) * nargs); + jcstate->nargs = nargs; + + foreach(lc, args) + { + Expr *arg = (Expr *) lfirst(lc); + + jcstate->arg_types[argno] = exprType((Node *) arg); + + if (IsA(arg, Const)) + { + /* Don't evaluate const arguments every round */ + Const *con = (Const *) arg; + + jcstate->arg_values[argno] = con->constvalue; + jcstate->arg_nulls[argno] = con->constisnull; + } + else + { + VciExecInitExprRec(arg, parent, state, &jcstate->arg_values[argno], &jcstate->arg_nulls[argno], inittype); + } + argno++; + } + + /* prepare type cache for datum_to_json[b]() */ + if (ctor->type == JSCTOR_JSON_SCALAR) + { + bool is_jsonb = + ctor->returning->format->format_type == JS_FORMAT_JSONB; + + jcstate->arg_type_cache = + palloc(sizeof(*jcstate->arg_type_cache) * nargs); + + for (int i = 0; i < nargs; i++) + { + JsonTypeCategory category; + Oid outfuncid; + Oid typid = jcstate->arg_types[i]; + + json_categorize_type(typid, is_jsonb, + &category, &outfuncid); + + jcstate->arg_type_cache[i].outfuncid = outfuncid; + jcstate->arg_type_cache[i].category = (int) category; + } + } + + ExprEvalPushStep(state, &scratch); + } + + if (ctor->coercion) + { + Datum *innermost_caseval = state->innermost_caseval; + bool *innermost_isnull = state->innermost_casenull; + + state->innermost_caseval = resv; + state->innermost_casenull = resnull; + + VciExecInitExprRec(ctor->coercion, parent, state, resv, resnull, inittype); + + state->innermost_caseval = innermost_caseval; + state->innermost_casenull = innermost_isnull; + } + } + break; + + case T_JsonIsPredicate: + { + JsonIsPredicate *pred = (JsonIsPredicate *) node; + + VciExecInitExprRec((Expr *) pred->expr, parent, state, resv, resnull, inittype); + + scratch.opcode = EEOP_IS_JSON; + scratch.d.is_json.pred = pred; + + ExprEvalPushStep(state, &scratch); + break; + } + + case T_JsonExpr: + { + JsonExpr *jsexpr = castNode(JsonExpr, node); + + /* + * No need to initialize a full JsonExprState For + * JSON_TABLE(), because the upstream caller tfuncFetchRows() + * is only interested in the value of formatted_expr. + */ + if (jsexpr->op == JSON_TABLE_OP) + VciExecInitExprRec((Expr *) jsexpr->formatted_expr, parent, state, + resv, resnull, inittype); + else + VciExecInitJsonExpr(jsexpr, parent, state, resv, resnull, &scratch, inittype); + break; + } + + case T_NullTest: + { + NullTest *ntest = (NullTest *) node; + + if (ntest->nulltesttype == IS_NULL) + { + if (ntest->argisrow) + scratch.opcode = EEOP_NULLTEST_ROWISNULL; + else + scratch.opcode = EEOP_NULLTEST_ISNULL; + } + else if (ntest->nulltesttype == IS_NOT_NULL) + { + if (ntest->argisrow) + scratch.opcode = EEOP_NULLTEST_ROWISNOTNULL; + else + scratch.opcode = EEOP_NULLTEST_ISNOTNULL; + } + else + { + elog(ERROR, "unrecognized nulltesttype: %d", + (int) ntest->nulltesttype); + } + /* initialize cache in case it's a row test */ + scratch.d.nulltest_row.rowcache.cacheptr = NULL; + + /* first evaluate argument into result variable */ + VciExecInitExprRec(ntest->arg, parent, state, + resv, resnull, inittype); + + /* then push the test of that argument */ + ExprEvalPushStep(state, &scratch); + break; + } + break; + case T_BooleanTest: + { + BooleanTest *btest = (BooleanTest *) node; + + /* + * Evaluate argument, directly into result datum. That's ok, + * because resv/resnull is definitely not used anywhere else, + * and will get overwritten by the below EEOP_BOOLTEST_IS_* + * step. + */ + VciExecInitExprRec(btest->arg, parent, state, resv, resnull, inittype); + + switch (btest->booltesttype) + { + case IS_TRUE: + scratch.opcode = EEOP_BOOLTEST_IS_TRUE; + break; + case IS_NOT_TRUE: + scratch.opcode = EEOP_BOOLTEST_IS_NOT_TRUE; + break; + case IS_FALSE: + scratch.opcode = EEOP_BOOLTEST_IS_FALSE; + break; + case IS_NOT_FALSE: + scratch.opcode = EEOP_BOOLTEST_IS_NOT_FALSE; + break; + case IS_UNKNOWN: + /* Same as scalar IS NULL test */ + scratch.opcode = EEOP_NULLTEST_ISNULL; + break; + case IS_NOT_UNKNOWN: + /* Same as scalar IS NOT NULL test */ + scratch.opcode = EEOP_NULLTEST_ISNOTNULL; + break; + default: + elog(ERROR, "unrecognized booltesttype: %d", + (int) btest->booltesttype); + } + + ExprEvalPushStep(state, &scratch); + break; + } + break; + case T_CoerceViaIO: + { + CoerceViaIO *iocoerce = (CoerceViaIO *) node; + Oid iofunc; + bool typisvarlena; + Oid typioparam; + FunctionCallInfo fcinfo_in; + + /* evaluate argument into step's result area */ + VciExecInitExprRec(iocoerce->arg, parent, state, resv, resnull, inittype); + + /* + * Prepare both output and input function calls, to be + * evaluated inside a single evaluation step for speed - this + * can be a very common operation. + * + * We don't check permissions here as a type's input/output + * function are assumed to be executable by everyone. + */ + if (state->escontext == NULL) + scratch.opcode = EEOP_IOCOERCE; + else + scratch.opcode = EEOP_IOCOERCE_SAFE; + + /* lookup the source type's output function */ + scratch.d.iocoerce.finfo_out = palloc0(sizeof(FmgrInfo)); + scratch.d.iocoerce.fcinfo_data_out = palloc0(SizeForFunctionCallInfo(1)); + + getTypeOutputInfo(exprType((Node *) iocoerce->arg), + &iofunc, &typisvarlena); + fmgr_info(iofunc, scratch.d.iocoerce.finfo_out); + fmgr_info_set_expr((Node *) node, scratch.d.iocoerce.finfo_out); + InitFunctionCallInfoData(*scratch.d.iocoerce.fcinfo_data_out, + scratch.d.iocoerce.finfo_out, + 1, InvalidOid, NULL, NULL); + + /* lookup the result type's input function */ + scratch.d.iocoerce.finfo_in = palloc0(sizeof(FmgrInfo)); + scratch.d.iocoerce.fcinfo_data_in = palloc0(SizeForFunctionCallInfo(3)); + + getTypeInputInfo(iocoerce->resulttype, + &iofunc, &typioparam); + fmgr_info(iofunc, scratch.d.iocoerce.finfo_in); + fmgr_info_set_expr((Node *) node, scratch.d.iocoerce.finfo_in); + InitFunctionCallInfoData(*scratch.d.iocoerce.fcinfo_data_in, + scratch.d.iocoerce.finfo_in, + 3, InvalidOid, NULL, NULL); + + /* + * We can preload the second and third arguments for the input + * function, since they're constants. + */ + fcinfo_in = scratch.d.iocoerce.fcinfo_data_in; + fcinfo_in->args[1].value = ObjectIdGetDatum(typioparam); + fcinfo_in->args[1].isnull = false; + fcinfo_in->args[2].value = Int32GetDatum(-1); + fcinfo_in->args[2].isnull = false; + + fcinfo_in->context = (Node *) state->escontext; + + ExprEvalPushStep(state, &scratch); + break; + } + break; + default: + /* LCOV_EXCL_START */ + elog(ERROR, "unrecognized node type: %s(%d)", + VciGetNodeName(nodeTag(node)), (int) nodeTag(node)); + break; + /* LCOV_EXCL_STOP */ + } + +} + +/* + * VciExecInitQual: prepare a qual for execution by ExecQual + * + * Prepares for the evaluation of a conjunctive boolean expression (qual list + * with implicit AND semantics) that returns true if none of the + * subexpressions are false. + * + * We must return true if the list is empty. Since that's a very common case, + * we optimize it a bit further by translating to a NULL ExprState pointer + * rather than setting up an ExprState that computes constant TRUE. (Some + * especially hot-spot callers of ExecQual detect this and avoid calling + * ExecQual at all.) + * + * If any of the subexpressions yield NULL, then the result of the conjunction + * is false. This makes ExecQual primarily useful for evaluating WHERE + * clauses, since SQL specifies that tuples with null WHERE results do not + * get selected. + * copied from src/backend/executor/execExpr.c + */ +ExprState * +VciExecInitQual(List *qual, PlanState *parent, vci_initexpr_t inittype) +{ + ExprState *state; + ExprEvalStep scratch; + List *adjust_jumps = NIL; + + /* short-circuit (here and in ExecQual) for empty restriction list */ + if (qual == NIL) + return NULL; + + Assert(IsA(qual, List)); + + state = makeNode(ExprState); + state->expr = (Expr *) qual; + state->parent = parent; + state->ext_params = NULL; + + /* mark expression as to be used with ExecQual() */ + state->flags = EEO_FLAG_IS_QUAL; + + /* Insert setup steps as needed */ + ExecCreateExprSetupSteps(state, (Node *) qual); + + /* + * ExecQual() needs to return false for an expression returning NULL. That + * allows us to short-circuit the evaluation the first time a NULL is + * encountered. As qual evaluation is a hot-path this warrants using a + * special opcode for qual evaluation that's simpler than BOOL_AND (which + * has more complex NULL handling). + */ + scratch.opcode = EEOP_QUAL; + + /* + * We can use ExprState's resvalue/resnull as target for each qual expr. + */ + scratch.resvalue = &state->resvalue; + scratch.resnull = &state->resnull; + + foreach_ptr(Expr, node, qual) + { + + /* first evaluate expression */ + VciExecInitExprRec(node, parent, state, &state->resvalue, &state->resnull, inittype); + + /* then emit EEOP_QUAL to detect if it's false (or null) */ + scratch.d.qualexpr.jumpdone = -1; + ExprEvalPushStep(state, &scratch); + adjust_jumps = lappend_int(adjust_jumps, + state->steps_len - 1); + } + + /* adjust jump targets */ + foreach_int(jump, adjust_jumps) + { + ExprEvalStep *as = &state->steps[jump]; + + Assert(as->opcode == EEOP_QUAL); + Assert(as->d.qualexpr.jumpdone == -1); + as->d.qualexpr.jumpdone = state->steps_len; + } + + /* + * At the end, we don't need to do anything more. The last qual expr must + * have yielded TRUE, and since its result is stored in the desired output + * location, we're done. + */ + scratch.opcode = EEOP_DONE_RETURN; + ExprEvalPushStep(state, &scratch); + + ExecReadyExpr(state); + + return state; +} + +/* + * Perform setup necessary for the evaluation of a function-like expression, + * appending argument evaluation steps to the steps list in *state, and + * setting up *scratch so it is ready to be pushed. + * + * scratch is not pushed here, so that callers may override the opcode, + * which is useful for function-like cases like DISTINCT. + */ +static void +VciExecInitFunc(ExprEvalStep *scratch, Expr *node, List *args, Oid funcid, + Oid inputcollid, PlanState *parent, ExprState *state, vci_initexpr_t inittype) +{ + int nargs = list_length(args); + AclResult aclresult; + FmgrInfo *flinfo; + FunctionCallInfo fcinfo; + int argno; + ListCell *lc; + + /* Check permission to call function */ + aclresult = object_aclcheck(ProcedureRelationId, funcid, GetUserId(), ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, get_func_name(funcid)); + InvokeFunctionExecuteHook(funcid); + + /* + * Safety check on nargs. Under normal circumstances this should never + * fail, as parser should check sooner. But possibly it might fail if + * server has been compiled with FUNC_MAX_ARGS smaller than some functions + * declared in pg_proc? + */ + if (nargs > FUNC_MAX_ARGS) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_ARGUMENTS), + errmsg_plural("cannot pass more than %d argument to a function", + "cannot pass more than %d arguments to a function", + FUNC_MAX_ARGS, + FUNC_MAX_ARGS))); + + /* Allocate function lookup data and parameter workspace for this call */ + scratch->d.func.finfo = palloc0(sizeof(FmgrInfo)); + scratch->d.func.fcinfo_data = palloc0(SizeForFunctionCallInfo(nargs)); + flinfo = scratch->d.func.finfo; + fcinfo = scratch->d.func.fcinfo_data; + + /* Set up the primary fmgr lookup information */ + fmgr_info(funcid, flinfo); + fmgr_info_set_expr((Node *) node, flinfo); + + /* Initialize function call parameter structure too */ + InitFunctionCallInfoData(*fcinfo, flinfo, + nargs, inputcollid, NULL, NULL); + + /* Keep extra copies of this info to save an indirection at runtime */ + scratch->d.func.fn_addr = flinfo->fn_addr; + scratch->d.func.nargs = nargs; + + /* We only support non-set functions here */ + if (flinfo->fn_retset) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("set-valued function called in context that cannot accept a set"), + parent ? executor_errposition(parent->state, + exprLocation((Node *) node)) : 0)); + + /* Build code to evaluate arguments directly into the fcinfo struct */ + argno = 0; + foreach(lc, args) + { + Expr *arg = (Expr *) lfirst(lc); + + if (IsA(arg, Const)) + { + /* + * Don't evaluate const arguments every round; especially + * interesting for constants in comparisons. + */ + Const *con = (Const *) arg; + + fcinfo->args[argno].value = con->constvalue; + fcinfo->args[argno].isnull = con->constisnull; + } + else + { + VciExecInitExprRec(arg, parent, state, + &fcinfo->args[argno].value, &fcinfo->args[argno].isnull, inittype); + } + argno++; + } + + /* Insert appropriate opcode depending on strictness and stats level */ + if (pgstat_track_functions <= flinfo->fn_stats) + { + if (flinfo->fn_strict && nargs > 0) + { + /* Choose nargs optimized implementation if available. */ + if (nargs == 1) + scratch->opcode = EEOP_FUNCEXPR_STRICT_1; + else if (nargs == 2) + scratch->opcode = EEOP_FUNCEXPR_STRICT_2; + else + scratch->opcode = EEOP_FUNCEXPR_STRICT; + } + else + scratch->opcode = EEOP_FUNCEXPR; + } + else + { + if (flinfo->fn_strict && nargs > 0) + scratch->opcode = EEOP_FUNCEXPR_STRICT_FUSAGE; + else + scratch->opcode = EEOP_FUNCEXPR_FUSAGE; + } +} + +/* ---------------------------------------------------------------- + * ExecQual / ExecTargetList / ExecProject + * ---------------------------------------------------------------- + */ + +/** + * ExecProject + * + * projects a tuple based on projection info and stores + * it in the previously specified tuple table slot. + * + * Note: the result is always a virtual tuple; therefore it + * may reference the contents of the exprContext's scan tuples + * and/or temporary results constructed in the exprContext. + * If the caller wishes the result to be valid longer than that + * data will be valid, he must call ExecMaterializeSlot on the + * result slot. + * + * copied from src/include/executor/executor.h + */ +TupleTableSlot * +VciExecProject(VciProjectionInfo *projInfo) +{ + ExprContext *econtext = projInfo->pi_exprContext; + ExprState *state = &projInfo->pi_state; + TupleTableSlot *slot = state->resultslot; + bool isnull; + + /* + * Clear any former contents of the result slot. This makes it safe for + * us to use the slot's Datum/isnull arrays as workspace. + */ + ExecClearTuple(slot); + + /* Run the expression, discarding scalar result from the last column. */ + (void) ExecEvalExprSwitchContext(state, econtext, &isnull); + + /* + * Successfully formed a result row. Mark the result slot as containing a + * valid virtual tuple (inlined version of ExecStoreVirtualTuple()). + */ + slot->tts_flags &= ~TTS_FLAG_EMPTY; + slot->tts_nvalid = slot->tts_tupleDescriptor->natts; + + return slot; +} + +/** + * Generate projection based on target list + * + * @param[in] targetlist Target list + * @param[in] econtext Execution context + * @param[in] slot + * @param[in] inputDesc + * @return VciProjectionInfo type projection + */ +VciProjectionInfo * +VciExecBuildProjectionInfo(List *targetList, + ExprContext *econtext, + TupleTableSlot *slot, + PlanState *parent, + TupleDesc inputDesc) +{ + VciProjectionInfo *projInfo; + ExprState *state; + ExprEvalStep scratch; + ListCell *lc; + int len = ExecTargetListLength(targetList); + int numSimpleVars; + bool directMap; + int exprlist_len; + int tle_id; + int *workspace; + int *varNumbers; + int *varOutputCols; + + projInfo = palloc0(sizeof(VciProjectionInfo)); + projInfo->pi_slotMap = (VciProjectionInfoSlot *) palloc0(len * sizeof(VciProjectionInfoSlot)); + projInfo->pi_tle_array = (TargetEntry **) palloc0(len * sizeof(TargetEntry *)); + + /* since these are all int arrays, we need do just one palloc */ + workspace = (int *) palloc(len * 2 * sizeof(int)); + projInfo->pi_varNumbers = varNumbers = workspace; + projInfo->pi_varOutputCols = varOutputCols = workspace + len; + + projInfo->pi_exprContext = econtext; + /* We embed ExprState into ProjectionInfo instead of doing extra palloc */ + projInfo->pi_state.type = T_ExprState; + state = &projInfo->pi_state; + state->expr = (Expr *) targetList; + state->resultslot = slot; + + numSimpleVars = 0; + tle_id = 0; + exprlist_len = 0; + directMap = true; + + /* Insert setup steps as needed */ + ExecCreateExprSetupSteps(state, (Node *) targetList); + + /* Now compile each tlist column */ + foreach(lc, targetList) + { + TargetEntry *tle = lfirst_node(TargetEntry, lc); + Var *variable = NULL; + AttrNumber attnum = 0; + bool isSafeVar = false; + + /* + * If tlist expression is a safe non-system Var, use the fast-path + * ASSIGN_*_VAR opcodes. "Safe" means that we don't need to apply + * CheckVarSlotCompatibility() during plan startup. If a source slot + * was provided, we make the equivalent tests here; if a slot was not + * provided, we assume that no check is needed because we're dealing + * with a non-relation-scan-level expression. + */ + if (tle->expr != NULL && + IsA(tle->expr, Var) && + ((Var *) tle->expr)->varattno > 0) + { + /* Non-system Var, but how safe is it? */ + variable = (Var *) tle->expr; + attnum = variable->varattno; + + if (inputDesc == NULL) + isSafeVar = true; /* can't check, just assume OK */ + else if (attnum <= inputDesc->natts) + { + Form_pg_attribute attr = TupleDescAttr(inputDesc, attnum - 1); + + /* + * If user attribute is dropped or has a type mismatch, don't + * use ASSIGN_*_VAR. Instead let the normal expression + * machinery handle it (which'll possibly error out). + */ + if (!attr->attisdropped && variable->vartype == attr->atttypid) + { + isSafeVar = true; + } + } + } + + if (isSafeVar) + { + varNumbers[numSimpleVars] = attnum; + varOutputCols[numSimpleVars] = tle->resno; + + if (tle->resno != numSimpleVars + 1) + directMap = false; + + /* Fast-path: just generate an EEOP_ASSIGN_*_VAR step */ + switch (variable->varno) + { + case INNER_VAR: + /* get the tuple from the inner node */ + scratch.opcode = EEOP_ASSIGN_INNER_VAR; + break; + + case OUTER_VAR: + /* get the tuple from the outer node */ + scratch.opcode = EEOP_ASSIGN_OUTER_VAR; + break; + + /* INDEX_VAR is handled by default case */ + + default: + + /* + * Get the tuple from the relation being scanned, or the + * old/new tuple slot, if old/new values were requested. + */ + switch (variable->varreturningtype) + { + case VAR_RETURNING_DEFAULT: + scratch.opcode = EEOP_ASSIGN_SCAN_VAR; + break; + case VAR_RETURNING_OLD: + scratch.opcode = EEOP_ASSIGN_OLD_VAR; + state->flags |= EEO_FLAG_HAS_OLD; + break; + case VAR_RETURNING_NEW: + scratch.opcode = EEOP_ASSIGN_NEW_VAR; + state->flags |= EEO_FLAG_HAS_NEW; + break; + } + break; + } + + scratch.d.assign_var.attnum = attnum - 1; + scratch.d.assign_var.resultnum = tle->resno - 1; + ExprEvalPushStep(state, &scratch); + + projInfo->pi_slotMap[tle_id].is_simple_var = true; + projInfo->pi_slotMap[tle_id].data.simple_var.relid = variable->varno; + projInfo->pi_slotMap[tle_id].data.simple_var.attno = variable->varattno; + + numSimpleVars++; + } + else + { + /* + * Otherwise, compile the column expression normally. + * + * We can't tell the expression to evaluate directly into the + * result slot, as the result slot (and the exprstate for that + * matter) can change between executions. We instead evaluate + * into the ExprState's resvalue/resnull and then move. + */ + VciExecInitExprRec(tle->expr, parent, state, + &state->resvalue, &state->resnull, VCI_INIT_EXPR_NORMAL); + + /* + * Column might be referenced multiple times in upper nodes, so + * force value to R/O - but only if it could be an expanded datum. + */ + if (get_typlen(exprType((Node *) tle->expr)) == -1) + scratch.opcode = EEOP_ASSIGN_TMP_MAKE_RO; + else + scratch.opcode = EEOP_ASSIGN_TMP; + scratch.d.assign_tmp.resultnum = tle->resno - 1; + ExprEvalPushStep(state, &scratch); + + /* Not a simple variable, add it to generic targetlist */ + projInfo->pi_tle_array[exprlist_len] = tle; + + projInfo->pi_slotMap[tle_id].is_simple_var = false; + projInfo->pi_slotMap[tle_id].data.expr.expr_id = exprlist_len; + + exprlist_len++; + } + + tle_id++; + } + + projInfo->pi_tle_array_len = exprlist_len; + + projInfo->pi_numSimpleVars = numSimpleVars; + projInfo->pi_directMap = directMap; + + if (projInfo->pi_tle_array == 0) + projInfo->pi_tle_array = NULL; + + scratch.opcode = EEOP_DONE_RETURN; + ExprEvalPushStep(state, &scratch); + + ExecReadyExpr(state); + + return projInfo; +} + +/* + * Push steps to evaluate a JsonExpr and its various subsidiary expressions. + */ +static void +VciExecInitJsonExpr(JsonExpr *jsexpr, PlanState *parent, ExprState *state, + Datum *resv, bool *resnull, + ExprEvalStep *scratch, vci_initexpr_t inittype) +{ + JsonExprState *jsestate = palloc0(sizeof(JsonExprState)); + ListCell *argexprlc; + ListCell *argnamelc; + List *jumps_return_null = NIL; + List *jumps_to_end = NIL; + ListCell *lc; + ErrorSaveContext *escontext; + bool returning_domain = + get_typtype(jsexpr->returning->typid) == TYPTYPE_DOMAIN; + + Assert(jsexpr->on_error != NULL); + + jsestate->jsexpr = jsexpr; + + /* + * Evaluate formatted_expr storing the result into + * jsestate->formatted_expr. + */ + VciExecInitExprRec((Expr *) jsexpr->formatted_expr, parent, state, + &jsestate->formatted_expr.value, + &jsestate->formatted_expr.isnull, inittype); + + /* JUMP to return NULL if formatted_expr evaluates to NULL */ + jumps_return_null = lappend_int(jumps_return_null, state->steps_len); + scratch->opcode = EEOP_JUMP_IF_NULL; + scratch->resnull = &jsestate->formatted_expr.isnull; + scratch->d.jump.jumpdone = -1; /* set below */ + ExprEvalPushStep(state, scratch); + + /* + * Evaluate pathspec expression storing the result into + * jsestate->pathspec. + */ + VciExecInitExprRec((Expr *) jsexpr->path_spec, parent, state, + &jsestate->pathspec.value, + &jsestate->pathspec.isnull, inittype); + + /* JUMP to return NULL if path_spec evaluates to NULL */ + jumps_return_null = lappend_int(jumps_return_null, state->steps_len); + scratch->opcode = EEOP_JUMP_IF_NULL; + scratch->resnull = &jsestate->pathspec.isnull; + scratch->d.jump.jumpdone = -1; /* set below */ + ExprEvalPushStep(state, scratch); + + /* Steps to compute PASSING args. */ + jsestate->args = NIL; + forboth(argexprlc, jsexpr->passing_values, + argnamelc, jsexpr->passing_names) + { + Expr *argexpr = (Expr *) lfirst(argexprlc); + String *argname = lfirst_node(String, argnamelc); + JsonPathVariable *var = palloc(sizeof(*var)); + + var->name = argname->sval; + var->typid = exprType((Node *) argexpr); + var->typmod = exprTypmod((Node *) argexpr); + + VciExecInitExprRec((Expr *) argexpr, parent, state, &var->value, &var->isnull, inittype); + + jsestate->args = lappend(jsestate->args, var); + } + + /* Step for jsonpath evaluation; see ExecEvalJsonExprPath(). */ + scratch->opcode = EEOP_JSONEXPR_PATH; + scratch->resvalue = resv; + scratch->resnull = resnull; + scratch->d.jsonexpr.jsestate = jsestate; + ExprEvalPushStep(state, scratch); + + /* + * Step to return NULL after jumping to skip the EEOP_JSONEXPR_PATH step + * when either formatted_expr or pathspec is NULL. Adjust jump target + * addresses of JUMPs that we added above. + */ + foreach(lc, jumps_return_null) + { + ExprEvalStep *as = &state->steps[lfirst_int(lc)]; + + as->d.jump.jumpdone = state->steps_len; + } + scratch->opcode = EEOP_CONST; + scratch->resvalue = resv; + scratch->resnull = resnull; + scratch->d.constval.value = (Datum) 0; + scratch->d.constval.isnull = true; + ExprEvalPushStep(state, scratch); + + escontext = jsexpr->on_error->btype != JSON_BEHAVIOR_ERROR ? + &jsestate->escontext : NULL; + + /* + * To handle coercion errors softly, use the following ErrorSaveContext to + * pass to VciExecInitExprRec() when initializing the coercion expressions + * and in the EEOP_JSONEXPR_COERCION step. + */ + jsestate->escontext.type = T_ErrorSaveContext; + + /* + * Steps to coerce the result value computed by EEOP_JSONEXPR_PATH or the + * NULL returned on NULL input as described above. + */ + jsestate->jump_eval_coercion = -1; + if (jsexpr->use_json_coercion) + { + + jsestate->jump_eval_coercion = state->steps_len; + + VciExecInitJsonCoercion(state, jsexpr->returning, escontext, + jsexpr->omit_quotes, + jsexpr->op == JSON_EXISTS_OP, + resv, resnull); + } + else if (jsexpr->use_io_coercion) + { + /* + * Here we only need to initialize the FunctionCallInfo for the target + * type's input function, which is called by ExecEvalJsonExprPath() + * itself, so no additional step is necessary. + */ + Oid typinput; + Oid typioparam; + FmgrInfo *finfo; + FunctionCallInfo fcinfo; + + getTypeInputInfo(jsexpr->returning->typid, &typinput, &typioparam); + finfo = palloc0(sizeof(FmgrInfo)); + fcinfo = palloc0(SizeForFunctionCallInfo(3)); + fmgr_info(typinput, finfo); + fmgr_info_set_expr((Node *) jsexpr->returning, finfo); + InitFunctionCallInfoData(*fcinfo, finfo, 3, InvalidOid, NULL, NULL); + + /* + * We can preload the second and third arguments for the input + * function, since they're constants. + */ + fcinfo->args[1].value = ObjectIdGetDatum(typioparam); + fcinfo->args[1].isnull = false; + fcinfo->args[2].value = Int32GetDatum(jsexpr->returning->typmod); + fcinfo->args[2].isnull = false; + fcinfo->context = (Node *) escontext; + + jsestate->input_fcinfo = fcinfo; + } + + /* + * Add a special step, if needed, to check if the coercion evaluation ran + * into an error but was not thrown because the ON ERROR behavior is not + * ERROR. It will set jsestate->error if an error did occur. + */ + if (jsestate->jump_eval_coercion >= 0 && escontext != NULL) + { + scratch->opcode = EEOP_JSONEXPR_COERCION_FINISH; + scratch->d.jsonexpr.jsestate = jsestate; + ExprEvalPushStep(state, scratch); + } + + jsestate->jump_empty = jsestate->jump_error = -1; + + /* + * Step to check jsestate->error and return the ON ERROR expression if + * there is one. This handles both the errors that occur during jsonpath + * evaluation in EEOP_JSONEXPR_PATH and subsequent coercion evaluation. + * + * Speed up common cases by avoiding extra steps for a NULL-valued ON + * ERROR expression unless RETURNING a domain type, where constraints must + * be checked. ExecEvalJsonExprPath() already returns NULL on error, + * making additional steps unnecessary in typical scenarios. Note that the + * default ON ERROR behavior for JSON_VALUE() and JSON_QUERY() is to + * return NULL. + */ + if (jsexpr->on_error->btype != JSON_BEHAVIOR_ERROR && + (!(IsA(jsexpr->on_error->expr, Const) && + ((Const *) jsexpr->on_error->expr)->constisnull) || + returning_domain)) + { + ErrorSaveContext *saved_escontext; + + jsestate->jump_error = state->steps_len; + + /* JUMP to end if false, that is, skip the ON ERROR expression. */ + jumps_to_end = lappend_int(jumps_to_end, state->steps_len); + scratch->opcode = EEOP_JUMP_IF_NOT_TRUE; + scratch->resvalue = &jsestate->error.value; + scratch->resnull = &jsestate->error.isnull; + scratch->d.jump.jumpdone = -1; /* set below */ + ExprEvalPushStep(state, scratch); + + /* + * Steps to evaluate the ON ERROR expression; handle errors softly to + * rethrow them in COERCION_FINISH step that will be added later. + */ + saved_escontext = state->escontext; + state->escontext = escontext; + VciExecInitExprRec((Expr *) jsexpr->on_error->expr, parent, + state, resv, resnull, inittype); + state->escontext = saved_escontext; + + /* Step to coerce the ON ERROR expression if needed */ + if (jsexpr->on_error->coerce) + VciExecInitJsonCoercion(state, jsexpr->returning, escontext, jsexpr->omit_quotes, false, resv, + resnull); + + /* + * Add a COERCION_FINISH step to check for errors that may occur when + * coercing and rethrow them. + */ + if (jsexpr->on_error->coerce || + IsA(jsexpr->on_error->expr, CoerceViaIO) || + IsA(jsexpr->on_error->expr, CoerceToDomain)) + { + scratch->opcode = EEOP_JSONEXPR_COERCION_FINISH; + scratch->resvalue = resv; + scratch->resnull = resnull; + scratch->d.jsonexpr.jsestate = jsestate; + ExprEvalPushStep(state, scratch); + } + + /* JUMP to end to skip the ON EMPTY steps added below. */ + jumps_to_end = lappend_int(jumps_to_end, state->steps_len); + scratch->opcode = EEOP_JUMP; + scratch->d.jump.jumpdone = -1; + ExprEvalPushStep(state, scratch); + } + + /* + * Step to check jsestate->empty and return the ON EMPTY expression if + * there is one. + * + * See the comment above for details on the optimization for NULL-valued + * expressions. + */ + if (jsexpr->on_empty != NULL && + jsexpr->on_empty->btype != JSON_BEHAVIOR_ERROR && + (!(IsA(jsexpr->on_empty->expr, Const) && + ((Const *) jsexpr->on_empty->expr)->constisnull) || + returning_domain)) + { + ErrorSaveContext *saved_escontext; + + jsestate->jump_empty = state->steps_len; + + /* JUMP to end if false, that is, skip the ON EMPTY expression. */ + jumps_to_end = lappend_int(jumps_to_end, state->steps_len); + scratch->opcode = EEOP_JUMP_IF_NOT_TRUE; + scratch->resvalue = &jsestate->empty.value; + scratch->resnull = &jsestate->empty.isnull; + scratch->d.jump.jumpdone = -1; /* set below */ + ExprEvalPushStep(state, scratch); + + /* + * Steps to evaluate the ON EMPTY expression; handle errors softly to + * rethrow them in COERCION_FINISH step that will be added later. + */ + saved_escontext = state->escontext; + state->escontext = escontext; + VciExecInitExprRec((Expr *) jsexpr->on_empty->expr, parent, + state, resv, resnull, inittype); + state->escontext = saved_escontext; + + /* Step to coerce the ON EMPTY expression if needed */ + if (jsexpr->on_empty->coerce) + VciExecInitJsonCoercion(state, jsexpr->returning, escontext, jsexpr->omit_quotes, false, resv, + resnull); + + /* + * Add a COERCION_FINISH step to check for errors that may occur when + * coercing and rethrow them. + */ + if (jsexpr->on_empty->coerce || + IsA(jsexpr->on_empty->expr, CoerceViaIO) || + IsA(jsexpr->on_empty->expr, CoerceToDomain)) + { + scratch->opcode = EEOP_JSONEXPR_COERCION_FINISH; + scratch->resvalue = resv; + scratch->resnull = resnull; + scratch->d.jsonexpr.jsestate = jsestate; + ExprEvalPushStep(state, scratch); + } + } + + foreach(lc, jumps_to_end) + { + ExprEvalStep *as = &state->steps[lfirst_int(lc)]; + + as->d.jump.jumpdone = state->steps_len; + } + + jsestate->jump_end = state->steps_len; +} + +/* + * Initialize a EEOP_JSONEXPR_COERCION step to coerce the value given in resv + * to the given RETURNING type. + */ +static void +VciExecInitJsonCoercion(ExprState *state, JsonReturning *returning, + ErrorSaveContext *escontext, bool omit_quotes, + bool exists_coerce, + Datum *resv, bool *resnull) +{ + ExprEvalStep scratch = {0}; + + /* For json_populate_type() */ + scratch.opcode = EEOP_JSONEXPR_COERCION; + scratch.resvalue = resv; + scratch.resnull = resnull; + scratch.d.jsonexpr_coercion.targettype = returning->typid; + scratch.d.jsonexpr_coercion.targettypmod = returning->typmod; + scratch.d.jsonexpr_coercion.json_coercion_cache = NULL; + scratch.d.jsonexpr_coercion.escontext = escontext; + scratch.d.jsonexpr_coercion.omit_quotes = omit_quotes; + scratch.d.jsonexpr_coercion.exists_coerce = exists_coerce; + scratch.d.jsonexpr_coercion.exists_cast_to_int = exists_coerce && + getBaseType(returning->typid) == INT4OID; + scratch.d.jsonexpr_coercion.exists_check_domain = exists_coerce && + DomainHasConstraints(returning->typid); + ExprEvalPushStep(state, &scratch); +} diff --git a/contrib/vci/executor/vci_fetch_column_store.c b/contrib/vci/executor/vci_fetch_column_store.c new file mode 100644 index 000000000000..8df23e17b50f --- /dev/null +++ b/contrib/vci/executor/vci_fetch_column_store.c @@ -0,0 +1,1202 @@ +/*------------------------------------------------------------------------- + * + * vci_fetch_column_store.c + * Routine to fetch from column store + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/executor/vci_fetch_column_store.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/xact.h" /* for IsolationIsSerializable */ +#include "access/xlog.h" /* for RecoveryInProgress() */ +#include "access/xlogrecovery.h" +#include "catalog/pg_type.h" +#include "executor/execExpr.h" +#include "executor/executor.h" /* for EXEC_FLAG_BACKWARD */ +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "nodes/nodes.h" +#include "nodes/plannodes.h" +#include "storage/ipc.h" /* for before_shmem_exit() */ +#include "storage/lwlock.h" +#include "tcop/pquery.h" /* for ActivePortal */ +#include "utils/cash.h" +#include "utils/date.h" +#include "utils/elog.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/timestamp.h" + +#include "vci.h" + +#include "vci_executor.h" +#include "vci_fetch.h" +#include "vci_mem.h" +#include + +#if (!defined(WIN32)) + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include + +#else +typedef signed char int8_t; +typedef signed short int16_t; +typedef signed int int32_t; +typedef unsigned char uint8_t; +typedef unsigned short uint16_t; +typedef unsigned int uint32_t; +typedef signed long long int64_t; +typedef unsigned long long uint64_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#endif /* ! C99 */ +/** + * Used to search for VCI Scan in the plan tree with search_vci_scan_walker(). + */ +typedef struct +{ + List *scan_list; /* Record discovered VciScan in a list */ +} vci_search_vci_scan_context_t; + +/** + * Store data struct for each query + */ +vci_query_context_t *vci_query_context; + +static void initialize_query_context(PlannedStmt *target, MemoryContext smccontext); +static bool search_vci_scan_walker(Plan *plan, void *context); +static void aggregate_attr_used(List *scan_list); +static void output_local_ros_size(vci_CSQueryContext query_context); +static void enter_standby_query(void); +static void exit_standby_query(void); +static void prepare_query_contexts(bool recoveryInProgress, bool estimatingLocalROSSize); +static bool estimate_and_check_localROS_size(void); +static void shutdown_standby_query(int code, Datum arg); +static bool create_all_queries_context_for_fetching_column_store(QueryDesc *queryDesc, int eflags); +static void create_attr_map(VciScanState *scanstate, VciScan *scan, int *num_attrs_p, AttrNumber **attrNumArray_p); +static void initialize_one_fetch_context_for_fetching_column_store(VciScanState *scanstate, vci_index_placeholder_t *index_ph); + +static bool is_running_standby_query; +static bool shutdown_standby_query_registered; + +/** + * Initialize query context required for column store fetch + * + * Attempt to rewrite plan for each query, and if successful, initialize + * resources necessary to execute custom plan. + * + * @param[in,out] queryDesc query description to be rewritten + * @param[in] eflags Execution flag + * + * @note If plan rewrite is succesful, per-query SMC is constructed, and the + * rewritten plan is stored in queryDesc->plannedstmt. + * Also, vci_query_context_t will be generated in vci_query_context. + */ +void +vci_initialize_query_context(QueryDesc *queryDesc, int eflags) +{ + PlannedStmt *orig_stmt; + PlannedStmt *target; + MemoryContext tmpcontext; + MemoryContext oldcontext; + MemoryContext smccontext; + + /* + * When a previous query was failed, vci_query_context may be a dangling + * pointer. We'll only set vci_query_context to NULL but mustn't access + * the memory content pointed by vci_query_context. + */ + vci_query_context = NULL; + + /* + * In standalone mode or bootstrap mode, disable VCI execution. + */ + if (!IsPostmasterEnvironment) + return; + + if (!VciGuc.enable) + return; + + orig_stmt = queryDesc->plannedstmt; + + /* + * Custom plan is only for SELECT command. For other commands, plan tree + * rewrite won't be performed. + */ + if (orig_stmt->commandType != CMD_SELECT) + return; + + /* + * Stop if isolation level is serializable + */ + if (IsolationIsSerializable()) + return; + + /* + * Stop if full_page_writes is off + */ + if (!fullPageWrites) + return; + + /* + * Stop if WITH HOLD is specified in DECLARE command + */ + if (ActivePortal && + (ActivePortal->cursorOptions & (CURSOR_OPT_HOLD))) + return; + + /* + * Stop if SCROLL is specified in DECLARE command or SCROLL/NO SCROLL is + * not specified but SCROLL effect is applied internally + */ + if (eflags & EXEC_FLAG_BACKWARD) + return; + + /* + * Stop if plan cost estimate is less than threshold + */ + if ((orig_stmt->planTree == NULL) || + (orig_stmt->planTree->total_cost < (Cost) VciGuc.cost_threshold)) + return; + + elog(DEBUG1, "Call vci_initialize_query_context()"); + + tmpcontext = AllocSetContextCreate(CurrentMemoryContext, + "VCI Temporary Rewrite Plan", + ALLOCSET_DEFAULT_SIZES); + + oldcontext = MemoryContextSwitchTo(tmpcontext); + + target = vci_generate_custom_plan(orig_stmt, eflags, queryDesc->snapshot); + + MemoryContextSwitchTo(oldcontext); + + if (!target) + goto done; + + smccontext = AllocSetContextCreate(TopTransactionContext, "VCI Query", + ALLOCSET_DEFAULT_SIZES); + + oldcontext = MemoryContextSwitchTo(smccontext); + + /* To rewrite plan, memory is allocated in tmpcontext, but move it to SMC */ + target = copyObjectImpl(target); + + initialize_query_context(target, smccontext); + + if (create_all_queries_context_for_fetching_column_store(queryDesc, eflags)) + { + /* Environment build is succesful, so rewrite the plan */ + queryDesc->plannedstmt = target; + } + else + { + /* Environment build failed */ + MemoryContextSwitchTo(oldcontext); + vci_finalize_query_context(); + + goto done; + } + + vci_query_context->plannedstmt = target; + + vci_query_context->max_plan_info_entries = VCI_INIT_PLAN_INFO_ENTRIES; + vci_query_context->plan_info_map = palloc0(sizeof(vci_plan_info_t) * VCI_INIT_PLAN_INFO_ENTRIES); + + vci_query_context->lock = VciShmemAddr->vci_query_context_lock; + + MemoryContextSwitchTo(oldcontext); + +done: + MemoryContextDelete(tmpcontext); +} + +static void +initialize_query_context(PlannedStmt *target, MemoryContext smccontext) +{ + vci_search_vci_scan_context_t scontext; + + vci_query_context = palloc0(sizeof(*vci_query_context)); + + vci_query_context->mcontext = smccontext; + + scontext.scan_list = NIL; + vci_plannedstmt_tree_walker(target, search_vci_scan_walker, NULL, &scontext); + aggregate_attr_used(scontext.scan_list); + list_free(scontext.scan_list); +} + +static bool +search_vci_scan_walker(Plan *plan, void *context) +{ + vci_search_vci_scan_context_t *scontext; + + scontext = (vci_search_vci_scan_context_t *) context; + + if (plan && (IsA(plan, CustomScan) || IsA(plan, CustomPlanMarkPos))) + { + uint32 plan_type = ((CustomScan *) plan)->flags & VCI_CUSTOMPLAN_MASK; + + if (plan_type == VCI_CUSTOMPLAN_SCAN) + { + VciScan *scan = (VciScan *) plan; + + if (scan->scan_mode == VCI_SCAN_MODE_COLUMN_STORE) + { + scontext->scan_list = lappend(scontext->scan_list, plan); + return false; + } + } + } + + return vci_plan_tree_walker(plan, search_vci_scan_walker, context); +} + +/** + * If there are multiple VCI Scan in the same table, OR of the referenced attributes is taken. + * This is required to pass to vci_CSCreateQueryContext(). + */ +static void +aggregate_attr_used(List *scan_list) +{ + int i; + int uniq_vci_indexes = 0; + List *uniq_oid_list = NIL; + ListCell *outer, + *inner; + + /* + * Calculate number of unique VCI indexes referenced from query + */ + foreach(outer, scan_list) + { + bool match = false; + VciScan *scan = (VciScan *) lfirst(outer); + + foreach(inner, uniq_oid_list) + { + if (scan->indexoid == lfirst_oid(inner)) + { + match = true; + break; + } + } + + if (match) + continue; + else + { + uniq_oid_list = lappend_oid(uniq_oid_list, scan->indexoid); + uniq_vci_indexes++; + } + } + + elog(DEBUG1, "# of unique VCI indexes = %d", uniq_vci_indexes); + + /* uniq_vci_indexes can be 0 */ + + vci_query_context->num_indexes = uniq_vci_indexes; + vci_query_context->index_ph_table = palloc0(sizeof(vci_index_placeholder_t) * uniq_vci_indexes); + + i = 0; + foreach(outer, uniq_oid_list) + vci_query_context->index_ph_table[i++].indexoid = lfirst_oid(outer); + + list_free(uniq_oid_list); + uniq_oid_list = NIL; + + for (i = 0; i < uniq_vci_indexes; i++) + { + vci_index_placeholder_t *index_ph; + + index_ph = &vci_query_context->index_ph_table[i]; + + foreach(outer, scan_list) + { + VciScan *scan = (VciScan *) lfirst(outer); + + if (scan->indexoid == index_ph->indexoid) + { + index_ph->attr_used = bms_add_members(index_ph->attr_used, + scan->attr_used); + + scan->index_ph_id = i + 1; + scan->fetch_ph_id = ++index_ph->num_fetches; + } + } + + index_ph->fetch_ph_table = palloc0(sizeof(vci_fetch_placeholder_t) * index_ph->num_fetches); + } +} + +/** + * Free query context for VCI execution + * + * Release if query context is secured. + * Free if local SMC of backend process is secured. + */ +void +vci_free_query_context(void) +{ + if (vci_query_context) + { + MemoryContextDelete(vci_query_context->mcontext); + + vci_query_context = NULL; + } +} + +/** + * Determine whether custom plan that performs column store fetch is being executed + * + * @retval true Executing custom plan + * @retval false Not executing custom plan (including interruptions) + */ +bool +vci_is_processing_custom_plan(void) +{ + if (vci_query_context == NULL) + return false; + + return !vci_query_context->has_stopped; +} + +/** + * @description output the Data WOS size and Whiteout WOS size on log. + */ +static void +output_local_ros_size(vci_CSQueryContext query_context) +{ + elog(DEBUG1, + "A local ROS creation for VCI %d failed: Data WOS size = %ld, Whiteout WOS size = %ld", + query_context->main_relation_oid, + (long) query_context->num_data_wos_entries, (long) query_context->num_whiteout_wos_entries); +} + +/** + * Create data necessary for column store fetch. + * Call before executor runs. + */ +static bool +create_all_queries_context_for_fetching_column_store(QueryDesc *queryDesc, int eflags) +{ + int i; + bool result = true; + bool recoveryInProgress; + + /* + * In standby server query, ShareUpdateExclusiveLock lock cannot be + * performed during Local ROS creation. Instead, stop streaming + * replication WAL replay. + * + * Multiple queries simultaneously creating Local ROS are counted by + * num_standby_exec_queries. Restart WAL replay at the end of last query. + */ + recoveryInProgress = RecoveryInProgress(); + + if (recoveryInProgress) + enter_standby_query(); + + prepare_query_contexts(recoveryInProgress, true); + if (!estimate_and_check_localROS_size()) + goto error; + + for (i = 0; i < vci_query_context->num_indexes; i++) + { + vci_index_placeholder_t *index_ph; + + index_ph = &vci_query_context->index_ph_table[i]; + + vci_CSDestroyQueryContext(index_ph->query_context); + index_ph->query_context = NULL; + } + prepare_query_contexts(recoveryInProgress, false); + if (!estimate_and_check_localROS_size()) + goto error; + + /* + * Create Local ROS + */ + PG_TRY(); + { + for (i = 0; i < vci_query_context->num_indexes; i++) + { + vci_index_placeholder_t *index_ph; + + index_ph = &vci_query_context->index_ph_table[i]; + + index_ph->local_ros = vci_CSGenerateLocalRos(index_ph->query_context); + + Assert(index_ph->local_ros); + } + } + PG_CATCH(); + { + if (geterrcode() == ERRCODE_OUT_OF_MEMORY) + { + /* + * Cancel VCI execution if there is an error due to insufficient + * memory during Local ROS generation. + */ + if (VciGuc.log_query) + elog(WARNING, "out of memory during local ROS generation"); + + for (i = 0; i < vci_query_context->num_indexes; i++) + { + vci_index_placeholder_t *index_ph; + vci_id_t vciid; + + index_ph = &vci_query_context->index_ph_table[i]; + + vciid.oid = index_ph->indexoid; + vciid.dbid = MyDatabaseId; + + vci_SetForceNextWosRosConvFlag(&vciid, true); + + if (index_ph->query_context) + { + output_local_ros_size(index_ph->query_context); + vci_CSDestroyQueryContext(index_ph->query_context); + index_ph->query_context = NULL; + } + } + + FlushErrorState(); + + result = false; + } + else + { + if (recoveryInProgress) + exit_standby_query(); + + PG_RE_THROW(); + } + } + PG_END_TRY(); + + if (recoveryInProgress) + exit_standby_query(); + + return result; + +error: + for (i = 0; i < vci_query_context->num_indexes; i++) + { + vci_index_placeholder_t *index_ph; + vci_id_t vciid; + + index_ph = &vci_query_context->index_ph_table[i]; + + vciid.oid = index_ph->indexoid; + vciid.dbid = MyDatabaseId; + + vci_SetForceNextWosRosConvFlag(&vciid, true); + output_local_ros_size(index_ph->query_context); + + vci_CSDestroyQueryContext(index_ph->query_context); + index_ph->query_context = NULL; + } + + if (recoveryInProgress) + exit_standby_query(); + + return false; +} + +static void +enter_standby_query(void) +{ + LWLockAcquire(VciShmemAddr->standby_exec_loc, LW_EXCLUSIVE); + if (VciShmemAddr->num_standby_exec_queries == 0) + SetVciRecoveryPause(); + VciShmemAddr->num_standby_exec_queries++; + LWLockRelease(VciShmemAddr->standby_exec_loc); + + if (!shutdown_standby_query_registered) + { + before_shmem_exit(shutdown_standby_query, 0); + shutdown_standby_query_registered = true; + } + + is_running_standby_query = true; +} + +static void +exit_standby_query(void) +{ + is_running_standby_query = false; + + LWLockAcquire(VciShmemAddr->standby_exec_loc, LW_EXCLUSIVE); + VciShmemAddr->num_standby_exec_queries--; + if (VciShmemAddr->num_standby_exec_queries == 0) + SetRecoveryPause(false); + LWLockRelease(VciShmemAddr->standby_exec_loc); +} + +static void +shutdown_standby_query(int code, Datum arg) +{ + if (!is_running_standby_query) + return; + + is_running_standby_query = false; + + LWLockAcquire(VciShmemAddr->standby_exec_loc, LW_EXCLUSIVE); + VciShmemAddr->num_standby_exec_queries--; + if (VciShmemAddr->num_standby_exec_queries == 0) + SetRecoveryPause(false); + LWLockRelease(VciShmemAddr->standby_exec_loc); +} + +/** + * @description allocate query_contexts for VCIs. + * @param[in] recoveryInProgress true if recovery is in progress. + * @param[in] estimatingLocalROSSize true if estimating a local ROS size. + */ +static void +prepare_query_contexts(bool recoveryInProgress, bool estimatingLocalROSSize) +{ + int i; + + for (i = 0; i < vci_query_context->num_indexes; i++) + { + int j, + k, + num_attrs; + AttrNumber *attrNumArray; + vci_index_placeholder_t *index_ph; + vci_id_t vciid; + + index_ph = &vci_query_context->index_ph_table[i]; + + num_attrs = bms_num_members(index_ph->attr_used); + + attrNumArray = palloc(sizeof(AttrNumber) * num_attrs); + + j = k = 0; + do + { + if (bms_is_member(k, index_ph->attr_used)) + attrNumArray[j++] = k; + + k++; + } while (j < num_attrs); + + /* update memory entry */ + vciid.oid = index_ph->indexoid; + vciid.dbid = MyDatabaseId; + + vci_TouchMemoryEntry(&vciid, + get_rel_tablespace(index_ph->indexoid)); + + index_ph->query_context = vci_CSCreateQueryContext( + index_ph->indexoid, + num_attrs, /* Numbe of read columns */ + attrNumArray, /* Array of read columns */ + vci_query_context->mcontext, /* SMC */ + recoveryInProgress, + estimatingLocalROSSize); + + Assert(index_ph->query_context); + + pfree(attrNumArray); + } +} + +/** + * @description estimate the size of local ROS + * @return true if estimated local ROS size is smaller than upperbounds. + */ +static bool +estimate_and_check_localROS_size() +{ + int i; + Size total_local_ros_size = 0; + + /* + * Estimate Local ROS size + */ + for (i = 0; i < vci_query_context->num_indexes; i++) + { + Size local_ros_size; + + local_ros_size = vci_CSEstimateLocalRosSize(vci_query_context->index_ph_table[i].query_context); + + if (local_ros_size == (Size) -1) + { + if (VciGuc.log_query) + elog(WARNING, "too many rows in Data WOS"); + + return false; + } + + total_local_ros_size += local_ros_size; + } + + if (VciGuc.max_local_ros_size * UINT64CONST(1024) < total_local_ros_size) + { + if (VciGuc.log_query) + elog(WARNING, "could not use VCI: local ROS size (%zu) exceeds vci.max_local_ros (%zu)", + total_local_ros_size, (Size) VciGuc.max_local_ros_size * UINT64CONST(1024)); + + return false; + } + + return true; +} + +/** + * Finalize query context requited for column store fetch + * + * @note Object pointed to by vci_query_context is collected. + */ +void +vci_finalize_query_context(void) +{ + int i; + + Assert(vci_query_context); + + elog(DEBUG1, "Call vci_finalize_query_context()"); + + for (i = vci_query_context->num_indexes - 1; i >= 0; i--) + { + if (vci_query_context->index_ph_table[i].local_ros) + { + vci_CSDestroyLocalRos(vci_query_context->index_ph_table[i].local_ros); + vci_query_context->index_ph_table[i].local_ros = NULL; + } + + if (vci_query_context->index_ph_table[i].query_context) + { + vci_CSDestroyQueryContext(vci_query_context->index_ph_table[i].query_context); + vci_query_context->index_ph_table[i].query_context = NULL; + } + } + vci_free_query_context(); + vci_query_context = NULL; +} + +/** + * VCI Scan assigned serial number only to attributes read from the table and creates map. + * + * @param[out] scanstate VCI Scan state to be output + * @param[in] scan Original VCI Scan + * @param[out] num_attrs_p Number of attributes to read + * @param[out] attrNumArray_p Map of original attribute number in table -> serial numbers for attributes to be read + * + * @todo The order of function arguments are unnatural + */ +static void +create_attr_map(VciScanState *scanstate, VciScan *scan, int *num_attrs_p, AttrNumber **attrNumArray_p) +{ + int i, + top_attr, + attr_index, + num_attrs; + AttrNumber *attrNumArray; + + num_attrs = bms_num_members(scan->attr_used); + + attrNumArray = palloc(sizeof(AttrNumber) * num_attrs); + + top_attr = 1; /* AttrNumber starts from 1 */ + attr_index = 0; + + do + { + if (bms_is_member(top_attr, scan->attr_used)) + attrNumArray[attr_index++] = top_attr; + + top_attr++; + } while (attr_index < num_attrs); + + /* Record the biggest AttrNumber */ + scanstate->last_attr = top_attr - 1; + + /* + * Create a map of column number returned by column store fetch from + * AttrNumber so that searched can be performed from Var. + */ + scanstate->attr_map = palloc0(sizeof(int) * (scanstate->last_attr + 1)); + + for (i = 0; i < num_attrs; i++) + /* Add 1 to the index number so that 0 indicates an invalid value */ + scanstate->attr_map[attrNumArray[i]] = i + 1; + + *num_attrs_p = num_attrs; + *attrNumArray_p = attrNumArray; +} + +/** + * Create data required for a specific VCI Scan to perform column store fetch. + * + * @param[in,out] scanstate Pointer to VCI Scan + * @param[in,out] econtext expression context needed for execution + * + * @note Call only once in ExecInit for VCI Scan. + */ +void +vci_create_one_fetch_context_for_fetching_column_store(VciScanState *scanstate, ExprContext *econtext) +{ + VciScan *scan = (VciScan *) scanstate->vci.css.ss.ps.plan; + vci_index_placeholder_t *index_ph; + vci_fetch_placeholder_t *fetch_ph; + int num_attrs; + AttrNumber *attrNumArray; + + elog(DEBUG1, "Call vci_create_one_fetch_context_for_fetching_column_store()"); + + create_attr_map(scanstate, scan, &num_attrs, &attrNumArray); + + Assert((1 <= scan->index_ph_id) && (scan->index_ph_id <= vci_query_context->num_indexes)); + + index_ph = &vci_query_context->index_ph_table[scan->index_ph_id - 1]; + + scanstate->fetch_context + = vci_CSCreateFetchContext(index_ph->query_context, + VCI_NUM_ROWS_READ_AT_ONCE, + num_attrs, + attrNumArray, + true, /* column store */ + false, /* Do not return TID vector */ + true /* Returns CRID */ ); + + Assert(scanstate->fetch_context); + + pfree(attrNumArray); + + initialize_one_fetch_context_for_fetching_column_store(scanstate, index_ph); + + /* Record status in shared memory area */ + Assert((1 <= scan->fetch_ph_id) && (scan->fetch_ph_id <= index_ph->num_fetches)); + + fetch_ph = &index_ph->fetch_ph_table[scan->fetch_ph_id - 1]; + + fetch_ph->fetch_context = scanstate->fetch_context; + fetch_ph->scanstate = scanstate; +} + +/** + * Parallel background worker copies data necessary for VCI Scan to perform + * column store fetch. + * + * @param[in,out] scanstate Pointer to VCI Scan + * + * @note This is for parallel background worker + */ +void +vci_clone_one_fetch_context_for_fetching_column_store(VciScanState *scanstate) +{ + VciScan *scan = (VciScan *) scanstate->vci.css.ss.ps.plan; + vci_index_placeholder_t *index_ph; + int num_attrs; + AttrNumber *attrNumArray; + + Assert((1 <= scan->index_ph_id) && (scan->index_ph_id <= vci_query_context->num_indexes)); + + index_ph = &vci_query_context->index_ph_table[scan->index_ph_id - 1]; + + /* Copy fecth context created on backend */ + scanstate->fetch_context = index_ph->fetch_ph_table[scan->fetch_ph_id - 1].fetch_context; + + create_attr_map(scanstate, scan, &num_attrs, &attrNumArray); + + pfree(attrNumArray); + + initialize_one_fetch_context_for_fetching_column_store(scanstate, index_ph); + + /* + * first_extent_id, last_extent_id, first_fetch of scanstate of VCI Scan + * to be scanned in parallel are reset when the task is received. + */ +} + +static void +initialize_one_fetch_context_for_fetching_column_store(VciScanState *scanstate, vci_index_placeholder_t *index_ph) +{ + scanstate->local_fetch_context + = vci_CSLocalizeFetchContext(scanstate->fetch_context, + CurrentMemoryContext); + + scanstate->extent_status + = vci_CSCreateCheckExtent(scanstate->local_fetch_context); + + Assert(scanstate->extent_status); + + scanstate->vector_set + = vci_CSCreateVirtualTuples(scanstate->local_fetch_context); + + Assert(scanstate->vector_set); + + /* Start scanning from the negative extent id if Local ROS exists */ + scanstate->first_extent_id = -index_ph->query_context->num_local_ros_extents; + scanstate->last_extent_id = index_ph->query_context->num_ros_extents; + scanstate->first_crid = (int64) scanstate->first_extent_id * VCI_NUM_ROWS_IN_EXTENT; + scanstate->last_crid = (int64) scanstate->last_extent_id * VCI_NUM_ROWS_IN_EXTENT; + scanstate->first_fetch = false; +} + +/** + * Destroy data required for specific VCI Scan to execute column store fetch. + * + * @param[in,out] scanstate Pointer to VCI Scan + * + * @note Call only once in ExecEnd for VCI Scan + */ +void +vci_destroy_one_fetch_context_for_fetching_column_store(VciScanState *scanstate) +{ + elog(DEBUG1, "Call vci_destroy_one_fetch_context_for_fetching_column_store()"); + + pfree(scanstate->attr_map); + scanstate->attr_map = NULL; + + vci_CSDestroyVirtualTuples(scanstate->vector_set); + scanstate->vector_set = NULL; + + vci_CSDestroyCheckExtent(scanstate->extent_status); + scanstate->extent_status = NULL; + + vci_CSDestroyFetchContext(scanstate->local_fetch_context); + scanstate->local_fetch_context = NULL; + + vci_CSDestroyFetchContext(scanstate->fetch_context); + scanstate->fetch_context = NULL; +} + +/** + * Specify column store read start position to VCI Scan + * + * @param[in, out] scanstate Pointer to VCI Scan + * @param[in] crid_statrt Read start CRID + * @param[in] size Number of rows to read at a time + * (VCI_NUM_ROWS_IN_EXTENT or less) + */ +void +vci_set_starting_position_for_fetching_column_store(VciScanState *scanstate, int64 crid_start, int size) +{ + int64 crid_end = crid_start + size; + int32 extent_id; + + /* + * Dividing by VCI_NUM_ROWS_IN_EXTENT doesn't work when crid_start is + * negative, so bit shift. + */ + extent_id = crid_start >> VCI_CRID_ROW_ID_BIT_WIDTH; + + Assert(crid_end <= (int64) (extent_id + 1) * VCI_NUM_ROWS_IN_EXTENT); + + scanstate->first_extent_id = extent_id; + scanstate->last_extent_id = extent_id + 1; + scanstate->first_crid = crid_start; + scanstate->last_crid = crid_end; + + scanstate->first_fetch = false; +} + +/** + * Read vector from column store fetches in VCI Scan. + * If there are unread lines in the vector, do nothing. + * + * @param[in, out] scanstate Pointer to VCI Scan + * + * @retval false Read all rows in column store + * @retval true One or more lines remain to be read + * + * @note Before calling this function, initialize settings + * such as vci_reset_vector_set_from_column_store() and + * vci_set_starting_position_for_fetching_column_store(). + */ +bool +vci_fill_vector_set_from_column_store(VciScanState *scanstate) +{ + if (!scanstate->first_fetch) + { + int64 crid_start; + int64 crid_end; + int64 vector_end; + vci_extent_status_t *status; + vci_virtual_tuples_t *vector_set; + uint16 *skip_list; + + scanstate->first_fetch = true; + + scanstate->pos.current_extent_id = scanstate->first_extent_id; + + crid_start = scanstate->first_crid; + crid_end = scanstate->last_crid; + + /* Check first extent */ + status = scanstate->extent_status; + + vci_CSCheckExtent(status, + scanstate->local_fetch_context, + scanstate->pos.current_extent_id, + false); + + if (!status->existence || !status->visible) + goto start; + + crid_end = Min(crid_end, crid_start + status->num_rows); + + /* Read first vector */ + vector_end = (crid_start + VCI_MAX_FETCHING_ROWS) & ~(VCI_MAX_FETCHING_ROWS - 1); + + if (crid_end < vector_end) + vector_end = crid_end; + + vector_set = scanstate->vector_set; + + scanstate->pos.fetch_starting_crid = crid_start; + scanstate->pos.num_fetched_rows = + vci_CSFetchVirtualTuples(vector_set, crid_start, vector_end - crid_start); + + if (scanstate->pos.num_fetched_rows < 1) + elog(ERROR, "vci_CSFetchVirtualTuples returns %d num_fetched_rows(crid=" INT64_FORMAT ")", + scanstate->pos.num_fetched_rows, crid_start); + + scanstate->pos.offset_in_extent = (crid_start & (VCI_NUM_ROWS_IN_EXTENT - 1)) + scanstate->pos.num_fetched_rows; + scanstate->pos.num_rows_in_extent = ((crid_end - 1) & (VCI_NUM_ROWS_IN_EXTENT - 1)) + 1; + + skip_list = vci_CSGetSkipFromVirtualTuples(vector_set); + scanstate->pos.current_row = skip_list[0]; + } + +start: + CHECK_FOR_INTERRUPTS(); + + if (scanstate->pos.current_row < scanstate->pos.num_fetched_rows) + /* Can read fetched vectors */ + return true; + + if (scanstate->pos.offset_in_extent < scanstate->pos.num_rows_in_extent) + { + /* Read the next vector in the same extent */ + vci_virtual_tuples_t *vector_set; + int64 crid_start; + uint16 *skip_list; + + vector_set = scanstate->vector_set; + + crid_start = (int64) scanstate->pos.current_extent_id * VCI_NUM_ROWS_IN_EXTENT + + scanstate->pos.offset_in_extent; + + scanstate->pos.fetch_starting_crid = crid_start; + scanstate->pos.num_fetched_rows = + vci_CSFetchVirtualTuples(vector_set, crid_start, VCI_MAX_FETCHING_ROWS); + + if (scanstate->pos.num_fetched_rows < 1) + elog(ERROR, "vci_CSFetchVirtualTuples returns %d num_fetched_rows(crid=" INT64_FORMAT ")", + scanstate->pos.num_fetched_rows, crid_start); + + Assert(vector_set->num_rows > 0); + + scanstate->pos.offset_in_extent += VCI_MAX_FETCHING_ROWS; + + skip_list = vci_CSGetSkipFromVirtualTuples(vector_set); + scanstate->pos.current_row = skip_list[0]; + + goto start; + } + + /* read next extent */ + while (scanstate->pos.current_extent_id + 1 < scanstate->last_extent_id) + { + vci_extent_status_t *status = scanstate->extent_status; + int64 extent_start; + int64 extent_end; + + scanstate->pos.current_extent_id++; + + vci_CSCheckExtent(status, + scanstate->local_fetch_context, + scanstate->pos.current_extent_id, + false); + + if (status->existence && status->visible) + { + extent_start = (int64) scanstate->pos.current_extent_id * VCI_NUM_ROWS_IN_EXTENT; + extent_end = Min(extent_start + status->num_rows, scanstate->last_crid); + + scanstate->pos.offset_in_extent = 0; + scanstate->pos.num_rows_in_extent = extent_end - extent_start; + + goto start; + } + } + + /* Finished read all extent */ + return false; +} + +/** + * Temporarily record the read position of VCI Scan column store. + * + * @param[in, out] scanstate Pointer to VCI Scan + */ +void +vci_mark_pos_vector_set_from_column_store(VciScanState *scanstate) +{ + scanstate->mark = scanstate->pos; +} + +/** + * Return read position of VCI Scan column store to the marked position, + * and read data again. + * + * @param[in, out] scanstate Pointer to VCI Scan + */ +void +vci_restr_pos_vector_set_from_column_store(VciScanState *scanstate) +{ + /* read next vector in the same extent */ + vci_virtual_tuples_t *vector_set; + int64 crid_start; + + /* return to marked position */ + scanstate->pos = scanstate->mark; + + /* Re-read extent */ + vector_set = scanstate->vector_set; + + crid_start = (int64) scanstate->pos.current_extent_id * VCI_NUM_ROWS_IN_EXTENT + + (scanstate->pos.offset_in_extent - VCI_MAX_FETCHING_ROWS); + + scanstate->pos.fetch_starting_crid = crid_start; + scanstate->pos.num_fetched_rows = + vci_CSFetchVirtualTuples(vector_set, crid_start, VCI_MAX_FETCHING_ROWS); + + Assert(vector_set->num_rows > 0); +} + +/** + * When reading 1 row of vector loaded by vci_fill_vector_set_from_column_store(), + * set the row to be read next to pointer. + * + * @param[in, out] scanstate Pointer to VCI Scan + */ +void +vci_step_next_tuple_from_column_store(VciScanState *scanstate) +{ + vci_virtual_tuples_t *vector_set; + uint16 *skip_list; + + vector_set = scanstate->vector_set; + skip_list = vci_CSGetSkipFromVirtualTuples(vector_set); + + scanstate->pos.current_row += skip_list[scanstate->pos.current_row + 1] + 1; +} + +/** + * Set lines of loaded vector to read + * + * @param[in, out] scanstate Pointer to VCI Scan + */ +void +vci_finish_vector_set_from_column_store(VciScanState *scanstate) +{ + scanstate->pos.current_row = scanstate->pos.num_fetched_rows; +} + +/** + * Execute vector process corresponding to target list of VCI Scan + * + * @param[in,out] scanstate Pointer to VCI Scan + * @param[in,out] econtext expression context required for execution + * @param[in] max_slots max length of this vector + */ +void +VciExecTargetListWithVectorProcessing(VciScanState *scanstate, ExprContext *econtext, int max_slots) +{ + int i; + + for (i = 0; i < scanstate->num_vp_targets; i++) + VciExecEvalVectorProcessing(scanstate->vp_targets[i], econtext, max_slots); +} + +/** + * Evaluation function for Var when performing column store fetch + * + * @param[in,out] exprstate expression state tree of Var (VciVarState type) + * @param[in,out] econtext expression context required for execution + * @param[out] isNull Return NULL/NOT NULL information of evaluation result of Var + * @param[out] isDone Return state when multiple lines are retruned. Always ExprSingleResult in VciParamState. + * + * @return Return evaluation result data of Var + * + * @note Called from VciExecInitExpr() + */ +void +VciExecEvalScalarVarFromColumnStore(ExprState *exprstate, ExprEvalStep *op, ExprContext *econtext) +{ + vci_virtual_tuples_column_info_t *data_vector; + int index; + int null_bit_id; + + PlanState *parent; + VciScanState *scanstate; + int attnum; + + attnum = op->d.var.attnum; + parent = op->d.var.vci_parent_planstate; + scanstate = vci_search_scan_state((VciPlanState *) parent); + + /* The actual index number is the value minus 1. 0 is invalid. */ + index = scanstate->attr_map[attnum] - 1; + + Assert(index >= 0); + Assert(index < scanstate->vector_set->num_columns); + + data_vector = &scanstate->vector_set->column_info[index]; + + null_bit_id = data_vector->null_bit_id; + + if (null_bit_id >= 0) + *op->resnull = vci_CSGetIsNullOfVirtualTupleColumnar(scanstate->vector_set, index)[scanstate->pos.current_row]; + + *op->resvalue = vci_CSGetValuesOfVirtualTupleColumnar(scanstate->vector_set, index)[scanstate->pos.current_row]; + +} diff --git a/contrib/vci/executor/vci_gather.c b/contrib/vci/executor/vci_gather.c new file mode 100644 index 000000000000..3349318776c7 --- /dev/null +++ b/contrib/vci/executor/vci_gather.c @@ -0,0 +1,157 @@ +/*------------------------------------------------------------------------- + * + * vci_gather.c + * Routines to handle VCI Gather nodes + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/executor/vci_gather.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "commands/explain.h" +#include "commands/explain_format.h" +#include "executor/nodeCustom.h" + +#include "vci.h" +#include "vci_executor.h" +#include "vci_utils.h" + +/* + * Declarations of Custom Plan Methods callbacks + */ +static void vci_gather_BeginCustomPlan(CustomScanState *node, EState *estate, int eflags); +static TupleTableSlot *vci_gather_ExecCustomPlan(CustomScanState *node); +static void vci_gather_EndCustomPlan(CustomScanState *node); +static void vci_gather_ReScanCustomPlan(CustomScanState *node); +static void vci_gather_MarkPosCustomPlan(CustomScanState *cpstate); +static void vci_gather_RestrPosCustomPlan(CustomScanState *cpstate); + +static CustomScan *vci_gather_CopyCustomPlan(const CustomScan *_from); + +static Node * +vci_gather_CreateCustomScanState(CustomScan *cs) +{ + VciGather *vgather; + VciGatherState *vgs = (VciGatherState *) palloc0(sizeof(VciGatherState)); + + vgather = (VciGather *) cs; + + vgs->vci.css.ss.ps.type = T_CustomScanState; + vgs->vci.css.ss.ps.plan = (Plan *) vgather; + vgs->vci.css.flags = cs->flags; + vgs->vci.css.methods = &vci_gather_exec_methods; + + return (Node *) vgs; +} + +static void +vci_gather_BeginCustomPlan(CustomScanState *node, EState *estate, int eflags) +{ + VciGather *gather; + VciGatherState *gatherstate; + + gather = (VciGather *) node->ss.ps.plan; + + /* + * create state structure + */ + gatherstate = (VciGatherState *) node; + + gatherstate->vci.css.ss.ps.state = estate; + + /* create expression context for node */ + ExecAssignExprContext(estate, &gatherstate->vci.css.ss.ps); + + outerPlanState(gatherstate) = ExecInitNode(outerPlan(gather), estate, eflags); + + ExecInitResultTupleSlotTL(&gatherstate->vci.css.ss.ps, &TTSOpsVirtual); +} + +static TupleTableSlot * +vci_gather_ExecCustomPlan(CustomScanState *cstate) +{ + VciGatherState *gatherstate = (VciGatherState *) cstate; + + return ExecProcNode(outerPlanState(gatherstate)); +} + +static void +vci_gather_EndCustomPlan(CustomScanState *node) +{ + VciGatherState *gatherstate = (VciGatherState *) node; + + /* clean out the tuple table */ + ExecClearTuple(gatherstate->vci.css.ss.ps.ps_ResultTupleSlot); + + ExecEndNode(outerPlanState(node)); +} + +static void +vci_gather_ReScanCustomPlan(CustomScanState *node) +{ + /* + * if chgParam of subnode is not null then plan will be re-scanned by + * first ExecProcNode. + */ + if (node->ss.ps.lefttree->chgParam == NULL) + ExecReScan(node->ss.ps.lefttree); +} + +static void +vci_gather_MarkPosCustomPlan(CustomScanState *node) +{ + elog(PANIC, "VCI Gather does not support MarkPosCustomPlan call convention"); +} + +/* LCOV_EXCL_START */ + +static void +vci_gather_RestrPosCustomPlan(CustomScanState *node) +{ + elog(PANIC, "VCI Gather does not support RestrPosCustomPlan call convention"); +} + +/* LCOV_EXCL_STOP */ + +static CustomScan * +vci_gather_CopyCustomPlan(const CustomScan *_from) +{ + const VciGather *from = (const VciGather *) _from; + VciGather *newnode; + + newnode = (VciGather *) palloc0(sizeof(VciGather)); + + vci_copy_plan(&newnode->vci, &from->vci); + + ((Node *) newnode)->type = nodeTag((Node *) from); + + return &newnode->vci.cscan; +} + +CustomScanMethods vci_gather_scan_methods = { + "VCI Gather", + vci_gather_CreateCustomScanState, + vci_gather_CopyCustomPlan, +}; + +CustomExecMethods vci_gather_exec_methods = { + "VCI Gather", + vci_gather_BeginCustomPlan, + vci_gather_ExecCustomPlan, + vci_gather_EndCustomPlan, + vci_gather_ReScanCustomPlan, + vci_gather_MarkPosCustomPlan, + vci_gather_RestrPosCustomPlan, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL +}; diff --git a/contrib/vci/executor/vci_param.c b/contrib/vci/executor/vci_param.c new file mode 100644 index 000000000000..51cc4687937a --- /dev/null +++ b/contrib/vci/executor/vci_param.c @@ -0,0 +1,60 @@ +/*------------------------------------------------------------------------- + * + * vci_param.c + * Routines to handle VCI Param Expr node + * + * Param evaluation may execute ExecSetParamPlan() only the first time to execute the + * subquery and receive and return the result, but parallel workers in parallel execution + * may not be able to execute the subquery. To avoid this, the parallel worker asks + * the main backend process to execute ExecSetParamPlan() on its behalf. + * + * Therefore, Param is converted to dedicated VciParamState. + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/executor/vci_param.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" +#include "executor/execExpr.h" +#include "executor/nodeSubplan.h" +#include "nodes/execnodes.h" +#include "nodes/primnodes.h" + +#include "vci.h" +#include "vci_executor.h" + +/** + * VciParamState evaluation function + * + * @param[in] exprstate Pointer to VciParamState (Casted to ExprState) + * @param[in] econtext execution context + * @param[out] isNull true if result of evaluation function is NULL + * @return result of evaluation function + */ +void +VciExecEvalParamExec(ExprState *exprstate, ExprEvalStep *op, ExprContext *econtext) +{ + ParamExecData *prm; + + int thisParamId = op->d.param.paramid; + + /* + * PARAM_EXEC params (internal executor parameters) are stored in the + * ecxt_param_exec_vals array, and can be accessed by array index. + */ + prm = &(econtext->ecxt_param_exec_vals[thisParamId]); + + if (prm->execPlan != NULL) + { + /* Parameter not evaluated yet, so go do it */ + ExecSetParamPlan(prm->execPlan, econtext); + /* ExecSetParamPlan should have processed this param... */ + Assert(prm->execPlan == NULL); + } + + *op->resnull = prm->isnull; + *op->resvalue = prm->value; +} diff --git a/contrib/vci/executor/vci_plan.c b/contrib/vci/executor/vci_plan.c new file mode 100644 index 000000000000..dcc788026555 --- /dev/null +++ b/contrib/vci/executor/vci_plan.c @@ -0,0 +1,235 @@ +/*------------------------------------------------------------------------- + * + * vci_plan.c + * Common processing for VCI plan nodes + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "executor/instrument.h" +#include "executor/nodeSubplan.h" +#include "nodes/bitmapset.h" +#include "nodes/execnodes.h" +#include "nodes/makefuncs.h" /* for makeVarFromTargetEntry() */ +#include "nodes/nodes.h" +#include "nodes/pg_list.h" +#include "nodes/plannodes.h" + +#include "vci.h" +#include "vci_executor.h" + +static VciScan *search_scan(Plan *node, AttrNumber scan_plan_no); +static VciScanState *search_scan_state(PlanState *node, Plan *target); + +/** + * Determine if given plan node is CustomPlan + * + * @param[in] plan plan node + * @return true if CustomPlan, else false + */ +bool +vci_is_custom_plan(Plan *plan) +{ + NodeTag type; + + type = nodeTag(plan); + + if ((type == T_CustomScan) || (type == T_CustomPlanMarkPos)) + return true; + + return false; +} + +/** + * Returns type of VCI plan node with VCI_CUSTOMPLAN_XXX macro + * + * @param[in] plan plan node + * @retval 0 not VCI plan node + * @retval non 0 is a VCI plan node + */ +int +vci_get_vci_plan_type(Plan *plan) +{ + if (plan == NULL) + return 0; + + if (!vci_is_custom_plan(plan)) + return 0; + + return ((CustomScan *) plan)->flags & VCI_CUSTOMPLAN_MASK; +} + +/** + * Copy only the basic part of the VCI-derived plan nodes given to src to dest. + * + * @param[out] dest Copy destination + * @param[in] src Copy source + */ +void +vci_copy_plan(VciPlan *dest, const VciPlan *src) +{ + dest->scan_plan_no = src->scan_plan_no; + + /* Do not copy scan_cached */ + dest->scan_cached = NULL; +} + +/** + * Search and return VCI Scan node that is the source of data input for the VCI plan node + * + * @param[in] node Pointer to the VCI plan that serves as search starting point + * @return Pointer to VCI Scan plan + */ +VciScan * +vci_search_scan(VciPlan *node) +{ + AttrNumber scan_plan_no; + VciScan *result; + + if (node->scan_cached) + return node->scan_cached; + + scan_plan_no = node->scan_plan_no; + if (scan_plan_no == 0) + return NULL; + + result = search_scan(&node->cscan.scan.plan, scan_plan_no); + + if (node->scan_cached == NULL) + node->scan_cached = result; + + return result; +} + +/** + * Subroutine for vci_search_scan() + * + * Recursively descend and search for VCI Scan nodes. + */ +static VciScan * +search_scan(Plan *node, AttrNumber scan_plan_no) +{ + if (node->plan_no == scan_plan_no) + return (VciScan *) node; + + if (outerPlan(node)) + { + VciScan *result = search_scan(outerPlan(node), scan_plan_no); + + if (result != NULL) + return result; + } + + if (innerPlan(node)) + { + VciScan *result = search_scan(innerPlan(node), scan_plan_no); + + if (result != NULL) + return result; + } + + /* + * Some types of plan nodes have plans other than outerPlan and innerPlan, + * but they do not contain VCI Scan nodes. + */ + + return NULL; +} + +/** + * Search and return VCI Scan State node that is the source of data input for the VCI plan state node + * + * @param[in] node Pointer to the VCI plan state node that serves as search starting point + * @return Pointer to VCI Scan plan state node + */ +VciScanState * +vci_search_scan_state(VciPlanState *node) +{ + VciScan *scan; + VciScanState *result; + + if (node->scanstate_cached) + return node->scanstate_cached; + + scan = vci_search_scan((VciPlan *) node->css.ss.ps.plan); + if (scan == NULL) + return NULL; + + result = search_scan_state(&node->css.ss.ps, &scan->vci.cscan.scan.plan); + + if (node->scanstate_cached == NULL) + node->scanstate_cached = result; + + return result; +} + +/** + * Subroutine for vci_search_scan_state() + * + * Recursively descend and search for VCI Scan state nodes. + */ +static VciScanState * +search_scan_state(PlanState *node, Plan *target) +{ + if (node->plan == target) + { + Assert(node->type == T_CustomScanState); + return (VciScanState *) node; + } + + if (outerPlanState(node)) + { + VciScanState *result = search_scan_state(outerPlanState(node), target); + + if (result != NULL) + return result; + } + + if (innerPlanState(node)) + { + VciScanState *result = search_scan_state(innerPlanState(node), target); + + if (result != NULL) + return result; + } + + /* + * Depending on the type of Plan State, some may have Plan States other + * than outerPlanState and innerPlanState, but they do not have VCI Scan + * State. + */ + + return NULL; +} + +/** + * Create a target list that pass through the lower nodes required for + * Materialize node. + * + * @param[in] targetlist target list + * @return created pass through target list + */ +List * +vci_generate_pass_through_target_list(List *targetlist) +{ + List *new_targetlist = NIL; + ListCell *lc; + + foreach(lc, targetlist) + { + TargetEntry *src_tle = (TargetEntry *) lfirst(lc); + TargetEntry *new_tle; + + new_tle = makeNode(TargetEntry); + + *new_tle = *src_tle; + new_tle->expr = (Expr *) makeVarFromTargetEntry(OUTER_VAR, src_tle); + + new_targetlist = lappend(new_targetlist, new_tle); + } + + return new_targetlist; +} diff --git a/contrib/vci/executor/vci_plan_func.c b/contrib/vci/executor/vci_plan_func.c new file mode 100644 index 000000000000..78cfc4cb0b11 --- /dev/null +++ b/contrib/vci/executor/vci_plan_func.c @@ -0,0 +1,950 @@ +/*------------------------------------------------------------------------- + * + * vci_plan_func.c + * General-purpose manipulations of plan trees + * + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/executor/vci_plan_func.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "executor/executor.h" +#include "nodes/bitmapset.h" +#include "nodes/nodeFuncs.h" +#include "nodes/pg_list.h" +#include "nodes/plannodes.h" + +#include "vci.h" +#include "vci_executor.h" + +static bool expression_walker_core(Plan *plan, bool (*walker) (Node *, void *), bool (*walker_initplan) (Node *, void *), void (*attr_cb) (AttrNumber *, void *), void *context); +static bool subplan_mutator(PlannedStmt *plannedstmt, Plan **plan_p, int plan_id, vci_mutator_t mutator, vci_topmost_plan_cb_t topmostplan, void *context, int eflags, bool *changed); +static bool plan_tree_mutator(Plan **edge, Plan *plan, vci_mutator_t mutator, void *context, int eflags, bool *changed); +static bool plan_list_tree_mutator(List **plan_list, Plan *plan, vci_mutator_t mutator, void *context, int eflags, bool *changed); + +/*---------------------------------------------------------------------------*/ +/* Plan walker */ +/*---------------------------------------------------------------------------*/ + +/** + * Helper function that traverse plan tree without updating it + * + * @param[in] plannedstmt Pointer to PlannedStmt type struct that holds the plan tree to be traversed + * @param[in] walker Callback function to be used in traverse. Returns true to stop cycle. + * @param[in] topmostplan Callback function to call before analyzing Topmost plan node + * @param[in,out] context Pointer to arbitrary data to pass to callback function +: + * @return true when callback function stop cycle, false if cycle is complete + */ +bool +vci_plannedstmt_tree_walker(PlannedStmt *plannedstmt, bool (*walker) (Plan *, void *), vci_topmost_plan_cb_t topmostplan, void *context) +{ + int i; + ListCell *l; + + if (plannedstmt == NULL) + return false; + + i = 1; + foreach(l, plannedstmt->subplans) + { + Plan *subplan = (Plan *) lfirst(l); + + if (subplan == NULL) + continue; + + if (topmostplan) + topmostplan(subplan, i /* plan_id */ , context); + + if (walker(subplan, context)) + return true; + + i++; + } + + if (plannedstmt->planTree) + { + if (topmostplan) + topmostplan(plannedstmt->planTree, 0 /* plan_id */ , context); + + if (walker(plannedstmt->planTree, context)) + return true; + } + + return false; +} + +/** + * Helper function that traverse plan node without updating it + * + * @param[in] plan Pointer to Plan type struct that holds the plan node to be traversed + * @param[in] walker Callback function to be used in traverse. Returns true to stop cycle. + * @param[in,out] context Pointer to arbitrary data to pass to callback function + * + * @return true when callback function stop cycle, false if cycle is complete + */ +bool +vci_plan_tree_walker(Plan *plan, bool (*walker) (Plan *, void *), void *context) +{ + ListCell *lc; + + switch (nodeTag(plan)) + { + case T_ForeignScan: + case T_ModifyTable: + case T_LockRows: + elog(DEBUG1, "unsupported node type: %s(%d)", + VciGetNodeName(nodeTag(plan)), (int) nodeTag(plan)); + return true; + + case T_Append: + foreach(lc, ((Append *) plan)->appendplans) + { + if (walker((Plan *) lfirst(lc), context)) + return true; + } + break; + + case T_MergeAppend: + foreach(lc, ((MergeAppend *) plan)->mergeplans) + { + if (walker((Plan *) lfirst(lc), context)) + return true; + } + break; + + case T_BitmapAnd: + foreach(lc, ((BitmapAnd *) plan)->bitmapplans) + { + if (walker((Plan *) lfirst(lc), context)) + return true; + } + break; + + case T_BitmapOr: + foreach(lc, ((BitmapOr *) plan)->bitmapplans) + { + if (walker((Plan *) lfirst(lc), context)) + return true; + } + break; + + case T_SubqueryScan: + if (((SubqueryScan *) plan)->subplan) + if (walker(((SubqueryScan *) plan)->subplan, context)) + return true; + break; + + default: + break; + } + + if (outerPlan(plan)) + if (walker(outerPlan(plan), context)) + return true; + + if (innerPlan(plan)) + if (walker(innerPlan(plan), context)) + return true; + + return false; +} + +/** + * Helper function that traverse expression tree in plan node without updating it + * + * @param[in] plan Pointer to Plan type struct that holds the plan node to be traversed + * @param[in] walker Callback function to be used in traverse. Returns true to stop cycle. + * @param[in,out] context Pointer to arbitrary data to pass to callback function + * + * @return true when callback function stop cycle, false if cycle is complete + */ +bool +vci_expression_walker(Plan *plan, bool (*walker) (Node *, void *), void *context) +{ + return expression_walker_core(plan, walker, NULL, NULL, context); +} + +/** + * Helper function that traverse expression tree in plan node without updating it + * If there is attribut information (AttrNumber) other than Var node included in plan node, + * attr_cb is executed. + * + * @param[in] plan Pointer to Plan type struct that holds the plan node to be traversed + * @param[in] walker Callback function to be used in traverse. Returns true to stop cycle. + * @param[in] attr_cb Callback function to be called when attribute (column) other than Var exists + * @param[in,out] context Pointer to arbitrary data to pass to callback function + * + * @return true when callback function stop cycle, false if cycle is complete + */ +bool +vci_expression_and_colid_walker(Plan *plan, bool (*walker) (Node *, void *), void (*attr_cb) (AttrNumber *, void *), void *context) +{ + return expression_walker_core(plan, walker, walker, attr_cb, context); +} + +/** + * Helper function that traverse expression tree in plan node without updating it + * Run walker_initplan if there is an initPlan associated with the plan node. + * + * @param[in] plan Pointer to Plan type struct that holds the plan node to be traversed + * @param[in] walker Callback function to be used in traverse. Returns true to stop cycle. + * @param[in] walker_initplan Callbac k function to be used in initPlan traverse + * @param[in,out] context Pointer to arbitrary data to pass to callback function + * + * @return true when callback function stop cycle, false if cycle is complete + */ +bool +vci_expression_and_initplan_walker(Plan *plan, bool (*walker) (Node *, void *), bool (*walker_initplan) (Node *, void *), void *context) +{ + return expression_walker_core(plan, walker, walker_initplan, NULL, context); +} + +static bool +expression_walker_core(Plan *plan, bool (*walker) (Node *, void *), bool (*walker_initplan) (Node *, void *), void (*attr_cb) (AttrNumber *, void *), void *context) +{ + if (walker_initplan) + { + if (expression_tree_walker((Node *) plan->initPlan, walker_initplan, context)) + return true; + } + + switch (nodeTag(plan)) + { + case T_Result: + { + Result *result = (Result *) plan; + + if (expression_tree_walker((Node *) result->resconstantqual, walker, context)) + return true; + } + break; + + case T_MergeAppend: + if (attr_cb) + { + MergeAppend *merge_append = (MergeAppend *) plan; + int i; + + for (i = 0; i < merge_append->numCols; i++) + attr_cb(&merge_append->sortColIdx[i], context); + + } + break; + + case T_RecursiveUnion: + if (attr_cb) + { + RecursiveUnion *recursive_union = (RecursiveUnion *) plan; + int i; + + for (i = 0; i < recursive_union->numCols; i++) + attr_cb(&recursive_union->dupColIdx[i], context); + } + break; + + case T_IndexScan: + { + IndexScan *index_scan = (IndexScan *) plan; + + if (expression_tree_walker((Node *) index_scan->indexqual, walker, context)) + return true; + + if (expression_tree_walker((Node *) index_scan->indexqualorig, walker, context)) + return true; + + if (expression_tree_walker((Node *) index_scan->indexorderby, walker, context)) + return true; + + if (expression_tree_walker((Node *) index_scan->indexorderbyorig, walker, context)) + return true; + } + break; + + case T_IndexOnlyScan: + { + IndexOnlyScan *index_only_scan = (IndexOnlyScan *) plan; + + if (expression_tree_walker((Node *) index_only_scan->indexqual, walker, context)) + return true; + + if (expression_tree_walker((Node *) index_only_scan->indexorderby, walker, context)) + return true; + + if (expression_tree_walker((Node *) index_only_scan->indextlist, walker, context)) + return true; + } + break; + + case T_BitmapIndexScan: + { + BitmapIndexScan *bitmap_index_scan = (BitmapIndexScan *) plan; + + if (expression_tree_walker((Node *) bitmap_index_scan->indexqual, walker, context)) + return true; + + if (expression_tree_walker((Node *) bitmap_index_scan->indexqualorig, walker, context)) + return true; + } + break; + + case T_BitmapHeapScan: + { + BitmapHeapScan *bitmap_heap_scan = (BitmapHeapScan *) plan; + + if (expression_tree_walker((Node *) bitmap_heap_scan->bitmapqualorig, walker, context)) + return true; + } + break; + + case T_TidScan: + { + TidScan *tid_scan = (TidScan *) plan; + + if (expression_tree_walker((Node *) tid_scan->tidquals, walker, context)) + return true; + } + break; + + case T_TidRangeScan: + { + TidRangeScan *tid_range_scan = (TidRangeScan *) plan; + + if (expression_tree_walker((Node *) tid_range_scan->tidrangequals, walker, context)) + return true; + } + break; + + case T_FunctionScan: + { + FunctionScan *func_scan = (FunctionScan *) plan; + + if (expression_tree_walker((Node *) func_scan->functions, walker, context)) + return true; + } + break; + + case T_ValuesScan: + { + ValuesScan *values_scan = (ValuesScan *) plan; + + if (expression_tree_walker((Node *) values_scan->values_lists, walker, context)) + return true; + } + break; + + case T_CteScan: + break; + + case T_WorkTableScan: + break; + + case T_NestLoop: + { + NestLoop *nest_loop = (NestLoop *) plan; + ListCell *lc; + + if (expression_tree_walker((Node *) nest_loop->join.joinqual, walker, context)) + return true; + + foreach(lc, nest_loop->nestParams) + { + NestLoopParam *nlp = (NestLoopParam *) lfirst(lc); + + if (walker((Node *) nlp->paramval, context)) + return true; + } + } + break; + + case T_Memoize: + { + Memoize *memoize = (Memoize *) plan; + + if (expression_tree_walker((Node *) memoize->param_exprs, walker, context)) + return true; + } + break; + + case T_MergeJoin: + { + MergeJoin *merge_join = (MergeJoin *) plan; + + if (expression_tree_walker((Node *) merge_join->join.joinqual, walker, context)) + return true; + + if (expression_tree_walker((Node *) merge_join->mergeclauses, walker, context)) + return true; + } + break; + + case T_HashJoin: + { + HashJoin *hash_join = (HashJoin *) plan; + + if (expression_tree_walker((Node *) hash_join->join.joinqual, walker, context)) + return true; + + if (expression_tree_walker((Node *) hash_join->hashclauses, walker, context)) + return true; + + if (expression_tree_walker((Node *) hash_join->hashkeys, walker, context)) + return true; + } + break; + + case T_Sort: + if (attr_cb) + { + Sort *sort = (Sort *) plan; + int i; + + for (i = 0; i < sort->numCols; i++) + attr_cb(&sort->sortColIdx[i], context); + } + break; + + case T_Group: + if (attr_cb) + { + Group *group = (Group *) plan; + int i; + + for (i = 0; i < group->numCols; i++) + attr_cb(&group->grpColIdx[i], context); + } + break; + + case T_Agg: + if (attr_cb) + { + int i; + Agg *agg = (Agg *) plan; + + for (i = 0; i < agg->numCols; i++) + attr_cb(&agg->grpColIdx[i], context); + } + break; + + case T_WindowAgg: + if (attr_cb) + { + int i; + WindowAgg *window_agg = (WindowAgg *) plan; + + for (i = 0; i < window_agg->partNumCols; i++) + attr_cb(&window_agg->partColIdx[i], context); + + for (i = 0; i < window_agg->ordNumCols; i++) + attr_cb(&window_agg->ordColIdx[i], context); + } + break; + + case T_Unique: + if (attr_cb) + { + int i; + Unique *unique = (Unique *) plan; + + for (i = 0; i < unique->numCols; i++) + attr_cb(&unique->uniqColIdx[i], context); + } + break; + + case T_Hash: + break; + + case T_SetOp: + if (attr_cb) + { + int i; + SetOp *setop = (SetOp *) plan; + + for (i = 0; i < setop->numCols; i++) + attr_cb(&setop->cmpColIdx[i], context); + } + break; + + case T_Limit: + { + Limit *limit = (Limit *) plan; + + if (expression_tree_walker((Node *) limit->limitOffset, walker, context)) + return true; + + if (expression_tree_walker((Node *) limit->limitCount, walker, context)) + return true; + } + break; + + case T_CustomScan: + case T_CustomPlanMarkPos: + switch (vci_get_vci_plan_type(plan)) + { + case VCI_CUSTOMPLAN_SCAN: + case VCI_CUSTOMPLAN_SORT: + case VCI_CUSTOMPLAN_AGG: + case VCI_CUSTOMPLAN_GATHER: + break; + + default: + break; + } + break; + + case T_ForeignScan: + case T_ModifyTable: + case T_LockRows: + elog(DEBUG1, "unsupported node type: %s(%d)", + VciGetNodeName(nodeTag(plan)), (int) nodeTag(plan)); + return true; + + default: + break; + } + + if (expression_tree_walker((Node *) plan->qual, walker, context)) + return true; + + if (expression_tree_walker((Node *) plan->targetlist, walker, context)) + return true; + + /* Success */ + return false; +} + +/*---------------------------------------------------------------------------*/ +/* Plan mutator */ +/*---------------------------------------------------------------------------*/ + +/** + * Rewrite each plan node in PlannedStmt according to conditions from mutator + * + * @param[in,out] plannedstmt Pointer to PlannedStmt type struct containing plan tree to be rewritten + * @param[in] mutator Callback function to be used in rewrite + * @param[in] topmostplan Callback function to be called before parsing Topmost plan + * @param[in,out] context Pointer to arbitrary data to pass to callback function + * @param[in] eflags Specify same eflags value as the one passed when plan tree starts ExecutorStart() + * @param[out] changed Write true to *changed if any rewrite is done. Do nothing if not. + * + * @return true when callback function stop cycle, false if cycle is complete + * + * @note rewrite is executed in in-place + */ +bool +vci_plannedstmt_tree_mutator(PlannedStmt *plannedstmt, vci_mutator_t mutator, vci_topmost_plan_cb_t topmostplan, void *context, int eflags, bool *changed) +{ + int i; + ListCell *l; + + if (plannedstmt == NULL) + return false; + + i = 1; + foreach(l, plannedstmt->subplans) + { + Plan **plan_p = (Plan **) &lfirst(l); + + if (*plan_p == NULL) + continue; + + if (subplan_mutator(plannedstmt, plan_p, i, mutator, topmostplan, context, eflags, changed)) + return true; + + i++; + } + + if (plannedstmt->planTree) + { + Plan *oldplan; + Plan *newplan; + + oldplan = newplan = plannedstmt->planTree; + + if (topmostplan) + topmostplan(oldplan, 0 /* plan_id */ , context); + + if (mutator(&newplan, NULL, context, eflags, changed)) + return true; + + if (newplan != oldplan) + plannedstmt->planTree = newplan; + + if (topmostplan) + topmostplan(newplan, 0 /* plan_id */ , context); + } + + return false; +} + +/** + * Rewrite plan node in PlannedStmt according to conditions from mutator. But, specify the rewrite order of subplan. + * + * @param[in,out] plannedstmt Pointer to PlannedStmt type struct containing plan tree to be rewritten + * @param[in] mutator Callback function to be used in rewrite + * @param[in] topmostplan Callback function to be called before parsing Topmost plan + * @param[in,out] context Pointer to arbitrary data to pass to callback function + * @param[in] eflags Specify same eflags value as the one passed when plan tree starts ExecutorStart() + * @param[out] changed Write true to *changed if any rewrite is done. Do nothing if not. + * @param[in] subplan_order Array of ID in order of subplan to be parsed(including main plan) + * + * @return true when callback function stops cycle, false if cycle is complete + * + * @note rewrite is executed in in-place + */ +bool +vci_plannedstmt_tree_mutator_order(PlannedStmt *plannedstmt, vci_mutator_t mutator, vci_topmost_plan_cb_t topmostplan, void *context, int eflags, bool *changed, int *subplan_order) +{ + int i; + int max_subplans; + bool mainplan_changed = false, + subplans_changed = false; + Plan **subplan_array; + List *subplans = NIL; + ListCell *l; + + if (plannedstmt == NULL) + return false; + + max_subplans = list_length(plannedstmt->subplans); + + subplan_array = palloc0(sizeof(Plan *) * max_subplans); + + i = 0; + foreach(l, plannedstmt->subplans) + subplan_array[i++] = (Plan *) lfirst(l); + + for (i = 0; i < max_subplans + 1; i++) + { + int plan_id = subplan_order[i]; + + if (plan_id == 0) + { + Plan *oldplan; + Plan *newplan; + + oldplan = newplan = plannedstmt->planTree; + + if (topmostplan) + topmostplan(oldplan, 0 /* plan_id */ , context); + + if (mutator(&newplan, NULL, context, eflags, &mainplan_changed)) + return true; + + if (newplan != oldplan) + plannedstmt->planTree = newplan; + + if (topmostplan) + topmostplan(newplan, 0 /* plan_id */ , context); + } + else + { + Plan **plan_p = &subplan_array[plan_id - 1]; + + if (*plan_p == NULL) + continue; + + if (subplan_mutator(plannedstmt, plan_p, plan_id, mutator, topmostplan, context, eflags, &subplans_changed)) + return true; + } + } + + *changed = mainplan_changed || subplans_changed; + + if (subplans_changed) + { + for (i = 0; i < max_subplans; i++) + subplans = lappend(subplans, subplan_array[i]); + + plannedstmt->subplans = subplans; + } + + pfree(subplan_array); + + return false; +} + +static bool +subplan_mutator(PlannedStmt *plannedstmt, Plan **plan_p, int plan_id, vci_mutator_t mutator, vci_topmost_plan_cb_t topmostplan, void *context, int eflags, bool *changed) +{ + int sp_eflags; + Plan *oldplan; + Plan *newplan; + + /* + * A subplan will never need to do BACKWARD scan nor MARK/RESTORE. If it + * is a parameterless subplan (not initplan), we suggest that it be + * prepared to handle REWIND efficiently; otherwise there is no need. + */ + sp_eflags = eflags + & (EXEC_FLAG_EXPLAIN_ONLY | EXEC_FLAG_WITH_NO_DATA); + if (bms_is_member(plan_id, plannedstmt->rewindPlanIDs)) + sp_eflags |= EXEC_FLAG_REWIND; + + oldplan = newplan = *plan_p; + + if (topmostplan) + topmostplan(oldplan, plan_id, context); + + if (mutator(&newplan, NULL, context, sp_eflags, changed)) + return true; + + if (newplan != oldplan) + { + *plan_p = (void *) newplan; + *changed = true; + } + + if (topmostplan) + topmostplan(newplan, plan_id, context); + + return false; +} + +/** + * Rewrite nodes under plan + * (Do not rewrite plan itself) + * + * @param[in,out] plan_p Pointer to a pointer to Plan typepe struct that holds the plan node to be rewritten + * @param[in] parent Parent plan node of plan node to rewrite. NULL is there is no parent. + * @param[in] mutator Callback function to be used in rewrite + * @param[in,out] context Pointer to arbitrary data to pass to callback function + * @param[in] eflags eflags value same as the one passed by plan tree in ExecutorStart() + * @param[out] changed Write true to *changed if any rewrite is done. Do nothing if not. + * + * @return true when callback function stops cycle, false if cycle is complete + */ +bool +vci_plan_tree_mutator(Plan **plan_p, Plan *parent, vci_mutator_t mutator, void *context, int eflags, bool *changed) +{ + int eflags_outer, + eflags_inner; + Plan *plan; + + plan = *plan_p; + + if (plan == NULL) + return false; + + /* + * Determine unsupported plan nodes + */ + switch (nodeTag(plan)) + { + case T_ForeignScan: + case T_ModifyTable: + case T_LockRows: + elog(DEBUG1, "unsupported node type: %s(%d)", + VciGetNodeName(nodeTag(plan)), (int) nodeTag(plan)); + return true; + case T_Agg: + { + if ((parent != NULL) && (nodeTag(parent) == T_Gather || nodeTag(parent) == T_GatherMerge)) + return true; /* If underlying plan is Aggregate then it + * skip using VCI as OSS parallel + * aggregation is performing better */ + } + break; + + case T_Gather: + case T_GatherMerge: + + /* + * For parallel aggregates, there will be two aggregate nodes: + * partial and final. The Gather node could be in between these + * two nodes with a Sort in between. So check if the either the + * parent or the child of an Aggregate is a Gather node. for eg: + * Finalize Aggregate->Gather->Sort->Partial Aggregate + */ + if ((parent != NULL) && (nodeTag(parent) == T_Agg)) + return true; + default: + break; + } + + eflags_outer = eflags_inner = eflags; + + switch (nodeTag(plan)) + { + case T_Material: + case T_Sort: + eflags_outer = eflags_inner = (eflags & ~(EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)); + break; + + case T_CteScan: + eflags_outer = eflags_inner = (eflags | EXEC_FLAG_REWIND); + break; + + case T_MergeJoin: + eflags_inner = eflags | EXEC_FLAG_MARK; + break; + + case T_NestLoop: + if (((NestLoop *) plan)->nestParams == NIL) + eflags_inner = (eflags | EXEC_FLAG_REWIND); + else + eflags_inner = (eflags & ~EXEC_FLAG_REWIND); + break; + + case T_SetOp: + if (((SetOp *) plan)->strategy == SETOP_HASHED) + eflags_outer &= ~EXEC_FLAG_REWIND; + break; + + default: + break; + } + + if (plan_tree_mutator(&plan->lefttree, plan, mutator, context, eflags_outer, changed)) + return true; + + if (plan_tree_mutator(&plan->righttree, plan, mutator, context, eflags_inner, changed)) + return true; + + /* + * Process nodes other than lefttree and rightree connected to this plan + * node + */ + switch (nodeTag(plan)) + { + case T_Append: + { + Append *node = (Append *) plan; + + if (plan_list_tree_mutator(&node->appendplans, plan, mutator, context, eflags_outer, changed)) + return true; + } + break; + + case T_MergeAppend: + { + MergeAppend *node = (MergeAppend *) plan; + + if (plan_list_tree_mutator(&node->mergeplans, plan, mutator, context, eflags_outer, changed)) + return true; + } + break; + + case T_BitmapAnd: + { + BitmapAnd *node = (BitmapAnd *) plan; + + if (plan_list_tree_mutator(&node->bitmapplans, plan, mutator, context, eflags_outer, changed)) + return true; + } + break; + + case T_BitmapOr: + { + BitmapOr *node = (BitmapOr *) plan; + + if (plan_list_tree_mutator(&node->bitmapplans, plan, mutator, context, eflags_outer, changed)) + return true; + } + break; + + case T_SubqueryScan: + { + SubqueryScan *node = (SubqueryScan *) plan; + + if (plan_tree_mutator(&node->subplan, plan, mutator, context, eflags_outer, changed)) + return true; + } + break; + + default: + break; + } + + return false; +} + +/** + * Rewrite edge connected to plan node of interest (*plan_p) + * + * @param[in,out] plan_p Pointer to a pointer to Plan typepe struct that holds the plan node to be rewritten + * @param[in] parent Parent plan node of plan node to rewrite. NULL is there is no parent. + * @param[in] mutator Callback function to be used in rewrite + * @param[in,out] context Pointer to arbitrary data to pass to callback function + * @param[in] eflags eflags value same as the one passed by plan tree in ExecutorStart() + * + * @param[out] changed Write true to *changed if any rewrite is done. Do nothing if not. + */ +static bool +plan_tree_mutator(Plan **plan_p, Plan *parent, vci_mutator_t mutator, void *context, int eflags, bool *changed) +{ + if (*plan_p == NULL) + return false; + + if (mutator(plan_p, parent, context, eflags, changed)) + return true; + + return false; +} + +/** + * Rewrite plan node list + * + * @param[in,out] plan_list Pointer to List type struct that holds list of plan node to be rewritten + * @param[in] parent Parent plan node of plan node list to be rewritten. NULL if no parent. + * @param[in] mutator Callback function to be used in rewrite + * @param[in,out] context Pointer to arbitrary data to pass to callback function + * @param[in] eflags eflags value same as the one passed by plan tree in ExecutorStart() + * @param[out] changed Write true to *changed if any rewrite is done. Do nothing if not. + */ +static bool +plan_list_tree_mutator(List **plan_list, Plan *parent, vci_mutator_t mutator, void *context, int eflags, bool *changed) +{ + List *newlist = NIL; + List *list = *plan_list; + ListCell *lc; + bool any_changed = false; + + if (list == NIL) + return false; + + if (list_length(list) == 0) + return false; + + foreach(lc, list) + { + Plan *child = (Plan *) lfirst(lc); + + /* + * In case of List of plans, we need to verify any of the list item + * has Gather node in top-level plan.i.e., + * Appenedplans->Gather->Parallel Seq scan. If yes, plan tree walker + * cannot replace the gather node properly. So, skip re-writing VCI + * plan in such scenarios. + */ + if (newlist == NIL) /* Using this just to make this code check + * work only for the first time which is what + * needed */ + { + if (nodeTag(child) == T_Gather || nodeTag(child) == T_GatherMerge) + return true; + } + + if (plan_tree_mutator(&child, parent, mutator, context, eflags, &any_changed)) + return true; + + newlist = lappend(newlist, child); + } + + if (any_changed) + { + *plan_list = newlist; + *changed = true; + } + else + { + list_free(newlist); + } + + return false; +} diff --git a/contrib/vci/executor/vci_planner.c b/contrib/vci/executor/vci_planner.c new file mode 100644 index 000000000000..9f565882f4bf --- /dev/null +++ b/contrib/vci/executor/vci_planner.c @@ -0,0 +1,1913 @@ +/*------------------------------------------------------------------------- + * + * vci_planner.c + * Plan rewrite routine(sequential only) + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/executor/vci_planner.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/sysattr.h" +#include "access/transam.h" +#include "catalog/pg_aggregate.h" +#include "catalog/pg_am.h" +#include "executor/executor.h" +#include "executor/nodeCustom.h" +#include "executor/nodeIndexscan.h" +#include "miscadmin.h" +#include "nodes/nodeFuncs.h" +#include "optimizer/clauses.h" +#include "optimizer/optimizer.h" +#include "optimizer/cost.h" +#include "optimizer/pathnode.h" +#include "optimizer/paths.h" +#include "optimizer/plancat.h" +#include "optimizer/planmain.h" +#include "optimizer/planner.h" +#include "optimizer/restrictinfo.h" +#include "parser/parsetree.h" +#include "utils/fmgroids.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/relcache.h" +#include "utils/snapmgr.h" +#include "utils/snapshot.h" +#include "utils/syscache.h" + +#include "vci.h" +#include "vci_columns_data.h" + +#include "vci_mem.h" +#include "vci_executor.h" +#include "vci_utils.h" +#include "vci_planner.h" +#include "vci_supported_oid.h" + +/* + * rt_fetch + * + * NB: this will crash and burn if handed an out-of-range RT index + */ +#define rt_fetch(rangetable_index, rangetable) \ + ((RangeTblEntry *) list_nth(rangetable, (rangetable_index)-1)) + +/* + * getrelid + * + * Given the range index of a relation, return the corresponding + * relation OID. Note that InvalidOid will be returned if the + * RTE is for a non-relation-type RTE. + */ +#define getrelid(rangeindex,rangetable) \ + (rt_fetch(rangeindex, rangetable)->relid) + +/** + * Used to pass auxiliary information about the table to vci_can_rewrite_custom_scan(). + */ +typedef struct +{ + /** reloid of table to be selected for rewrite */ + Oid reloid; + + /** oid of selected VCI index. InvalidOid if not rewriteable. */ + Oid indexOid; + + /** Copy reltuples of selected table */ + double estimate_tuples; + + /** Bitmap of referenced column (attribute). NULL if not rewriteable. */ + Bitmapset *attrs_used; +} vci_table_info_t; + +/** + * Used to search plan tree with vci_gather_used_attrs() and vci_gather_one_used_attr(), + * and record attributes references in tables specified by scanrelid. + */ +typedef struct +{ + Index scanrelid; + + Bitmapset *attrs_used; +} vci_gather_used_attrs_t; + +/** + * Search plan tree with vci_renumber_attrs() and vci_renumber_on_attr(), and rewrite attribute number. + * Used to replace varattno in Var. + */ +typedef struct +{ + Index scanrelid; + + /** New attribute number map. newattno = attr_map[oldattno] */ + AttrNumber *attr_map; +} vci_renumber_attrs_t; + +typedef struct +{ + Plan *father_plan; + Plan *gather_plan; +} father_gather_plans; + +static bool vci_optimize_phase1(PlannedStmt *plannedstmt, vci_rewrite_plan_context_t *rp_context, int eflags); +static bool vci_rewrite_plan_tree_mutator(Plan **plan_p, Plan *parent, void *context, int eflags, bool *changed); +static bool vci_rewrite_plan_node(Plan **plan_p, Plan *parent, void *context, int eflags, bool *changed); +static bool vci_rewrite_scan_node_via_column_store(Plan **plan_p, Plan *parent, void *context, bool *changed); +static bool vci_insert_material_node_mutator(Plan **plan_p, Plan *parent, void *context, int eflags, bool *changed); +static VciSort *vci_create_custom_sort(Sort *sortnode, AttrNumber scan_plan_no); +static VciAgg *vci_create_custom_agg(Agg *aggnode, AttrNumber scan_plan_no, bool suppress_vp); +static List *vci_reconstruct_qualification(Scan *scannode); +static bool vci_can_rewrite_custom_scan(Scan *scannode, List *targetlist, List *qual, Plan *parent, vci_table_info_t *result); +static Bitmapset *vci_gather_used_attrs_in_plan(Plan *plan, Index scanrelid); +static bool vci_gather_used_attrs(Node *node, void *context); +static void vci_gather_one_used_attr(AttrNumber *attr_p, void *context); +static void vci_minimize_tlist_of_scan(Scan *scannode, Plan *parent, Index parent_refer_relid, Bitmapset *attrs_used_from_parent); +static VciScan *vci_create_custom_scan_via_column_store(Scan *scannode, const vci_table_info_t *table_info, List *tlist, List *qual, bool suppress_vp); + +static bool vci_contain_inapplicable_expr_walker(Node *node, void *context); +static bool vci_contain_nestloop_param_expr_walker(Node *node, void *context); +static bool vci_renumber_attrs(Node *node, void *context); +static void vci_renumber_one_attr(AttrNumber *attr_p, void *context); +static bool vci_tlist_consists_of_only_simple_vars(List *tlist, Index scanrelid); + +static AttrNumber vci_satisfies_vci_join(vci_rewrite_plan_context_t *rp_context, Join *join); + +static bool vci_is_supported_operation(Oid oid); +static bool vci_is_not_user_defined_type(Oid oid); + +static void vci_update_plan_tree(PlannedStmt *plannedstmt); +static List *vci_update_target_list(Plan *plan, Plan *gather_plan); + +static bool vci_update_plan_walker(Plan *plan, void *plans); + +/** + * Attempt to rewrite plan and return the rewritten planned stmt is successful. + * + * @param[in] src original planned stmt + * @param[in] eflags flag to be passed to ExecInitNode + * @param[in] snapshot snapshot + * + * @retval non NULL plan after rewrite + * @retval NULL rewrite failed + */ +PlannedStmt * +vci_generate_custom_plan(PlannedStmt *src, int eflags, Snapshot snapshot) +{ + int i, + nParamExec; + bool changed, + dummy; + bool isGather = false; + PlannedStmt *target; + vci_rewrite_plan_context_t rp_context; + + vci_register_applicable_udf(snapshot); + + target = copyObjectImpl(src); + + /* + * Initialize plan rewrite information + */ + memset(&rp_context, 0, sizeof(rp_context)); + + rp_context.plannedstmt = target; + rp_context.max_subplan_attrs = list_length(target->subplans) + 1; + rp_context.subplan_attr_map = palloc0(sizeof(vci_subplan_attr_t) * rp_context.max_subplan_attrs); + rp_context.subplan_order_array = palloc(sizeof(int) * rp_context.max_subplan_attrs); + rp_context.max_plan_attrs = 16; + rp_context.plan_attr_map = palloc0(sizeof(vci_plan_attr_t) * rp_context.max_plan_attrs); + rp_context.last_plan_no = 0; + nParamExec = list_length(target->paramExecTypes); + rp_context.param_exec_attr_map = palloc0(sizeof(vci_param_exec_attr_t) * nParamExec); + + for (i = 0; i < rp_context.max_subplan_attrs; i++) + rp_context.subplan_order_array[i] = i; + + /* + * Preparing for analysis + */ + if (vci_preanalyze_plan_tree(target, &rp_context, eflags, &isGather)) + { + elog(DEBUG1, "Not suitable plan"); + return NULL; + } + + /* Adjust plan tree by moving oss gather plan */ + if (isGather) + vci_update_plan_tree(target); + + /* + * Phase 1: Basic VCI plan rewrite + */ + changed = vci_optimize_phase1(target, &rp_context, eflags); + + if (!changed) + { + elog(DEBUG1, "No plan to be rewritten"); + return NULL; + } + + /* + * VCI plan node do not support backward scan an mark/restore, so insert + * Material node if eflag needs them. + */ + vci_plannedstmt_tree_mutator(target, vci_insert_material_node_mutator, vci_register_plan_id, &rp_context, eflags, &dummy); + + /* Disable community parallelism */ + /* target->parallelModeNeeded=0; */ + + elog(DEBUG1, "Rewrite plan tree"); + + return target; +} + +/*==========================================================================*/ +/* Plan rewrite */ +/*==========================================================================*/ + +/** + * Basic part of VCI plan rewrite + * + * @param[in] plannedstmt plan + * @param[in,out] rp_context Plan rewrite information + * @param[in] eflags flag to be passed to ExecInitNode + * + * @return true if rewrite succeed, false if failed + */ +static bool +vci_optimize_phase1(PlannedStmt *plannedstmt, vci_rewrite_plan_context_t *rp_context, int eflags) +{ + bool changed = false; + + rp_context->forbid_parallel_exec = false; + + if (vci_plannedstmt_tree_mutator_order(plannedstmt, vci_rewrite_plan_tree_mutator, vci_register_plan_id, rp_context, + eflags, &changed, rp_context->subplan_order_array)) + return false; + + return changed; +} + +/** + * Rewrite plan subtree starting with plan into VCI plan + * + * @param[in,out] plan_p Pointer to the plan subtree to start rewriting + * @param[in,out] parent parent plan node of plan + * @param[in,out] context additional context + * @param[in] eflags flag to be passed to ExecInitNode + * @param[out] changed write true if rewrite is executed + * + * @return true when callback function stops cycle, false if cycle is complete + */ +static bool +vci_rewrite_plan_tree_mutator(Plan **plan_p, Plan *parent, void *context, int eflags, bool *changed) +{ + vci_rewrite_plan_context_t *rp_context = (vci_rewrite_plan_context_t *) context; + Plan *plan; + bool saved_forbid_parallel_exec; + bool result; + + plan = *plan_p; + + saved_forbid_parallel_exec = rp_context->forbid_parallel_exec; + rp_context->forbid_parallel_exec = false; + + if (vci_plan_tree_mutator(plan_p, parent, vci_rewrite_plan_tree_mutator, context, eflags, changed)) + return true; + + result = vci_rewrite_plan_node(plan_p, parent, context, eflags, changed); + + if (rp_context->plan_attr_map[plan->plan_no].plan_compat == VCI_PLAN_COMPAT_OK) + rp_context->plan_attr_map[plan->plan_no].plan_compat = rp_context->forbid_parallel_exec ? VCI_PLAN_COMPAT_UNSUPPORTED_OBJ : VCI_PLAN_COMPAT_OK; + rp_context->forbid_parallel_exec |= saved_forbid_parallel_exec; + + return result; +} + +/** + * Rewrute plan node of *plan_p with VCI plan + * + * @param[in,out] plan_p Pointer to the plan to start rewriting + * @param[in,out] parent parent plan of plan + * @param[in,out] context additional context + * @param[in] eflags flag to be passed to ExecInitNode + * @param[out] changed write true if rewrite is executed + * + * @return true when callback function stops cycle, false if cycle is complete + */ +static bool +vci_rewrite_plan_node(Plan **plan_p, Plan *parent, void *context, int eflags, bool *changed) +{ + vci_rewrite_plan_context_t *rp_context = (vci_rewrite_plan_context_t *) context; + Plan *plan; + AttrNumber scan_plan_no = 0; + + plan = *plan_p; + + /* Determine if Vector processing is possible */ + rp_context->suppress_vp = false; + + if (rp_context->plan_attr_map[plan->plan_no].plan_compat != VCI_PLAN_COMPAT_OK) + { + rp_context->forbid_parallel_exec = true; + return false; + } + + /* Determine if there are any expression that cannot be rewritten in plan */ + if (vci_expression_walker(plan, vci_contain_inapplicable_expr_walker, context)) + { + rp_context->forbid_parallel_exec = true; + return false; + } + + switch (nodeTag(plan)) + { + default: + break; + + case T_HashJoin: + { + HashJoin *hjnode = (HashJoin *) plan; + + if (!VciGuc.enable_hashjoin) + return false; + + scan_plan_no = vci_satisfies_vci_join(rp_context, &hjnode->join); + + if (scan_plan_no == 0) + return false; + + elog(DEBUG1, "Replace VCI HashJoin"); + + *changed = true; + + vci_set_inner_plan_type_and_scan_plan_no(rp_context, plan, VCI_INNER_PLAN_TYPE_HASHJOIN, scan_plan_no); + } + break; + + case T_NestLoop: + { + NestLoop *nlnode = (NestLoop *) plan; + + if (!VciGuc.enable_nestloop) + return false; + + scan_plan_no = vci_satisfies_vci_join(rp_context, &nlnode->join); + + if (scan_plan_no == 0) + return false; + + elog(DEBUG1, "Replace VCI NestLoop"); + + *changed = true; + + vci_set_inner_plan_type_and_scan_plan_no(rp_context, plan, VCI_INNER_PLAN_TYPE_NESTLOOP, scan_plan_no); + } + break; + + case T_Sort: + { + Sort *sortnode = (Sort *) plan; + + if (!VciGuc.enable_sort) + return false; + + /* + * Can only be rewritten when outer is VCI + * Scan/HashJoin/NestLoop. VCI Agg cannot be rewritten. Sort + * plan nodes are note consecutive, so VCI Sort will not + * occur. + */ + switch (vci_get_inner_plan_type(rp_context, outerPlan(plan))) + { + case VCI_INNER_PLAN_TYPE_SCAN: + case VCI_INNER_PLAN_TYPE_HASHJOIN: + case VCI_INNER_PLAN_TYPE_NESTLOOP: + /* OK */ + scan_plan_no = vci_get_inner_scan_plan_no(rp_context, outerPlan(plan)); + break; + default: + return false; + } + + Assert(scan_plan_no > 0); + + elog(DEBUG1, "Replace VCI Sort"); + + *plan_p = (Plan *) vci_create_custom_sort(sortnode, scan_plan_no); + *changed = true; + + vci_set_inner_plan_type_and_scan_plan_no(rp_context, plan, VCI_INNER_PLAN_TYPE_SORT, scan_plan_no); + } + break; + + case T_Agg: + { + Agg *aggnode = (Agg *) plan; + + switch (aggnode->aggstrategy) + { + case AGG_SORTED: + if (!VciGuc.enable_sortagg) + return false; + break; + + case AGG_HASHED: + if (!VciGuc.enable_hashagg) + return false; + break; + + case AGG_PLAIN: + if (!VciGuc.enable_plainagg) + return false; + break; + + default: + break; /* LCOV_EXCL_LINE */ + } + + switch (aggnode->aggstrategy) + { + case AGG_SORTED: + if (vci_get_inner_plan_type(rp_context, outerPlan(plan)) != VCI_INNER_PLAN_TYPE_SORT) + return false; + /* OK */ + scan_plan_no = vci_get_inner_scan_plan_no(rp_context, outerPlan(plan)); + break; + + case AGG_HASHED: + case AGG_PLAIN: + switch (vci_get_inner_plan_type(rp_context, outerPlan(plan))) + { + case VCI_INNER_PLAN_TYPE_SCAN: + case VCI_INNER_PLAN_TYPE_HASHJOIN: + case VCI_INNER_PLAN_TYPE_NESTLOOP: + /* OK */ + scan_plan_no = vci_get_inner_scan_plan_no(rp_context, outerPlan(plan)); + break; + default: + return false; + } + break; + + default: + break; /* LCOV_EXCL_LINE */ + } + + Assert(scan_plan_no > 0); + + elog(DEBUG1, "Replace VCI Agg"); + + *plan_p = (Plan *) vci_create_custom_agg(aggnode, scan_plan_no, rp_context->suppress_vp); + *changed = true; + + vci_set_inner_plan_type_and_scan_plan_no(rp_context, plan, VCI_INNER_PLAN_TYPE_AGG, scan_plan_no); + } + break; + + case T_SeqScan: + if (!VciGuc.enable_seqscan) + return false; + else + { + bool each_changed = false; + + switch (VciGuc.table_scan_policy) + { + case VCI_TABLE_SCAN_POLICY_COLUMN_ONLY: + if (true == vci_rewrite_scan_node_via_column_store(plan_p, parent, context, &each_changed)) + return false; + break; + + default: + break; + } + + *changed |= each_changed; + } + break; + + case T_IndexScan: + if (!VciGuc.enable_indexscan) + return false; + + if (((IndexScan *) plan)->indexorderdir != NoMovementScanDirection) + { + elog(DEBUG1, "Need sorting rows if indexscan with indexorderdir(%d) is replaced", + ((IndexScan *) plan)->indexorderdir); + return false; + } + goto process_scan_like_plan; + + case T_BitmapHeapScan: + if (!VciGuc.enable_bitmapheapscan) + return false; + + goto process_scan_like_plan; + + process_scan_like_plan: + { + if (VciGuc.table_scan_policy == VCI_TABLE_SCAN_POLICY_COLUMN_ONLY) + if (true == vci_rewrite_scan_node_via_column_store(plan_p, parent, context, changed)) + return false; + } + break; + + case T_CustomScan: + case T_CustomPlanMarkPos: + return false; + } + + return false; +} + +/** + * Get bitmap of parameters updated by SubPlan via initPlan called from + * the given plan. + */ +static bool +vci_rewrite_scan_node_via_column_store(Plan **plan_p, Plan *parent, void *context, bool *changed) +{ + vci_rewrite_plan_context_t *rp_context = (vci_rewrite_plan_context_t *) context; + Plan *plan = *plan_p; + Scan *scannode = (Scan *) plan; + vci_table_info_t table_info; + List *tlist = NIL; + List *qual = NIL; + AttrNumber scan_plan_no; + + if (rp_context->plan_attr_map[plan->plan_no].preset_eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)) + return false; + + table_info.reloid = getrelid(scannode->scanrelid, rp_context->plannedstmt->rtable); + table_info.indexOid = InvalidOid; + table_info.attrs_used = NULL; + + tlist = scannode->plan.targetlist; + qual = vci_reconstruct_qualification(scannode); + + scan_plan_no = plan->plan_no; + + if (nodeTag(plan) != T_SeqScan) + { + if (expression_tree_walker((Node *) qual, vci_contain_nestloop_param_expr_walker, context)) + { + elog(DEBUG1, "Scan's qual contains any inapplicable expression"); + return false; + } + } + + /* + * Determines whether VCI index containes attributes accessed by the + * query, and if so returns the OID of the VCI index and bitmapset of + * attributes accessed in the query. + */ + if (!vci_can_rewrite_custom_scan(scannode, tlist, qual, parent, &table_info)) + return false; + + tlist = scannode->plan.targetlist; + + elog(DEBUG1, "Replace VCI Scan [column store]: convert from %s", + VciGetNodeName(nodeTag(plan))); + + *plan_p = (Plan *) vci_create_custom_scan_via_column_store(scannode, &table_info, tlist, qual, rp_context->suppress_vp); + *changed = true; + + vci_set_inner_plan_type_and_scan_plan_no(rp_context, plan, VCI_INNER_PLAN_TYPE_SCAN, scan_plan_no); + + return false; +} + +/** + * Insert Material node into the tree that has already been rewritten to VCI plan node + * as necessary. + * + * @param[in,out] plan_p rewritten plan tree + * @param[in,out] parent parent plan of plan + * @param[in,out] context additional context + * @param[in] eflags flag to be passed to ExecInitNode + * @param[out] changed write true if rewrite is executed + * + * @return true when callback function stop cycle, false if cycle is complete + * + * None of the VCI plan nodes support mark/restore, backward scan, or rewind (efficient scan). + * If they are needed, insert a Materialnode above them to handle. + */ +static bool +vci_insert_material_node_mutator(Plan **plan_p, Plan *parent, void *context, int eflags, bool *changed) +{ + vci_rewrite_plan_context_t *rp_context = (vci_rewrite_plan_context_t *) context; + Material *newplan; + VciPlan *targetplan; + Plan *plan; + + plan = *plan_p; + + switch (nodeTag(plan)) + { + case T_CustomScan: + case T_CustomPlanMarkPos: + switch (vci_get_inner_plan_type(rp_context, plan)) + { + case VCI_INNER_PLAN_TYPE_SORT: + + /* + * VCI Sort node does not support EXEC_FLAG_BACKWARD and + * EXEC_FLAG_MARK, so insert a Material node between them. + * + * VCI Sort can be used if only EXEC_FLAG_REWIND + */ + if ((eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)) != 0) + { + targetplan = (VciPlan *) plan; + goto maybe_need_material_node; + } + break; + + case VCI_INNER_PLAN_TYPE_SCAN: + case VCI_INNER_PLAN_TYPE_AGG: + Assert(!(eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK))); + /* pass through */ + /* pgr0007 */ + + default: + break; + } + break; + + case T_Limit: + if (outerPlan(plan) && (vci_get_inner_plan_type(rp_context, outerPlan(plan)) == VCI_INNER_PLAN_TYPE_SORT)) + { + if ((eflags & (EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)) != 0) + { + targetplan = (VciPlan *) outerPlan(plan); + goto maybe_need_material_node; + } + } + break; + + default: + break; + } + + if (vci_plan_tree_mutator(plan_p, parent, vci_insert_material_node_mutator, context, eflags, changed)) + return true; + + return false; + +maybe_need_material_node: + newplan = makeNode(Material); + + newplan->plan.targetlist = vci_generate_pass_through_target_list(plan->targetlist); + newplan->plan.qual = NIL; + newplan->plan.lefttree = plan; + newplan->plan.plan_no = ++rp_context->last_plan_no; + vci_expand_plan_attr_map(rp_context); + + copy_plan_costsize(&newplan->plan, plan); + + newplan->plan.extParam = bms_copy(plan->extParam); + newplan->plan.allParam = bms_copy(plan->allParam); + + newplan->plan.initPlan = plan->initPlan; + plan->initPlan = NULL; + + *plan_p = (Plan *) newplan; + *changed = true; + + rp_context->plan_attr_map[targetplan->cscan.scan.plan.plan_no].preset_eflags &= ~(EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK); + + return false; +} + +/** + * Create VCI Sort node + */ +static VciSort * +vci_create_custom_sort(Sort *sortnode, AttrNumber scan_plan_no) +{ + VciSort *sort; + + sort = (VciSort *) palloc0(sizeof(VciSort)); + + sort->vci.cscan.scan.plan = sortnode->plan; + sort->vci.cscan.scan.plan.type = T_CustomPlanMarkPos; /* Mark restore support */ + sort->vci.cscan.flags = VCI_CUSTOMPLAN_SORT | CUSTOMPATH_SUPPORT_BACKWARD_SCAN | CUSTOMPATH_SUPPORT_MARK_RESTORE; + sort->vci.cscan.custom_relids = NULL; + sort->vci.cscan.methods = &vci_sort_scan_methods; + + sort->vci.scan_plan_no = scan_plan_no; + sort->vci.orig_plan = (Plan *) sortnode; + + sort->numCols = sortnode->numCols; + sort->sortColIdx = sortnode->sortColIdx; + sort->sortOperators = sortnode->sortOperators; + sort->collations = sortnode->collations; + sort->nullsFirst = sortnode->nullsFirst; + + return sort; +} + +/** + * Create VCI Agg node + */ +static VciAgg * +vci_create_custom_agg(Agg *aggnode, AttrNumber scan_plan_no, bool suppress_vp) +{ + VciAgg *agg; + + agg = (VciAgg *) palloc0(sizeof(VciAgg)); + + agg->vci.cscan.scan.plan = aggnode->plan; + agg->vci.cscan.scan.plan.type = T_CustomScan; /* Not mark restore + * support */ + agg->vci.cscan.flags = VCI_CUSTOMPLAN_AGG; + agg->vci.cscan.custom_relids = NULL; + + switch (aggnode->aggstrategy) + { + case AGG_HASHED: + agg->vci.cscan.methods = &vci_hashagg_scan_methods; + break; + + case AGG_SORTED: + agg->vci.cscan.methods = &vci_groupagg_scan_methods; + break; + + case AGG_PLAIN: + agg->vci.cscan.methods = &vci_agg_scan_methods; + break; + + default: + break; /* LCOV_EXCL_LINE */ + } + + agg->vci.scan_plan_no = scan_plan_no; + agg->vci.orig_plan = (Plan *) aggnode; + + agg->aggstrategy = aggnode->aggstrategy; + agg->numCols = aggnode->numCols; + agg->grpColIdx = aggnode->grpColIdx; + agg->grpOperators = aggnode->grpOperators; + agg->grpCollations = aggnode->grpCollations; + agg->numGroups = aggnode->numGroups; + + return agg; +} + +/** + * Determine if Scan plan node can be rewritten to VCI Scan + * + * Checks that there is a VCI index with all columns (attributes) to be read + * from the table. If there is more than one matching VCI index, the earlier one + * in the index list is used. + * + * @param[in] scanrelid relid of target table + * @param[in] reloid oid of target table + * @param[in] targetlist targetlist of target Scan plan + * @param[in] qual qual of target Scan plan + * @param[in] parent Parent plan node of target Scan plan + * @param[in,out] table_info Input information about targe table and returns information obtained within this function + * + * @retval true if rewriteable, false if not + */ +static bool +vci_can_rewrite_custom_scan(Scan *scannode, List *targetlist, List *qual, Plan *parent, vci_table_info_t *table_info) +{ + Index scanrelid; + vci_gather_used_attrs_t gcontext; + int orig_natts, + opt_natts; + Relation tableRel; + double estimate_tuples; + Oid foundVciIndexOid = InvalidOid; + Bitmapset *attrs_used = NULL; + bool do_minimize_tlist = false; + Bitmapset *attrs_used_from_parent = NULL; + List *indexoidlist = NIL; + ListCell *indexoidscan; + Index parent_refer_relid = 0; + + scanrelid = scannode->scanrelid; + + gcontext.scanrelid = scanrelid; + gcontext.attrs_used = NULL; + + if (expression_tree_walker((Node *) qual, vci_gather_used_attrs, &gcontext) || + expression_tree_walker((Node *) targetlist, vci_gather_used_attrs, &gcontext)) + return false; + + attrs_used = gcontext.attrs_used; + + orig_natts = opt_natts = bms_num_members(attrs_used); + + if (orig_natts == 0) + return false; + + if (parent) + { + if ((Plan *) scannode == outerPlan(parent)) + parent_refer_relid = OUTER_VAR; + else if ((Plan *) scannode == innerPlan(parent)) + parent_refer_relid = INNER_VAR; + } + + /* + * To improve the read performance of SeqScan, PostgreSQL may sort the + * target list according to the order of columns in the heap tuple, + * including columns that are not actually referenced by the upper node. + * + * In a columnar system, such optimizations are harmful, so optimizations + * are needed to stop reading unnecessary columns. + * + * First, calculate the columns that are truly referenced from the upper + * node. + * + * This optimization only looks at the next higher node. Hash does not + * work because it works in conjunction with HashJoin, which is even + * higher up. + */ + if ((parent_refer_relid != 0) && vci_tlist_consists_of_only_simple_vars(targetlist, scanrelid)) + { + switch (nodeTag(parent)) + { + case T_Agg: + case T_Group: + case T_HashJoin: + case T_MergeJoin: + case T_NestLoop: + attrs_used_from_parent = vci_gather_used_attrs_in_plan(parent, parent_refer_relid); + do_minimize_tlist = true; + break; + default: + break; + } + } + + if (do_minimize_tlist) + { + Bitmapset *new_attrs_used; + + gcontext.scanrelid = scanrelid; + gcontext.attrs_used = NULL; + + expression_tree_walker((Node *) qual, vci_gather_used_attrs, &gcontext); + + new_attrs_used = bms_add_members(gcontext.attrs_used, attrs_used_from_parent); + + /* + * Compare the attributes referenced by Scan with the attributes + * referenced by the WHERE clause and the attributes referenced by the + * parent node. + */ + if (bms_equal(attrs_used, new_attrs_used)) + { + bms_free(new_attrs_used); + bms_free(attrs_used_from_parent); + + attrs_used_from_parent = NULL; + do_minimize_tlist = false; + } + else + { + bms_free(attrs_used); + + attrs_used = new_attrs_used; + + opt_natts = bms_num_members(attrs_used); + } + } + + /* + * Lock table for index calculation + */ + tableRel = table_open(table_info->reloid, AccessShareLock); + + estimate_tuples = (double) Max(tableRel->rd_rel->reltuples, 0); + + elog(DEBUG1, "vci index: target table \"%s\"(oid=%u) tuples(rows=%.0f,extents=%u)", + NameStr(tableRel->rd_rel->relname), table_info->reloid, + estimate_tuples, (int) (estimate_tuples / VCI_NUM_ROWS_IN_EXTENT)); + + if (estimate_tuples < (double) VciGuc.table_rows_threshold) + { + elog(DEBUG1, "vci index: target table \"%s\"(oid=%u) is too few rows. threshold=%d", + NameStr(tableRel->rd_rel->relname), table_info->reloid, VciGuc.table_rows_threshold); + + goto done; + } + + /* + * Find the VCI index from the indexes existing in the table and check + * whether the table contains attrs_used. + */ + indexoidlist = RelationGetIndexList(tableRel); + + foreach(indexoidscan, indexoidlist) + { + Relation indexRel; + Oid indexOid; + + indexOid = lfirst_oid(indexoidscan); + indexRel = index_open(indexOid, AccessShareLock); + + if (isVciIndexRelation(indexRel)) + { + Form_pg_index indexStruct = indexRel->rd_index; + Bitmapset *attrs_indexed; + + /* + * If the index is valid, but cannot yet be used, ignore it. (See + * L.190 src/backend/optimizer/util/plancat.c) See + * src/backend/access/heap/README.HOT for discussion. + */ + if (indexStruct->indcheckxmin && + !TransactionIdPrecedes(HeapTupleHeaderGetXmin(indexRel->rd_indextuple->t_data), + TransactionXmin)) + { + index_close(indexRel, AccessShareLock); + continue; + } + + attrs_indexed = vci_MakeIndexedColumnBitmap(indexOid, + CurrentMemoryContext, + AccessShareLock); + + if (bms_is_subset(attrs_used, attrs_indexed)) + { + elog(DEBUG1, "vci index: adopt index \"%s\"(oid=%u)", + NameStr(indexRel->rd_rel->relname), indexOid); + + foundVciIndexOid = indexOid; + } + else + { + int num, + x; + + elog(DEBUG1, "vci index: don't match index \"%s\"(oid=%u)", + NameStr(indexRel->rd_rel->relname), indexOid); + + num = bms_num_members(attrs_used); + x = 1; + while (num > 0) + { + if (bms_is_member(x, attrs_used)) + { + elog(DEBUG1, "\tattrnum = %d%s", x, bms_is_member(x, attrs_indexed) ? " x" : ""); + num--; + } + x++; + } + } + + bms_free(attrs_indexed); + } + + index_close(indexRel, AccessShareLock); + + if (OidIsValid(foundVciIndexOid)) + break; + } + + list_free(indexoidlist); + +done: + table_close(tableRel, AccessShareLock); + + if (OidIsValid(foundVciIndexOid)) + { + if (do_minimize_tlist) + { + elog(DEBUG1, "vci index: minimize targetlist %d -> %d handing over %s", + orig_natts, opt_natts, VciGetNodeName(nodeTag(parent))); + + vci_minimize_tlist_of_scan(scannode, parent, parent_refer_relid, attrs_used_from_parent); + bms_free(attrs_used_from_parent); + } + + table_info->indexOid = foundVciIndexOid; + table_info->estimate_tuples = estimate_tuples; + table_info->attrs_used = attrs_used; + } + else + { + bms_free(attrs_used); + } + + return OidIsValid(foundVciIndexOid); +} + +/** + * Determine if the given target list consists only of Simple Vars + * referencing a single input tuple. + */ +static bool +vci_tlist_consists_of_only_simple_vars(List *tlist, Index scanrelid) +{ + ListCell *tl; + Index attno = 1; + + foreach(tl, tlist) + { + TargetEntry *tle; + Var *var; + + tle = (TargetEntry *) lfirst(tl); + + if (!tle->expr || !IsA(tle->expr, Var)) + return false; + + var = (Var *) tle->expr; + + if (var->varno != scanrelid) + return false; + + if (var->varattno != attno) + return false; + + attno++; + } + + return true; +} + +/** + * Collect the bitmap of attributes referenced as scanrelid within the specified plan node. + */ +static Bitmapset * +vci_gather_used_attrs_in_plan(Plan *plan, Index scanrelid) +{ + vci_gather_used_attrs_t gcontext; + + gcontext.scanrelid = scanrelid; + gcontext.attrs_used = NULL; + + if (vci_expression_and_colid_walker(plan, vci_gather_used_attrs, vci_gather_one_used_attr, &gcontext)) + { + bms_free(gcontext.attrs_used); + return NULL; + } + + return gcontext.attrs_used; +} + +/** + * Scan Var node in the VCI Scan node and obtain the attrno of attributes + * that require data supply from the VCI index. + */ +static bool +vci_gather_used_attrs(Node *node, void *context) +{ + vci_gather_used_attrs_t *gcontext = (vci_gather_used_attrs_t *) context; + + if (node == NULL) + return false; + + switch (nodeTag(node)) + { + case T_Var: + { + Var *var = (Var *) node; + + if (gcontext->scanrelid != var->varno) + return false; + + gcontext->attrs_used = bms_add_member(gcontext->attrs_used, var->varattno); + } + return false; + + default: + break; + } + + return expression_tree_walker(node, vci_gather_used_attrs, context); +} + +/** + * Records to vci_gather_used_attrs_t because it is *attr_p attribute going to be referenced + */ +static void +vci_gather_one_used_attr(AttrNumber *attr_p, void *context) +{ + vci_gather_used_attrs_t *gcontext = (vci_gather_used_attrs_t *) context; + + Assert(*attr_p > 0); + + gcontext->attrs_used = bms_add_member(gcontext->attrs_used, *attr_p); +} + +/** + * Delete nodes in targetlist of Scan node that are not referenced by higher-level nodes. + * At the same time, change the attno of outer var or inner var within higher-leve nodes. + */ +static void +vci_minimize_tlist_of_scan(Scan *scannode, Plan *parent, Index parent_refer_relid, Bitmapset *attrs_used_from_parent) +{ + vci_renumber_attrs_t rcontext; + AttrNumber last_attr; + int i, + j; + AttrNumber resno; + List *tlist; + List *new_tlist = NIL; + ListCell *lc; + + tlist = scannode->plan.targetlist; + + last_attr = list_length(tlist); + + rcontext.scanrelid = parent_refer_relid; + rcontext.attr_map = palloc0(sizeof(AttrNumber) * (last_attr + 1)); + + j = 1; + for (i = 1; i <= last_attr; i++) + if (bms_is_member(i, attrs_used_from_parent)) + rcontext.attr_map[i] = j++; + + if (vci_expression_and_colid_walker(parent, vci_renumber_attrs, vci_renumber_one_attr, &rcontext)) + elog(ERROR, "planner failed to minimize tlist of scan"); + + resno = 1; + new_tlist = NIL; + foreach(lc, tlist) + { + TargetEntry *tle = (TargetEntry *) lfirst(lc); + + Assert(IsA(tle, TargetEntry)); + + if (rcontext.attr_map[tle->resno] > 0) + { + tle->resno = resno++; + new_tlist = lappend(new_tlist, tle); + } + } + + pfree(rcontext.attr_map); + + scannode->plan.targetlist = new_tlist; +} + +/** + * Renumber attribute numbers in the subtree under the specified expression node. + */ +static bool +vci_renumber_attrs(Node *node, void *context) +{ + vci_renumber_attrs_t *rcontext = (vci_renumber_attrs_t *) context; + + if (node == NULL) + return false; + + switch (nodeTag(node)) + { + case T_Var: + { + Var *var = (Var *) node; + + if (rcontext->scanrelid != var->varno) + return false; + + if (var->varattno <= InvalidAttrNumber) + return true; + + var->varattno = rcontext->attr_map[var->varattno]; + } + return false; + + default: + break; + } + + return expression_tree_walker(node, vci_renumber_attrs, context); +} + +/** + * Renumber attribute number located at the position of attr_p + */ +static void +vci_renumber_one_attr(AttrNumber *attr_p, void *context) +{ + vci_renumber_attrs_t *rcontext = (vci_renumber_attrs_t *) context; + + Assert(*attr_p > 0); + + *attr_p = rcontext->attr_map[*attr_p]; +} + +/** + * Combine qual when Scan derived node returned to SeqScan node + */ +static List * +vci_reconstruct_qualification(Scan *scannode) +{ + List *qual = scannode->plan.qual; + + switch (nodeTag(scannode)) + { + case T_SeqScan: + qual = list_copy(qual); + break; + + case T_IndexScan: + qual = list_copy(qual); + if (((IndexScan *) scannode)->indexqualorig) + qual = list_concat(qual, ((IndexScan *) scannode)->indexqualorig); + break; + + case T_BitmapHeapScan: + qual = list_copy(qual); + if (((BitmapHeapScan *) scannode)->bitmapqualorig) + qual = list_concat(qual, ((BitmapHeapScan *) scannode)->bitmapqualorig); + break; + + default: + Assert(0); + break; + } + + return qual; +} + +/** + * Create VCI Scan node + */ +static VciScan * +vci_create_custom_scan_via_column_store(Scan *scannode, const vci_table_info_t *table_info, List *tlist, List *qual, bool suppress_vp) +{ + VciScan *scan; + + scan = (VciScan *) palloc0(sizeof(VciScan)); + + scan->vci.cscan.scan.plan = scannode->plan; + scan->vci.cscan.scan.plan.parallel_aware = false; + scan->vci.cscan.scan.plan.type = T_CustomPlanMarkPos; + + scan->vci.cscan.scan.plan.targetlist = tlist; + scan->vci.cscan.scan.plan.qual = qual; + + scan->vci.cscan.scan.scanrelid = scannode->scanrelid; + + scan->vci.cscan.flags = VCI_CUSTOMPLAN_SCAN | CUSTOMPATH_SUPPORT_MARK_RESTORE; + scan->vci.cscan.custom_relids = bms_make_singleton(scannode->scanrelid); + scan->vci.cscan.methods = &vci_scan_scan_methods; + + scan->vci.scan_plan_no = scan->vci.cscan.scan.plan.plan_no; + scan->vci.orig_plan = (Plan *) scannode; + + scan->scan_mode = VCI_SCAN_MODE_COLUMN_STORE; + scan->scanrelid = scannode->scanrelid; + scan->reloid = table_info->reloid; + scan->indexoid = table_info->indexOid; + scan->attr_used = table_info->attrs_used; + scan->num_attr_used = bms_num_members(table_info->attrs_used); + scan->estimate_tuples = table_info->estimate_tuples; + scan->is_all_simple_vars = vci_tlist_consists_of_only_simple_vars(tlist, scannode->scanrelid); + + return scan; +} + +/** + * Return true when expression node that cannot be executed in custom plan is detected, + * false if they are all custom plan applicable + */ +static bool +vci_contain_inapplicable_expr_walker(Node *node, void *context) +{ + vci_rewrite_plan_context_t *rp_context = (vci_rewrite_plan_context_t *) context; + + Assert(context); + + if (node == NULL) + return false; + + switch (nodeTag(node)) + { + case T_Var: + { + Var *var = (Var *) node; + + /* varattno == InvalidAttrNumber means it's a whole-row Var */ + if (var->varattno == InvalidAttrNumber) + return true; + + /* + * varattno < InvalidAttrNumber means it's a system-defined + * attribute + */ + else if (var->varattno < InvalidAttrNumber) + return true; + } + break; + + case T_FuncExpr: + { + FuncExpr *expr = (FuncExpr *) node; + + if (expr->funcretset) + { + elog(DEBUG1, "FuncExpr contains returning-set function"); + return true; + } + + if (expr->funcvariadic) + { + elog(DEBUG1, "FuncExpr contains funcvariadic"); + return true; + } + + if (!vci_is_supported_function(expr->funcid)) + { + elog(DEBUG1, "FuncExpr contains not-supported function: oid=%d", expr->funcid); + return true; + } + + if (!vci_is_not_user_defined_type(expr->funcresulttype)) + { + elog(DEBUG1, "FuncExpr contains user defined type: oid=%d", expr->funcresulttype); + return true; + } + + /* + * Always returns true here to create vci_runs_in_plan() + * result. Overwrite to function. + */ + if (expr->funcid == vci_special_udf_info.vci_runs_in_plan_funcoid) + expr->funcid = vci_special_udf_info.vci_always_return_true_funcoid; + } + break; + + case T_OpExpr: + case T_DistinctExpr: /* struct-equivalent to OpExpr */ + case T_NullIfExpr: /* struct-equivalent to OpExpr */ + { + OpExpr *expr = (OpExpr *) node; + + if (expr->opretset) + { + elog(DEBUG1, "%s contains returning-set function", VciGetNodeName(nodeTag(node))); + return true; + } + + if (!vci_is_supported_operation(expr->opfuncid)) + { + elog(DEBUG1, "%s contains not-supported operation: oid=%d", VciGetNodeName(nodeTag(node)), expr->opfuncid); + return true; + } + + if (!vci_is_not_user_defined_type(expr->opresulttype)) + { + elog(DEBUG1, "%s contains user defined type: oid=%d", VciGetNodeName(nodeTag(node)), expr->opresulttype); + return true; + } + } + break; + + case T_Param: + { + Param *param = (Param *) node; + int paramid = param->paramid; + + /* Not support PARAM_EXTERN or PARAM_SUBLINK */ + if (param->paramkind != PARAM_EXEC) + { + elog(DEBUG1, "Param contains extern or sublink"); + return true; + } + + /* + * Check Param defined or referenced by multiple plan node + */ + switch (rp_context->param_exec_attr_map[paramid].type) + { + case VCI_PARAM_EXEC_NESTLOOP: + /* VCI compatible, for calls via NestLoop */ + break; + + case VCI_PARAM_EXEC_INITPLAN: + case VCI_PARAM_EXEC_SUBPLAN: + + /* + * not VCI compatible, for calls via initPlan or + * SubPlan + */ + if (rp_context->param_exec_attr_map[paramid].num_def_plans > 1) + { + elog(DEBUG1, "Param contains multi defining plans"); + return true; + } + + if (rp_context->param_exec_attr_map[paramid].num_use_plans > 1) + { + elog(DEBUG1, "Param contains multi referencing plans"); + return true; + } + break; + + /* LCOV_EXCL_START */ + default: + + /* + * Commenting out below code as there is possibility + * to reach here when optimizer optimizes the plan to + * remove subplan node itself. E.g: Create view V1 as + * SELECT *, (SELECT d FROM t11 WHERE t11.a = t1.a + * LIMIT 1) AS d FROM t1 WHERE a > 5; and run SELECT * + * FROM v1 where a=3; + */ + /* elog(PANIC, "Should not reach here."); */ + break; + /* LCOV_EXCL_STOP */ + } + } + break; + + case T_Const: + case T_List: + break; + + case T_Aggref: + { + Aggref *aggref = (Aggref *) node; + + /* Not support ordered-set or hypothetical */ + if (aggref->aggkind != AGGKIND_NORMAL) + { + elog(DEBUG1, "Aggref contains %c", aggref->aggkind); + return true; + } + + /* Not support polymorphic and variadic aggregation */ + if (aggref->aggvariadic) + { + elog(DEBUG1, "Aggref contains variadic aggregation"); + return true; + } + + /* Not support FILTER expression */ + if (aggref->aggfilter != NULL) + { + elog(DEBUG1, "Aggref contains FILTER expression"); + return true; + } + + /* Not support DISTINCT */ + if (aggref->aggdistinct != NIL) + { + elog(DEBUG1, "Aggref contains DISTINCT"); + return true; + } + + /* Not support ORDER BY */ + if (aggref->aggorder != NIL) + { + elog(DEBUG1, "Aggref contains ORDER BY"); + return true; + } + + /* Not support user-defined aggregation */ + if (!vci_is_supported_aggregation(aggref)) + return true; + } + break; + + case T_ScalarArrayOpExpr: + break; + + case T_BoolExpr: + break; + + case T_RelabelType: + case T_CoalesceExpr: + case T_MinMaxExpr: + break; + case T_NullTest: + { + NullTest *ntest = (NullTest *) node; + + if (ntest->argisrow) + { + elog(DEBUG1, "NullTest contains row-format"); + return true; + } + } + break; + + case T_BooleanTest: + case T_TargetEntry: + break; + + case T_CoerceViaIO: + break; + + case T_CaseExpr: + case T_CaseTestExpr: + break; + + case T_SubPlan: + return true; + + case T_ArrayExpr: + case T_ArrayCoerceExpr: + case T_ConvertRowtypeExpr: + case T_RowExpr: + case T_RowCompareExpr: + case T_SubscriptingRef: + case T_WindowFunc: + case T_XmlExpr: + case T_WindowClause: + case T_CommonTableExpr: + case T_FieldSelect: + case T_FieldStore: + case T_RangeTblFunction: + case T_AlternativeSubPlan: + case T_SetOperationStmt: + case T_AppendRelInfo: + case T_WithCheckOption: /* nserting/updating an auto-updatable view */ + case T_CurrentOfExpr: /* CURRENT OF cursor_name */ + case T_CoerceToDomain: + case T_CoerceToDomainValue: + case T_GroupingFunc: + case T_SQLValueFunction: + case T_NextValueExpr: + return true; + + case T_Query: + case T_FromExpr: + case T_JoinExpr: + case T_PlaceHolderVar: + case T_PlaceHolderInfo: + case T_CollateExpr: + case T_SubLink: + case T_RangeTblRef: + case T_SortGroupClause: + case T_NamedArgExpr: + case T_SetToDefault: /* a DEFAULT marker in an INSERT or UPDATE + * command */ + return true; /* LCOV_EXCL_LINE */ + + default: + /* LCOV_EXCL_START */ + elog(ERROR, "unrecognized node type: %s(%d)", + VciGetNodeName(nodeTag(node)), (int) nodeTag(node)); + break; + /* LCOV_EXCL_STOP */ + } + + return expression_tree_walker(node, vci_contain_inapplicable_expr_walker, context); +} + +/** + * Returns true if it references Param defined in NestLoop + */ +static bool +vci_contain_nestloop_param_expr_walker(Node *node, void *context) +{ + vci_rewrite_plan_context_t *rp_context = (vci_rewrite_plan_context_t *) context; + + if (node == NULL) + return false; + + if (nodeTag(node) == T_Param) + { + Param *param = (Param *) node; + int paramid = param->paramid; + + if (rp_context->param_exec_attr_map[paramid].type == VCI_PARAM_EXEC_NESTLOOP) + { + elog(DEBUG1, "Param contains non-permitted paramId"); + return true; + } + } + + return expression_tree_walker(node, vci_contain_nestloop_param_expr_walker, context); +} + +/*==========================================================================*/ +/* Determine if function/typeis supported in VCI */ +/*==========================================================================*/ + +/** + * Determine if Join is supported + * + * @param[in] jointype Join type + * @return true if supported, false if not + */ +bool +vci_is_supported_jointype(JoinType jointype) +{ + switch (jointype) + { + case JOIN_INNER: + case JOIN_SEMI: + case JOIN_ANTI: + case JOIN_LEFT: + return true; + + case JOIN_RIGHT: + case JOIN_FULL: + default: + return false; + } +} + +/** + * Determine whether Join plan node can be incorporated into parallel plan group. + * + * @param[in] rp_context Join type + * @param[in] join Join type + * + * @return 0 if cannot be incorporated, return plan_no of the VCI Scan that will + * result in partitioned table + */ +static AttrNumber +vci_satisfies_vci_join(vci_rewrite_plan_context_t *rp_context, Join *join) +{ + Plan *outer, + *inner; + + if (!vci_is_supported_jointype(join->jointype)) + return 0; + + outer = outerPlan(join); + inner = innerPlan(join); + + if (rp_context->plan_attr_map[outer->plan_no].plan_compat != VCI_PLAN_COMPAT_OK) + { + elog(DEBUG1, "Join's outer subtree contains not-parallel-executable plannode"); + return 0; + } + + if (rp_context->plan_attr_map[inner->plan_no].plan_compat != VCI_PLAN_COMPAT_OK) + { + elog(DEBUG1, "Join's inner subtree contains not-parallel-executable plannode"); + return 0; + } + + /* + * Check if outer can be used as partitioned table + * + * Rewriteable only when VCI Scan/HashJoin/NestLoop. Not rewriteable for + * VCI Sort/VCI Agg. + */ + switch (vci_get_inner_plan_type(rp_context, outer)) + { + case VCI_INNER_PLAN_TYPE_SCAN: + case VCI_INNER_PLAN_TYPE_HASHJOIN: + case VCI_INNER_PLAN_TYPE_NESTLOOP: + /* OK */ + return vci_get_inner_scan_plan_no(rp_context, outer); + + default: + break; + } + + /* + * If outer cannot be used as partitioned table, try to use inner. + * However, inner-side is generally unsuitable for partitioned table, so + * stricter restrictions are imposed than on outer. + */ + + if (nodeTag(inner) == T_Hash) + inner = outerPlan(inner); + else + return 0; + + if ((inner == NULL) || (join->jointype != JOIN_INNER)) + return 0; + + if (inner->plan_rows < (double) VciGuc.table_rows_threshold) + return 0; + + /* + * outer-side should be less than threshold + * + * This restriction is imposed because performance deteriorates when a + * partitioned table is established on the inner-side when the outer side + * is too large. + */ + if ((double) VciGuc.table_rows_threshold <= outer->plan_rows) + return 0; + + if ((vci_get_inner_plan_type(rp_context, inner) == VCI_INNER_PLAN_TYPE_SCAN) && + (inner->allParam == NULL)) + { + switch (nodeTag(outer)) + { + case T_SeqScan: + case T_BitmapHeapScan: + + case T_IndexScan: + /* OK */ + return vci_get_inner_scan_plan_no(rp_context, inner); + + default: + break; + } + } + + return 0; +} + +/** + * Determine whether the given oid is an operation supported by VCI + */ +static bool +vci_is_supported_operation(Oid oid) +{ + return oid < FirstNormalObjectId; +} + +/** + * Determine whether the given oid is not user defined type + */ +static bool +vci_is_not_user_defined_type(Oid oid) +{ + return oid < FirstNormalObjectId; +} + +/*==========================================================================*/ +/* Register map of Plan on SMC and Plan State on backend */ +/*==========================================================================*/ + +/*==========================================================================*/ +/* Implementation of PG function to check VCI execution */ +/*==========================================================================*/ + +PG_FUNCTION_INFO_V1(vci_runs_in_query); +PG_FUNCTION_INFO_V1(vci_runs_in_plan); +PG_FUNCTION_INFO_V1(vci_always_return_true); + +/** + * PG function that returns whether query is being executed by VCI + * + * @param[in] PG_FUNCTION_ARGS Pointer to data struct passed to PG function + * @return true if VCI is runnning, false if not + */ +Datum +vci_runs_in_query(PG_FUNCTION_ARGS) +{ + return BoolGetDatum(vci_is_processing_custom_plan()); +} + +/** + * PG function that returns whether the plan node containing this function call is VCI plan node + * + * @param[in] PG_FUNCTION_ARGS Pointer to data struct passed to PG function + * @return always false + */ +Datum +vci_runs_in_plan(PG_FUNCTION_ARGS) +{ + return BoolGetDatum(false); +} + +/** + * Function that always returns true + * + * @param[in] PG_FUNCTION_ARGS Pointer to data struct passed to PG function + * @return always true + * + * The vci_runs_in_plan function in the query is overridden by this function, + * which always returns true if the plan rewrite determines that a VCI plan node is connected. + */ +Datum +vci_always_return_true(PG_FUNCTION_ARGS) +{ + return BoolGetDatum(true); +} + +/* + * This function is used to update the plan tree by removing + * the gather plan from the tree and adjust the targetlist + * in custom_vci_plan based on the partial_plan and gather_plan. + */ +static void +vci_update_plan_tree(PlannedStmt *plannedstmt) +{ + Plan *plan = NULL; + List *newsubplans = NIL; + + father_gather_plans plans; + + memset(&plans, 0, sizeof(father_gather_plans)); + + if (plannedstmt->planTree) + { + plan = plannedstmt->planTree; + + if (nodeTag(plan) == T_Gather || nodeTag(plan) == T_GatherMerge) + { + plannedstmt->planTree = plan->lefttree; + plans.gather_plan = plan; + + /* + * The targetlist of the Gather/GatherMerge node and the + * underlying node should be the same (this is enforced in + * preanalyze_plan_tree_mutator()). However, the + * Gather/GatherMerge node may have additional information that + * needs to be retained (by the underlying node) once it is + * removed. + */ + vci_update_target_list(plannedstmt->planTree, plan); + } + plans.father_plan = plan; + vci_plan_tree_walker(plan, vci_update_plan_walker, &plans); + + } + + if (plannedstmt->subplans) + { + ListCell *l; + + foreach(l, plannedstmt->subplans) + { + Plan *subplan = (Plan *) lfirst(l); + + if (subplan == NULL) + continue; + + plans.father_plan = subplan; + if (nodeTag(subplan) == T_Gather || nodeTag(subplan) == T_GatherMerge) + { + plans.gather_plan = subplan; + subplan = subplan->lefttree; + } + newsubplans = lappend(newsubplans, subplan); + vci_plan_tree_walker(subplan, vci_update_plan_walker, &plans); + } + plannedstmt->subplans = newsubplans; + } + +} +static bool +vci_update_plan_walker(Plan *plan, void *plans) +{ + father_gather_plans *fg_plans = (father_gather_plans *) plans; + father_gather_plans fg_plans_local; + + if (plan == NULL) + return false; + /* Go through the every plan here */ + if (nodeTag(plan) == T_Gather || nodeTag(plan) == T_GatherMerge) + { + if (fg_plans->father_plan->lefttree == plan) + { + fg_plans->father_plan->lefttree = plan->lefttree; + } + else if (fg_plans->father_plan->righttree == plan) + { + fg_plans->father_plan->righttree = plan->lefttree; + } + else + { + /* + * Not expected scenario, All other cases should already mark that + * VCI is not possible. + */ + elog(ERROR, "The plan must be either left or right child of parent."); + } + + fg_plans->gather_plan = plan; + } + else if (nodeTag(plan) == T_CustomPlanMarkPos && fg_plans->gather_plan) + { + plan->plan_rows = fg_plans->gather_plan->plan_rows; + plan->parallel_aware = 0; + vci_update_target_list(plan, fg_plans->gather_plan); + } + + fg_plans_local.gather_plan = fg_plans->gather_plan; + fg_plans_local.father_plan = plan; + + return vci_plan_tree_walker(plan, vci_update_plan_walker, &fg_plans_local); + +} + +/* + * If vci_scan is created based on partial scan, some fields will be updated + * by the targetlist in gather_plan. This function is used to do this job. + * + */ +static List * +vci_update_target_list(Plan *plan, Plan *gather_plan) +{ + ListCell *cell1, + *cell2; + + forboth(cell1, plan->targetlist, cell2, gather_plan->targetlist) + { + TargetEntry *te1 = (TargetEntry *) lfirst(cell1); + TargetEntry *te2 = (TargetEntry *) lfirst(cell2); + + te1->resname = te2->resname; + } + + return plan->targetlist; +} diff --git a/contrib/vci/executor/vci_planner_preanalyze.c b/contrib/vci/executor/vci_planner_preanalyze.c new file mode 100644 index 000000000000..007dee7c6bfe --- /dev/null +++ b/contrib/vci/executor/vci_planner_preanalyze.c @@ -0,0 +1,415 @@ +/*------------------------------------------------------------------------- + * + * vci_planner_preanalyze.c + * Preprocessing for plan rewrite routine + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/executor/vci_planner_preanalyze.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include /* for qsort() */ + +#include "access/htup_details.h" +#include "access/sysattr.h" +#include "access/transam.h" +#include "catalog/pg_aggregate.h" +#include "catalog/pg_am.h" +#include "catalog/pg_namespace.h" /* for PG_PUBLIC_NAMESPACE */ +#include "catalog/pg_proc.h" /* for ProcedureRelationId, Form_pg_proc */ +#include "catalog/pg_type.h" /* for BOOLOID */ +#include "executor/executor.h" +#include "executor/nodeCustom.h" +#include "miscadmin.h" +#include "nodes/nodeFuncs.h" +#include "optimizer/clauses.h" +#include "optimizer/cost.h" +#include "optimizer/pathnode.h" +#include "optimizer/paths.h" +#include "optimizer/plancat.h" +#include "optimizer/planmain.h" +#include "optimizer/planner.h" +#include "optimizer/restrictinfo.h" +#include "parser/parsetree.h" +#include "utils/fmgroids.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/relcache.h" +#include "utils/snapmgr.h" +#include "utils/snapshot.h" +#include "utils/syscache.h" + +#include "vci.h" + +#include "vci_mem.h" +#include "vci_executor.h" +#include "vci_utils.h" +#include "vci_planner.h" + +static bool preanalyze_plan_tree_mutator(Plan **plan_p, Plan *parent, void *context, int eflags, bool *changed); +static bool collect_data_in_expression(Node *node, void *context); +static bool collect_data_in_initplan(Node *node, void *context); + +static bool isGatherExists; + +/** + * Analysis before plan rewrite + * + * @param[in] target Pointer to PlannedStmt holding the query + * @param[in,out] rp_context Pointer to plan rewrite information + * @param[in] eflags execution flags + * + * @return true when callback function stops cycle, false if cycle is complete + */ +bool +vci_preanalyze_plan_tree(PlannedStmt *target, vci_rewrite_plan_context_t *rp_context, int eflags, bool *isGather) +{ + bool dummy; + int i, + nParamExec; + + /* + * Scans target's plan tree and gathers information. Use + * vci_plannedstmt_tree_mutator () instead of vci_plannedstmt_tree_walker + * () because target cannot be written but eflags information is collected + * for plan nodes. + */ + if (vci_plannedstmt_tree_mutator(target, preanalyze_plan_tree_mutator, vci_register_plan_id, rp_context, eflags, &dummy)) + { + *isGather = isGatherExists; + return true; + } + + nParamExec = list_length(target->paramExecTypes); + for (i = 0; i < nParamExec; i++) + { + rp_context->param_exec_attr_map[i].num_def_plans = bms_num_members(rp_context->param_exec_attr_map[i].def_plan_nos); + rp_context->param_exec_attr_map[i].num_use_plans = bms_num_members(rp_context->param_exec_attr_map[i].use_plan_nos); + } + *isGather = isGatherExists; + return false; +} + +/** + * Callback function to record Topmost plan node and subplan number + * + * @param[in] plan Topmost plan node + * @param[in] plan_id subplan number + * @param[in,out] context Pointer to plan rewrite information + * + * This function specifies topmostplan for vci_plannedstmt_tree_mutator(). + */ +void +vci_register_plan_id(Plan *plan, int plan_id, void *context) +{ + vci_rewrite_plan_context_t *rp_context = (vci_rewrite_plan_context_t *) context; + + rp_context->current_plan_id = plan_id; + + rp_context->subplan_attr_map[plan_id].topmostplan = plan; +} + +/** + * Analysis before plan rewrite + * + * @param[in] plan_p Pointer to a pointer to plan node + * @param[in] parent Pointer to plan node that is the parent of *plan_p + * @param[in,out] context Pointer to plan rewrite information + * @param[in] eflags execution flags + * @param[out] changed Set true when plan tree has been rewritten + * + * @return true when callback function stops cycle, false if cycle is complete + * + * This function is specified as mutator for vci_plannedstmt_tree_mutator(). + * Since the plan tree is not rewritten, nothing is written to *changed. + */ +static bool +preanalyze_plan_tree_mutator(Plan **plan_p, Plan *parent, void *context, int eflags, bool *changed) +{ + vci_rewrite_plan_context_t *rp_context = (vci_rewrite_plan_context_t *) context; + Plan *plan; + AttrNumber plan_no; + bool saved_forbid_parallel_exec; + bool result; + + plan = *plan_p; + + Assert(plan->plan_no == 0); + plan_no = plan->plan_no = ++rp_context->last_plan_no; + + /* If the capacity of plan_attr_map[] is insufficient, double it */ + vci_expand_plan_attr_map(rp_context); + + rp_context->plan_attr_map[plan_no].preset_eflags = eflags; + + rp_context->current_plan_no = plan_no; + + /* + * Investigate plan nodes that prohibit parallel execution Set + * rp_context->forbid_parallel_exec to false and scan subplan tree + */ + saved_forbid_parallel_exec = rp_context->forbid_parallel_exec; + rp_context->forbid_parallel_exec = false; + + /* Scan expression tree included in plan and collect data */ + vci_expression_and_initplan_walker(plan, collect_data_in_expression, collect_data_in_initplan, context); + + switch (nodeTag(plan)) + { + case T_SubqueryScan: /* Since using VCI custom scan for initplans + * slows down the performance, block VCI scan + * to be replaced for subquery scan */ + return true; + case T_ModifyTable: + case T_TidScan: + case T_TidRangeScan: + case T_FunctionScan: + case T_ValuesScan: + case T_CteScan: + case T_ForeignScan: + case T_CustomScan: + case T_CustomPlanMarkPos: + case T_LockRows: + rp_context->forbid_parallel_exec = true; + break; + case T_Gather: + case T_GatherMerge: + + /* + * Verify the targetlist of Gather node and underlying node is + * same or not. VCI scan replacement assumes Gather node and + * underlying node has same targetlist. But, in some scenarios it + * is not the case. So, avoid rewriting VCI plan where Gather node + * has different targetlist than underlying node. E.g: SELECT c2, + * (select key from testtable1 where key=1 ) FROM testtable2 where + * c1 = 1 limit 1. + */ + + if (list_length(plan->targetlist) != list_length(plan->lefttree->targetlist)) + return true; + + /* + * Set the flag to verify the presence of Gather node in current + * query plan generated by OSS. If there are no Gather nodes + * present, then the step to update the query plan to remove + * Gather node can be skipped. This way unnecessary recursive + * function calls to remove Gather nodes will be skipped when + * there are no Gather plan exists in query plan + */ + + isGatherExists = true; + break; + case T_NestLoop: + { + NestLoop *nl; + ListCell *lc; + + nl = (NestLoop *) plan; + + foreach(lc, nl->nestParams) + { + NestLoopParam *nlp = (NestLoopParam *) lfirst(lc); + int paramid = nlp->paramno; + + if ((rp_context->param_exec_attr_map[paramid].type != VCI_PARAM_EXEC_UNKNOWN) && + (rp_context->param_exec_attr_map[paramid].type != VCI_PARAM_EXEC_NESTLOOP)) + return true; + + rp_context->param_exec_attr_map[paramid].type = VCI_PARAM_EXEC_NESTLOOP; + rp_context->param_exec_attr_map[paramid].def_plan_nos = + bms_add_member(rp_context->param_exec_attr_map[paramid].def_plan_nos, plan_no); + rp_context->plan_attr_map[plan_no].def_param_ids = + bms_add_member(rp_context->plan_attr_map[plan_no].def_param_ids, paramid); + + if (bms_is_member(paramid, rp_context->plan_attr_map[plan_no].use_param_ids)) + return true; + } + } + break; + + default: + break; + } + + rp_context->current_plan_no = 0; + + result = vci_plan_tree_mutator(plan_p, parent, preanalyze_plan_tree_mutator, context, eflags, changed); + + rp_context->plan_attr_map[plan_no].plan_compat = rp_context->forbid_parallel_exec ? VCI_PLAN_COMPAT_FORBID_TYPE : VCI_PLAN_COMPAT_OK; + rp_context->forbid_parallel_exec |= saved_forbid_parallel_exec; + + return result; +} + +/** + * Search expression tree and collect data related to PARAM_EXEC type Param + * and subquery calls. + */ +static bool +collect_data_in_expression(Node *node, void *context) +{ + vci_rewrite_plan_context_t *rp_context = (vci_rewrite_plan_context_t *) context; + AttrNumber plan_no; + + if (node == NULL) + return false; + + plan_no = rp_context->current_plan_no; + + switch (nodeTag(node)) + { + case T_SubPlan: + { + SubPlan *subplan = (SubPlan *) node; + ListCell *lc; + + if ((rp_context->subplan_attr_map[subplan->plan_id].type != VCI_SUBPLAN_UNKNOWN) && + (rp_context->subplan_attr_map[subplan->plan_id].type != VCI_SUBPLAN_SUBPLAN)) + return true; + + rp_context->subplan_attr_map[subplan->plan_id].type = VCI_SUBPLAN_SUBPLAN; + rp_context->subplan_attr_map[rp_context->current_plan_id].plan_ids = + bms_add_member(rp_context->subplan_attr_map[rp_context->current_plan_id].plan_ids, subplan->plan_id); + + foreach(lc, subplan->parParam) + { + int paramid = lfirst_int(lc); + + if ((rp_context->param_exec_attr_map[paramid].type != VCI_PARAM_EXEC_UNKNOWN) && + (rp_context->param_exec_attr_map[paramid].type != VCI_PARAM_EXEC_SUBPLAN)) + return true; + + rp_context->param_exec_attr_map[paramid].type = VCI_PARAM_EXEC_SUBPLAN; + rp_context->param_exec_attr_map[paramid].def_plan_nos = + bms_add_member(rp_context->param_exec_attr_map[paramid].def_plan_nos, plan_no); + rp_context->plan_attr_map[plan_no].def_param_ids = + bms_add_member(rp_context->plan_attr_map[plan_no].def_param_ids, paramid); + } + + return expression_tree_walker((Node *) subplan->args, collect_data_in_expression, context); + } + + case T_Param: + { + Param *param = (Param *) node; + + if (param->paramkind == PARAM_EXEC) + { + int paramid = param->paramid; + + rp_context->param_exec_attr_map[paramid].use_plan_nos = + bms_add_member(rp_context->param_exec_attr_map[paramid].use_plan_nos, plan_no); + rp_context->plan_attr_map[plan_no].use_param_ids = + bms_add_member(rp_context->plan_attr_map[plan_no].use_param_ids, paramid); + + if (rp_context->param_exec_attr_map[paramid].type == VCI_PARAM_EXEC_INITPLAN) + { + rp_context->param_exec_attr_map[paramid].def_plan_nos = + bms_add_member(rp_context->param_exec_attr_map[paramid].def_plan_nos, plan_no); + rp_context->plan_attr_map[plan_no].def_param_ids = + bms_add_member(rp_context->plan_attr_map[plan_no].def_param_ids, paramid); + } + } + } + return false; + + default: + break; + } + + return expression_tree_walker(node, collect_data_in_expression, context); +} + +/** + * Search for initPlan and analyze SubPlan + */ +static bool +collect_data_in_initplan(Node *node, void *context) +{ + vci_rewrite_plan_context_t *rp_context = (vci_rewrite_plan_context_t *) context; + + if (node == NULL) + return false; + + if (IsA(node, SubPlan)) + { + SubPlan *subplan = (SubPlan *) node; + ListCell *lc; + + if ((rp_context->subplan_attr_map[subplan->plan_id].type != VCI_SUBPLAN_UNKNOWN) && + (rp_context->subplan_attr_map[subplan->plan_id].type != VCI_SUBPLAN_INITPLAN)) + return true; + + rp_context->subplan_attr_map[subplan->plan_id].type = VCI_SUBPLAN_INITPLAN; + rp_context->subplan_attr_map[rp_context->current_plan_id].plan_ids = + bms_add_member(rp_context->subplan_attr_map[rp_context->current_plan_id].plan_ids, subplan->plan_id); + + foreach(lc, subplan->setParam) + { + int paramid = lfirst_int(lc); + + if ((rp_context->param_exec_attr_map[paramid].type != VCI_PARAM_EXEC_UNKNOWN) && + (rp_context->param_exec_attr_map[paramid].type != VCI_PARAM_EXEC_INITPLAN)) + return true; + + rp_context->param_exec_attr_map[paramid].type = VCI_PARAM_EXEC_INITPLAN; + + rp_context->param_exec_attr_map[paramid].plan_id = subplan->plan_id; + } + + return false; + } + + return expression_tree_walker(node, collect_data_in_initplan, context); +} + +/** + * Expand array of analysis data for each plan node as necessary + */ +void +vci_expand_plan_attr_map(vci_rewrite_plan_context_t *rp_context) +{ + if (rp_context->max_plan_attrs <= rp_context->last_plan_no) + { + int i; + int old_max_plan_attrs = rp_context->max_plan_attrs; + vci_plan_attr_t *old_plan_attr_map = rp_context->plan_attr_map; + + rp_context->max_plan_attrs *= 2; + rp_context->plan_attr_map = palloc0(sizeof(vci_plan_attr_t) * rp_context->max_plan_attrs); + + for (i = 0; i < old_max_plan_attrs; i++) + rp_context->plan_attr_map[i] = old_plan_attr_map[i]; + + pfree(old_plan_attr_map); + } +} + +vci_inner_plan_type_t +vci_get_inner_plan_type(vci_rewrite_plan_context_t *context, const Plan *plan) +{ + Assert(plan->plan_no > 0); + + return context->plan_attr_map[plan->plan_no].plan_type; +} + +AttrNumber +vci_get_inner_scan_plan_no(vci_rewrite_plan_context_t *context, const Plan *plan) +{ + Assert(plan->plan_no > 0); + + return context->plan_attr_map[plan->plan_no].scan_plan_no; +} + +void +vci_set_inner_plan_type_and_scan_plan_no(vci_rewrite_plan_context_t *context, Plan *plan, vci_inner_plan_type_t plan_type, AttrNumber scan_plan_no) +{ + Assert(plan->plan_no > 0); + + context->plan_attr_map[plan->plan_no].plan_type = plan_type; + context->plan_attr_map[plan->plan_no].scan_plan_no = scan_plan_no; +} diff --git a/contrib/vci/executor/vci_scan.c b/contrib/vci/executor/vci_scan.c new file mode 100644 index 000000000000..158c6c58e221 --- /dev/null +++ b/contrib/vci/executor/vci_scan.c @@ -0,0 +1,632 @@ +/*------------------------------------------------------------------------- + * + * vci_scan.c + * Routines to handle VCI Scan nodes + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/executor/vci_scan.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/relscan.h" +#include "commands/explain.h" +#include "commands/explain_format.h" +#include "executor/executor.h" +#include "executor/nodeCustom.h" +#include "executor/nodeSubplan.h" +#include "miscadmin.h" +#include "optimizer/cost.h" +#include "optimizer/pathnode.h" +#include "optimizer/paths.h" +#include "optimizer/plancat.h" +#include "optimizer/planmain.h" +#include "optimizer/planner.h" +#include "optimizer/restrictinfo.h" +#include "parser/parsetree.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +#include "vci.h" +#include "vci_executor.h" +#include "vci_utils.h" +#include "vci_fetch_row_store.h" + +static Node *vci_scan_CreateCustomScanState(CustomScan *cscan); + +/* +* Declarations of Custom Scan Methods callbacks +*/ +static void vci_scan_BeginCustomPlan(CustomScanState *node, EState *estate, int eflags); + +static void vci_scan_BeginCustomPlan_postprocess_enabling_vp(VciScan *scan, VciScanState *scanstate); +static TupleTableSlot *vci_scan_ExecCustomPlan(CustomScanState *node); +static void vci_scan_EndCustomPlan(CustomScanState *node); + +static void vci_scan_ReScanCustomPlan(CustomScanState *node); +static void vci_scan_MarkPosCustomPlan(CustomScanState *cpstate); +static void vci_scan_RestrPosCustomPlan(CustomScanState *cpstate); + +static void vci_scan_ExplainCustomPlanTargetRel(CustomScanState *node, ExplainState *es); +static CustomScan *vci_scan_CopyCustomPlan(const CustomScan *_from); + +static int exec_proc_scan_vector(VciScanState *scanstate); +static TupleTableSlot *exec_custom_plan_enabling_vp(VciScanState *scanstate); + +/*****************************************************************************/ +/* Column-store (basic) */ +/*****************************************************************************/ + +static Node * +vci_scan_CreateCustomScanState(CustomScan *cscan) +{ + VciScan *vscan; + VciScanState *vss = (VciScanState *) palloc0(sizeof(VciScanState)); + + vscan = (VciScan *) cscan; + + vss->vci.css.ss.ps.type = T_CustomScanState; + vss->vci.css.ss.ps.plan = (Plan *) vscan; + vss->vci.css.flags = cscan->flags; + + switch (vscan->scan_mode) + { + case VCI_SCAN_MODE_COLUMN_STORE: + vss->vci.css.methods = &vci_scan_exec_column_store_methods; + break; + + default: + Assert(0); + break; + } + return (Node *) vss; +} + +static void +vci_scan_BeginCustomPlan(CustomScanState *node, EState *estate, int eflags) +{ + VciScanState *scanstate = (VciScanState *) node; + VciScan *scan = (VciScan *) node->ss.ps.plan; + Relation currentRelation; + TableScanDesc currentScanDesc; + vci_initexpr_t initexpr = VCI_INIT_EXPR_NONE; + TupleDesc scanDesc; + + Assert(scanstate->vci.css.ss.ps.type == T_CustomScanState); + + if (ScanDirectionIsBackward(estate->es_direction)) + elog(ERROR, "VCI Scan does not support backward scan"); + + switch (scan->scan_mode) + { + case VCI_SCAN_MODE_COLUMN_STORE: + initexpr = VCI_INIT_EXPR_FETCHING_COLUMN_STORE; + break; + + default: + Assert(0); + break; + } + + /* + * create state structure + */ + scanstate->is_subextent_grain = scan->is_subextent_grain; + scanstate->vci.css.ss.ps.state = estate; + + /* create expression context for node */ + ExecAssignExprContext(estate, &scanstate->vci.css.ss.ps); + + /* initialize child expressions */ + scanstate->vci.css.ss.ps.qual = + VciExecInitQual(scan->vci.cscan.scan.plan.qual, &scanstate->vci.css.ss.ps, + initexpr); + + if (scan->scan_mode == VCI_SCAN_MODE_COLUMN_STORE) + { + vci_create_one_fetch_context_for_fetching_column_store(scanstate, scanstate->vci.css.ss.ps.ps_ExprContext); + } + + switch (scan->scan_mode) + { + case VCI_SCAN_MODE_COLUMN_STORE: + + /* + * get the relation object id from the relid'th entry in the range + * table, open that relation and acquire appropriate lock on it. + */ + currentRelation = ExecOpenScanRelation(estate, scan->scanrelid, eflags); + + /* initialize a heapscan */ + currentScanDesc = table_beginscan(currentRelation, + estate->es_snapshot, + 0, + NULL); + + scanstate->vci.css.ss.ss_currentRelation = currentRelation; + scanstate->vci.css.ss.ss_currentScanDesc = currentScanDesc; + + /* and report the scan tuple slot's rowtype */ + scanDesc = RelationGetDescr(currentRelation); + break; + + default: + outerPlanState(scanstate) = ExecInitNode(outerPlan(scan), estate, eflags); + scanDesc = ExecGetResultType(outerPlanState(scanstate)); + break; + } + + /* tuple table initialization */ + ExecInitScanTupleSlot(estate, &scanstate->vci.css.ss, scanDesc, &TTSOpsMinimalTuple); + ExecInitResultTupleSlotTL(&scanstate->vci.css.ss.ps, &TTSOpsMinimalTuple); + + /* ExecAssignScanProjectionInfo() ???? */ + if (scan->scan_mode == VCI_SCAN_MODE_COLUMN_STORE) + { + vci_scan_BeginCustomPlan_postprocess_enabling_vp(scan, scanstate); + } +} + +static void +vci_scan_BeginCustomPlan_postprocess_enabling_vp(VciScan *scan, VciScanState *scanstate) +{ + int i, + max_targetlist; + uint16 *skip_list; + ListCell *l; + + max_targetlist = list_length(scanstate->vci.css.ss.ps.plan->targetlist); + + skip_list = vci_CSGetSkipAddrFromVirtualTuples(scanstate->vector_set); + + if (scanstate->vci.css.ss.ps.qual) + { + + scanstate->vp_qual = VciBuildVectorProcessing(scanstate->vci.css.ss.ps.qual->expr, + (PlanState *) scanstate, + scanstate->vci.css.ss.ps.ps_ExprContext, + skip_list); + } + scanstate->result_values = palloc(sizeof(Datum *) * max_targetlist); + scanstate->result_isnull = palloc(sizeof(bool *) * max_targetlist); + scanstate->vp_targets = palloc0(sizeof(VciVPContext *) * max_targetlist); + + i = 0; + foreach(l, scanstate->vci.css.ss.ps.plan->targetlist) + { + TargetEntry *tle = castNode(TargetEntry, lfirst(l)); + AttrNumber resind = tle->resno - 1; + + if (tle->expr && IsA(tle->expr, Var)) + { + Var *var = (Var *) tle->expr; + int index; + + Assert(var->varno == scan->scanrelid); + + index = scanstate->attr_map[var->varattno] - 1; + + Assert(index >= 0); + Assert(index < scanstate->vector_set->num_columns); + + scanstate->result_values[resind] = vci_CSGetValueAddrFromVirtualTuplesColumnwise(scanstate->vector_set, index); + scanstate->result_isnull[resind] = vci_CSGetIsNullAddrFromVirtualTuplesColumnwise(scanstate->vector_set, index); + } + else + { + scanstate->vp_targets[i] = + VciBuildVectorProcessing((Expr *) tle->expr, + (PlanState *) scanstate, + scanstate->vci.css.ss.ps.ps_ExprContext, + skip_list); + + scanstate->result_values[resind] = scanstate->vp_targets[i]->resultValue; + scanstate->result_isnull[resind] = scanstate->vp_targets[i]->resultIsNull; + + i++; + } + } + scanstate->num_vp_targets = i; +} + +static TupleTableSlot * +vci_scan_ExecCustomPlan(CustomScanState *cstate) +{ + VciScanState *scanstate = (VciScanState *) cstate; + + Assert(scanstate->vci.css.ss.ps.type == T_CustomScanState); + + return VciExecProcScanTuple(scanstate); +} + +/** + * Processing equivalent to ExecProcNode() for VCI Scan. + * + * When calling the ExecProcNode of a lower VCI Scan from an upper VCI plan, + * overhead occurs if going through the CustomPlanState's ExecCustomPlan. + * This is a special version to avoid that. + * + * @param[in] scanstate VCI Scan state + * @return output tuple + * + * @todo This can be abolished. + */ +TupleTableSlot * +VciExecProcScanTuple(VciScanState *scanstate) +{ + TupleTableSlot *result; + PlanState *node; + bool use_instrumentation; + + node = &scanstate->vci.css.ss.ps; + + /* + * XXX - This is a workaround to make sure that the plan node we are + * reusing had not already started timing. This is needed to prevent + * "ERROR: InstrStartNode called twice in a row", which can happen for + * EXPLAIN ANALYZE SELECT ... + */ + use_instrumentation = node->instrument && INSTR_TIME_IS_ZERO(node->instrument->starttime); + + CHECK_FOR_INTERRUPTS(); + + if (node->chgParam != NULL) /* something changed */ + ExecReScan(node); /* let ReScan handle this */ + + if (use_instrumentation) + InstrStartNode(node->instrument); + + result = exec_custom_plan_enabling_vp(scanstate); + + if (use_instrumentation) + InstrStopNode(node->instrument, TupIsNull(result) ? 0.0 : 1.0); + + return result; +} + +static TupleTableSlot * +exec_custom_plan_enabling_vp(VciScanState *scanstate) +{ + ExprContext *econtext; + TupleTableSlot *outputslot; + TupleDesc tupdesc; + int i, + slot_index; + + econtext = scanstate->vci.css.ss.ps.ps_ExprContext; + + if (!scanstate->first_fetch || (scanstate->pos.num_fetched_rows <= scanstate->pos.current_row)) + { + int result; + + ResetExprContext(econtext); + + do + { + result = exec_proc_scan_vector(scanstate); + + if (result == -1) + { + ExecClearTuple(scanstate->vci.css.ss.ss_ScanTupleSlot); + + return NULL; + } + } while (result == 0); + } + + outputslot = scanstate->vci.css.ss.ps.ps_ResultTupleSlot; + tupdesc = outputslot->tts_tupleDescriptor; + slot_index = scanstate->pos.current_row; + + ExecClearTuple(outputslot); + + for (i = 0; i < tupdesc->natts; i++) + { + outputslot->tts_values[i] = (scanstate->result_values[i])[slot_index]; + outputslot->tts_isnull[i] = (scanstate->result_isnull[i])[slot_index]; + } + + ExecStoreVirtualTuple(outputslot); + + vci_step_next_tuple_from_column_store(scanstate); + + return outputslot; +} + +int +VciExecProcScanVector(VciScanState *scanstate) +{ + int result; + PlanState *node; + + node = &scanstate->vci.css.ss.ps; + + CHECK_FOR_INTERRUPTS(); + + if (node->chgParam != NULL) /* something changed */ + ExecReScan(node); /* let ReScan handle this */ + + if (node->instrument) + InstrStartNode(node->instrument); + + do + { + result = exec_proc_scan_vector(scanstate); + + if (result == -1) + { + ExecClearTuple(scanstate->vci.css.ss.ss_ScanTupleSlot); + + result = 0; + break; + } + } while (result == 0); + + if (node->instrument) + InstrStopNode(node->instrument, 1.0 * result); + + return result; +} + +static int +exec_proc_scan_vector(VciScanState *scanstate) +{ + int max_slots; + int num_slots = 0; + int slot_index; + int check_slot_index; + ExprContext *econtext; + ExprState *qual; + TupleTableSlot *old_tts; + MemoryContext oldContext; + uint16 *skip_list; + + econtext = scanstate->vci.css.ss.ps.ps_ExprContext; + qual = scanstate->vci.css.ss.ps.qual; + + CHECK_FOR_INTERRUPTS(); + + ResetExprContext(econtext); + + if (!vci_fill_vector_set_from_column_store(scanstate)) + return -1; + + old_tts = econtext->ecxt_scantuple; + econtext->ecxt_scantuple = NULL; /* safety */ + max_slots = scanstate->pos.num_fetched_rows; + + Assert(max_slots > 0); + + oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + + num_slots = 0; + skip_list = vci_CSGetSkipFromVirtualTuples(scanstate->vector_set); + slot_index = skip_list[0]; + check_slot_index = 0; + + if (qual) + { + VciVPContext *vpcontext = scanstate->vp_qual; + + VciExecEvalVectorProcessing(vpcontext, econtext, max_slots); + + for (; slot_index < max_slots; slot_index += skip_list[slot_index + 1] + 1) + { + if (!vpcontext->resultIsNull[slot_index] && DatumGetBool(vpcontext->resultValue[slot_index])) + { + check_slot_index = slot_index + 1; + num_slots++; + } + else + { + InstrCountFiltered1(&scanstate->vci.css.ss, 1); + skip_list[check_slot_index] += skip_list[slot_index + 1] + 1; + } + } + + scanstate->pos.current_row = skip_list[0]; + + VciExecTargetListWithVectorProcessing(scanstate, econtext, max_slots); + } + else + { + VciExecTargetListWithVectorProcessing(scanstate, econtext, max_slots); + + for (; slot_index < max_slots; slot_index += skip_list[slot_index + 1] + 1) + num_slots++; + } + + MemoryContextSwitchTo(oldContext); + + econtext->ecxt_scantuple = old_tts; + + if (num_slots == 0) + { + scanstate->pos.current_row = scanstate->pos.num_fetched_rows; + return 0; + } + + return max_slots; +} + +static void +vci_scan_EndCustomPlan(CustomScanState *node) +{ + VciScan *scan; + VciScanState *scanstate = (VciScanState *) node; + TableScanDesc scanDesc; + + Assert(scanstate->vci.css.ss.ps.type == T_CustomScanState); + + scan = (VciScan *) scanstate->vci.css.ss.ps.plan; + + scanDesc = scanstate->vci.css.ss.ss_currentScanDesc; + + switch (scan->scan_mode) + { + case VCI_SCAN_MODE_COLUMN_STORE: + vci_destroy_one_fetch_context_for_fetching_column_store(scanstate); + + /* close the heap scan */ + table_endscan(scanDesc); + + break; + + default: + /* LCOV_EXCL_START */ + elog(PANIC, "Should not reach here"); + /* LCOV_EXCL_STOP */ + break; + } +} + +static void +vci_scan_ReScanCustomPlan(CustomScanState *node) +{ + VciScanState *scanstate; + + scanstate = (VciScanState *) node; + + Assert(scanstate->vci.css.ss.ps.type == T_CustomScanState); + /* Rescan EvalPlanQual tuple if we're inside an EvalPlanQual recheck */ + Assert(scanstate->vci.css.ss.ps.state->es_epq_active == NULL); + + scanstate->first_fetch = false; +} + +static void +vci_scan_MarkPosCustomPlan(CustomScanState *node) +{ + VciScanState *scanstate = (VciScanState *) node; + + Assert(scanstate->vci.css.ss.ps.type == T_CustomScanState); + + vci_mark_pos_vector_set_from_column_store(scanstate); +} + +static void +vci_scan_RestrPosCustomPlan(CustomScanState *node) +{ + VciScanState *scanstate = (VciScanState *) node; + + Assert(scanstate->vci.css.ss.ps.type == T_CustomScanState); + + ExecClearTuple(scanstate->vci.css.ss.ss_ScanTupleSlot); + + vci_restr_pos_vector_set_from_column_store(scanstate); +} + +static void +vci_scan_ExplainCustomPlanTargetRel(CustomScanState *node, ExplainState *es) +{ + VciScanState *scanstate; + VciScan *scan; + Index scanrelid; + char *refname; + char *objectname = NULL; + char *namespace = NULL; + const char *indexname = NULL; + RangeTblEntry *rte; + + scanstate = (VciScanState *) node; + Assert(scanstate->vci.css.ss.ps.type == T_CustomScanState); + scan = (VciScan *) scanstate->vci.css.ss.ps.plan; + scanrelid = scan->scanrelid; + + rte = rt_fetch(scanrelid, es->rtable); + Assert(rte->rtekind == RTE_RELATION); + + refname = (char *) list_nth(es->rtable_names, scanrelid - 1); + if (refname == NULL) + refname = rte->eref->aliasname; + objectname = get_rel_name(rte->relid); + if (es->verbose) + namespace = get_namespace_name(get_rel_namespace(rte->relid)); + + indexname = get_rel_name(scan->indexoid); + if (indexname == NULL) + elog(ERROR, "cache lookup failed for index %u", scan->indexoid); + + if (es->format == EXPLAIN_FORMAT_TEXT) + { + appendStringInfo(es->str, " using %s on", + quote_identifier(indexname)); + + if (namespace != NULL) + appendStringInfo(es->str, " %s.%s", quote_identifier(namespace), + quote_identifier(objectname)); + else if (objectname != NULL) + appendStringInfo(es->str, " %s", quote_identifier(objectname)); + if (objectname == NULL || strcmp(refname, objectname) != 0) + appendStringInfo(es->str, " %s", quote_identifier(refname)); + } + else + { + ExplainPropertyText("Index Name", indexname, es); + if (objectname != NULL) + ExplainPropertyText("Relation Name", objectname, es); + if (namespace != NULL) + ExplainPropertyText("Schema", namespace, es); + ExplainPropertyText("Alias", refname, es); + } +} + +static CustomScan * +vci_scan_CopyCustomPlan(const CustomScan *_from) +{ + const VciScan *from = (const VciScan *) _from; + VciScan *newnode; + + newnode = (VciScan *) palloc0(sizeof(VciScan)); + + vci_copy_plan(&newnode->vci, &from->vci); + + newnode->scan_mode = from->scan_mode; + newnode->scanrelid = from->scanrelid; + newnode->reloid = from->reloid; + newnode->indexoid = from->indexoid; + newnode->attr_used = bms_copy(from->attr_used); + newnode->num_attr_used = from->num_attr_used; + newnode->is_all_simple_vars = from->is_all_simple_vars; + newnode->estimate_tuples = from->estimate_tuples; + newnode->is_subextent_grain = from->is_subextent_grain; + newnode->index_ph_id = from->index_ph_id; + newnode->fetch_ph_id = from->fetch_ph_id; + + ((Node *) newnode)->type = nodeTag((Node *) from); + + return &newnode->vci.cscan; +} + +/*****************************************************************************/ +/* Callback */ +/*****************************************************************************/ + +CustomScanMethods vci_scan_scan_methods = { + "VCI Scan", + vci_scan_CreateCustomScanState, + vci_scan_CopyCustomPlan +}; + +CustomExecMethods vci_scan_exec_column_store_methods = { + "VCI Scan", + vci_scan_BeginCustomPlan, + vci_scan_ExecCustomPlan, + vci_scan_EndCustomPlan, + vci_scan_ReScanCustomPlan, + vci_scan_MarkPosCustomPlan, + vci_scan_RestrPosCustomPlan, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + vci_scan_ExplainCustomPlanTargetRel +}; diff --git a/contrib/vci/executor/vci_sort.c b/contrib/vci/executor/vci_sort.c new file mode 100644 index 000000000000..029e02739ab5 --- /dev/null +++ b/contrib/vci/executor/vci_sort.c @@ -0,0 +1,415 @@ +/*------------------------------------------------------------------------- + * + * vci_sort.c + * Routines to handle VCI Agg nodes + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/executor/vci_sort.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "commands/explain.h" +#include "commands/explain_format.h" +#include "executor/execdebug.h" +#include "executor/executor.h" +#include "executor/nodeCustom.h" +#include "miscadmin.h" +#include "optimizer/cost.h" +#include "optimizer/pathnode.h" +#include "optimizer/paths.h" +#include "optimizer/plancat.h" +#include "optimizer/planmain.h" +#include "optimizer/planner.h" +#include "optimizer/restrictinfo.h" +#include "utils/tuplesort.h" + +#include "vci.h" +#include "vci_executor.h" +#include "vci_mem.h" + +/* ---------------- + * VCI Sort information + * ---------------- + */ +static Node * +vci_sort_CreateCustomScanState(CustomScan *cs) +{ + VciSort *vsort; + VciSortState *vss = (VciSortState *) palloc0(sizeof(VciSortState)); + + vsort = (VciSort *) cs; + + vss->vci.css.ss.ps.type = T_CustomScanState; + vss->vci.css.ss.ps.plan = (Plan *) vsort; + vss->vci.css.flags = cs->flags; + vss->vci.css.methods = &vci_sort_exec_methods; + + return (Node *) vss; +} + +static TupleTableSlot * +vci_sort_ExecCustomPlan(CustomScanState *node) +{ + EState *estate; + ScanDirection dir; + Tuplesortstate *tuplesortstate; + TupleTableSlot *slot; + VciSortState *sortstate; + + sortstate = (VciSortState *) node; + + SO1_printf("ExecCustomSort: %s\n", + "entering routine"); + + estate = sortstate->vci.css.ss.ps.state; + dir = estate->es_direction; + tuplesortstate = (Tuplesortstate *) sortstate->tuplesortstate; + + if (!sortstate->sort_Done) + { + PlanState *outerNode; + + SO1_printf("ExecCustomSort: %s\n", + "custom sorting subplan"); + + SO1_printf("ExecCustomSort: %s\n", + "calling tuplesort_begin"); + + outerNode = outerPlanState(node); + + tuplesortstate = vci_sort_exec_top_half(sortstate); + + for (;;) + { + VciScanState *scanstate = (VciScanState *) outerNode; + Assert(scanstate->vci.css.ss.ps.type == T_CustomScanState); + slot = VciExecProcScanTuple(scanstate); + + if (TupIsNull(slot)) + break; + + tuplesort_puttupleslot(tuplesortstate, slot); + } + + vci_sort_perform_sort(sortstate); + + sortstate->sort_Done = true; + sortstate->bounded_Done = sortstate->bounded; + sortstate->bound_Done = sortstate->bound; + + SO1_printf("ExecCustomSort: %s\n", "sorting done"); + } + + SO1_printf("ExecCustomSort: %s\n", + "retrieving tuple from tuplesort"); + + slot = sortstate->vci.css.ss.ps.ps_ResultTupleSlot; + + tuplesort_gettupleslot(tuplesortstate, + ScanDirectionIsForward(dir), false, + slot, NULL); + return slot; +} + +Tuplesortstate * +vci_sort_exec_top_half(VciSortState *sortstate) +{ + EState *estate; + Tuplesortstate *tuplesortstate; + VciSort *plannode = (VciSort *) sortstate->vci.css.ss.ps.plan; + PlanState *outerNode; + TupleDesc tupDesc; + int tuplesortopts = TUPLESORT_NONE; + + estate = sortstate->vci.css.ss.ps.state; + sortstate->saved_dir = estate->es_direction; + tuplesortstate = (Tuplesortstate *) sortstate->tuplesortstate; + + estate->es_direction = ForwardScanDirection; + + outerNode = outerPlanState(sortstate); + tupDesc = ExecGetResultType(outerNode); + + if (sortstate->randomAccess) + tuplesortopts |= TUPLESORT_RANDOMACCESS; + if (sortstate->bounded) + tuplesortopts |= TUPLESORT_ALLOWBOUNDED; + + tuplesortstate = tuplesort_begin_heap(tupDesc, + plannode->numCols, + plannode->sortColIdx, + plannode->sortOperators, + plannode->collations, + plannode->nullsFirst, + work_mem, + NULL, + tuplesortopts); + + if (sortstate->bounded) + tuplesort_set_bound(tuplesortstate, sortstate->bound); + + sortstate->tuplesortstate = (void *) tuplesortstate; + + return tuplesortstate; +} + +void +vci_sort_perform_sort(VciSortState *sortstate) +{ + EState *estate; + Tuplesortstate *tuplesortstate; + + estate = sortstate->vci.css.ss.ps.state; + tuplesortstate = (Tuplesortstate *) sortstate->tuplesortstate; + + tuplesort_performsort(tuplesortstate); + + estate->es_direction = sortstate->saved_dir; +} + +static void +vci_sort_BeginCustomPlan(CustomScanState *node, EState *estate, int eflags) +{ + VciSort *sort; + VciSortState *sortstate; + + SO1_printf("vci_sort_BeginCustomPlan: %s\n", + "initializing custom sort node"); + + sort = (VciSort *) node->ss.ps.plan; + + /* + * create state structure + */ + sortstate = (VciSortState *) node; + + sortstate->vci.css.ss.ps.state = estate; + + sortstate->randomAccess = (eflags & (EXEC_FLAG_REWIND | + EXEC_FLAG_BACKWARD | + EXEC_FLAG_MARK)) != 0; + + sortstate->bounded = false; + sortstate->sort_Done = false; + sortstate->tuplesortstate = NULL; + + /* + * initialize child nodes + * + * We shield the child node from the need to support REWIND, BACKWARD, or + * MARK/RESTORE. + */ + + eflags &= ~(EXEC_FLAG_REWIND | EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK); + + outerPlanState(sortstate) = ExecInitNode(outerPlan(sort), estate, eflags); + + /* + * Initialize scan slot and type. + */ + ExecCreateScanSlotFromOuterPlan(estate, &sortstate->vci.css.ss, &TTSOpsVirtual); + + /* + * Initialize return slot and type. No need to initialize projection info + * because this node doesn't do projections. + */ + ExecInitResultTupleSlotTL(&sortstate->vci.css.ss.ps, &TTSOpsMinimalTuple); + sortstate->vci.css.ss.ps.ps_ProjInfo = NULL; + + SO1_printf("vci_sort_BeginCustomPlan: %s\n", + "sort node initialized"); +} + +static void +vci_sort_EndCustomPlan(CustomScanState *node) +{ + VciSortState *sortstate; + + sortstate = (VciSortState *) node; + + SO1_printf("ExecEndSort: %s\n", + "shutting down custom sort node"); + + ExecClearTuple(sortstate->vci.css.ss.ss_ScanTupleSlot); + ExecClearTuple(sortstate->vci.css.ss.ps.ps_ResultTupleSlot); + + if (sortstate->tuplesortstate != NULL) + tuplesort_end((Tuplesortstate *) sortstate->tuplesortstate); + + sortstate->tuplesortstate = NULL; + + ExecEndNode(outerPlanState(sortstate)); + + SO1_printf("ExecEndSort: %s\n", + "VCI Sort node shutdown"); +} + +static void +vci_sort_ReScanCustomPlan(CustomScanState *node) +{ + VciSortState *sortstate; + + sortstate = (VciSortState *) node; + + if (!sortstate->sort_Done) + return; + + ExecClearTuple(sortstate->vci.css.ss.ps.ps_ResultTupleSlot); + + if (sortstate->vci.css.ss.ps.lefttree->chgParam != NULL || + sortstate->bounded != sortstate->bounded_Done || + sortstate->bound != sortstate->bound_Done || + !sortstate->randomAccess) + { + sortstate->sort_Done = false; + tuplesort_end((Tuplesortstate *) sortstate->tuplesortstate); + sortstate->tuplesortstate = NULL; + + if (sortstate->vci.css.ss.ps.lefttree->chgParam == NULL) + ExecReScan(sortstate->vci.css.ss.ps.lefttree); + } + else + tuplesort_rescan((Tuplesortstate *) sortstate->tuplesortstate); +} + +/* LCOV_EXCL_START */ + +static void +vci_sort_MarkPosCustomPlan(CustomScanState *node) +{ + + elog(PANIC, "VCI Sort does not support MarkPosCustomPlan call convention"); + +} + +static void +vci_sort_RestrPosCustomPlan(CustomScanState *node) +{ + elog(PANIC, "VCI Sort does not support RestrPosCustomPlan call convention"); +} + +/* LCOV_EXCL_STOP */ + +static void +vci_sort_ExplainCustomPlan(CustomScanState *csstate, + List *ancestors, + ExplainState *es) +{ + VciSortState *sortstate = (VciSortState *) csstate; + VciSort *sort = (VciSort *) csstate->ss.ps.plan; + + ExplainPropertySortGroupKeys(&csstate->ss.ps, "Sort Key", + sort->numCols, sort->sortColIdx, + ancestors, es); + + if (es->analyze && sortstate->sort_Done && + sortstate->tuplesortstate != NULL) + { + Tuplesortstate *state = (Tuplesortstate *) sortstate->tuplesortstate; + TuplesortInstrumentation stats; + const char *sortMethod; + const char *spaceType; + int64 spaceUsed; + + tuplesort_get_stats(state, &stats); + sortMethod = tuplesort_method_name(stats.sortMethod); + spaceType = tuplesort_space_type_name(stats.spaceType); + spaceUsed = stats.spaceUsed; + + if (es->format == EXPLAIN_FORMAT_TEXT) + { + ExplainIndentText(es); + appendStringInfo(es->str, "Sort Method: %s %s: " INT64_FORMAT "kB\n", + sortMethod, spaceType, spaceUsed); + } + else + { + ExplainPropertyText("Sort Method", sortMethod, es); + ExplainPropertyInteger("Sort Space Used", "kB", spaceUsed, es); + ExplainPropertyText("Sort Space Type", spaceType, es); + } + } +} + +static CustomScan * +vci_sort_CopyCustomPlan(const CustomScan *_from) +{ + const VciSort *from = (const VciSort *) _from; + VciSort *newnode = (VciSort *) palloc0(sizeof(VciSort)); + int numCols; + + vci_copy_plan(&newnode->vci, &from->vci); + + numCols = from->numCols; + + newnode->numCols = numCols; + + if (numCols > 0) + { + int i; + + newnode->sortColIdx = palloc(sizeof(AttrNumber) * numCols); + newnode->sortOperators = palloc(sizeof(Oid) * numCols); + newnode->collations = palloc(sizeof(Oid) * numCols); + newnode->nullsFirst = palloc(sizeof(bool) * numCols); + + for (i = 0; i < numCols; i++) + { + newnode->sortColIdx[i] = from->sortColIdx[i]; + newnode->sortOperators[i] = from->sortOperators[i]; + newnode->collations[i] = from->collations[i]; + newnode->nullsFirst[i] = from->nullsFirst[i]; + } + } + + ((Node *) newnode)->type = nodeTag((Node *) from); + + return &newnode->vci.cscan; +} + +static void +vci_sort_SetBoundCustomScan(const LimitState *node, CustomScanState *css) +{ + VciSortState *sortState = (VciSortState *) css; + int64 tuples_needed = node->count + node->offset; + + /* negative test checks for overflow in sum */ + if (node->noCount || tuples_needed < 0) + { + /* make sure flag gets reset if needed upon rescan */ + sortState->bounded = false; + } + else + { + sortState->bounded = true; + sortState->bound = tuples_needed; + } +} + +CustomScanMethods vci_sort_scan_methods = { + "VCI Sort", + vci_sort_CreateCustomScanState, + vci_sort_CopyCustomPlan +}; + +CustomExecMethods vci_sort_exec_methods = { + "VCI Sort", + vci_sort_BeginCustomPlan, + vci_sort_ExecCustomPlan, + vci_sort_EndCustomPlan, + vci_sort_ReScanCustomPlan, + vci_sort_MarkPosCustomPlan, + vci_sort_RestrPosCustomPlan, + NULL, + NULL, + NULL, + NULL, + NULL, + vci_sort_ExplainCustomPlan, + vci_sort_SetBoundCustomScan, + NULL +}; diff --git a/contrib/vci/executor/vci_vector_executor.c b/contrib/vci/executor/vci_vector_executor.c new file mode 100644 index 000000000000..d89988b4e75a --- /dev/null +++ b/contrib/vci/executor/vci_vector_executor.c @@ -0,0 +1,2338 @@ +/*------------------------------------------------------------------------- + * + * vci_vector_executor.c + * Routines to build and evaluate vector processing object + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/executor/vci_vector_executor.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup_details.h" +#include "access/nbtree.h" +#include "access/relscan.h" +#include "access/transam.h" +#include "access/tupconvert.h" +#include "catalog/index.h" +#include "catalog/objectaccess.h" +#include "catalog/pg_type.h" +#include "catalog/pg_proc.h" +#include "commands/typecmds.h" +#include "executor/execdebug.h" +#include "executor/executor.h" +#include "executor/nodeCustom.h" +#include "executor/nodeSubplan.h" +#include "fmgr.h" +#include "funcapi.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "nodes/makefuncs.h" +#include "nodes/nodeFuncs.h" +#include "nodes/nodes.h" +#include "optimizer/planner.h" +#include "parser/parse_coerce.h" +#include "parser/parsetree.h" +#include "pgstat.h" +#include "storage/lmgr.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/typcache.h" +#include "utils/xml.h" + +#include "vci.h" +#include "vci_executor.h" +#include "vci_utils.h" + +/* Private Structure to Vector processing */ +typedef struct FuncExprinfo +{ + FmgrInfo *finfo; + FunctionCallInfo fcinfo_data; /* arguments etc */ + PGFunction fn_addr; /* actual call address */ + int nargs; /* number of arguments */ + List *args; /* states of argument expressions */ + Oid funcid; + Oid inputcollid; +} FuncExprinfo; + +/* + * VciScalarArrayOpExprHashEntry + * Hash table entry type used during VciVPExecHashedScalarArrayOpExpr + * Copied from OSS ScalarArrayOpExprHashEntry + */ +typedef struct VciScalarArrayOpExprHashEntry +{ + Datum key; + uint32 status; /* hash status */ + uint32 hash; /* hash value (cached) */ +} VciScalarArrayOpExprHashEntry; + +#define SH_PREFIX saophash +#define SH_ELEMENT_TYPE VciScalarArrayOpExprHashEntry +#define SH_KEY_TYPE Datum +#define SH_SCOPE static inline +#define SH_DECLARE +#include "lib/simplehash.h" + +static bool saop_hash_element_match(struct saophash_hash *tb, Datum key1, + Datum key2); +static uint32 saop_element_hash(struct saophash_hash *tb, Datum key); + +/* + * VciScalarArrayOpExprHashTable + * Hash table for VciVPExecHashedScalarArrayOpExpr + * Copied from OSS ScalarArrayOpExprHashTable + */ +typedef struct VciScalarArrayOpExprHashTable +{ + saophash_hash *hashtab; /* underlying hash table */ + struct VciVPNode *pnode; +} VciScalarArrayOpExprHashTable; + +/* Define parameters for ScalarArrayOpExpr hash table code generation. */ +#define SH_PREFIX saophash +#define SH_ELEMENT_TYPE VciScalarArrayOpExprHashEntry +#define SH_KEY_TYPE Datum +#define SH_KEY key +#define SH_HASH_KEY(tb, key) saop_element_hash(tb, key) +#define SH_EQUAL(tb, a, b) saop_hash_element_match(tb, a, b) +#define SH_SCOPE static inline +#define SH_STORE_HASH +#define SH_GET_HASH(tb, a) a->hash +#define SH_DEFINE +#include "lib/simplehash.h" + +static void VciVPExecFunc(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecFunc_arg0(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecFunc_arg1(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecFunc_arg2(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecDistinctExpr(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecNullIfExpr(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecScalarArrayOpExpr(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecHashedScalarArrayOpExpr(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecNullTest(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecBooleanTest(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecNot(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecAnd_head(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecAnd_next(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecAnd_nullasfalse_next(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecOr_head(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecOr_next(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecMinMax_head(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecMinMax_next(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecCoalesce_head(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecCoalesce_next(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecCase_head(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecCase_arg(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecCase_cond(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecCase_result(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecCaseTest(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecParamExec(Expr *expression, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecCoerceViaIO(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecVar(Expr *expression, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void VciVPExecConst(Expr *expression, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +static void vci_vp_exec_simple_copy(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots); + +static VciVPContext *vci_create_vp_context(void); +static vci_vp_item_id vci_add_vp_node(VciVPExecOp_func func, Expr *expr, VciVPContext *vpcontext, int len_args, vci_vp_item_id *arg_items, bool allocValueAndIsNull, uint16 *skip_list); +static vci_vp_item_id vci_add_var_node(Var *variable, PlanState *parent, VciVPContext *vpcontext, uint16 *skip_list); +static vci_vp_item_id vci_add_param_node(Param *param, PlanState *parent, VciVPContext *vpcontext, uint16 *skip_list); +static vci_vp_item_id vci_add_const_node(Const *con, VciVPContext *vpcontext, uint16 *skip_list); +static vci_vp_item_id vci_add_control_nodes(VciVPExecOp_func head_func, VciVPExecOp_func next_func, List *args, Expr *expr, PlanState *parent, ExprContext *econtext, VciVPContext *vpcontext, uint16 *skip_list); +static vci_vp_item_id traverse_expr_state_tree(Expr *node, PlanState *parent, ExprContext *econtext, VciVPContext *vpcontext, uint16 *skip_list); + +static void VciVPExecInitFunc(Expr *node, List *args, Oid funcid, Oid inputcollid, PlanState *parent, FuncExprinfo *funcinfo); +static vci_vp_item_id vci_add_func_expr_node(Expr *expr, VciVPContext *vpcontext, FuncExprinfo *funcinfo, PlanState *parent, ExprContext *econtext, uint16 *skip_list); +static Datum VciExecEvalParamExec_vp(VciVPNode *vpnode, ExprContext *econtext, bool *isNull); + +/***************************************************************************** + * Vector processing execution function + *****************************************************************************/ + +/** + * Execute vector processing + * + * @param[in,out] vpcontext + * @param[in] econtext + * @param[in] max_slots + */ +void +VciExecEvalVectorProcessing(VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + vci_vp_item_id i, + max; + + max = vpcontext->num_item; + + for (i = 1; i < max; i++) + { + VciVPNode *vpnode = &vpcontext->itemNode[i]; + + vpnode->evalfunc(vpnode->expr, vpnode, vpcontext, econtext, max_slots); + } +} + +static void +VciVPExecFunc(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index; + + FunctionCallInfo fcinfo; + PgStat_FunctionCallUsage fcusage; + + /* inlined, simplified version of ExecEvalFuncArgs */ + fcinfo = vpnode->data.func.fcinfo_data; + + for (slot_index = skip_list[0]; + slot_index < max_slots; + slot_index += skip_list[slot_index + 1] + 1) + { + int i; + Datum value = (Datum) 0; + bool isnull = true; + + for (i = 0; i < vpnode->len_args; i++) + { + vci_vp_item_id item = vpnode->arg_items[i]; + VciVPNode *arg_node = &vpcontext->itemNode[item]; + + fcinfo->args[i].value = arg_node->itemValue[slot_index]; + fcinfo->args[i].isnull = arg_node->itemIsNull[slot_index]; + } + + if (vpnode->data.func.finfo->fn_strict) + { + while (--i >= 0) + { + if (fcinfo->args[i].isnull) + { + goto done; + } + } + } + + pgstat_init_function_usage(fcinfo, &fcusage); + + fcinfo->isnull = false; + value = FunctionCallInvoke(fcinfo); + isnull = fcinfo->isnull; + + pgstat_end_function_usage(&fcusage, true); + +done: + itemValue[slot_index] = value; + itemIsNull[slot_index] = isnull; + } +} + +static void +VciVPExecFunc_arg0(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index; + + FunctionCallInfo fcinfo; + + /* PgStat_FunctionCallUsage fcusage; */ + + /* inlined, simplified version of ExecEvalFuncArgs */ + fcinfo = vpnode->data.func.fcinfo_data; + + for (slot_index = skip_list[0]; + slot_index < max_slots; + slot_index += skip_list[slot_index + 1] + 1) + { + Datum value = (Datum) 0; + bool isnull = true; + + /* pgstat_init_function_usage(fcinfo, &fcusage); */ + + fcinfo->isnull = false; + value = FunctionCallInvoke(fcinfo); + isnull = fcinfo->isnull; + + /* pgstat_end_function_usage(&fcusage, true); */ + + itemValue[slot_index] = value; + itemIsNull[slot_index] = isnull; + } +} + +static void +VciVPExecFunc_arg1(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index; + + FunctionCallInfo fcinfo; + + /* PgStat_FunctionCallUsage fcusage; */ + + VciVPNode *arg_node; + + /* inlined, simplified version of ExecEvalFuncArgs */ + fcinfo = vpnode->data.func.fcinfo_data; + + arg_node = &vpcontext->itemNode[vpnode->arg_items[0]]; + + for (slot_index = skip_list[0]; + slot_index < max_slots; + slot_index += skip_list[slot_index + 1] + 1) + { + Datum value = (Datum) 0; + bool isnull = true; + + fcinfo->args[0].value = arg_node->itemValue[slot_index]; + fcinfo->args[0].isnull = arg_node->itemIsNull[slot_index]; + + if (vpnode->data.func.finfo->fn_strict) + if (fcinfo->args[0].isnull) + goto done; + + /* pgstat_init_function_usage(fcinfo, &fcusage); */ + + fcinfo->isnull = false; + value = FunctionCallInvoke(fcinfo); + isnull = fcinfo->isnull; + + /* pgstat_end_function_usage(&fcusage, true); */ + +done: + itemValue[slot_index] = value; + itemIsNull[slot_index] = isnull; + } +} + +static void +VciVPExecFunc_arg2(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index; + + FunctionCallInfo fcinfo; + + /* PgStat_FunctionCallUsage fcusage; */ + + VciVPNode *arg_node0, + *arg_node1; + + /* inlined, simplified version of ExecEvalFuncArgs */ + fcinfo = vpnode->data.func.fcinfo_data; + + arg_node0 = &vpcontext->itemNode[vpnode->arg_items[0]]; + arg_node1 = &vpcontext->itemNode[vpnode->arg_items[1]]; + + for (slot_index = skip_list[0]; + slot_index < max_slots; + slot_index += skip_list[slot_index + 1] + 1) + { + Datum value = (Datum) 0; + bool isnull = true; + + fcinfo->args[0].value = arg_node0->itemValue[slot_index]; + fcinfo->args[0].isnull = arg_node0->itemIsNull[slot_index]; + + fcinfo->args[1].value = arg_node1->itemValue[slot_index]; + fcinfo->args[1].isnull = arg_node1->itemIsNull[slot_index]; + + if (vpnode->data.func.finfo->fn_strict) + if (fcinfo->args[0].isnull || fcinfo->args[1].isnull) + goto done; + + /* pgstat_init_function_usage(fcinfo, &fcusage); */ + + fcinfo->isnull = false; + value = FunctionCallInvoke(fcinfo); + isnull = fcinfo->isnull; + + /* pgstat_end_function_usage(&fcusage, true); */ + +done: + itemValue[slot_index] = value; + itemIsNull[slot_index] = isnull; + } +} + +static void +VciVPExecDistinctExpr(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index; + + FunctionCallInfo fcinfo; + + /* inlined, simplified version of ExecEvalFuncArgs */ + fcinfo = vpnode->data.func.fcinfo_data; + + for (slot_index = skip_list[0]; + slot_index < max_slots; + slot_index += skip_list[slot_index + 1] + 1) + { + int i; + Datum value = (Datum) 0; + bool isnull = false; + + Assert(vpnode->len_args == 2); + + for (i = 0; i < 2; i++) + { + vci_vp_item_id item = vpnode->arg_items[i]; + VciVPNode *arg_node = &vpcontext->itemNode[item]; + + fcinfo->args[i].value = arg_node->itemValue[slot_index]; + fcinfo->args[i].isnull = arg_node->itemIsNull[slot_index]; + } + + if (fcinfo->args[0].isnull && fcinfo->args[1].isnull) + { + /* Both NULL? Then is not distinct... */ + value = BoolGetDatum(false); + } + else if (fcinfo->args[0].isnull || fcinfo->args[1].isnull) + { + /* Only one is NULL? Then is distinct... */ + value = BoolGetDatum(true); + } + else + { + fcinfo->isnull = false; + value = FunctionCallInvoke(fcinfo); + isnull = fcinfo->isnull; + /* Must invert result of "=" */ + value = BoolGetDatum(!DatumGetBool(value)); + } + + itemValue[slot_index] = value; + itemIsNull[slot_index] = isnull; + } +} + +static void +VciVPExecNullIfExpr(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index; + + FunctionCallInfo fcinfo; + + /* inlined, simplified version of ExecEvalFuncArgs */ + fcinfo = vpnode->data.func.fcinfo_data; + + for (slot_index = skip_list[0]; + slot_index < max_slots; + slot_index += skip_list[slot_index + 1] + 1) + { + int i; + Datum value = (Datum) 0; + bool isnull = false; + + Assert(vpnode->len_args == 2); + + for (i = 0; i < 2; i++) + { + vci_vp_item_id item = vpnode->arg_items[i]; + VciVPNode *arg_node = &vpcontext->itemNode[item]; + + fcinfo->args[i].value = arg_node->itemValue[slot_index]; + fcinfo->args[i].isnull = arg_node->itemIsNull[slot_index]; + } + + /* if either argument is NULL they can't be equal */ + if (!fcinfo->args[0].isnull && !fcinfo->args[1].isnull) + { + fcinfo->isnull = false; + value = FunctionCallInvoke(fcinfo); + /* if the arguments are equal return null */ + if (!fcinfo->isnull && DatumGetBool(value)) + { + value = (Datum) 0; + isnull = true; + goto equal_two_arguments; + } + } + + value = fcinfo->args[0].value; + isnull = fcinfo->args[0].isnull; + +equal_two_arguments: + itemValue[slot_index] = value; + itemIsNull[slot_index] = isnull; + } +} + +static void +VciVPExecScalarArrayOpExpr(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index; + + bool useOr = vpnode->data.scalararrayop.useOr; + FunctionCallInfo fcinfo; + + fcinfo = vpnode->data.scalararrayop.fcinfo_data; + + for (slot_index = skip_list[0]; + slot_index < max_slots; + slot_index += skip_list[slot_index + 1] + 1) + { + ArrayType *arr; + int nitems; + Datum result; + bool resultnull; + int i; + int16 typlen; + bool typbyval; + char typalign; + char *s; + bits8 *bitmap; + int bitmask; + + result = (Datum) 0; + resultnull = false; /* Set default values for result flags: + * non-null, not a set result */ + + for (i = 0; i < 2; i++) + { + vci_vp_item_id item = vpnode->arg_items[i]; + VciVPNode *arg_node = &vpcontext->itemNode[item]; + + fcinfo->args[i].value = arg_node->itemValue[slot_index]; + fcinfo->args[i].isnull = arg_node->itemIsNull[slot_index]; + } + + /* + * If the array is NULL then we return NULL --- it's not very + * meaningful to do anything else, even if the operator isn't strict. + */ + if (fcinfo->args[1].isnull) + { + result = (Datum) 0; + resultnull = true; + goto done; + } + + /* Else okay to fetch and detoast the array */ + arr = DatumGetArrayTypeP(fcinfo->args[1].value); + + /* + * If the array is empty, we return either FALSE or TRUE per the useOr + * flag. This is correct even if the scalar is NULL; since we would + * evaluate the operator zero times, it matters not whether it would + * want to return NULL. + */ + nitems = ArrayGetNItems(ARR_NDIM(arr), ARR_DIMS(arr)); + if (nitems <= 0) + { + result = BoolGetDatum(!useOr); + goto done; + } + + /* + * If the scalar is NULL, and the function is strict, return NULL; no + * point in iterating the loop. + */ + if (fcinfo->args[0].isnull && vpnode->data.scalararrayop.finfo->fn_strict) + { + result = (Datum) 0; + resultnull = true; + goto done; + } + + /* + * We arrange to look up info about the element type only once per + * series of calls, assuming the element type doesn't change + * underneath us. + */ + if (vpnode->data.scalararrayop.element_type != ARR_ELEMTYPE(arr)) + { + get_typlenbyvalalign(ARR_ELEMTYPE(arr), + &vpnode->data.scalararrayop.typlen, + &vpnode->data.scalararrayop.typbyval, + &vpnode->data.scalararrayop.typalign); + vpnode->data.scalararrayop.element_type = ARR_ELEMTYPE(arr); + } + typlen = vpnode->data.scalararrayop.typlen; + typbyval = vpnode->data.scalararrayop.typbyval; + typalign = vpnode->data.scalararrayop.typalign; + + result = BoolGetDatum(!useOr); + resultnull = false; + + /* Loop over the array elements */ + s = (char *) ARR_DATA_PTR(arr); + bitmap = ARR_NULLBITMAP(arr); + bitmask = 1; + + for (i = 0; i < nitems; i++) + { + Datum elt; + Datum thisresult; + + /* Get array element, checking for NULL */ + if (bitmap && (*bitmap & bitmask) == 0) + { + fcinfo->args[1].value = (Datum) 0; + fcinfo->args[1].isnull = true; + } + else + { + elt = fetch_att(s, typbyval, typlen); + s = att_addlength_pointer(s, typlen, s); + s = (char *) att_align_nominal(s, typalign); + fcinfo->args[1].value = elt; + fcinfo->args[1].isnull = false; + } + + /* Call comparison function */ + if (fcinfo->args[1].isnull && vpnode->data.scalararrayop.finfo->fn_strict) + { + fcinfo->isnull = true; + thisresult = (Datum) 0; + } + else + { + fcinfo->isnull = false; + thisresult = FunctionCallInvoke(fcinfo); + } + + /* Combine results per OR or AND semantics */ + if (fcinfo->isnull) + resultnull = true; + else if (useOr) + { + if (DatumGetBool(thisresult)) + { + result = BoolGetDatum(true); + resultnull = false; + break; /* needn't look at any more elements */ + } + } + else + { + if (!DatumGetBool(thisresult)) + { + result = BoolGetDatum(false); + resultnull = false; + break; /* needn't look at any more elements */ + } + } + + /* advance bitmap pointer if any */ + if (bitmap) + { + bitmask <<= 1; + if (bitmask == 0x100) + { + bitmap++; + bitmask = 1; + } + } + } + +done: + itemValue[slot_index] = result; + itemIsNull[slot_index] = resultnull; + } +} + +/* + * Hash function for scalar array hash op elements. + * + * We use the element type's default hash opclass, and the column collation + * if the type is collation-sensitive. + */ +static uint32 +saop_element_hash(struct saophash_hash *tb, Datum key) +{ + VciScalarArrayOpExprHashTable *elements_tab = (VciScalarArrayOpExprHashTable *) tb->private_data; + FunctionCallInfo fcinfo = elements_tab->pnode->data.hashedscalararrayop.fcinfo_data; + Datum hash; + + fcinfo->args[0].value = key; + fcinfo->args[0].isnull = false; + + hash = elements_tab->pnode->data.hashedscalararrayop.hash_fn_addr(fcinfo); + + return DatumGetUInt32(hash); +} + +/* + * Matching function for scalar array hash op elements, to be used in hashtable + * lookups. + */ +static bool +saop_hash_element_match(struct saophash_hash *tb, Datum key1, Datum key2) +{ + Datum result; + + VciScalarArrayOpExprHashTable *elements_tab = (VciScalarArrayOpExprHashTable *) tb->private_data; + FunctionCallInfo fcinfo = elements_tab->pnode->data.hashedscalararrayop.fcinfo_data; + + fcinfo->args[0].value = key1; + fcinfo->args[0].isnull = false; + fcinfo->args[1].value = key2; + fcinfo->args[1].isnull = false; + + result = elements_tab->pnode->data.hashedscalararrayop.fn_addr(fcinfo); + + return DatumGetBool(result); +} + +static void +VciVPExecHashedScalarArrayOpExpr(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index; + + FunctionCallInfo fcinfo; + bool strictfunc; + Datum scalar; + bool scalar_isnull; + VciScalarArrayOpExprHashTable *elements_tab; + + fcinfo = vpnode->data.hashedscalararrayop.fcinfo_data; + strictfunc = vpnode->data.hashedscalararrayop.finfo->fn_strict; + elements_tab = vpnode->data.hashedscalararrayop.elements_tab; + + for (slot_index = skip_list[0]; + slot_index < max_slots; + slot_index += skip_list[slot_index + 1] + 1) + { + Datum result; + bool resultnull; + bool hashfound; + + /* We don't setup a hashed scalar array op if the array const is null. */ + Assert(!fcinfo->args[1].isnull); + + for (int i = 0; i < 2; i++) + { + vci_vp_item_id item = vpnode->arg_items[i]; + VciVPNode *arg_node = &vpcontext->itemNode[item]; + + fcinfo->args[i].value = arg_node->itemValue[slot_index]; + fcinfo->args[i].isnull = arg_node->itemIsNull[slot_index]; + } + scalar = fcinfo->args[0].value; + scalar_isnull = fcinfo->args[0].isnull; + + /* + * If the scalar is NULL, and the function is strict, return NULL; no + * point in executing the search. + */ + if (fcinfo->args[0].isnull && strictfunc) + { + result = (Datum) 0; + resultnull = true; + goto done; + } + + /* Build the hash table on first evaluation */ + if (elements_tab == NULL) + { + int16 typlen; + bool typbyval; + char typalign; + int nitems; + bool has_nulls = false; + char *s; + bits8 *bitmap; + int bitmask; + MemoryContext oldcontext; + ArrayType *arr; + + arr = DatumGetArrayTypeP(fcinfo->args[1].value); + nitems = ArrayGetNItems(ARR_NDIM(arr), ARR_DIMS(arr)); + + get_typlenbyvalalign(ARR_ELEMTYPE(arr), + &typlen, + &typbyval, + &typalign); + + oldcontext = MemoryContextSwitchTo(econtext->ecxt_per_query_memory); + + elements_tab = (VciScalarArrayOpExprHashTable *) + palloc(sizeof(VciScalarArrayOpExprHashTable)); + vpnode->data.hashedscalararrayop.elements_tab = elements_tab; + elements_tab->pnode = vpnode; + + /* + * Create the hash table sizing it according to the number of + * elements in the array. This does assume that the array has no + * duplicates. If the array happens to contain many duplicate + * values then it'll just mean that we sized the table a bit on + * the large side. + */ + elements_tab->hashtab = saophash_create(CurrentMemoryContext, nitems, + elements_tab); + + MemoryContextSwitchTo(oldcontext); + + s = (char *) ARR_DATA_PTR(arr); + bitmap = ARR_NULLBITMAP(arr); + bitmask = 1; + for (int i = 0; i < nitems; i++) + { + /* Get array element, checking for NULL. */ + if (bitmap && (*bitmap & bitmask) == 0) + { + has_nulls = true; + } + else + { + Datum element; + + element = fetch_att(s, typbyval, typlen); + s = att_addlength_pointer(s, typlen, s); + s = (char *) att_align_nominal(s, typalign); + + saophash_insert(elements_tab->hashtab, element, &hashfound); + } + + /* Advance bitmap pointer if any. */ + if (bitmap) + { + bitmask <<= 1; + if (bitmask == 0x100) + { + bitmap++; + bitmask = 1; + } + } + } + + /* + * Remember if we had any nulls so that we know if we need to + * execute non-strict functions with a null lhs value if no match + * is found. + */ + vpnode->data.hashedscalararrayop.has_nulls = has_nulls; + } + + /* Check the hash to see if we have a match. */ + hashfound = NULL != saophash_lookup(elements_tab->hashtab, scalar); + + result = BoolGetDatum(hashfound); + resultnull = false; + + /* + * If we didn't find a match in the array, we still might need to + * handle the possibility of null values. We didn't put any NULLs + * into the hashtable, but instead marked if we found any when + * building the table in has_nulls. + */ + if (!DatumGetBool(result) && vpnode->data.hashedscalararrayop.has_nulls) + { + if (strictfunc) + { + + /* + * We have nulls in the array so a non-null lhs and no match + * must yield NULL. + */ + result = (Datum) 0; + resultnull = true; + } + else + { + /* + * Execute function will null rhs just once. + * + * The hash lookup path will have scribbled on the lhs + * argument so we need to set it up also (even though we + * entered this function with it already set). + */ + fcinfo->args[0].value = scalar; + fcinfo->args[0].isnull = scalar_isnull; + fcinfo->args[1].value = (Datum) 0; + fcinfo->args[1].isnull = true; + + result = FunctionCallInvoke(fcinfo); + resultnull = fcinfo->isnull; + } + } + +done: + itemValue[slot_index] = result; + itemIsNull[slot_index] = resultnull; + } +} + +static void +VciVPExecNullTest(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index; + + NullTest *ntest = (NullTest *) expr; + VciVPNode *arg_node; + + arg_node = &vpcontext->itemNode[vpnode->arg_items[0]]; + + for (slot_index = skip_list[0]; + slot_index < max_slots; + slot_index += skip_list[slot_index + 1] + 1) + { + Datum value; + bool isnull; + + value = arg_node->itemValue[slot_index]; + isnull = arg_node->itemIsNull[slot_index]; + + Assert(!ntest->argisrow); + + /* Simple scalar-argument case, or a null rowtype datum */ + switch (ntest->nulltesttype) + { + case IS_NULL: + if (isnull) + { + value = BoolGetDatum(true); + isnull = false; + } + else + value = BoolGetDatum(false); + break; + + case IS_NOT_NULL: + if (isnull) + { + value = BoolGetDatum(false); + isnull = false; + } + else + value = BoolGetDatum(true); + break; + + default: + /* LCOV_EXCL_START */ + elog(ERROR, "unrecognized nulltesttype: %d", + (int) ntest->nulltesttype); + break; + /* LCOV_EXCL_STOP */ + } + + itemValue[slot_index] = value; + itemIsNull[slot_index] = isnull; + } +} + +static void +VciVPExecBooleanTest(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index; + + BooleanTest *btest = (BooleanTest *) expr; + VciVPNode *arg_node; + + arg_node = &vpcontext->itemNode[vpnode->arg_items[0]]; + + for (slot_index = skip_list[0]; + slot_index < max_slots; + slot_index += skip_list[slot_index + 1] + 1) + { + Datum value; + bool isnull; + + value = arg_node->itemValue[slot_index]; + isnull = arg_node->itemIsNull[slot_index]; + + switch (btest->booltesttype) + { + case IS_TRUE: + if (isnull) + { + value = BoolGetDatum(false); + isnull = false; + } + else if (DatumGetBool(value)) + value = BoolGetDatum(true); + else + value = BoolGetDatum(false); + break; + + case IS_NOT_TRUE: + if (isnull) + { + value = BoolGetDatum(true); + isnull = false; + } + else if (DatumGetBool(value)) + value = BoolGetDatum(false); + else + value = BoolGetDatum(true); + break; + + case IS_FALSE: + if (isnull) + { + value = BoolGetDatum(false); + isnull = false; + } + else if (DatumGetBool(value)) + value = BoolGetDatum(false); + else + value = BoolGetDatum(true); + break; + + case IS_NOT_FALSE: + if (isnull) + { + value = BoolGetDatum(true); + isnull = false; + } + else if (DatumGetBool(value)) + value = BoolGetDatum(true); + else + value = BoolGetDatum(false); + break; + + case IS_UNKNOWN: + if (isnull) + { + value = BoolGetDatum(true); + isnull = false; + } + else + value = BoolGetDatum(false); + break; + + case IS_NOT_UNKNOWN: + if (isnull) + { + value = BoolGetDatum(false); + isnull = false; + } + else + value = BoolGetDatum(true); + break; + + default: + /* LCOV_EXCL_START */ + elog(ERROR, "unrecognized booltesttype: %d", + (int) btest->booltesttype); + break; + /* LCOV_EXCL_STOP */ + } + + itemValue[slot_index] = value; + itemIsNull[slot_index] = isnull; + } +} + +static void +VciVPExecNot(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index; + + VciVPNode *arg_node; + + arg_node = &vpcontext->itemNode[vpnode->arg_items[0]]; + + for (slot_index = skip_list[0]; + slot_index < max_slots; + slot_index += skip_list[slot_index + 1] + 1) + { + Datum value; + bool isnull; + + value = arg_node->itemValue[slot_index]; + isnull = arg_node->itemIsNull[slot_index]; + + if (isnull) + value = (Datum) 0; + else + value = BoolGetDatum(!DatumGetBool(value)); + + itemValue[slot_index] = value; + itemIsNull[slot_index] = isnull; + } +} + +static void +VciVPExecAnd_head(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + int i; + + for (i = 0; i < VCI_MAX_FETCHING_ROWS; i++) + vpnode->itemValue[i] = BoolGetDatum(true); + + memset(vpnode->itemIsNull, 0, sizeof(bool) * VCI_MAX_FETCHING_ROWS); + memcpy(vpnode->skip_list, vpnode->data.init.orig_skip_list, sizeof(uint16) * VCI_MAX_SKIP_LIST_SLOTS); +} + +static void +VciVPExecAnd_next(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index; + + int check_slot_index = 0; + VciVPNode *arg_node; + + arg_node = &vpcontext->itemNode[vpnode->arg_items[0]]; + + for (slot_index = skip_list[0]; + slot_index < max_slots; + slot_index += skip_list[slot_index + 1] + 1) + { + Datum value; + bool isnull; + + value = arg_node->itemValue[slot_index]; + isnull = arg_node->itemIsNull[slot_index]; + + if (isnull) + { + itemValue[slot_index] = (Datum) 0; + itemIsNull[slot_index] = true; + + check_slot_index = slot_index + 1; + } + else if (!DatumGetBool(value)) + { + itemValue[slot_index] = BoolGetDatum(false); + itemIsNull[slot_index] = false; + + skip_list[check_slot_index] += skip_list[slot_index + 1] + 1; + } + else + { + check_slot_index = slot_index + 1; + } + } +} + +static void +VciVPExecAnd_nullasfalse_next(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index; + + int check_slot_index = 0; + VciVPNode *arg_node; + + arg_node = &vpcontext->itemNode[vpnode->arg_items[0]]; + + for (slot_index = skip_list[0]; + slot_index < max_slots; + slot_index += skip_list[slot_index + 1] + 1) + { + Datum value; + bool isnull; + + value = arg_node->itemValue[slot_index]; + isnull = arg_node->itemIsNull[slot_index]; + + if (isnull || !DatumGetBool(value)) + { + itemValue[slot_index] = BoolGetDatum(false); + itemIsNull[slot_index] = false; + + skip_list[check_slot_index] += skip_list[slot_index + 1] + 1; + } + else + { + check_slot_index = slot_index + 1; + } + } +} + +static void +VciVPExecOr_head(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + int i; + + for (i = 0; i < VCI_MAX_FETCHING_ROWS; i++) + vpnode->itemValue[i] = BoolGetDatum(false); + + memset(vpnode->itemIsNull, 0, sizeof(bool) * VCI_MAX_FETCHING_ROWS); + memcpy(vpnode->skip_list, vpnode->data.init.orig_skip_list, sizeof(uint16) * VCI_MAX_SKIP_LIST_SLOTS); +} + +static void +VciVPExecOr_next(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index; + + int check_slot_index = 0; + VciVPNode *arg_node; + + arg_node = &vpcontext->itemNode[vpnode->arg_items[0]]; + + for (slot_index = skip_list[0]; + slot_index < max_slots; + slot_index += skip_list[slot_index + 1] + 1) + { + Datum value; + bool isnull; + + value = arg_node->itemValue[slot_index]; + isnull = arg_node->itemIsNull[slot_index]; + + if (isnull) + { + itemValue[slot_index] = (Datum) 0; + itemIsNull[slot_index] = true; + + check_slot_index = slot_index + 1; + } + else if (DatumGetBool(value)) + { + itemValue[slot_index] = BoolGetDatum(true); + itemIsNull[slot_index] = false; + + skip_list[check_slot_index] += skip_list[slot_index + 1] + 1; + } + else + { + check_slot_index = slot_index + 1; + } + } +} + +static void +VciVPExecMinMax_head(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + int i; + + for (i = 0; i < VCI_MAX_FETCHING_ROWS; i++) + vpnode->itemIsNull[i] = true; + + memcpy(vpnode->skip_list, vpnode->data.init.orig_skip_list, sizeof(uint16) * VCI_MAX_SKIP_LIST_SLOTS); +} + +static void +VciVPExecMinMax_next(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index; + + VciVPNode *arg_node; + FmgrInfo *finfo; + TypeCacheEntry *typentry; + + MinMaxExpr *minmax = (MinMaxExpr *) expr; + Oid collation = minmax->inputcollid; + MinMaxOp op = minmax->op; + + arg_node = &vpcontext->itemNode[vpnode->arg_items[0]]; + + finfo = palloc0(sizeof(FmgrInfo)); /* will be freed as part of query + * context free */ + + /* Look up the btree comparison function for the datatype */ + typentry = lookup_type_cache(minmax->minmaxtype, + TYPECACHE_CMP_PROC); + + if (!OidIsValid(typentry->cmp_proc)) + ereport(ERROR, + (errcode(ERRCODE_UNDEFINED_FUNCTION), + errmsg("could not identify a comparison function for type %s", + format_type_be(minmax->minmaxtype)))); + + fmgr_info(typentry->cmp_proc, finfo); + fmgr_info_set_expr((Node *) expr, finfo); + + for (slot_index = skip_list[0]; + slot_index < max_slots; + slot_index += skip_list[slot_index + 1] + 1) + { + Datum value; + bool isnull; + + LOCAL_FCINFO(locfcinfo, 2); + int32 cmpresult; + + InitFunctionCallInfoData(*locfcinfo, finfo, 2, + collation, NULL, NULL); + locfcinfo->args[0].isnull = false; + locfcinfo->args[1].isnull = false; + + value = arg_node->itemValue[slot_index]; + isnull = arg_node->itemIsNull[slot_index]; + + if (isnull) + { + } + else if (itemIsNull[slot_index] == true) + { + /* first nonnull input, adopt value */ + itemValue[slot_index] = value; + itemIsNull[slot_index] = false; + } + else + { + /* apply comparison function */ + locfcinfo->args[0].value = itemValue[slot_index]; + locfcinfo->args[1].value = value; + locfcinfo->isnull = false; + cmpresult = DatumGetInt32(FunctionCallInvoke(locfcinfo)); + if (cmpresult > 0 && op == IS_LEAST) + itemValue[slot_index] = value; + else if (cmpresult < 0 && op == IS_GREATEST) + itemValue[slot_index] = value; + } + } +} + +static void +VciVPExecCoalesce_head(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + int i; + + for (i = 0; i < VCI_MAX_FETCHING_ROWS; i++) + { + vpnode->itemValue[i] = (Datum) 0; + vpnode->itemIsNull[i] = true; + } + + memcpy(vpnode->skip_list, vpnode->data.init.orig_skip_list, sizeof(uint16) * VCI_MAX_SKIP_LIST_SLOTS); +} + +static void +VciVPExecCoalesce_next(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index; + + int check_slot_index = 0; + VciVPNode *arg_node; + + arg_node = &vpcontext->itemNode[vpnode->arg_items[0]]; + + for (slot_index = skip_list[0]; + slot_index < max_slots; + slot_index += skip_list[slot_index + 1] + 1) + { + Datum value; + bool isnull; + + value = arg_node->itemValue[slot_index]; + isnull = arg_node->itemIsNull[slot_index]; + + if (isnull) + { + itemValue[slot_index] = (Datum) 0; + itemIsNull[slot_index] = true; + + check_slot_index = slot_index + 1; + } + else + { + itemValue[slot_index] = value; + itemIsNull[slot_index] = false; + + skip_list[check_slot_index] += skip_list[slot_index + 1] + 1; + } + } +} + +static void +VciVPExecCase_head(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + int i; + + for (i = 0; i < VCI_MAX_FETCHING_ROWS; i++) + { + vpnode->itemValue[i] = (Datum) 0; + vpnode->itemIsNull[i] = true; + } + + memcpy(vpnode->skip_list, vpnode->data.init.orig_skip_list, sizeof(uint16) * VCI_MAX_SKIP_LIST_SLOTS); +} + +static void +VciVPExecCase_arg(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + vci_vp_exec_simple_copy(expr, vpnode, vpcontext, econtext, max_slots); +} + +static void +VciVPExecCase_cond(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list0 = vpnode->data.init.orig_skip_list; + uint16 *skip_list1 = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index0; + int slot_index1; + + int check_slot_index0 = 0; + int check_slot_index1 = 0; + VciVPNode *arg_node; + + arg_node = &vpcontext->itemNode[vpnode->arg_items[0]]; + + memcpy(skip_list1, skip_list0, sizeof(uint16) * VCI_MAX_SKIP_LIST_SLOTS); + + for (slot_index0 = skip_list0[0], + slot_index1 = skip_list1[0]; + slot_index0 < max_slots; + slot_index0 += skip_list0[slot_index0 + 1] + 1, + slot_index1 += skip_list1[slot_index1 + 1] + 1) + { + Datum clause_value; + bool isnull; + + clause_value = arg_node->itemValue[slot_index0]; + isnull = arg_node->itemIsNull[slot_index0]; + + if (DatumGetBool(clause_value) && !isnull) + { + itemValue[slot_index0] = arg_node->itemValue[slot_index0]; + itemIsNull[slot_index0] = arg_node->itemIsNull[slot_index0]; + + skip_list0[check_slot_index0] += skip_list0[slot_index0 + 1] + 1; + check_slot_index1 = slot_index1 + 1; + } + else + { + check_slot_index0 = slot_index0 + 1; + skip_list1[check_slot_index1] += skip_list1[slot_index1 + 1] + 1; + } + } +} + +static void +VciVPExecCase_result(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + vci_vp_exec_simple_copy(expr, vpnode, vpcontext, econtext, max_slots); +} + +static void +VciVPExecCaseTest(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + /* Do nothging */ +} + +static void +VciVPExecParamExec(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index; + + bool first_eval_exec = false; + Datum paramValue; + bool paramIsNull; + + for (slot_index = skip_list[0]; + slot_index < max_slots; + slot_index += skip_list[slot_index + 1] + 1) + { + + if (!first_eval_exec) + { + paramValue = VciExecEvalParamExec_vp(vpnode, econtext, ¶mIsNull); + first_eval_exec = true; + } + + itemValue[slot_index] = paramValue; + itemIsNull[slot_index] = paramIsNull; + } +} + +static void +VciVPExecCoerceViaIO(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index; + VciVPNode *arg_node; + + FmgrInfo *outfunc = vpnode->data.iocoerce.finfo_out; /* lookup info for + * source output + * function */ + FmgrInfo *infunc = vpnode->data.iocoerce.finfo_in; /* lookup info for + * result input function */ + Oid typioparam = vpnode->data.iocoerce.typioparam; + + arg_node = &vpcontext->itemNode[vpnode->arg_items[0]]; + + for (slot_index = skip_list[0]; + slot_index < max_slots; + slot_index += skip_list[slot_index + 1] + 1) + { + char *string; + Datum value; + bool isnull; + + value = arg_node->itemValue[slot_index]; + isnull = arg_node->itemIsNull[slot_index]; + + if (isnull) + string = NULL; + else + string = OutputFunctionCall(outfunc, value); + + value = InputFunctionCall(infunc, + string, + typioparam, + -1); + + itemValue[slot_index] = value; + itemIsNull[slot_index] = isnull; + } +} + +static void +VciVPExecVar(Expr *expression, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + /* Do nothging */ +} + +static void +VciVPExecConst(Expr *expression, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + /* Do nothging */ +} + +static void +vci_vp_exec_simple_copy(Expr *expr, VciVPNode *vpnode, VciVPContext *vpcontext, ExprContext *econtext, int max_slots) +{ + uint16 *skip_list = vpnode->skip_list; + Datum *itemValue = vpnode->itemValue; + bool *itemIsNull = vpnode->itemIsNull; + int slot_index; + + VciVPNode *arg_node; + + arg_node = &vpcontext->itemNode[vpnode->arg_items[0]]; + + for (slot_index = skip_list[0]; + slot_index < max_slots; + slot_index += skip_list[slot_index + 1] + 1) + { + itemValue[slot_index] = arg_node->itemValue[slot_index]; + itemIsNull[slot_index] = arg_node->itemIsNull[slot_index]; + } +} + +/***************************************************************************** + * Vector processing setting function + *****************************************************************************/ + +VciVPContext * +VciBuildVectorProcessing(Expr *node, PlanState *parent, ExprContext *econtext, uint16 *skip_list) +{ + VciVPContext *vpcontext; + VciVPNode *lastNode; + + if (node == NULL) + return NULL; + + vpcontext = vci_create_vp_context(); + + traverse_expr_state_tree(node, parent, econtext, vpcontext, skip_list); + + lastNode = &vpcontext->itemNode[vpcontext->num_item - 1]; + + vpcontext->resultValue = lastNode->itemValue; + vpcontext->resultIsNull = lastNode->itemIsNull; + + return vpcontext; +} + +static VciVPContext * +vci_create_vp_context(void) +{ + const vci_vp_item_id max = 16; + VciVPContext *vpcontext; + + vpcontext = (VciVPContext *) palloc0(sizeof(VciVPContext)); + + vpcontext->num_item = 1; + vpcontext->max_item = max; + vpcontext->itemNode = (VciVPNode *) palloc0(sizeof(VciVPNode) * max); + + return vpcontext; +} + +static vci_vp_item_id +vci_add_vp_node(VciVPExecOp_func evalfunc, Expr *expr, VciVPContext *vpcontext, int len_args, vci_vp_item_id *arg_items, bool allocValueAndIsNull, uint16 *skip_list) +{ + vci_vp_item_id item; + VciVPNode *vpnode; + + item = vpcontext->num_item; + + if (vpcontext->num_item + 1 >= vpcontext->max_item) + { + vci_vp_item_id j; + VciVPNode *oldnodes = vpcontext->itemNode; + VciVPNode *newnodes = (VciVPNode *) palloc0(sizeof(VciVPNode) * vpcontext->max_item * 2); + + for (j = 1; j < vpcontext->max_item; j++) + newnodes[j] = oldnodes[j]; + + vpcontext->max_item *= 2; + vpcontext->itemNode = newnodes; + + pfree(oldnodes); + } + + vpnode = &vpcontext->itemNode[item]; + + vpnode->evalfunc = evalfunc; + vpnode->expr = expr; + vpnode->len_args = len_args; + + if (len_args > 0) + { + int i; + + vpnode->arg_items = (vci_vp_item_id *) palloc(sizeof(vci_vp_item_id) * len_args); + + for (i = 0; i < len_args; i++) + vpnode->arg_items[i] = arg_items[i]; + } + + if (allocValueAndIsNull) + { + vpnode->itemValue = palloc(sizeof(Datum) * VCI_MAX_FETCHING_ROWS); + vpnode->itemIsNull = palloc(sizeof(bool) * VCI_MAX_FETCHING_ROWS); + } + + vpnode->skip_list = skip_list; + + vpcontext->num_item++; + + return item; +} + +static vci_vp_item_id +vci_add_var_node(Var *variable, PlanState *parent, VciVPContext *vpcontext, uint16 *skip_list) +{ + vci_vp_item_id ret; + VciVPNode *vpnode; + + VciScanState *scanstate = vci_search_scan_state((VciPlanState *) parent); + + ret = vci_add_vp_node(VciVPExecVar, (Expr *) variable, vpcontext, 0, NULL, false, skip_list); + + vpnode = &vpcontext->itemNode[ret]; + + if (variable->varno == OUTER_VAR) + { + vpnode->itemValue = scanstate->result_values[variable->varattno - 1]; + vpnode->itemIsNull = scanstate->result_isnull[variable->varattno - 1]; + } + else + { + int index; + + index = scanstate->attr_map[variable->varattno] - 1; + + Assert(index >= 0); + Assert(index < scanstate->vector_set->num_columns); + + vpnode->itemValue = vci_CSGetValueAddrFromVirtualTuplesColumnwise(scanstate->vector_set, index); + vpnode->itemIsNull = vci_CSGetIsNullAddrFromVirtualTuplesColumnwise(scanstate->vector_set, index); + } + + return ret; +} +static vci_vp_item_id +vci_add_param_node(Param *param, PlanState *parent, VciVPContext *vpcontext, uint16 *skip_list) +{ + vci_vp_item_id ret; + VciVPNode *vpnode; + + ret = vci_add_vp_node((VciVPExecOp_func) VciVPExecParamExec, (Expr *) param, vpcontext, 0, NULL, true, skip_list); + + vpnode = &vpcontext->itemNode[ret]; + + vpnode->data.param.paramid = param->paramid; + vpnode->data.param.paramtype = param->paramtype; + vpnode->data.param.vci_parent_plan = parent->plan; + + return ret; +} +static vci_vp_item_id +vci_add_const_node(Const *con, VciVPContext *vpcontext, uint16 *skip_list) +{ + vci_vp_item_id ret; + int i; + VciVPNode *vpnode; + Datum *itemValue; + bool *itemIsNull; + + ret = vci_add_vp_node(VciVPExecConst, (Expr *) con, vpcontext, 0, NULL, true, skip_list); + + vpnode = &vpcontext->itemNode[ret]; + + itemValue = vpnode->itemValue; + itemIsNull = vpnode->itemIsNull; + + for (i = 0; i < VCI_MAX_FETCHING_ROWS; i++) + { + itemValue[i] = con->constvalue; + itemIsNull[i] = con->constisnull; + } + + return ret; +} +static vci_vp_item_id +vci_add_func_expr_node(Expr *expr, VciVPContext *vpcontext, FuncExprinfo *funcinfo, PlanState *parent, ExprContext *econtext, uint16 *skip_list) +{ + + vci_vp_item_id result; + int i; + int len_args = 0; + vci_vp_item_id *arg_items; + ListCell *l; + + len_args = list_length(funcinfo->args); + if (len_args > 0) + arg_items = (vci_vp_item_id *) palloc(sizeof(vci_vp_item_id) * len_args); + else + arg_items = NULL; + + i = 0; + foreach(l, funcinfo->args) + { + Expr *arg = (Expr *) lfirst(l); + + arg_items[i] = traverse_expr_state_tree(arg, parent, econtext, vpcontext, skip_list); + i++; + } + + /* + * pgstat_init_function_usage() + */ + + if (pgstat_track_functions <= funcinfo->fcinfo_data->flinfo->fn_stats) + { + switch (list_length(funcinfo->args)) + { + case 0: + result = vci_add_vp_node((VciVPExecOp_func) VciVPExecFunc_arg0, + expr, vpcontext, len_args, arg_items, true, skip_list); + goto func_expr_state_done; + + case 1: + result = vci_add_vp_node((VciVPExecOp_func) VciVPExecFunc_arg1, + expr, vpcontext, len_args, arg_items, true, skip_list); + goto func_expr_state_done; + + case 2: + result = vci_add_vp_node((VciVPExecOp_func) VciVPExecFunc_arg2, + expr, vpcontext, len_args, arg_items, true, skip_list); + goto func_expr_state_done; + + default: + break; + } + } + + result = vci_add_vp_node((VciVPExecOp_func) VciVPExecFunc, + expr, vpcontext, len_args, arg_items, true, skip_list); + +func_expr_state_done: + { + VciVPNode *vpnode; + + vpnode = &vpcontext->itemNode[result]; + vpnode->data.func.finfo = funcinfo->finfo; + vpnode->data.func.fcinfo_data = funcinfo->fcinfo_data; + vpnode->data.func.fn_addr = funcinfo->fn_addr; + vpnode->data.func.nargs = funcinfo->nargs; + + if (arg_items) + pfree(arg_items); + } + return result; +} +static vci_vp_item_id +vci_add_control_nodes(VciVPExecOp_func head_func, VciVPExecOp_func next_func, List *args, + Expr *expr, PlanState *parent, ExprContext *econtext, VciVPContext *vpcontext, uint16 *skip_list) +{ + vci_vp_item_id ret; + ListCell *l; + Datum *itemValue = palloc(sizeof(Datum) * VCI_MAX_FETCHING_ROWS); + bool *itemIsNull = palloc(sizeof(bool) * VCI_MAX_FETCHING_ROWS); + uint16 *inner_skip_list = palloc(sizeof(uint16) * VCI_MAX_SKIP_LIST_SLOTS); + VciVPNode *head_node; + + ret = vci_add_vp_node(head_func, expr, vpcontext, 0, NULL, false, inner_skip_list); + + head_node = &vpcontext->itemNode[ret]; + head_node->itemValue = itemValue; + head_node->itemIsNull = itemIsNull; + head_node->data.init.orig_skip_list = skip_list; + + foreach(l, args) + { + Expr *arg = (Expr *) lfirst(l); + vci_vp_item_id next_item; + VciVPNode *next_node; + + next_item = traverse_expr_state_tree(arg, parent, econtext, vpcontext, inner_skip_list); + ret = vci_add_vp_node(next_func, expr, vpcontext, 1, &next_item, false, inner_skip_list); + + next_node = &vpcontext->itemNode[ret]; + next_node->itemValue = itemValue; + next_node->itemIsNull = itemIsNull; + } + + return ret; +} + +static vci_vp_item_id +traverse_expr_state_tree(Expr *node, PlanState *parent, ExprContext *econtext, VciVPContext *vpcontext, uint16 *skip_list) +{ + + if (node == NULL) + return 0; + + /* Guard against stack overflow due to overly complex expressions */ + check_stack_depth(); + + if (IsA(node, List)) + { + int num_args = list_length((List *) node); + + if (num_args > 1) + return vci_add_control_nodes(VciVPExecAnd_head, VciVPExecAnd_nullasfalse_next, (List *) node, + node, parent, econtext, vpcontext, skip_list); + else if (num_args == 1) + return traverse_expr_state_tree(linitial((List *) node), parent, econtext, vpcontext, skip_list); + } + + switch (nodeTag(node)) + { + case T_Var: + + /* + * Assert(state->evalfunc == (ExprStateEvalFunc) + * VciExecEvalScalarVarFromColumnStore); + * + * If execinitexpr for qual is decided to be not needed, then this + * assertion also becomes invalid + */ + + return vci_add_var_node((Var *) node, parent, vpcontext, skip_list); + + case T_Const: + return vci_add_const_node((Const *) node, vpcontext, skip_list); + + case T_Param: + return vci_add_param_node((Param *) node, parent, vpcontext, skip_list); + + /* + * return vci_add_vp_node((VciVPExecOp_func) VciVPExecParamExec, + * node, vpcontext, 0, NULL, true, skip_list); + */ + + case T_Aggref: + /* LCOV_EXCL_START */ + elog(ERROR, "Aggref should not be targeted by vector processing"); + node = NULL; + break; + /* LCOV_EXCL_STOP */ + + case T_OpExpr: + { + OpExpr *op = (OpExpr *) node; + FuncExprinfo *funcinfo = palloc0(sizeof(struct FuncExprinfo)); /* will be freed as part + * of query context free */ + + VciVPExecInitFunc(node, op->args, op->opfuncid, op->inputcollid, parent, funcinfo); + return vci_add_func_expr_node(node, vpcontext, funcinfo, parent, econtext, skip_list); + } + + case T_FuncExpr: + { + FuncExpr *func = (FuncExpr *) node; + FuncExprinfo *funcinfo = palloc0(sizeof(struct FuncExprinfo)); /* will be freed as part + * of query context free */ + + VciVPExecInitFunc(node, func->args, func->funcid, func->inputcollid, parent, funcinfo); + return vci_add_func_expr_node(node, vpcontext, funcinfo, parent, econtext, skip_list); + } + + case T_DistinctExpr: + { + DistinctExpr *op = (DistinctExpr *) node; + FuncExprinfo *funcinfo = palloc0(sizeof(struct FuncExprinfo)); /* will be freed as part + * of query context free */ + vci_vp_item_id result; + VciVPNode *vpnode; + + vci_vp_item_id arg_items[2]; + + /* + * Not required as this was the value always set earlier in + * execinitexpr stage + */ + + VciVPExecInitFunc(node, op->args, op->opfuncid, op->inputcollid, parent, funcinfo); + + Assert(list_length(funcinfo->args) == 2); + + arg_items[0] = traverse_expr_state_tree(list_nth(funcinfo->args, 0), parent, econtext, vpcontext, skip_list); + arg_items[1] = traverse_expr_state_tree(list_nth(funcinfo->args, 1), parent, econtext, vpcontext, skip_list); + + result = vci_add_vp_node((VciVPExecOp_func) VciVPExecDistinctExpr, + node, vpcontext, 2, arg_items, true, skip_list); + + vpnode = &vpcontext->itemNode[result]; + vpnode->data.func.finfo = funcinfo->finfo; + vpnode->data.func.fcinfo_data = funcinfo->fcinfo_data; + vpnode->data.func.fn_addr = funcinfo->fn_addr; + vpnode->data.func.nargs = funcinfo->nargs; + + return result; + } + + case T_NullIfExpr: + { + NullIfExpr *op = (NullIfExpr *) node; + vci_vp_item_id arg_items[2]; + FuncExprinfo *funcinfo = palloc0(sizeof(struct FuncExprinfo)); /* will be freed as part + * of query context free */ + vci_vp_item_id result; + VciVPNode *vpnode; + + /* + * Not required as this was the value always set earlier in + * execinitexpr stage + */ + VciVPExecInitFunc(node, op->args, op->opfuncid, op->inputcollid, parent, funcinfo); + + Assert(list_length(funcinfo->args) == 2); + + arg_items[0] = traverse_expr_state_tree(list_nth(funcinfo->args, 0), parent, econtext, vpcontext, skip_list); + arg_items[1] = traverse_expr_state_tree(list_nth(funcinfo->args, 1), parent, econtext, vpcontext, skip_list); + + result = vci_add_vp_node((VciVPExecOp_func) VciVPExecNullIfExpr, + node, vpcontext, 2, arg_items, true, skip_list); + + vpnode = &vpcontext->itemNode[result]; + vpnode->data.func.finfo = funcinfo->finfo; + vpnode->data.func.fcinfo_data = funcinfo->fcinfo_data; + vpnode->data.func.fn_addr = funcinfo->fn_addr; + vpnode->data.func.nargs = funcinfo->nargs; + + return result; + } + + case T_ScalarArrayOpExpr: + { + ScalarArrayOpExpr *op = (ScalarArrayOpExpr *) node; + vci_vp_item_id arg_items[2]; + FuncExprinfo *funcinfo = palloc0(sizeof(struct FuncExprinfo)); /* will be freed as part + * of query context free */ + vci_vp_item_id result; + VciVPNode *vpnode; + + /* + * Not required as this was the value always set earlier in + * execinitexpr stage + */ + VciVPExecInitFunc(node, op->args, op->opfuncid, op->inputcollid, parent, funcinfo); + + Assert(list_length(funcinfo->args) == 2); + + arg_items[0] = traverse_expr_state_tree(list_nth(funcinfo->args, 0), parent, econtext, vpcontext, skip_list); + arg_items[1] = traverse_expr_state_tree(list_nth(funcinfo->args, 1), parent, econtext, vpcontext, skip_list); + + if (OidIsValid(op->hashfuncid)) + { + FmgrInfo *hash_finfo = palloc0(sizeof(FmgrInfo)); + FunctionCallInfo hash_fcinfo = palloc0(SizeForFunctionCallInfo(1)); + + fmgr_info(op->hashfuncid, hash_finfo); + fmgr_info_set_expr((Node *) node, hash_finfo); + InitFunctionCallInfoData(*hash_fcinfo, hash_finfo, + 1, op->inputcollid, NULL, + NULL); + + result = vci_add_vp_node((VciVPExecOp_func) VciVPExecHashedScalarArrayOpExpr, + node, vpcontext, 2, arg_items, true, skip_list); + + vpnode = &vpcontext->itemNode[result]; + vpnode->data.hashedscalararrayop.finfo = funcinfo->finfo; + vpnode->data.hashedscalararrayop.fcinfo_data = funcinfo->fcinfo_data; + vpnode->data.hashedscalararrayop.fn_addr = funcinfo->fn_addr; + + vpnode->data.hashedscalararrayop.hash_finfo = funcinfo->finfo; + vpnode->data.hashedscalararrayop.hash_fcinfo_data = funcinfo->fcinfo_data; + vpnode->data.hashedscalararrayop.hash_fn_addr = funcinfo->fn_addr; + } + else + { + result = vci_add_vp_node((VciVPExecOp_func) VciVPExecScalarArrayOpExpr, + node, vpcontext, 2, arg_items, true, skip_list); + + vpnode = &vpcontext->itemNode[result]; + vpnode->data.scalararrayop.element_type = InvalidOid; + vpnode->data.scalararrayop.useOr = op->useOr; + vpnode->data.scalararrayop.finfo = funcinfo->finfo; + vpnode->data.scalararrayop.fcinfo_data = funcinfo->fcinfo_data; + vpnode->data.scalararrayop.fn_addr = funcinfo->fn_addr; + } + + return result; + } + + case T_RelabelType: + { + RelabelType *relabel = (RelabelType *) node; + + return traverse_expr_state_tree(relabel->arg, parent, econtext, vpcontext, skip_list); + } + case T_NullTest: + { + vci_vp_item_id ret; + NullTest *ntest = (NullTest *) node; + + ret = traverse_expr_state_tree(ntest->arg, parent, econtext, vpcontext, skip_list); + return vci_add_vp_node((VciVPExecOp_func) VciVPExecNullTest, + node, vpcontext, 1, &ret, true, skip_list); + } + + case T_BooleanTest: + { + BooleanTest *booltest = (BooleanTest *) node; + vci_vp_item_id ret; + + ret = traverse_expr_state_tree(booltest->arg, parent, econtext, vpcontext, skip_list); + return vci_add_vp_node((VciVPExecOp_func) VciVPExecBooleanTest, + node, vpcontext, 1, &ret, true, skip_list); + } + + case T_BoolExpr: + { + BoolExpr *boolexpr = (BoolExpr *) node; + vci_vp_item_id arg_item; + + switch (boolexpr->boolop) + { + case AND_EXPR: + return vci_add_control_nodes(VciVPExecAnd_head, VciVPExecAnd_next, boolexpr->args, + node, parent, econtext, vpcontext, skip_list); + break; + + case OR_EXPR: + return vci_add_control_nodes(VciVPExecOr_head, VciVPExecOr_next, boolexpr->args, + node, parent, econtext, vpcontext, skip_list); + break; + + case NOT_EXPR: + arg_item = traverse_expr_state_tree((Expr *) linitial(boolexpr->args), parent, econtext, vpcontext, skip_list); + return vci_add_vp_node((VciVPExecOp_func) VciVPExecNot, + node, vpcontext, 1, &arg_item, true, skip_list); + + default: + /* LCOV_EXCL_START */ + elog(ERROR, "unrecognized boolop: %d", + (int) boolexpr->boolop); + break; + /* LCOV_EXCL_STOP */ + } + } + break; + + case T_MinMaxExpr: + { + MinMaxExpr *minmaxexpr = (MinMaxExpr *) node; + + return vci_add_control_nodes(VciVPExecMinMax_head, VciVPExecMinMax_next, minmaxexpr->args, + node, parent, econtext, vpcontext, skip_list); + } + + case T_CoalesceExpr: + { + CoalesceExpr *cexpr = (CoalesceExpr *) node; + + return vci_add_control_nodes(VciVPExecCoalesce_head, VciVPExecCoalesce_next, cexpr->args, + node, parent, econtext, vpcontext, skip_list); + } + + case T_CoerceViaIO: + { + CoerceViaIO *coerceViaIOexpr = (CoerceViaIO *) node; + vci_vp_item_id ret; + VciVPNode *vpnode; + Oid iofunc; + Oid typioparam; + bool typisvarlena; + FmgrInfo *finfo_out = palloc0(sizeof(struct FmgrInfo)); /* will be freed as part + * of query context free */ + FmgrInfo *finfo_in = palloc0(sizeof(struct FmgrInfo)); /* will be freed as part + * of query context free */ + + ret = traverse_expr_state_tree(coerceViaIOexpr->arg, parent, econtext, vpcontext, skip_list); + + ret = vci_add_vp_node((VciVPExecOp_func) VciVPExecCoerceViaIO, + node, vpcontext, 1, &ret, true, skip_list); + getTypeOutputInfo(exprType((Node *) coerceViaIOexpr->arg), + &iofunc, &typisvarlena); + fmgr_info(iofunc, finfo_out); + fmgr_info_set_expr((Node *) node, finfo_out); + + getTypeInputInfo(coerceViaIOexpr->resulttype, + &iofunc, &typioparam); + fmgr_info(iofunc, finfo_in); + fmgr_info_set_expr((Node *) node, finfo_in); + + vpnode = &vpcontext->itemNode[ret]; + vpnode->data.iocoerce.finfo_out = finfo_out; + vpnode->data.iocoerce.finfo_in = finfo_in; + vpnode->data.iocoerce.typioparam = typioparam; + + return ret; + } + + case T_CaseExpr: + { + CaseExpr *caseExpr = (CaseExpr *) node; + vci_vp_item_id head, + ret = 0, + save_caseValue; + ListCell *lc; + Datum *itemValue = palloc(sizeof(Datum) * VCI_MAX_FETCHING_ROWS); + bool *itemIsNull = palloc(sizeof(bool) * VCI_MAX_FETCHING_ROWS); + Datum *caseValue = palloc(sizeof(Datum) * VCI_MAX_FETCHING_ROWS); + bool *caseIsNull = palloc(sizeof(bool) * VCI_MAX_FETCHING_ROWS); + uint16 *case_whole_skip_list = palloc(sizeof(uint16) * VCI_MAX_SKIP_LIST_SLOTS); + VciVPNode *arg_node; + + head = vci_add_vp_node(VciVPExecCase_head, node, vpcontext, 0, NULL, false, case_whole_skip_list); + + arg_node = &vpcontext->itemNode[head]; + arg_node->itemValue = caseValue; + arg_node->itemIsNull = caseIsNull; + arg_node->data.init.orig_skip_list = skip_list; + + save_caseValue = vpcontext->caseValue; + vpcontext->caseValue = head; + + if (caseExpr->arg) + { + vci_vp_item_id arg_item; + + arg_item = traverse_expr_state_tree(caseExpr->arg, parent, econtext, vpcontext, case_whole_skip_list); + ret = vci_add_vp_node(VciVPExecCase_arg, node, vpcontext, 1, &arg_item, false, case_whole_skip_list); + arg_node = &vpcontext->itemNode[ret]; + arg_node->itemValue = caseValue; + arg_node->itemIsNull = caseIsNull; + } + + foreach(lc, caseExpr->args) + { + CaseWhen *when = lfirst(lc); + vci_vp_item_id arg_item; + uint16 *each_case_skip_list = palloc(sizeof(uint16) * VCI_MAX_SKIP_LIST_SLOTS); + + /* WHEN evaluation */ + + arg_item = traverse_expr_state_tree(when->expr, parent, econtext, vpcontext, case_whole_skip_list); + ret = vci_add_vp_node(VciVPExecCase_cond, node, vpcontext, 1, &arg_item, false, each_case_skip_list); + + arg_node = &vpcontext->itemNode[ret]; + arg_node->itemValue = caseValue; + arg_node->itemIsNull = caseIsNull; + arg_node->data.init.orig_skip_list = case_whole_skip_list; + + vpcontext->caseValue = save_caseValue; + + /* THEN evaluation */ + + arg_item = traverse_expr_state_tree(when->result, parent, econtext, vpcontext, each_case_skip_list); + ret = vci_add_vp_node(VciVPExecCase_result, node, vpcontext, 1, &arg_item, false, each_case_skip_list); + + arg_node = &vpcontext->itemNode[ret]; + arg_node->itemValue = itemValue; + arg_node->itemIsNull = itemIsNull; + + save_caseValue = vpcontext->caseValue; + vpcontext->caseValue = head; + } + + vpcontext->caseValue = save_caseValue; + + if (caseExpr->defresult) + { + /* ELSE evaluation */ + vci_vp_item_id arg_item; + vci_vp_item_id save_caseValue_defresult; + + save_caseValue_defresult = vpcontext->caseValue; + + arg_item = traverse_expr_state_tree(caseExpr->defresult, parent, econtext, vpcontext, case_whole_skip_list); + ret = vci_add_vp_node(VciVPExecCase_result, node, vpcontext, 1, &arg_item, false, case_whole_skip_list); + + arg_node = &vpcontext->itemNode[ret]; + arg_node->itemValue = itemValue; + arg_node->itemIsNull = itemIsNull; + + vpcontext->caseValue = save_caseValue_defresult; + } + + Assert(ret > 0); + + return ret; + } + + case T_CaseTestExpr: + { + vci_vp_item_id ret; + VciVPNode *arg_node, + *caseValue_node; + + ret = vci_add_vp_node(VciVPExecCaseTest, node, vpcontext, 0, NULL, false, skip_list); + + caseValue_node = &vpcontext->itemNode[vpcontext->caseValue]; + + arg_node = &vpcontext->itemNode[ret]; + arg_node->itemValue = caseValue_node->itemValue; + arg_node->itemIsNull = caseValue_node->itemIsNull; + + return ret; + } + + case T_List: + case T_TargetEntry: + /* LCOV_EXCL_START */ + Assert(0); + break; + /* LCOV_EXCL_STOP */ + + default: + /* LCOV_EXCL_START */ + elog(ERROR, "unrecognized node type: %s(%d)", + VciGetNodeName(nodeTag(node)), (int) nodeTag(node)); + node = NULL; /* keep compiler quiet */ + break; + /* LCOV_EXCL_STOP */ + } + + Assert(0); + return 0; +} +static +void +VciVPExecInitFunc(Expr *node, List *args, Oid funcid, Oid inputcollid, PlanState *parent, FuncExprinfo *funcinfo) +{ + int nargs = list_length(args); + AclResult aclresult; + FmgrInfo *flinfo; + FunctionCallInfo fcinfo; + + /* Check permission to call function */ + aclresult = object_aclcheck(ProcedureRelationId, funcid, GetUserId(), ACL_EXECUTE); + if (aclresult != ACLCHECK_OK) + aclcheck_error(aclresult, OBJECT_FUNCTION, get_func_name(funcid)); + InvokeFunctionExecuteHook(funcid); + + /* + * Safety check on nargs. Under normal circumstances this should never + * fail, as parser should check sooner. But possibly it might fail if + * server has been compiled with FUNC_MAX_ARGS smaller than some functions + * declared in pg_proc? + */ + if (nargs > FUNC_MAX_ARGS) + ereport(ERROR, + (errcode(ERRCODE_TOO_MANY_ARGUMENTS), + errmsg_plural("cannot pass more than %d argument to a function", + "cannot pass more than %d arguments to a function", + FUNC_MAX_ARGS, + FUNC_MAX_ARGS))); + + /* Allocate function lookup data and parameter workspace for this call */ + funcinfo->finfo = palloc0(sizeof(FmgrInfo)); + funcinfo->fcinfo_data = palloc0(SizeForFunctionCallInfo(nargs)); + flinfo = funcinfo->finfo; + fcinfo = funcinfo->fcinfo_data; + + /* Set up the primary fmgr lookup information */ + fmgr_info(funcid, flinfo); + fmgr_info_set_expr((Node *) node, flinfo); + + /* Initialize function call parameter structure too */ + InitFunctionCallInfoData(*fcinfo, flinfo, + nargs, inputcollid, NULL, NULL); + + /* Keep extra copies of this info to save an indirection at runtime */ + funcinfo->fn_addr = flinfo->fn_addr; + funcinfo->nargs = nargs; + + funcinfo->args = args; + funcinfo->funcid = funcid; + funcinfo->inputcollid = inputcollid; + + Assert(!flinfo->fn_retset); + +} + +Datum +VciExecEvalParamExec_vp(VciVPNode *vpnode, ExprContext *econtext, + bool *isNull) +{ + int thisParamId = vpnode->data.param.paramid; + ParamExecData *prm; + + /* + * PARAM_EXEC params (internal executor parameters) are stored in the + * ecxt_param_exec_vals array, and can be accessed by array index. + */ + prm = &(econtext->ecxt_param_exec_vals[thisParamId]); + + if (prm->execPlan != NULL) + { + /* Parameter not evaluated yet, so go do it */ + ExecSetParamPlan(prm->execPlan, econtext); + /* ExecSetParamPlan should have processed this param... */ + Assert(prm->execPlan == NULL); + } + + *isNull = prm->isnull; + return prm->value; +} diff --git a/contrib/vci/expected/bugs.out b/contrib/vci/expected/bugs.out new file mode 100644 index 000000000000..be0f567ec57b --- /dev/null +++ b/contrib/vci/expected/bugs.out @@ -0,0 +1,149 @@ +-- Bug reported by Japin Li that caused a vci_beginscan PANIC +-- See https://www.postgresql.org/message-id/ME0P300MB04457E24CA8965F008FB2CDBB648A%40ME0P300MB0445.AUSP300.PROD.OUTLOOK.COM +CREATE TABLE t1 (id int, info text); +CREATE INDEX t1_id_idx ON t1 USING vci (id); +INSERT INTO t1 SELECT id, md5(id::text) FROM generate_series(1, 1000) id; +SET enable_seqscan TO off; +SELECT * FROM t1 WHERE id = 100; + id | info +-----+---------------------------------- + 100 | f899139df5e1059396431415e770c6dd +(1 row) + +DROP TABLE t1; +-- Bug reported by Japin Li that VACUUM caused a TRAP +-- See https://www.postgresql.org/message-id/SY8P300MB0442BEC3F5CF432F0121ACC4B642A%40SY8P300MB0442.AUSP300.PROD.OUTLOOK.COM +CREATE TABLE t2 (id int, info text) WITH (autovacuum_enabled = off); +CREATE INDEX t2_id_idx ON t2 USING vci (id); +INSERT INTO t2 SELECT id, 'test' || id FROM generate_series(1, 1000) id; +DELETE FROM t2 WHERE id % 10 = 0; +VACUUM t2; +DROP TABLE t2; +-- Bug reported by Japin Li that caused a Segmentation Violation attempting to REFRESH a VCI internal relation +-- See https://www.postgresql.org/message-id/ME0P300MB0445EBA04D6947DD717074DFB65CA%40ME0P300MB0445.AUSP300.PROD.OUTLOOK.COM +CREATE TABLE t3 (id int, info text); +CREATE INDEX ON t3 USING vci (id); +SELECT relname FROM pg_class WHERE relname ~ '^pg_vci_*' LIMIT 1 \gset +SELECT * FROM :relname; + bindata +--------- + +(1 row) + +\d+ :relname + Materialized view "public.pg_vci_0000016582_00000_d" + Column | Type | Collation | Nullable | Default | Storage | Stats target | Description +---------+-------+-----------+----------+---------+---------+--------------+------------- + bindata | bytea | | | | plain | | +View definition: + + +REFRESH MATERIALIZED VIEW :relname; +ERROR: extension "vci" prohibits this operation on view "pg_vci_0000016582_00000_d" +DROP TABLE t3; +-- Bug missing logic. Ensure VCI internal relations get removed when the TABLE is dropped. +CREATE TABLE t4 (id int, info text); +CREATE INDEX t4_idx ON t4 USING vci (id); +SELECT relname FROM pg_class WHERE relname ~ '^pg_vci_*' ORDER BY relname; + relname +--------------------------- + pg_vci_0000016602_00000_d + pg_vci_0000016602_00000_m + pg_vci_0000016602_65526_d + pg_vci_0000016602_65527_d + pg_vci_0000016602_65530_0 + pg_vci_0000016602_65530_1 + pg_vci_0000016602_65531_d + pg_vci_0000016602_65531_m + pg_vci_0000016602_65533_d + pg_vci_0000016602_65533_m + pg_vci_0000016602_65534_d + pg_vci_0000016602_65534_m + pg_vci_0000016602_65535_d + pg_vci_0000016602_65535_m +(14 rows) + +DROP TABLE t4; +SELECT relname FROM pg_class WHERE relname ~ '^pg_vci_*'; + relname +--------- +(0 rows) + +-- Bug reported by Japin Li that REINDEX forgot to restore security context +-- See https://www.postgresql.org/message-id/ME0P300MB0445827B6E9CC04E0FAEE446B624A%40ME0P300MB0445.AUSP300.PROD.OUTLOOK.COM +CREATE TABLE t5 (id int, info text); +CREATE INDEX t5_idx ON t5 USING vci (id); +REINDEX TABLE t5; +REINDEX TABLE t5; +DROP TABLE t5; +-- InstrStartNode bug: +-- Unexpected error "InstrStartNode called twice in a row" +-- NOTE -Change the EXPLAIN below to use TIMING TRUE reproduce the bug, +-- otherwise leave it FALSE so timings don't cause 'make check' to fail. +CREATE TABLE t6(id int, info text); +CREATE INDEX t6_id_idx ON t6 USING vci (id); +INSERT INTO t6 SELECT id, 'info' || id FROM generate_series(1, 500) id; +ANALYZE t6; +EXPLAIN (ANALYZE, COSTS FALSE, BUFFERS FALSE, TIMING FALSE, SUMMARY FALSE) SELECT max(id) FROM t6; + QUERY PLAN +---------------------------------------------------------------------------------- + Custom Scan (VCI Aggregate) (actual rows=1.00 loops=1) + Disabled: true + -> Custom Scan (VCI Scan) using t6_id_idx on t6 (actual rows=1000.00 loops=1) + Disabled: true +(4 rows) + +DROP TABLE t6; +-- Bug reported by Timur: VCI Sort does not work on top of a non-VCI join +-- See https://www.postgresql.org/message-id/a27f68845af78d404459fcab940bfae2ec7755e5.camel%40postgrespro.ru +CREATE TABLE main (id BIGSERIAL PRIMARY KEY); +CREATE TABLE secondary (id BIGSERIAL PRIMARY KEY, main_id BIGINT REFERENCES main (id), val INTEGER); +CREATE INDEX main_vci ON main USING vci (id); +CREATE INDEX sec_vci ON secondary USING vci (id, main_id, val); +-- Check VCI Sort is not put on top of non-VCI join +EXPLAIN (ANALYZE, COSTS FALSE, BUFFERS FALSE, TIMING FALSE, SUMMARY FALSE) +SELECT * + FROM main m + JOIN secondary s + ON m.id = s.main_id + WHERE s.val in ( + SELECT MAX(val) + FROM secondary s2 + WHERE s2.main_id = m.id) + ORDER BY s.val; + QUERY PLAN +-------------------------------------------------------------------------------------------------- + Sort (actual rows=0.00 loops=1) + Sort Key: s.val + Sort Method: quicksort Memory: 25kB + -> Nested Loop (actual rows=0.00 loops=1) + Join Filter: (s.val = (max(s2.val))) + -> Hash Join (actual rows=0.00 loops=1) + Hash Cond: (s.main_id = m.id) + -> Custom Scan (VCI Scan) using sec_vci on secondary s (actual rows=0.00 loops=1) + Disabled: true + -> Hash (never executed) + -> Index Only Scan using main_pkey on main m (never executed) + Heap Fetches: 0 + Index Searches: 0 + -> Custom Scan (VCI Aggregate) (never executed) + Disabled: true + -> Custom Scan (VCI Scan) using sec_vci on secondary s2 (never executed) + Disabled: true + Filter: (main_id = m.id) +(18 rows) + +-- Check VCI Sort is used if suitable +EXPLAIN (ANALYZE, COSTS FALSE, BUFFERS FALSE, TIMING FALSE, SUMMARY FALSE) +SELECT * FROM secondary s ORDER BY s.val; + QUERY PLAN +-------------------------------------------------------------------------------------- + Custom Scan (VCI Sort) (actual rows=0.00 loops=1) + Sort Key: val + Sort Method: quicksort Memory: 25kB + -> Custom Scan (VCI Scan) using sec_vci on secondary s (actual rows=0.00 loops=1) + Disabled: true +(5 rows) + +DROP TABLE secondary; +DROP TABLE main; diff --git a/contrib/vci/expected/vci.out b/contrib/vci/expected/vci.out new file mode 100644 index 000000000000..d5e55b0b7192 --- /dev/null +++ b/contrib/vci/expected/vci.out @@ -0,0 +1,128 @@ +CREATE EXTENSION vci; +SELECT amname, amhandler, amtype FROM pg_am WHERE amname = 'vci'; + amname | amhandler | amtype +--------+-------------+-------- + vci | vci_handler | i +(1 row) + +SET vci.table_rows_threshold = 0; +CREATE TABLE testtable ( + key int, + cond int, + c01a bool, + c01b bool, + c02 bytea, + c03 "char", + c05 int8, + c06 int2, + c07 int4, + c08 text, + c09 float4, + c10 float8, + c13 interval, + c15 money, + c16 bpchar, + c17 varchar, + c18 date, + c19 time, + c20 timetz, + c21 timestamp, + c22 timestamptz, + c23a bit, + c23b bit, + c24a varbit, + c24b varbit, + c25 numeric, + c26 uuid); +-- Input data +INSERT INTO testtable (key, cond, c01a, c01b, c02, c03, c05, c06, c07, c08, c09, c10, c13, c15, c16, c17, c18, c19, c20, c21, c22, c23a, c23b, c24a, c24b, c25, c26) +SELECT + i % 10, -- key int + i % 21.000000000000, -- cond int + (i % 3.000000000000) > 0, -- c01a bool + (i % 11.000000000000) > 0, -- c01b bool + CAST(to_char((i % 1001.000000000000), '9999') AS bytea), -- c02 bytea + CAST(CAST((i % 249.000000000000) AS character varying) AS "char"), -- c03 "char" + i % 651.000000000000 + i % 350, -- c05 int8 + i % 1001.000000000000, -- c06 in2 + i % 1001.000000000000, -- c07 int4 + i % 1001.000000000000, -- c08 text + i % 1001.000000000000, -- c09 float4 + i % 1001.000000000000, -- c10 float8 + i % 1001.000000000000 * interval '1h', -- c13 interval + (i % 1001.000000000000)::integer::money, -- c15 money + i % 1001.000000000000, -- c16 bpchar + i % 1001.000000000000, -- c17 varchar + date '2015-12-21' + 1 % 1001.000000000000 * interval '1d', -- c18 date + TIMESTAMP '2015-12-21' + (i % 1001.000000000000) * interval '1h', -- c19 time + TIMESTAMP WITH TIME ZONE '2015-12-21 10:00:00+09' + (i % 1001.000000000000) * interval '1h', -- c20 timetz + TIMESTAMP '2015-12-21' + (i % 1001.000000000000) * interval '1h', -- c21 timestamp + TIMESTAMP WITH TIME ZONE '2015-12-21 10:00:00+09' + (i % 1001.000000000000) * interval '1h', -- c22 timestamptz + ((i % 3.000000000000)>0)::integer::bit(1), -- c23a bit + ((i % 11.000000000000)>0)::integer::bit(1), -- c23b bit + (i % 1001.000000000000)::integer::bit(10), -- c24a varbit + (i % 999.000000000000)::integer::bit(10), -- c24b varbit + i % 1001.000000000000, -- c25 numeric + 'a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11'::uuid -- c26 uuid +FROM generate_series(1, 10000) AS i; +-- Testcase: insert with some NULL values +INSERT INTO testtable (key, cond, c01a, c01b, c02, c03, c05, c06, c07, c08, c09, c10, c13, c15, c16, c17, c18, c19, c20, c21, c22, c23a, c23b, c24a, c24b, c25, c26) SELECT i, 1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL FROM generate_series(1, 9) AS i; +-- Testcase: insert with some special values +INSERT INTO testtable (key, cond, c09, c10, c18, c21, c22) VALUES (7, 1, 'Infinity', 'Infinity', 'Infinity', 'infinity', 'infinity'); +INSERT INTO testtable (key, cond, c09, c10, c18, c21, c22) VALUES (8, 1, 'Infinity', 'Infinity', 'Infinity', 'infinity', 'infinity'); +INSERT INTO testtable (key, cond, c09, c10, c18, c21, c22) VALUES (8, 1, '-Infinity', '-Infinity', '-Infinity', '-infinity', '-infinity'); +INSERT INTO testtable (key, cond, c09, c10) VALUES (9, 1, 'NaN', 'NaN'); +-- Testcase: NaN only +INSERT INTO testtable (key, cond, c09, c10) VALUES (10, 1, 'NaN', 'NaN'); +INSERT INTO testtable (key, cond, c09, c10) VALUES (10, 1, 'NaN', 'NaN'); +INSERT INTO testtable (key, cond, c05) VALUES (10, 1, 1); +-- Testcase: Timestamp with timezone +INSERT INTO testtable (key, cond, c18, c22) VALUES (11, 1, TIMESTAMP WITH TIME ZONE '2004-10-19 01:00:00+01', TIMESTAMP WITH TIME ZONE '2004-10-19 02:00:00+01'); +INSERT INTO testtable (key, cond, c18, c22) VALUES (11, 1, TIMESTAMP WITH TIME ZONE '2004-10-19 02:00:00+02', TIMESTAMP WITH TIME ZONE '2004-10-19 02:00:00+02'); +INSERT INTO testtable (key, cond, c18, c22) VALUES (11, 1, TIMESTAMP WITH TIME ZONE '2004-10-19 01:00:00+02', TIMESTAMP WITH TIME ZONE '2004-10-19 01:00:00+02'); +INSERT INTO testtable (key, cond, c05) VALUES (11, 1, 1); +-- Testcase: few attributes are valid +INSERT INTO testtable (key, cond, c01a, c01b, c02, c03, c05, c06, c07, c08, c09, c10, c13, c15, c16, c17, c18, c19, c20, c21, c22, c23a, c23b, c24a, c24b, c25, c26) SELECT 98, 1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL FROM generate_series(1, 50) AS i; +INSERT INTO testtable (key, cond, c01a, c01b, c02, c03, c05, c06, c07, c08, c09, c10, c15, c16, c17, c18, c19, c20, c21, c22, c23a, c23b, c24a, c24b, c25) VALUES (98, 1, true, true, 'text', 1::char, 1, 1, 1, 'text', 1.0, 1.0, 1, 'text', 'text', timestamp '2015-12-22', timestamp '2015-12-22', timestamp with time zone '2015-12-22 10:23:54+02', timestamp '2015-12-22', timestamp with time zone '2015-12-22 10:23:54+02', 1::bit(1), 1::bit(1), 1::bit(10), 1::bit(10), 1); +INSERT INTO testtable (key, cond, c01a, c01b, c02, c03, c05, c06, c07, c08, c09, c10, c13, c15, c16, c17, c18, c19, c20, c21, c22, c23a, c23b, c24a, c24b, c25, c26) SELECT 98, 1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL FROM generate_series(1, 50) AS i; +INSERT INTO testtable (key, cond, c01a, c01b, c02, c03, c05, c06, c07, c08, c09, c10, c13, c15, c16, c17, c18, c19, c20, c21, c22, c23a, c23b, c24a, c24b, c25, c26) SELECT 99, 1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL FROM generate_series(1, 100) AS i; +-- Create an index which uses VCI index access method +CREATE INDEX testindex ON testtable USING vci (key, cond, c01a, c01b, c02, c03, c05, c06, c07, c08, c09, c10, c13, c15, c16, c17, c18, c19, c20, c21, c22, c23a, c23b, c24a, c24b, c25, c26); +-- We expect VCI plans are chosen here +EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF, SUMMARY OFF, BUFFERS OFF) +SELECT key, count(*) AS count_star, count(c05) AS count_c05 FROM testtable WHERE NOT cond = 0 GROUP BY key ORDER BY key; + QUERY PLAN +------------------------------------------------------------------------------------------------ + Sort (actual rows=14.00 loops=1) + Sort Key: key + Sort Method: quicksort Memory: 25kB + -> Custom Scan (VCI HashAggregate) (actual rows=14.00 loops=1) + Group Key: key + -> Custom Scan (VCI Scan) using testindex on testtable (actual rows=10221.00 loops=1) + Filter: (cond <> 0) + Rows Removed by Filter: 476 +(8 rows) + +-- Confirms the aggregation can work. The first column indicates whether the +-- VCI scan was used. +SELECT vci_runs_in_query() AS vci_runs_in_query, key, count(*) AS count_star, count(c05) AS count_c05 FROM testtable WHERE NOT cond = 0 GROUP BY key ORDER BY key; + vci_runs_in_query | key | count_star | count_c05 +-------------------+-----+------------+----------- + t | 0 | 953 | 953 + t | 1 | 953 | 952 + t | 2 | 953 | 952 + t | 3 | 953 | 952 + t | 4 | 953 | 952 + t | 5 | 953 | 952 + t | 6 | 953 | 952 + t | 7 | 955 | 953 + t | 8 | 956 | 953 + t | 9 | 955 | 953 + t | 10 | 3 | 1 + t | 11 | 4 | 1 + t | 98 | 101 | 1 + t | 99 | 100 | 0 +(14 rows) + +-- cleanup +DROP TABLE testtable; diff --git a/contrib/vci/include/postgresql_copy.h b/contrib/vci/include/postgresql_copy.h new file mode 100644 index 000000000000..f302232aebde --- /dev/null +++ b/contrib/vci/include/postgresql_copy.h @@ -0,0 +1,176 @@ +/*------------------------------------------------------------------------- + * + * postgresql_copy.h + * Definitions copied from PostgreSQL core + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/postgresql_copy.h + * + *------------------------------------------------------------------------- + */ +#ifndef POSTGRESQL_COPY_H +#define POSTGRESQL_COPY_H + +/* + * src/backend/utils/adt/float.c + */ +#include "postgres.h" + +#include + +#include "catalog/pg_type.h" +#include "datatype/timestamp.h" +#include "utils/array.h" +#include "utils/date.h" +#include "utils/elog.h" +#include "utils/errcodes.h" + +/* + * check to see if a float4/8 val has underflowed or overflowed + */ +#define CHECKFLOATVAL(val, inf_is_valid, zero_is_valid) \ +do { \ + if (isinf(val) && !(inf_is_valid)) \ + ereport(ERROR, \ + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), \ + errmsg("value out of range: overflow"))); \ + \ + if ((val) == 0.0 && !(zero_is_valid)) \ + ereport(ERROR, \ + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), \ + errmsg("value out of range: underflow"))); \ +} while(0) + +/* + * src/backend/utils/adt/float.c + */ +static inline float8 * +check_float8_array(ArrayType *transarray, const char *caller, int n) +{ + /* + * We expect the input to be an N-element float array; verify that. We + * don't need to use deconstruct_array() since the array data is just + * going to look like a C array of N float8 values. + */ + if (ARR_NDIM(transarray) != 1 || + ARR_DIMS(transarray)[0] != n || + ARR_HASNULL(transarray) || + ARR_ELEMTYPE(transarray) != FLOAT8OID) + elog(ERROR, "%s: expected %d-element float8 array", caller, n); + return (float8 *) ARR_DATA_PTR(transarray); +} + +typedef struct Int8TransTypeData +{ + int64 count; + int64 sum; +} Int8TransTypeData; + +#ifdef VCI_USE_CMP_FUNC +/* + * interval_relop - is interval1 relop interval2 + * + * collate invalid interval at the end + */ +static inline TimeOffset +interval_cmp_value(const Interval *interval) +{ + TimeOffset span; + + span = interval->time; + +#ifdef HAVE_INT64_TIMESTAMP + span += interval->month * INT64CONST(30) * USECS_PER_DAY; + span += interval->day * INT64CONST(24) * USECS_PER_HOUR; +#else + span += interval->month * ((double) DAYS_PER_MONTH * SECS_PER_DAY); + span += interval->day * ((double) HOURS_PER_DAY * SECS_PER_HOUR); +#endif + + return span; +} + +static int +interval_cmp_internal(Interval *interval1, Interval *interval2) +{ + TimeOffset span1 = interval_cmp_value(interval1); + TimeOffset span2 = interval_cmp_value(interval2); + + return ((span1 < span2) ? -1 : (span1 > span2) ? 1 : 0); +} + +static int +timetz_cmp_internal(TimeTzADT *time1, TimeTzADT *time2) +{ + TimeOffset t1, + t2; + + /* Primary sort is by true (GMT-equivalent) time */ +#ifdef HAVE_INT64_TIMESTAMP + t1 = time1->time + (time1->zone * USECS_PER_SEC); + t2 = time2->time + (time2->zone * USECS_PER_SEC); +#else + t1 = time1->time + time1->zone; + t2 = time2->time + time2->zone; +#endif + + if (t1 > t2) + return 1; + if (t1 < t2) + return -1; + + /* + * If same GMT time, sort by timezone; we only want to say that two + * timetz's are equal if both the time and zone parts are equal. + */ + if (time1->zone > time2->zone) + return 1; + if (time1->zone < time2->zone) + return -1; + + return 0; +} +#endif + +/* taken from numeric.c */ + +typedef int16 NumericDigit; +struct NumericShort +{ + uint16 n_header; /* Sign + display scale + weight */ + NumericDigit n_data[1]; /* Digits */ +}; + +struct NumericLong +{ + uint16 n_sign_dscale; /* Sign + display scale */ + int16 n_weight; /* Weight of 1st digit */ + NumericDigit n_data[1]; /* Digits */ +}; + +union NumericChoice +{ + uint16 n_header; /* Header word */ + struct NumericLong n_long; /* Long form (4-byte header) */ + struct NumericShort n_short; /* Short form (2-byte header) */ +}; + +struct NumericData +{ + int32 vl_len_; /* varlena header (do not touch directly!) */ + union NumericChoice choice; /* choice of format */ +}; + +typedef struct NumericVar +{ + int ndigits; /* # of digits in digits[] - can be 0! */ + int weight; /* weight of first digit */ + int sign; /* NUMERIC_POS, NUMERIC_NEG, or NUMERIC_NAN */ + int dscale; /* display scale */ + NumericDigit *buf; /* start of palloc'd space for digits[] */ + NumericDigit *digits; /* base-NBASE digits */ +} NumericVar; + +#endif /* POSTGRESQL_COPY_H */ diff --git a/contrib/vci/include/vci.h b/contrib/vci/include/vci.h new file mode 100644 index 000000000000..f1bc7a99aaa0 --- /dev/null +++ b/contrib/vci/include/vci.h @@ -0,0 +1,153 @@ +/*------------------------------------------------------------------------- + * + * vci.h + * Primary include file for VCI .c files + * + * This should be the first file included by VCI modules. + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci.h + * + *------------------------------------------------------------------------- + */ +#ifndef VCI_H +#define VCI_H + +/* define our text domain for translations */ +#undef TEXTDOMAIN +#define TEXTDOMAIN PG_TEXTDOMAIN("vci") + +#include "postgres.h" +#include "access/heapam.h" +#include "access/htup.h" +#include "access/htup_details.h" +#include "access/xact.h" +#include "access/relscan.h" +#include "c.h" +#include "catalog/objectaddress.h" +#include "catalog/pg_am.h" +#include "executor/nodeModifyTable.h" +#include "storage/itemptr.h" /* for ItemPointer */ +#include "tcop/utility.h" +#include "utils/rel.h" +#include "utils/relcache.h" /* for Relation */ +#include "utils/syscache.h" + +#define VCI_STRING "vci" + +#define VCI_INTERNAL_RELATION_TEMPLATE "pg_vci_%010d_%05d_%c" + +/** Use compact form to keep varlena with short header at some parts */ +#define VCI_USE_COMPACT_VARLENA + +#ifdef WIN32 +#define strtok_r strtok_s +#endif + +/** Restart time for Daemon(Background Worker) */ +#define VCI_DAEMON_RESTART_TIME (3) + +/** + * Scan policy + */ +typedef enum +{ + VCI_TABLE_SCAN_POLICY_NONE, + VCI_TABLE_SCAN_POLICY_COLUMN_ONLY, /* Only reads column store */ +} VciTableScanPolicy; + +/** + * VCI Scan mode + */ +typedef enum +{ + VCI_SCAN_MODE_NONE, + VCI_SCAN_MODE_COLUMN_STORE, /* Reads column store */ +} VciScanMode; + +typedef struct VciFetchPos +{ + int64 fetch_starting_crid; + + int32 current_extent_id; + + int num_rows_in_extent; + + int offset_in_extent; + + int num_fetched_rows; + + int current_row; +} VciFetchPos; + +extern void vci_add_index_delete(Relation heapRel, ItemPointer heap_tid, TransactionId xmin); +extern List *vci_add_should_index_insert(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, ItemPointer tupleid, EState *estate); +extern bool vci_add_drop_relation(const ObjectAddress *object, int flags); +extern bool vci_add_reindex_index(Relation indexRel); +extern bool vci_add_skip_vci_index(Relation indexRel); +extern bool vci_add_alter_tablespace(Relation indexRel); +extern void vci_process_utility(PlannedStmt *pstmt, const char *queryString, + bool readOnlyTree, + ProcessUtilityContext context, + ParamListInfo params, + QueryEnvironment *queryEnv, + DestReceiver *dest, + QueryCompletion *qc); +extern void vci_alter_table_change_owner(Oid relOid, char relKind, Oid newOwnerId); +extern void vci_alter_table_change_schema(Oid relOid, char relKind, Oid newNspOid); + +extern void vci_read_guc_variables(void); +extern void vci_setup_shmem(void); +extern void vci_shmem_startup_routine(void); +extern void vci_setup_executor_hook(void); +extern void vci_xact_change_handler(XactEvent event); +extern void vci_subxact_change_handler(SubXactEvent event, SubTransactionId mySubid); +extern void vci_set_copy_transaction_and_command_id(TransactionId xid, + CommandId cid); + +extern bool VCITupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer); + +/* for index_build */ +typedef enum +{ + vcirc_invalid = 0, + vcirc_reindex, + vcirc_truncate, + vcirc_vacuum_full, + vcirc_cluster, + vcirc_alter_table, + + vcirc_num +} vci_RebuildCommand; + +extern vci_RebuildCommand vci_rebuild_command; + +extern bool vci_is_in_vci_create_extension; + +extern ProcessUtility_hook_type process_utility_prev; +extern ProcessUtility_hook_type post_process_utility_prev; + +static inline +bool +isVciIndexRelation(Relation rel) +{ + if (rel->rd_rel->relam != 0) + { + Form_pg_am aform; + HeapTuple amtuple; + + amtuple = SearchSysCache1(AMOID, ObjectIdGetDatum(rel->rd_rel->relam)); + if (!HeapTupleIsValid(amtuple)) + elog(ERROR, "cache lookup failed for access method %u", + rel->rd_rel->relam); + aform = (Form_pg_am) GETSTRUCT(amtuple); + ReleaseSysCache(amtuple); + + if (strcmp(NameStr(aform->amname), "vci") == 0) + return true; + } + return false; +} +#endif /* VCI_H */ diff --git a/contrib/vci/include/vci_aggref.h b/contrib/vci/include/vci_aggref.h new file mode 100644 index 000000000000..f2b3ad7accf3 --- /dev/null +++ b/contrib/vci/include/vci_aggref.h @@ -0,0 +1,227 @@ +/*------------------------------------------------------------------------- + * + * vci_aggref.h + * Definitions and declarations about VCI Aggref + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_aggref.h + * + *------------------------------------------------------------------------- + */ +#ifndef VCI_AGGREF_H +#define VCI_AGGREF_H + +#include "postgres.h" + +#include "access/attnum.h" +#include "access/tupdesc.h" +#include "executor/tuptable.h" +#include "fmgr.h" +#include "nodes/execnodes.h" +#include "nodes/primnodes.h" +#include "utils/tuplesort.h" + +#include "vci_executor.h" + +/** + * AggStatePerAggData - per-aggregate working state for the Agg scan + * + * copied from src/backend/executor/nodeAgg.c + */ +typedef struct VciAggStatePerAggData +{ + /* + * These values are set up during ExecInitAgg() and do not change + * thereafter: + */ + + /* Links to Aggref expr and state nodes this working state is for */ + Aggref *aggref; + + /* + * Nominal number of arguments for aggregate function. For plain aggs, + * this excludes any ORDER BY expressions. For ordered-set aggs, this + * counts both the direct and aggregated (ORDER BY) arguments. + */ + int numArguments; + + /* + * Number of aggregated input columns. This includes ORDER BY expressions + * in both the plain-agg and ordered-set cases. Ordered-set direct args + * are not counted, though. + */ + int numInputs; + + /* + * Number of aggregated input columns to pass to the transfn. This + * includes the ORDER BY columns for ordered-set aggs, but not for plain + * aggs. (This doesn't count the transition state value!) + */ + int numTransInputs; + + /* + * Number of arguments to pass to the finalfn. This is always at least 1 + * (the transition state value) plus any ordered-set direct args. If the + * finalfn wants extra args then we pass nulls corresponding to the + * aggregated input columns. + */ + int numFinalArgs; + + /* Oids of transfer functions */ + Oid transfn_oid; + Oid finalfn_oid; /* may be InvalidOid */ + + /* + * fmgr lookup data for transfer functions --- only valid when + * corresponding oid is not InvalidOid. Note in particular that fn_strict + * flags are kept here. + */ + FmgrInfo transfn; + FmgrInfo finalfn; + + /* Input collation derived for aggregate */ + Oid aggCollation; + + /* number of sorting columns */ + int numSortCols; + + /* number of sorting columns to consider in DISTINCT comparisons */ + /* (this is either zero or the same as numSortCols) */ + int numDistinctCols; + + /* deconstructed sorting information (arrays of length numSortCols) */ + AttrNumber *sortColIdx; + Oid *sortOperators; + Oid *sortCollations; + bool *sortNullsFirst; + + /* + * fmgr lookup data for input columns' equality operators --- only + * set/used when aggregate has DISTINCT flag. Note that these are in + * order of sort column index, not parameter index. + */ + FmgrInfo *equalfns; /* array of length numDistinctCols */ + + /* + * initial value from pg_aggregate entry + */ + Datum initValue; + bool initValueIsNull; + + /* + * We need the len and byval info for the agg's input, result, and + * transition data types in order to know how to copy/delete values. + * + * Note that the info for the input type is used only when handling + * DISTINCT aggs with just one argument, so there is only one input type. + */ + int16 inputtypeLen, + resulttypeLen, + transtypeLen; + bool inputtypeByVal, + resulttypeByVal, + transtypeByVal; + + /* + * Stuff for evaluation of inputs. We used to just use ExecEvalExpr, but + * with the addition of ORDER BY we now need at least a slot for passing + * data to the sort object, which requires a tupledesc, so we might as + * well go whole hog and use ExecProject too. + */ + TupleDesc evaldesc; /* descriptor of input tuples */ + VciProjectionInfo *evalproj; /* projection machinery */ + + /* + * Slots for holding the evaluated input arguments. These are set up + * during ExecInitAgg() and then used for each input row. + */ + TupleTableSlot *evalslot; /* current input tuple */ + TupleTableSlot *uniqslot; /* used for multi-column DISTINCT */ + + /* + * These values are working state that is initialized at the start of an + * input tuple group and updated for each input tuple. + * + * For a simple (non DISTINCT/ORDER BY) aggregate, we just feed the input + * values straight to the transition function. If it's DISTINCT or + * requires ORDER BY, we pass the input values into a Tuplesort object; + * then at completion of the input tuple group, we scan the sorted values, + * eliminate duplicates if needed, and run the transition function on the + * rest. + */ + + Tuplesortstate *sortstate; /* sort object, if DISTINCT or ORDER BY */ + + /* + * This field is a pre-initialized FunctionCallInfo struct used for + * calling this aggregate's transfn. We save a few cycles per row by not + * re-initializing the unchanging fields; which isn't much, but it seems + * worth the extra space consumption. + */ + FunctionCallInfo transfn_fcinfo; + + /*----------------------------------------------------------------------*/ + /* Definitions above must same as AggStatePerAggData */ + /*----------------------------------------------------------------------*/ + + VciAdvanceAggref_Func advance_aggref; /* advance aggregation function */ + + Datum (*copy_trans) (Datum, bool, int); /* transition data copy + * function */ + FmgrInfo merge_transfn; /* function information for merging transition + * data */ + FmgrInfo send_transfn; /* function information for converting + * transition data to binary */ + FmgrInfo recv_transfn; /* function informayion for converting + * transition data from binary */ + + Oid recv_trans_typioparam; /* information to be passed as + * argument when recv_transfn is + * called */ + + FunctionCallInfo merge_trans_fcinfo; /* datastruct needed to call + * function via merge_transfn */ + FunctionCallInfo send_trans_fcinfo; /* datastruct needed to call function + * via send_trans */ + FunctionCallInfo recv_trans_fcinfo; /* datastruct needed to call function + * via recv_transfn */ + +} VciAggStatePerAggData; + +/** + * AggStatePerGroupData - per-aggregate-per-group working state + * + * These values are working state that is initialized at the start of + * an input tuple group and updated for each input tuple. + * + * In AGG_PLAIN and AGG_SORTED modes, we have a single array of these + * structs (pointed to by aggstate->pergroup); we re-use the array for + * each input group, if it's AGG_SORTED mode. In AGG_HASHED mode, the + * hash table contains an array of these structs for each tuple group. + * + * Logically, the sortstate field belongs in this struct, but we do not + * keep it here for space reasons: we don't support DISTINCT aggregates + * in AGG_HASHED mode, so there's no reason to use up a pointer field + * in every entry of the hashtable. + * + * copied from src/backend/executor/nodeAgg.c + */ +typedef struct VciAggStatePerGroupData +{ + Datum transValue; /* current transition value */ + bool transValueIsNull; + + bool noTransValue; /* true if transValue not set yet */ + + /* + * Note: noTransValue initially has the same value as transValueIsNull, + * and if true both are cleared to false at the same time. They are not + * the same though: if transfn later returns a NULL, we want to keep that + * NULL and not auto-replace it with a later input value. Only the first + * non-NULL input will be auto-substituted. + */ +} VciAggStatePerGroupData; + +#endif /* VCI_AGGREF_H */ diff --git a/contrib/vci/include/vci_aggref_impl.inc b/contrib/vci/include/vci_aggref_impl.inc new file mode 100644 index 000000000000..95fba4d19ee5 --- /dev/null +++ b/contrib/vci/include/vci_aggref_impl.inc @@ -0,0 +1,873 @@ +/*------------------------------------------------------------------------- + * + * vci_aggref_impl.h + * Templates for specialized advance_aggref functions + * + * This file is included by vci_aggref.c. This template can be used like: + * + * #define VCI_ADVANCE_AGGREF_FUNC aggref_0input_default + * #include "executor/vci_aggref_impl.h" + * #undef VCI_ADVANCE_AGGREF_FUNC * + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_aggref.impl.h + * + *------------------------------------------------------------------------- + */ + +#include "utils/float.h" +#include "datatype/timestamp.h" + +#include "vci_executor.h" + +static void +VCI_ADVANCE_AGGREF_FUNC(VciAggState *aggstate, + int aggno, + VciAggStatePerGroup *entries, + int max_slots) +{ + MemoryContext oldContext; + int slot_index; + VciScanState *scanstate; + uint16 *skip_list; + +#if VCI_TRANS_INPUTS_ARG == VCI_TRANS_INPUTS_0 + + /* + * aggref_0input_int8inc does not use these variable, skip the + * declaration. + */ +#if VCI_TRANFN_OID != F_INT8INC + Datum *inputValues = NULL; + bool *inputIsNulls = NULL; + VciAggStatePerAgg peraggstate = &aggstate->peragg[aggno]; +#endif /* VCI_TRANFN_OID */ + +#elif VCI_TRANS_INPUTS_ARG == VCI_TRANS_INPUTS_1_SIMPLEVAR + Datum *inputValues = NULL; + bool *inputIsNulls = NULL; + VciAggStatePerAgg peraggstate = &aggstate->peragg[aggno]; + VciProjectionInfo *projInfo; + int attno; + +#elif VCI_TRANS_INPUTS_ARG == VCI_TRANS_INPUTS_1_EVALEXPR + Datum *inputValues = NULL; + bool *inputIsNulls = NULL; + VciAggStatePerAgg peraggstate = &aggstate->peragg[aggno]; + VciProjectionInfo *projInfo; + ExprContext *econtext; + VciVPContext *vpcontext; +#endif /* SELECT VCI_TRANS_INPUTS_ARG */ + + scanstate = (VciScanState *) outerPlanState(aggstate); + Assert(scanstate->vci.css.ss.ps.type == T_CustomScanState); + skip_list = vci_CSGetSkipFromVirtualTuples(scanstate->vector_set); + +#if VCI_TRANS_INPUTS_ARG == VCI_TRANS_INPUTS_0 + +#elif VCI_TRANS_INPUTS_ARG == VCI_TRANS_INPUTS_1_SIMPLEVAR + projInfo = peraggstate->evalproj; + + attno = projInfo->pi_varNumbers[0]; + + inputValues = scanstate->result_values[attno - 1]; + inputIsNulls = scanstate->result_isnull[attno - 1]; + +#elif VCI_TRANS_INPUTS_ARG == VCI_TRANS_INPUTS_1_EVALEXPR + + projInfo = peraggstate->evalproj; + econtext = projInfo->pi_exprContext; + + vpcontext = projInfo->pi_vp_tle_array[0]; + + oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory); + VciExecEvalVectorProcessing(vpcontext, econtext, max_slots); + MemoryContextSwitchTo(oldContext); + + inputValues = vpcontext->resultValue; + inputIsNulls = vpcontext->resultIsNull; + +#endif /* SELECT VCI_TRANS_INPUTS_ARG */ + + for (slot_index = skip_list[0]; slot_index < max_slots; slot_index += skip_list[slot_index + 1] + 1) + { + VciAggStatePerGroup pergroupstate; + Datum newVal; + bool newIsNull; + + pergroupstate = &(entries[slot_index])[aggno]; + + if (VCI_TRANS_FN_STRICT) /* peraggstate->transfn.fn_strict or 1 or + * 0 */ + { + /* + * For a strict transfn, nothing happens when there's a NULL + * input; we just keep the prior transValue. + */ +#if (VCI_TRANS_INPUTS_ARG == VCI_TRANS_INPUTS_1_SIMPLEVAR) || (VCI_TRANS_INPUTS_ARG == VCI_TRANS_INPUTS_1_EVALEXPR) + if (inputIsNulls[slot_index]) + continue; +#endif + + if (pergroupstate->noTransValue) + { + /* + * transValue has not been initialized. This is the first + * non-NULL input value. We use it as the initial value for + * transValue. (We already checked that the agg's input type + * is binary-compatible with its transtype, so straight copy + * here is OK.) + * + * We must copy the datum into aggcontext if it is + * pass-by-ref. We do not need to pfree the old transValue, + * since it's NULL. + */ + oldContext = MemoryContextSwitchTo(aggstate->aggcontext); +#if VCI_TRANS_INPUTS_ARG == VCI_TRANS_INPUTS_0 + pergroupstate->transValue = 0; +#elif VCI_TRANS_TYPE_BYVAL <= 0 + pergroupstate->transValue = datumCopy(inputValues[slot_index], + peraggstate->transtypeByVal, + peraggstate->transtypeLen); +#else + pergroupstate->transValue = inputValues[slot_index]; +#endif + pergroupstate->transValueIsNull = false; + pergroupstate->noTransValue = false; + MemoryContextSwitchTo(oldContext); + continue; + } + if (pergroupstate->transValueIsNull) + { + /* + * Don't call a strict function with NULL inputs. Note it is + * possible to get here despite the above tests, if the + * transfn is strict *and* returned a NULL on a prior cycle. + * If that happens we will propagate the NULL all the way to + * the end. + */ + continue; + } + } + +#if VCI_TRANS_TYPE_BYVAL <= 0 + /* We run the transition functions in per-input-tuple memory context */ + oldContext = MemoryContextSwitchTo(aggstate->tmpcontext->ecxt_per_tuple_memory); +#endif + +#ifdef VCI_TRANS_USE_CURPERAGG + /* set up aggstate->curperagg for AggGetAggref() */ + aggstate->pseudo_aggstate->curperagg = (AggStatePerAgg) peraggstate; /* @remark */ +#endif + +#if VCI_TRANFN_OID == F_FLOAT4_ACCUM /* 208 */ + /* float4_accum */ + { + ArrayType *transarray = DatumGetArrayTypeP(pergroupstate->transValue); + + float8 newval = DatumGetFloat4(inputValues[slot_index]); + float8 *transvalues; + float8 N, + Sx, + Sxx, + tmp; + + transvalues = check_float8_array(transarray, "float4_accum", 3); + N = transvalues[0]; + Sx = transvalues[1]; + Sxx = transvalues[2]; + + /* + * Use the Youngs-Cramer algorithm to incorporate the new value + * into the transition values. + */ + + N += 1.0; + Sx += newval; + if (transvalues[0] > 0.0) + { + tmp = newval * N - Sx; + Sxx += tmp * tmp / (N * transvalues[0]); + + /* + * Overflow check. We only report an overflow error when + * finite inputs lead to infinite results. Note also that Sxx + * should be NaN if any of the inputs are infinite, so we + * intentionally prevent Sxx from becoming infinite. + */ + if (isinf(Sx) || isinf(Sxx)) + { + if (!isinf(transvalues[1]) && !isinf(newval)) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("value out of range: overflow"))); + + Sxx = get_float8_nan(); + } + } + + transvalues[0] = N; + transvalues[1] = Sx; + transvalues[2] = Sxx; + + newVal = pergroupstate->transValue; + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_FLOAT4PL /* 204 */ + /* float4pl */ + { + float4 arg1 = DatumGetFloat4(pergroupstate->transValue); + float4 arg2 = DatumGetFloat4(inputValues[slot_index]); + float4 result; + + result = arg1 + arg2; + + CHECKFLOATVAL(result, isinf(arg1) || isinf(arg2), true); + newVal = Float4GetDatum(result); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_FLOAT4LARGER /* 209 */ + /* float4larger */ + { + float4 arg1 = DatumGetFloat4(pergroupstate->transValue); + float4 arg2 = DatumGetFloat4(inputValues[slot_index]); + float4 result; + + if (float4_gt(arg1, arg2)) + result = arg1; + else + result = arg2; + newVal = Float4GetDatum(result); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_FLOAT4SMALLER /* 211 */ + /* float4smaller */ + { + float4 arg1 = DatumGetFloat4(pergroupstate->transValue); + float4 arg2 = DatumGetFloat4(inputValues[slot_index]); + float4 result; + + if (float4_lt(arg1, arg2)) + result = arg1; + else + result = arg2; + newVal = Float4GetDatum(result); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_FLOAT8PL /* 218 */ + /* float8pl */ + { + float8 arg1 = DatumGetFloat8(pergroupstate->transValue); + float8 arg2 = DatumGetFloat8(inputValues[slot_index]); + float8 result; + + result = arg1 + arg2; + + CHECKFLOATVAL(result, isinf(arg1) || isinf(arg2), true); + newVal = Float8GetDatum(result); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_INT4LARGER /* 768 */ + /* int4larger */ + { + int32 arg1 = DatumGetInt32(pergroupstate->transValue); + int32 arg2 = DatumGetInt32(inputValues[slot_index]); + + newVal = Int32GetDatum((arg1 > arg2) ? arg1 : arg2); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_INT4SMALLER /* 769 */ + /* int4smaller */ + { + int32 arg1 = DatumGetInt32(pergroupstate->transValue); + int32 arg2 = DatumGetInt32(inputValues[slot_index]); + + newVal = Int32GetDatum((arg1 < arg2) ? arg1 : arg2); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_CASH_PL /* 894 */ + /* cash_pl */ + { + Cash c1 = DatumGetCash(pergroupstate->transValue); + Cash c2 = DatumGetCash(inputValues[slot_index]); + Cash result; + + result = c1 + c2; + + newVal = CashGetDatum(result); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_CASHLARGER /* 898 */ + /* cashlarger */ + { + Cash c1 = DatumGetCash(pergroupstate->transValue); + Cash c2 = DatumGetCash(inputValues[slot_index]); + Cash result; + + result = (c1 > c2) ? c1 : c2; + + newVal = CashGetDatum(result); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_CASHSMALLER /* 899 */ + /* cashsmaller */ + { + Cash c1 = DatumGetCash(pergroupstate->transValue); + Cash c2 = DatumGetCash(inputValues[slot_index]); + Cash result; + + result = (c1 < c2) ? c1 : c2; + + newVal = CashGetDatum(result); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_DATE_LARGER /* 1138 */ + /* date_larger */ + { + DateADT dateVal1 = DatumGetDateADT(pergroupstate->transValue); + DateADT dateVal2 = DatumGetDateADT(inputValues[slot_index]); + + newVal = DateADTGetDatum((dateVal1 > dateVal2) ? dateVal1 : dateVal2); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_DATE_SMALLER /* 1139 */ + /* date_smaller */ + { + DateADT dateVal1 = DatumGetDateADT(pergroupstate->transValue); + DateADT dateVal2 = DatumGetDateADT(inputValues[slot_index]); + + newVal = DateADTGetDatum((dateVal1 < dateVal2) ? dateVal1 : dateVal2); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_INTERVAL_PL /* 1169 */ + /* interval_pl */ + { + Interval *span1 = DatumGetIntervalP(pergroupstate->transValue); + Interval *span2 = DatumGetIntervalP(inputValues[slot_index]); + Interval *result; + + result = (Interval *) palloc(sizeof(Interval)); + + /* + * Handle infinities. + * + * We treat anything that amounts to "infinity - infinity" as an + * error, since the interval type has nothing equivalent to NaN. + */ + if (INTERVAL_IS_NOBEGIN(span1)) + { + if (INTERVAL_IS_NOEND(span2)) + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("interval out of range"))); + else + INTERVAL_NOBEGIN(result); + } + else if (INTERVAL_IS_NOEND(span1)) + { + if (INTERVAL_IS_NOBEGIN(span2)) + ereport(ERROR, + (errcode(ERRCODE_DATETIME_VALUE_OUT_OF_RANGE), + errmsg("interval out of range"))); + else + INTERVAL_NOEND(result); + } + else if (INTERVAL_NOT_FINITE(span2)) + memcpy(result, span2, sizeof(Interval)); + else + finite_interval_pl(span1, span2, result); + + newVal = IntervalPGetDatum(result); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_TIMESTAMP_SMALLER /* 1195 */ + /* timestamp_smaller */ + { + Timestamp dt1 = DatumGetTimestamp(pergroupstate->transValue); + Timestamp dt2 = DatumGetTimestamp(inputValues[slot_index]); + Timestamp result; + + /* + * use timestamp_cmp_internal to be sure this agrees with + * comparisons + */ + if (timestamp_cmp_internal(dt1, dt2) < 0) + result = dt1; + else + result = dt2; + newVal = TimestampGetDatum(result); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_TIMESTAMP_LARGER /* 1196 */ + /* timestamp_larger */ + { + Timestamp dt1 = DatumGetTimestamp(pergroupstate->transValue); + Timestamp dt2 = DatumGetTimestamp(inputValues[slot_index]); + Timestamp result; + + if (timestamp_cmp_internal(dt1, dt2) > 0) + result = dt1; + else + result = dt2; + newVal = TimestampGetDatum(result); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_INTERVAL_SMALLER /* 1197 */ + /* interval_smaller */ + { + Interval *interval1 = DatumGetIntervalP(pergroupstate->transValue); + Interval *interval2 = DatumGetIntervalP(inputValues[slot_index]); + Interval *result; + + /* + * use interval_cmp_internal to be sure this agrees with + * comparisons + */ + if (interval_cmp_internal(interval1, interval2) < 0) + result = interval1; + else + result = interval2; + newVal = IntervalPGetDatum(result); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_INTERVAL_LARGER /* 1198 */ + /* interval_larger */ + { + Interval *interval1 = DatumGetIntervalP(pergroupstate->transValue); + Interval *interval2 = DatumGetIntervalP(inputValues[slot_index]); + Interval *result; + + if (interval_cmp_internal(interval1, interval2) > 0) + result = interval1; + else + result = interval2; + newVal = IntervalPGetDatum(result); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_INT8INC /* 1219 */ + { + newVal = Int64GetDatum(DatumGetInt64(pergroupstate->transValue) + 1); + + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_INT8INC_ANY /* 2804 */ + /* Mostly same as F_INT8INC, but NULL-check for arguments is done */ + { + newVal = Int64GetDatum(DatumGetInt64(pergroupstate->transValue) + 1); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_TIME_LARGER /* 1377 */ + /* time_larger */ + { + TimeADT time1 = DatumGetTimeADT(pergroupstate->transValue); + TimeADT time2 = DatumGetTimeADT(inputValues[slot_index]); + + newVal = TimeADTGetDatum((time1 > time2) ? time1 : time2); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_TIME_SMALLER /* 1378 */ + /* time_smaller */ + { + TimeADT time1 = DatumGetTimeADT(pergroupstate->transValue); + TimeADT time2 = DatumGetTimeADT(inputValues[slot_index]); + + newVal = TimeADTGetDatum((time1 < time2) ? time1 : time2); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_TIMETZ_LARGER /* 1379 */ + /* timetz_larger */ + { + TimeTzADT *time1 = DatumGetTimeTzADTP(pergroupstate->transValue); + TimeTzADT *time2 = DatumGetTimeTzADTP(inputValues[slot_index]); + TimeTzADT *result; + + if (timetz_cmp_internal(time1, time2) > 0) + result = time1; + else + result = time2; + newVal = TimeTzADTPGetDatum(result); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_TIMETZ_SMALLER /* 1380 */ + /* timetz_smaller */ + { + TimeTzADT *time1 = DatumGetTimeTzADTP(pergroupstate->transValue); + TimeTzADT *time2 = DatumGetTimeTzADTP(inputValues[slot_index]); + TimeTzADT *result; + + if (timetz_cmp_internal(time1, time2) < 0) + result = time1; + else + result = time2; + newVal = TimeTzADTPGetDatum(result); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_INT2_SUM /* 1840 */ + /* int2_sum */ + { + int64 newval; + + newIsNull = false; + if (pergroupstate->transValueIsNull) + { + if (inputIsNulls[slot_index]) + { + newval = 0; + newIsNull = true; + } + else + newval = (int64) DatumGetInt16(inputValues[slot_index]); + } + else + { + int64 oldsum = DatumGetInt64(pergroupstate->transValue); + + if (inputIsNulls[slot_index]) + newval = oldsum; + else + newval = oldsum + (int64) DatumGetInt16(inputValues[slot_index]); + } + newVal = Int64GetDatum(newval); + } + +#elif VCI_TRANFN_OID == F_INT4_SUM /* 1841 */ + /* int4_sum */ + { + int64 newval; + + newIsNull = false; + if (pergroupstate->transValueIsNull) + { + if (inputIsNulls[slot_index]) + { + newval = 0; + newIsNull = true; + } + else + newval = (int64) DatumGetInt32(inputValues[slot_index]); + } + else + { + int64 oldsum = DatumGetInt64(pergroupstate->transValue); + + if (inputIsNulls[slot_index]) + newval = oldsum; + else + newval = oldsum + (int64) DatumGetInt32(inputValues[slot_index]); + } + newVal = Int64GetDatum(newval); + } + +#elif VCI_TRANFN_OID == F_INT4AND /* 1898 */ + /* int4and */ + { + int32 arg1 = DatumGetInt32(pergroupstate->transValue); + int32 arg2 = DatumGetInt32(inputValues[slot_index]); + + newVal = Int32GetDatum(arg1 & arg2); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_INT4OR /* 1899 */ + /* int4or */ + { + int32 arg1 = DatumGetInt32(pergroupstate->transValue); + int32 arg2 = DatumGetInt32(inputValues[slot_index]); + + newVal = Int32GetDatum(arg1 | arg2); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_INT4_AVG_ACCUM /* 1963 */ + /* int4_avg_accum */ + { + ArrayType *transarray = DatumGetArrayTypeP(pergroupstate->transValue); + int32 newval = DatumGetInt32(inputValues[slot_index]); + Int8TransTypeData *transdata; + + if (ARR_HASNULL(transarray) || + ARR_SIZE(transarray) != ARR_OVERHEAD_NONULLS(1) + sizeof(Int8TransTypeData)) + elog(ERROR, "expected 2-element int8 array"); + + transdata = (Int8TransTypeData *) ARR_DATA_PTR(transarray); + transdata->count++; + transdata->sum += newval; + + newVal = pergroupstate->transValue; + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_BOOLAND_STATEFUNC /* 2515 */ + /* booland_statefunc */ + { + newVal = BoolGetDatum( + DatumGetBool(pergroupstate->transValue) && DatumGetBool(inputValues[slot_index])); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_BOOLOR_STATEFUNC /* 2516 */ + /* boolor_statefunc */ + { + newVal = BoolGetDatum( + DatumGetBool(pergroupstate->transValue) || DatumGetBool(inputValues[slot_index])); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_INT2LARGER /* 770 */ + /* int2larger */ + { + int16 arg1 = DatumGetInt16(pergroupstate->transValue); + int16 arg2 = DatumGetInt16(inputValues[slot_index]); + + newVal = Int16GetDatum((arg1 > arg2) ? arg1 : arg2); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_INT2SMALLER /* 771 */ + /* int2smaller */ + { + int16 arg1 = DatumGetInt16(pergroupstate->transValue); + int16 arg2 = DatumGetInt16(inputValues[slot_index]); + + newVal = Int16GetDatum((arg1 < arg2) ? arg1 : arg2); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_INT2AND /* 1892 */ + /* int2and */ + { + int16 arg1 = DatumGetInt16(pergroupstate->transValue); + int16 arg2 = DatumGetInt16(inputValues[slot_index]); + + newVal = Int16GetDatum(arg1 & arg2); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_INT2OR /* 1893 */ + /* int2or */ + { + int16 arg1 = DatumGetInt16(pergroupstate->transValue); + int16 arg2 = DatumGetInt16(inputValues[slot_index]); + + newVal = Int16GetDatum(arg1 | arg2); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_INT2_AVG_ACCUM /* 1962 */ + /* int2_avg_accum */ + { + ArrayType *transarray = DatumGetArrayTypeP(pergroupstate->transValue); + int16 newval = DatumGetInt16(inputValues[slot_index]); + Int8TransTypeData *transdata; + + if (ARR_HASNULL(transarray) || + ARR_SIZE(transarray) != ARR_OVERHEAD_NONULLS(1) + sizeof(Int8TransTypeData)) + elog(ERROR, "expected 2-element int8 array"); + + transdata = (Int8TransTypeData *) ARR_DATA_PTR(transarray); + transdata->count++; + transdata->sum += newval; + + newVal = pergroupstate->transValue; + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_INT8LARGER /* 1236 */ + /* int8larger */ + { + int64 arg1 = DatumGetInt64(pergroupstate->transValue); + int64 arg2 = DatumGetInt64(inputValues[slot_index]); + + newVal = Int64GetDatum((arg1 > arg2) ? arg1 : arg2); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_INT8SMALLER /* 1237 */ + /* int8smaller */ + { + int64 arg1 = DatumGetInt64(pergroupstate->transValue); + int64 arg2 = DatumGetInt64(inputValues[slot_index]); + + newVal = Int64GetDatum((arg1 < arg2) ? arg1 : arg2); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_INT8AND /* 1904 */ + /* int8and */ + { + int64 arg1 = DatumGetInt64(pergroupstate->transValue); + int64 arg2 = DatumGetInt64(inputValues[slot_index]); + + newVal = Int64GetDatum(arg1 & arg2); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_INT8OR /* 1905 */ + /* int8or */ + { + int64 arg1 = DatumGetInt64(pergroupstate->transValue); + int64 arg2 = DatumGetInt64(inputValues[slot_index]); + + newVal = Int64GetDatum(arg1 | arg2); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_FLOAT8_ACCUM /* 222 */ + /* float8_accum */ + { + ArrayType *transarray = DatumGetArrayTypeP(pergroupstate->transValue); + + float8 newval = DatumGetFloat8(inputValues[slot_index]); + float8 *transvalues; + float8 N, + Sx, + Sxx, + tmp; + + transvalues = check_float8_array(transarray, "float8_accum", 3); + N = transvalues[0]; + Sx = transvalues[1]; + Sxx = transvalues[2]; + + N += 1.0; + Sx += newval; + if (transvalues[0] > 0.0) + { + tmp = newval * N - Sx; + Sxx += tmp * tmp / (N * transvalues[0]); + + /* + * Overflow check. We only report an overflow error when + * finite inputs lead to infinite results. Note also that Sxx + * should be NaN if any of the inputs are infinite, so we + * intentionally prevent Sxx from becoming infinite. + */ + if (isinf(Sx) || isinf(Sxx)) + { + if (!isinf(transvalues[1]) && !isinf(newval)) + ereport(ERROR, + (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE), + errmsg("value out of range: overflow"))); + + Sxx = get_float8_nan(); + } + } + + transvalues[0] = N; + transvalues[1] = Sx; + transvalues[2] = Sxx; + + newVal = pergroupstate->transValue; + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_FLOAT8LARGER /* 223 */ + /* float8larger */ + { + float8 arg1 = DatumGetFloat8(pergroupstate->transValue); + float8 arg2 = DatumGetFloat8(inputValues[slot_index]); + float8 result; + + if (float8_cmp_internal(arg1, arg2) > 0) + result = arg1; + else + result = arg2; + newVal = Float8GetDatum(result); + newIsNull = false; + } + +#elif VCI_TRANFN_OID == F_FLOAT8SMALLER /* 224 */ + /* float8smaller */ + { + float8 arg1 = DatumGetFloat8(pergroupstate->transValue); + float8 arg2 = DatumGetFloat8(inputValues[slot_index]); + float8 result; + + if (float8_cmp_internal(arg1, arg2) < 0) + result = arg1; + else + result = arg2; + newVal = Float8GetDatum(result); + newIsNull = false; + } + +#else /* default */ + { + FunctionCallInfo fcinfo = peraggstate->transfn_fcinfo; + + fcinfo->args[0].value = pergroupstate->transValue; + fcinfo->args[0].isnull = pergroupstate->transValueIsNull; + fcinfo->args[1].value = inputValues[slot_index]; + fcinfo->args[1].isnull = inputIsNulls[slot_index]; + fcinfo->isnull = false; + newVal = FunctionCallInvoke(fcinfo); + newIsNull = fcinfo->isnull; + } +#endif + +#ifdef VCI_TRANS_USE_CURPERAGG + aggstate->pseudo_aggstate->curperagg = NULL; +#endif + +#if VCI_TRANS_TYPE_BYVAL == -1 + if (!peraggstate->transtypeByVal && + DatumGetPointer(newVal) != DatumGetPointer(pergroupstate->transValue)) + { + if (!newIsNull) + { + MemoryContextSwitchTo(aggstate->aggcontext); + newVal = datumCopy(newVal, + peraggstate->transtypeByVal, + peraggstate->transtypeLen); + } + if (!pergroupstate->transValueIsNull) + pfree(DatumGetPointer(pergroupstate->transValue)); + } +#elif VCI_TRANS_TYPE_BYVAL == 0 + if (DatumGetPointer(newVal) != DatumGetPointer(pergroupstate->transValue)) + { + if (!newIsNull) + { + MemoryContextSwitchTo(aggstate->aggcontext); + newVal = datumCopy(newVal, + peraggstate->transtypeByVal, + peraggstate->transtypeLen); + } + if (!pergroupstate->transValueIsNull) + pfree(DatumGetPointer(pergroupstate->transValue)); + } +#endif + + pergroupstate->transValue = newVal; + pergroupstate->transValueIsNull = newIsNull; + +#if VCI_TRANS_TYPE_BYVAL <= 0 + MemoryContextSwitchTo(oldContext); +#endif + } +} diff --git a/contrib/vci/include/vci_chunk.h b/contrib/vci/include/vci_chunk.h new file mode 100644 index 000000000000..9c4f628c5d7f --- /dev/null +++ b/contrib/vci/include/vci_chunk.h @@ -0,0 +1,114 @@ +/*------------------------------------------------------------------------- + * + * vci_chunk.h + * Definitions and Declarations of ROS chunk buffer strage. + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_chunk.h + * + *------------------------------------------------------------------------- + */ + +#ifndef VCI_CHUNK_H +#define VCI_CHUNK_H + +#include "postgres.h" + +#include "miscadmin.h" +#include "utils/snapmgr.h" +#include "utils/timestamp.h" +#include "utils/uuid.h" + +#include "vci.h" +#include "vci_ros.h" + +/** + * @brief RosChunkBuffer is a buffer to store one chunk. + * + * We use RosChunkBuffer in two purposes. One is to store data obtaind + * directly from PostgreSQL relation. For this purpose, we prepare this + * buffer to have enough space to store data even when all the attributes + * have the size of worst case, that never happens. Once the chunk is + * stored in this buffer, we inspect the size of each column in the chunk. + * Afterward, we copy all the chunk data into RosChunkStorage with removing + * unused spaces. Here, we use RosChunkBuffer for each chunk, but this + * time we prepare the buffer with the size suitable for each chunk. ROS + * without compression is built from RosChunkStorage directly. + */ +typedef struct RosChunkBuffer +{ + int16 numColumns; /* number of columns */ + int16 numNullableColumns; /* number of nullable columns */ + + /** number of columns which need offset data for each entry because they + * have variable-length fields or fields longer than eight bytes, say, + * reference Datum. + */ + int16 numColumnsWithIndex; + + int nullWidthInByte; /* The byte width of null bit vector. */ + int numRowsAtOnce; /* the maximum number of rows in the chunk */ + int numFilled; /* the number of rows actually contained here */ + vcis_compression_type_t *compType; /* Array of compression type for + * columns. */ + int16 *nullBitId; /* -1 for NOT NULLABLE */ + int16 *columnSizeList; /* the sizes of columns in the worst case */ + void *dataAllocPtr; /* pointer keeping allocated area */ + char **data; /* buffer for each column */ + vci_offset_in_extent_t **dataOffset; /* offset to each datum */ + char *nullData; /* pointer to array of null bit vector. */ + char *tidData; /* pointer to array of TID. */ + char *deleteData; /* pointer to array of delete information */ +} RosChunkBuffer; + +/** + * @brief Structure to keep buffers that keeps column-wise data built from WOS. + */ +typedef struct RosChunkStorage +{ + int numChunks; /* The length of allocated chunk. */ + int numFilled; /* The number of chunk actually used. */ + int numTotalRows; /* The sum of rows in registered chunks. */ + bool forAppending; /* True to append data to the shrunken extent. */ + + /** Array of pointers to RosChunkBuffer, which is copied in a compact + * manner to reduce the memory. + */ + RosChunkBuffer **chunk; +} RosChunkStorage; + +extern void + vci_InitOneRosChunkBuffer(RosChunkBuffer *rosChunkBuffer, + int numRowsAtOnce, + int16 *columnSizeList, + int numColumns, + bool useDeleteVector, + vci_MainRelHeaderInfo *info); +extern void + vci_InitRosChunkStorage(RosChunkStorage *rosChunkStorage, + int numRowsAtOnce, + bool forAppending); +extern void + vci_DestroyOneRosChunkBuffer(RosChunkBuffer *rosChunkBuffer); +extern void + vci_DestroyRosChunkStorage(RosChunkStorage *rosChunkStorage); +extern PGDLLEXPORT void + vci_ResetRosChunkStorage(RosChunkStorage *rosChunkStorage); +extern void + vci_FillOneRowInRosChunkBuffer(RosChunkBuffer *rosChunkBuffer, + vci_MainRelHeaderInfo *info, + ItemPointer tid, + HeapTuple tuple, + int16 *dstColumnIdList, + AttrNumber *heapAttrNumList, + TupleDesc tupleDesc); +extern void + vci_ResetRosChunkBufferCounter(RosChunkBuffer *buffer); +extern void + vci_RegisterChunkBuffer(RosChunkStorage *rosChunkStorage, RosChunkBuffer *src); +extern Size + vci_GetDataSizeInChunkStorage(RosChunkStorage *src, int columnId, bool asFixed); + +#endif /* #ifndef VCI_CHUNK_H */ diff --git a/contrib/vci/include/vci_columns.h b/contrib/vci/include/vci_columns.h new file mode 100644 index 000000000000..18d6e9aecc16 --- /dev/null +++ b/contrib/vci/include/vci_columns.h @@ -0,0 +1,319 @@ +/*------------------------------------------------------------------------- + * + * vci_columns.h + * Definitions and declarations of VCI column store and extents + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_columns.h + * + *------------------------------------------------------------------------- + */ + +#ifndef VCI_COLUMNS_H +#define VCI_COLUMNS_H + +#include "postgres.h" + +#include "vci.h" +#include "vci_chunk.h" +#include "vci_ros.h" +#include "vci_tidcrid.h" + +/** header page ID of column meta data */ +#define VCI_COLUMN_META_HEADER_PAGE_ID (0) + +/** First page of Column data relations */ +#define VCI_COLUMN_DATA_FIRST_PAGE_ID (0) + +/** Column number of Column meta header page */ +#define VCI_NUM_COLUMN_META_HEADER_PAGE (1) + +/** Column ID of first Normal Column */ +#define VCI_FIRST_NORMALCOLUMN_ID (0) + +/** Column ID of special column */ +#define VCI_COLUMN_ID_TID (-1) +#define VCI_COLUMN_ID_NULL (-2) +#define VCI_COLUMN_ID_DELETE (-3) +#define VCI_COLUMN_ID_CRID (-4) /** @todo what is this? */ + +/** The data below are not column-stored data. + * We prepare them for convenience. + */ +#define VCI_COLUMN_ID_TID_CRID (-5) +#define VCI_COLUMN_ID_TID_CRID_UPDATE (-6) +#define VCI_COLUMN_ID_TID_CRID_WRITE (-7) +#define VCI_COLUMN_ID_TID_CRID_CDR (-8) +#define VCI_COLUMN_ID_DATA_WOS (-9) +#define VCI_COLUMN_ID_WHITEOUT_WOS (-10) + +#define VCI_INVALID_COLUMN_ID ((int16) -11) + +/** Vector bit count in one item (tuple) for delete vector */ +#define VCI_NUM_ROWS_IN_ONE_ITEM_FOR_DELETE (1024) + +/** Item number in page for delete vector */ +#define VCI_ITEMS_IN_PAGE_FOR_DELETE (52) + +/** Page number in extent for delete vector */ +#define VCI_NUM_PAGES_IN_EXTENT_FOR_DELETE (5) + +static inline BlockNumber +vci_CalcBlockNumberFromCrid64ForDelete(uint64 crid64) +{ + return (vci_CalcExtentIdFromCrid64(crid64) * + VCI_NUM_PAGES_IN_EXTENT_FOR_DELETE) + + (vci_CalcRowIdInExtentFromCrid64(crid64) / + (VCI_ITEMS_IN_PAGE_FOR_DELETE * + VCI_NUM_ROWS_IN_ONE_ITEM_FOR_DELETE)); +} + +static inline OffsetNumber +vci_CalcOffsetNumberFromCrid64ForDelete(uint64 crid64) +{ + return ((vci_CalcRowIdInExtentFromCrid64(crid64) / + VCI_NUM_ROWS_IN_ONE_ITEM_FOR_DELETE) % + VCI_ITEMS_IN_PAGE_FOR_DELETE) + FirstOffsetNumber; +} + +static inline uint32 +vci_CalcByteFromCrid64ForDelete(uint64 crid64) +{ + return (crid64 % VCI_NUM_ROWS_IN_ONE_ITEM_FOR_DELETE) / BITS_PER_BYTE; +} + +static inline uint32 +vci_CalcBitFromCrid64ForDelete(uint64 crid64) +{ + return crid64 & (BITS_PER_BYTE - 1); +} + +/** + * Pointing extent position of each column in BlockNumber. + * + * @description + * This is used in vcis_column_meta_t.block_number_extent. + * The field is not defined in the definition of the structure, because + * we have the other variable length field "common_dict_info". + * This block_number_extent follows the field. + * + * @note + * unused entries have InvalidBlockNumber in block_number and + * zero in num_blocks. + */ +typedef struct vcis_c_extent +{ + BlockNumber block_number; /* the position in the column data relation */ + BlockNumber num_blocks; /* the length in DB page unit */ + + bool enabled; /* block_number is enabled if true */ + + /* FIXME */ /* fill me */ + bool valid_min_max; /* size of min is + * vcis_column_meta_t.min_max_element_size */ + char min[1]; /* max follows min. */ +} vcis_c_extent_t; + +/** + * common dictionary info of each column + * + * @descriptions + *This is used in vcis_column_meta_t.common_dict_info + * + * @note + * unused entries have InvalidBlockNumber in block_number and + * zero in num_blocks. + */ +typedef struct vcis_c_common_dict +{ + BlockNumber block_number; /* the position in the column data relation */ + BlockNumber num_blocks; /* the length in DB page unit */ +} vcis_c_common_dict_t; + +typedef struct vcis_column_meta +{ + vcis_attribute_type_t vcis_attr_type; /* Attribute type */ + + Oid pgsql_atttypid; /* taken from FormData_pg_attribute.atttypid */ + int16 pgsql_attnum; /* taken from FormData_pg_attribute.attnum */ + int16 pgsql_attlen; /* taken from FormData_pg_attribute.attlen */ + int32 pgsql_atttypmod; /* taken from + * FormData_pg_attribute.atttypmod */ + uint32 num_extents; /* number of extents (for debug) */ + uint32 num_extents_old; /* previous number of extents (for + * recovery) */ + + BlockNumber free_page_begin_id; /* page ID of the first free area */ + + BlockNumber free_page_end_id; /* page ID of the last free area */ + + /** + * The DB page ID of free area that located in front of the added or + * deleted extent by the ROS command. (for recovery) + * This is used to recover free area list. + */ + BlockNumber free_page_prev_id; + + /** + * Same as free_page_prev_id, but just behind the added or deleted extent. + */ + BlockNumber free_page_next_id; + + /** + * The freespace size of added or deleted extent by the ROS command (for recovery) + */ + uint32 free_page_old_size; + + /** + * The freespace position of added or deleted extent in BlockNumber + * by the ROS command (for recovery) + */ + BlockNumber new_data_head; + + BlockNumber num_free_pages; /* number of free DB pages in the listed free + * area */ + BlockNumber num_free_pages_old; /* for recovery */ + BlockNumber num_free_page_blocks; /* number of free areas, not number of + * free DB pages */ + BlockNumber num_free_page_blocks_old; /* for recovery */ + + /*--- Above must be same as vcis_tidcrid_meta_t ---*/ + + uint32 common_flag_0; /* vcis_column_meta_flag */ + + uint32 min_max_field_size; /* size of min_max field size */ + uint32 min_max_content_size; /* size of min_max content size */ + uint16 num_common_dicts; /* Number of common dictionarys */ + int16 latest_common_dict_id; /* Id of the latest common dictionary */ + uint32 common_dict_info_offset; /* offset of common_dict_info[0] */ + uint32 block_number_extent_offset; /* offset of extent_pointer[0] */ + + vcis_c_common_dict_t common_dict_info[1]; /* common dictionary + * informations */ + /* block_number_extent follows common_dict_info[num_common_dict - 1] */ +} vcis_column_meta_t; + +/** + * @brief Get pointer to vcis_extent_t in the give DB page. + */ +#define vci_GetExtentT(page) \ + ((vcis_extent_t *) &((page)[VCI_MIN_PAGE_HEADER])) + +/* + * Extend headers + */ +typedef struct vcis_extent +{ + uint32 size; /* Size of extent */ + vcis_extent_type_t type; + uint32 id; /* Extend id */ + vcis_compression_type_t comp_type; /* Compression method */ + uint32 offset_offset; /* Offset to the offset */ + uint32 offset_size; /* Size of the offset size */ + uint32 data_offset; /* Offset to the data */ + uint32 data_size; /* Data size */ + uint16 compressed; /* 0 for not compressed, 1 for compressed */ + int16 dict_offset; /* or common dictionary ID (>= -1) when + * dict_size == 0 */ + uint32 dict_size; /* Size to the dictionary data */ + vcis_dict_type_t dict_type; /* The type of dictionary */ + char dict_body[1]; /* the mainbody of the dictionary */ + /* offset_body and data_body follows dict_body */ +} vcis_extent_t; + +typedef vci_RelationPair vci_ColumnRelations; + +extern PGDLLEXPORT vcis_column_meta_t *vci_GetColumnMeta(Buffer *buffer, Relation rel); +extern PGDLLEXPORT vcis_c_extent_t *vci_GetColumnExtent(Buffer *buffer, + BlockNumber *blockNumber, + Relation rel, + int32 extentId); + +extern PGDLLEXPORT void vci_OpenColumnRelations(vci_ColumnRelations *rel, + vci_MainRelHeaderInfo *info, + int16 columnId, + LOCKMODE lockmode); + +extern void vci_CloseColumnRelations(vci_ColumnRelations *rel, + LOCKMODE lockmode); + +extern void vci_InitializeColumnRelations(vci_MainRelHeaderInfo *info, + TupleDesc tupdesc, + Relation heapRel); + +extern void vci_WriteRawDataExtentInfo(Relation rel, + int32 extentId, + uint32 startPageID, + uint32 numBlocks, + char *minData, + char *maxData, + bool validMinMax, + bool checkOverwrite); + +extern void vci_WriteOneExtent(vci_MainRelHeaderInfo *info, + RosChunkStorage *src, + int extentId, + TransactionId xgen, /* xgen in extent info */ + TransactionId xdel, /* xdel in extent info */ + TransactionId xid); /* in tuple header */ + +/* columns to fetcher Interface */ +extern void vci_GetElementPosition(uint32 *offset, + BlockNumber *blockNumberBase, + uint32 *dataOffset, + vci_ColumnRelations *rel, + int32 extentId, + uint32 rowIdInExtent, + Form_pg_attribute attr); + +extern PGDLLEXPORT void vci_GetChunkPositionAndSize(uint32 *offset, + Size *totalSize, + BlockNumber *blockNumberBase, + uint32 *dataOffset, + vci_ColumnRelations *rel, + int32 extentId, + uint32 rowIdInExtent, + int32 numUnit, + Form_pg_attribute attr); + +extern uint16 + vci_GetFixedColumnSize(vci_MainRelHeaderInfo *info, int16 columnId); +extern void + vci_GetPositionForFixedColumn(BlockNumber *blockNumber, + uint32 *offset, + vci_MainRelHeaderInfo *info, + int16 columnId, + int32 extentId, + uint32 rowIdInExtent, + bool atEnd); + +extern PGDLLEXPORT void + vci_InitializeDictInfo(vci_DictInfo *dictInfo); + +/* *************************** + * Min-Max info + * *************************** + */ + +static inline void +vci_Initvci_ColumnRelations(vci_ColumnRelations *rel) +{ + rel->meta = NULL; + rel->data = NULL; +} + +/* function to write meta data header + * argumtents + * Relation relMeta + * Buffer buffer + */ +static inline void +vci_WriteColumnMetaDataHeader(Relation relMeta, + Buffer buffer) +{ + vci_WriteOneItemPage(relMeta, buffer); +} + +#endif /* VCI_COLUMNS_H */ diff --git a/contrib/vci/include/vci_columns_data.h b/contrib/vci/include/vci_columns_data.h new file mode 100644 index 000000000000..d21c0a150aab --- /dev/null +++ b/contrib/vci/include/vci_columns_data.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * vci_columns_data.h + * Declarations of functions to check which columns are indexed. + * + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_columns_data.h + * + *------------------------------------------------------------------------- + */ +#ifndef VCI_COLUMNS_DATA_H +#define VCI_COLUMNS_DATA_H + +#include "access/tupdesc.h" +#include "access/attnum.h" +#include "nodes/bitmapset.h" +#include "storage/lock.h" +#include "utils/palloc.h" +#include "utils/rel.h" + +#include "vci_ros.h" + +extern TupleDesc vci_ExtractColumnDataUsingIds(const char *vci_column_ids, Relation heapRel); +extern PGDLLEXPORT TupleDesc vci_GetTupleDescr(vci_MainRelHeaderInfo *info); +extern Bitmapset *vci_MakeIndexedColumnBitmap(Oid mainRelationOid, MemoryContext sharedMemCtx, LOCKMODE lockmode); +extern Bitmapset *vci_MakeDroppedColumnBitmap(Relation indexRel); +extern char *vci_ConvertAttidBitmap2String(Bitmapset *attid_bitmap); +extern AttrNumber vci_GetAttNum(TupleDesc desc, const char *name); + +#endif /* VCI_COLUMNS_DATA_H */ diff --git a/contrib/vci/include/vci_executor.h b/contrib/vci/include/vci_executor.h new file mode 100644 index 000000000000..93e15fd0c3d5 --- /dev/null +++ b/contrib/vci/include/vci_executor.h @@ -0,0 +1,893 @@ +/*------------------------------------------------------------------------- + * vci_executor.h + * Definitions and declarations about executor modules + * + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_executor.h + * + *------------------------------------------------------------------------- + */ +#ifndef VCI_EXECUTOR_H +#define VCI_EXECUTOR_H + +#include "postgres.h" + +#include "access/htup.h" +#include "access/tupdesc.h" +#include "executor/execdesc.h" +#include "executor/execExpr.h" +#include "executor/instrument.h" +#include "nodes/bitmapset.h" +#include "nodes/execnodes.h" +#include "nodes/extensible.h" +#include "nodes/nodes.h" +#include "nodes/parsenodes.h" +#include "nodes/plannodes.h" +#include "nodes/pathnodes.h" +#include "storage/buffile.h" + +#include "vci_fetch.h" + +struct VciAgg; +struct VciAggState; + +/* + * MemoryContext size used during query execution + */ +#define VCI_ALLOCSET_DEFAULT_MINSIZE (0) +#define VCI_ALLOCSET_DEFAULT_INITSIZE ( 8 * 1024 * 1024) +#define VCI_ALLOCSET_DEFAULT_MAXSIZE (512 * 1024 * 1024) + +/** + * Maximum number of fetch rows specified in vci_CSCreateFetchContext() + */ +#define VCI_NUM_ROWS_READ_AT_ONCE (32 * 1024) + +/** + * Maximum number of rows to fetch at one time specified in vci_CSGetSkipFromVirtualTuples() + */ +#define VCI_MAX_FETCHING_ROWS (128) + +/** + * Number of slot to allocate for Skip List + */ +#define VCI_MAX_SKIP_LIST_SLOTS (VCI_MAX_FETCHING_ROWS + 1) + +/** + * Initial number of element in plan_info_map[] + */ +#define VCI_INIT_PLAN_INFO_ENTRIES (16) + +struct ExplainState; +struct VciScanState; +struct VciVPContext; +struct VciVPNode; +struct VciScalarArrayOpExprHashTable; + +/** + * Column store fetch management information per VCI Scan + * + * @note This struct is instantiated on SMC + */ +typedef struct +{ + /** + * The fetch context created by vci_CSCreateFetchContext() in the backend + * is recorded in the fetch_context member of VciScanState as the master. + * However, it is also recorded in this member variable so that it can be referenced from parallel workers. + */ + vci_CSFetchContext fetch_context; + + /** + * Pointer to VCI Scan State for reading VCI index (referenced only when abort) + * Used only on backend side. Reading it on the parallel worker side results in dangling pointer + */ + struct VciScanState *scanstate; + +} vci_fetch_placeholder_t; + +/** + * Column store fetch management information for each VCI index + * + * + * @note This struct instance is taken on SMC. + */ +typedef struct +{ + Oid indexoid; /* OID of VCI index */ + Bitmapset *attr_used; /* Bitmap indicating the column position + * referenced in VCI index */ + int num_fetches; /* Number of VCI Scan that refer to VCI index + * of indexoid */ + + vci_CSQueryContext query_context; /* Column Store Query Context */ + vci_local_ros_t *volatile local_ros; /* Pointer to Local ROS */ + + vci_fetch_placeholder_t *fetch_ph_table; /* Pointer to + * vci_fetch_placeholder_t + * array struct. Number of + * element in the array is + * num_fetches */ + +} vci_index_placeholder_t; + +/** + * Data struct that records the correspondence between Plan State and Plan in query + * + * - Plan is on SMC and is common between backend processes and parallel workers. + * - Plan State refers to data in the local memory of the backend process. + * + * @note This struct instance is taken on SMC. + */ +typedef struct +{ + Plan *plan; /* plan (on SMC) */ + PlanState *planstate; /* PlanState on backend side */ + Instrumentation instrument; /* Instrumentation for aggregating + * Instrumentation of parallel workers during + * parallel execution */ +} vci_plan_info_t; + +/** + * Column store fetch management information for each query + */ +typedef struct vci_query_context +{ + /** + * Memory context for allocatin gmemory related to Column Store Fetch + * + * - Expect it to be SMC + * - vci_query_context also be instatiated in mcontext + */ + MemoryContext mcontext; + + /** + * Used to use contention when writing data in the vci_query_context + * struct from a parallel worker. + */ + LWLock *lock; + + /** + * Number of VCI index referenced in query + */ + int num_indexes; + + /** + * Array into placeholder for VCI index referenced in this query + * The size is num_indexes. + */ + vci_index_placeholder_t *index_ph_table; + + /** + * If stops in the middle of custom plan execution + */ + bool has_stopped; + + /** + * planned stmt rewritten into VCI plan + */ + PlannedStmt *plannedstmt; + + /** + * Original planned stmt before rewrite. + * Used when custom plan execution is canceled. + */ + PlannedStmt *origplannedstmt; + + /** + * Maximum number of elements allocated for plan_info_map[] + */ + int max_plan_info_entries; + + /** + * Array containing all Plan and PlanState pairs on backend side + * Accessed by plan_info_map[plan->plan_no - 1] + * + * Used to find PlanState corresponding to plan in + * vci_exec_set_param_plan_as_proxy(). + */ + vci_plan_info_t *plan_info_map; + +} vci_query_context_t; + +/** + * Pointer to column store fetch management object for each query + */ +extern vci_query_context_t *vci_query_context; + +/* ---------------- + * Vector processing + * ---------------- + */ + +/** + * ExprState number in VciVPContext + */ +typedef unsigned int vci_vp_item_id; + +/** + * Templete of function pointer for Vector Processing + */ +typedef void (*VciVPExecOp_func) (Expr *expression, struct VciVPNode *vpnode, struct VciVPContext *vpcontext, ExprContext *econtext, int max_slots); + +/** + * Vector Processing's node + * + * Converted from Expression state node. + */ +typedef struct VciVPNode +{ + VciVPExecOp_func evalfunc; /* Function to process this VP node */ + Expr *expr; + int len_args; /* Max number of elements in arg_items[] */ + vci_vp_item_id *arg_items; /* Item number of the child VP node of this VP + * node */ + + Datum *itemValue; /* Datum array that records this VP node + * process result. Number of element is + * allocated VCI_MAX_FETCHING_ROWS. */ + bool *itemIsNull; /* bool array that records this VP node + * process result. Number of element is + * allocated VCI_MAX_FETCHING_ROWS. */ + uint16 *skip_list; /* Skip list usued during this VP node process */ + + /** Auxiliary information for some VP node types*/ + union + { + /** Original skip list configured on the control VP node */ + struct + { + uint16 *orig_skip_list; + } init; + + /** Used as storage location for intermediate data during processing of VP nodes based on T_CoerceToDomain */ + struct + { + Oid resulttype; + char *name; + } coerce_to_domain; + + struct + { + int paramid; /* numeric ID for parameter */ + Oid paramtype; /* OID of parameter's datatype */ + Plan *vci_parent_plan; + } param; + + struct + { + FmgrInfo *finfo; /* function's lookup data */ + FunctionCallInfo fcinfo_data; /* arguments etc */ + /* faster to access without additional indirection: */ + PGFunction fn_addr; /* actual call address */ + int nargs; /* number of arguments */ + } func; + + struct + { + /* element_type/typlen/typbyval/typalign are filled at runtime */ + Oid element_type; /* InvalidOid if not yet filled */ + bool useOr; /* use OR or AND semantics? */ + int16 typlen; /* array element type storage info */ + bool typbyval; + char typalign; + FmgrInfo *finfo; /* function's lookup data */ + FunctionCallInfo fcinfo_data; /* arguments etc */ + /* faster to access without additional indirection: */ + PGFunction fn_addr; /* actual call address */ + } scalararrayop; + + struct + { + bool has_nulls; + struct VciScalarArrayOpExprHashTable *elements_tab; + FmgrInfo *finfo; /* function's lookup data */ + FunctionCallInfo fcinfo_data; /* arguments etc */ + /* faster to access without additional indirection: */ + PGFunction fn_addr; /* actual call address */ + FmgrInfo *hash_finfo; /* function's lookup data */ + FunctionCallInfo hash_fcinfo_data; /* arguments etc */ + /* faster to access without additional indirection: */ + PGFunction hash_fn_addr; /* actual call address */ + } hashedscalararrayop; + + struct + { + /* lookup and call info for source type's output function */ + FmgrInfo *finfo_out; + /* lookup and call info for result type's input function */ + FmgrInfo *finfo_in; + Oid typioparam; + + /* + * Below ones used in OSS are not required for VCI as these + * information will be filled by InitFunctionCallInfoData in eval + * execute function itself FunctionCallInfo fcinfo_data_out; + * FunctionCallInfo fcinfo_data_in; + */ + } iocoerce; + + } data; +} VciVPNode; + +/** + * Vector processing context + * + * Converted from Expression tree. + */ +typedef struct VciVPContext +{ + vci_vp_item_id num_item; /* Currently assigned maximum item number */ + vci_vp_item_id max_item; /* Maximum number of nodes reserved by VP + * context */ + VciVPNode *itemNode; /* Array of VP node */ + + Datum *resultValue; /* Array of Datum that is the final result + * when VP context is processed */ + bool *resultIsNull; /* Array of bool that is the final result when + * VP context is processed */ + + vci_vp_item_id caseValue; /* Temporarily records caseValue during + * execution of VciExecEvalVectorProcessing() */ + vci_vp_item_id domainValue; /* Temporarily records domainValue during + * execution of VciExecEvalVectorProcessing() */ + +} VciVPContext; + +extern void VciExecEvalVectorProcessing(VciVPContext *vpcontext, ExprContext *econtext, int max_slots); +extern VciVPContext *VciBuildVectorProcessing(Expr *node, PlanState *parent, ExprContext *econtext, uint16 *skip_list); + +/* ---------------- + * Projection information for VCI + * ---------------- + */ + +/** + * Data struct that records how each target in the target list was processed in VciProjectionInfo + */ +typedef struct VciProjectionInfoSlot +{ + bool is_simple_var; + + union + { + /* Record here if is_simple_var is true */ + struct + { + Index relid; /* Copy varno value of Var */ + AttrNumber attno; /* Copy varattno value of Var */ + } simple_var; + + /* Record here if is_simple_var is false */ + struct + { + int expr_id; /* Converted to pi_vp_tle_array[expr_id] + * in VciProjectionInfo */ + } expr; + } data; +} VciProjectionInfoSlot; + +/** + * ProjectionInfo for VCI + * + * The exprlist in ProjectionInfo is an array of VciVPContext pointers for vector processing. + * + * @note The ProjectionInfo type in PostgreSQL and the VciProjectionInfo type in VCI are almost identical, + * but the former loses information about which position in the original target list the simple_var and pi_targetlist were + * created from, while the latter manages this information using pi_slotMap. + */ +typedef struct VciProjectionInfo +{ + /* instructions to evaluate projection */ + ExprState pi_state; + TargetEntry **pi_tle_array; /* Array of expression state tree under + * TargetEntry that was converted */ + VciVPContext **pi_vp_tle_array; /* Array of VP context */ + int pi_tle_array_len; /* Maximum number of element of + * pi_vp_tle_array[] */ + ExprContext *pi_exprContext; /* Execute context for executing this + * VciProjectionInfo */ + TupleTableSlot *pi_slot; /* TupleTableSlot that contains this + * VciProjectionInfo result */ + bool pi_directMap; + int pi_numSimpleVars; /* Number of Simple Vars */ + int *pi_varSlotOffsets; /* Pointer of mapping information used by + * Simple Vars */ + int *pi_varNumbers; /* Pointer of mapping information used by + * Simple Vars */ + int *pi_varOutputCols; /* Pointer of mapping information used by + * Simple Vars */ + VciProjectionInfoSlot *pi_slotMap; /* Map information that records + * whether each target list was + * converted to SimpleVar or VP + * context. */ + int pi_lastInnerVar; + int pi_lastOuterVar; + int pi_lastScanVar; +} VciProjectionInfo; + +/* ---------------- + * VCI Scan/Sort/Agg Common Definitions + * ---------------- + */ + +/* + * Macros specified in flags of CustomScan and CustomScanState + */ +#define VCI_CUSTOMPLAN_MASK (0x00F0) +#define VCI_CUSTOMPLAN_SCAN (0x0010) +#define VCI_CUSTOMPLAN_SORT (0x0020) +#define VCI_CUSTOMPLAN_AGG (0x0030) +#define VCI_CUSTOMPLAN_GATHER (0x0060) + +/** + * VCI based Plan node + */ +typedef struct VciPlan +{ + CustomScan cscan; /* Base class CustomScan */ + + /* + * The following parameters are set by the (sequential) scheduler. + */ + int preset_eflags; /* eflags precalculated for parallel + * scheduling */ + + AttrNumber scan_plan_no; /* Plan Number for VCI Scan that becomes a + * partitioned table */ + + /** Cache of vci_search_scan() result */ + struct VciScan *scan_cached; + + /** Plan to be rewritten. Become NULL when copyObject() is called */ + Plan *orig_plan; +} VciPlan; + +/** + * VCI based Plan State node + */ +typedef struct VciPlanState +{ + CustomScanState css; /* Base class CustomScanState */ + + /** Cache of vci_search_scan_state() result */ + struct VciScanState *scanstate_cached; + +} VciPlanState; + +/** + * VCI Scan node + */ +typedef struct VciScan +{ + VciPlan vci; /* Base class VCI Plan */ + + VciScanMode scan_mode; + + Index scanrelid; /* relid of table to be scanned */ + Oid reloid; /* OID of table to be scanned */ + Oid indexoid; /* OID of VCi index that actually reads data */ + Bitmapset *attr_used; /* Bitmap of column (attribute) to scans */ + int num_attr_used; /* Number of scan column */ + bool is_all_simple_vars; /* Target list is configured with + * SimpleVar */ + double estimate_tuples; /* Estimated number of rows in the scanned + * table */ + bool is_subextent_grain; /* Execute sub-extent fine-grained + * parallelization or not */ + Index index_ph_id; /* index_ph_table[index_ph_id-1] of + * vci_query_context_t */ + Index fetch_ph_id; /* index_ph_table[index_ph_id-1].fetch_ph_table[fetch_ph_id-1] + * of vci_query_context_t */ +} VciScan; + +/** + * VCI Scan State node + */ +typedef struct VciScanState +{ + VciPlanState vci; /* Base class VCI Plan State */ + + bool is_subextent_grain; /* Execute sub-extent fine-grained + * parallelization or not */ + + /* + * Column store fetch setting + */ + vci_CSFetchContext fetch_context; /* Columnar fetch context (master) */ + vci_CSFetchContext local_fetch_context; /* Columnar fetch context (locale + * of each process) */ + vci_extent_status_t *extent_status; /* extent information */ + vci_virtual_tuples_t *vector_set; /* vector set */ + + AttrNumber last_attr; /* Biggest Attr Number */ + int *attr_map; /* Map that substracts column store fetch id + * from Attr Number */ + + int32 first_extent_id; /* Extent number that starts reading */ + int32 last_extent_id; /* Extent number that finishes reading + * (exclusive) */ + int64 first_crid; /* CRID that starts read */ + int64 last_crid; /* CRID that finishes read (exclusive) */ + + /* + * The following are read and written during column store fetch execution. + */ + + /** + * true when the first column store fetch is executed + * + * Set to false before executing column store fetch + */ + bool first_fetch; + + VciFetchPos pos; /* Current column store fetch location */ + VciFetchPos mark; /* Column store fetch location recorded in + * mergr */ + + VciVPContext *vp_qual; /* VP context converted from qual */ + + VciProjectionInfo *vps_ProjInfo; /* when generating oputput with non-VP */ + + /* + * The result of vector processing will be recorded in + * result_values[resind][i] and result_isnull[resind][i]. With resind is + * order of target list and i is number in vector + */ + Datum **result_values; /** Process result after Vector processing (value information) */ + bool **result_isnull; /** Process result after Vector processing (NULL information) */ + + /** + * Number of Vector processing context + */ + int num_vp_targets; + + /** + * Arrays to pointer to Vector processing context + */ + VciVPContext **vp_targets; + + /*** + * true when parallel worker receives NULL + */ + bool scan_done; + +} VciScanState; + +/** + * VCI Sort node + */ +typedef struct VciSort +{ + VciPlan vci; /* Base class VCI Plan */ + + int numCols; /* number of sort-key columns */ + AttrNumber *sortColIdx; /* their indexes in the target list */ + Oid *sortOperators; /* OIDs of operators to sort them by */ + Oid *collations; /* OIDs of collations */ + bool *nullsFirst; /* NULLS FIRST/LAST directions */ +} VciSort; + +/** + * VCI Sort State node + */ +typedef struct VciSortState +{ + VciPlanState vci; /* Base class VCI Plan State */ + + bool randomAccess; /* need random access to sort output? */ + bool bounded; /* is the result set bounded? */ + int64 bound; /* if bounded, how many tuples are needed */ + bool sort_Done; /* sort completed yet? */ + bool bounded_Done; /* value of bounded we did the sort with */ + int64 bound_Done; /* value of bound we did the sort with */ + void *tuplesortstate; /* private state of tuplesort.c */ + + ScanDirection saved_dir; /* area to store estate->es_direction */ +} VciSortState; + +/** + * VCI Agg node + */ +typedef struct VciAgg +{ + VciPlan vci; /* base class VCI Plan State */ + + AggStrategy aggstrategy; + int numCols; /* number of grouping columns */ + AttrNumber *grpColIdx; /* their indexes in the target list */ + Oid *grpOperators; /* equality operators to compare with */ + Oid *grpCollations; + int64 numGroups; /* estimated number of groups in input */ +} VciAgg; + +typedef struct VciAggStatePerAggData *VciAggStatePerAgg; +typedef struct VciAggStatePerGroupData *VciAggStatePerGroup; + +/** + * VCI Agg State node + */ +typedef struct VciAggState +{ + VciPlanState vci; /* Base class VCI Plan State */ + + bool enable_vp; /* Is vector processing possible or not */ + + VciProjectionInfo *vps_ProjInfo; /* ProjectionInfo when generating Agg + * State output */ + + List *aggs; /* all Aggref nodes in targetlist & quals */ + int numaggs; /* length of list (could be zero!) */ + Oid *eqfuncoids; /* per-grouping-field equality fn oids */ + ExprState **eqfunctions; /* expression returning equality */ + FmgrInfo *hashfunctions; /* per-grouping-field hash fns */ + VciAggStatePerAgg peragg; /* per-Aggref information */ + MemoryContext aggcontext; /* memory context for long-lived data */ + ExprContext *tmpcontext; /* econtext for input expressions */ + bool agg_done; /* indicates completion of Agg scan */ + /* these fields are used in AGG_PLAIN and AGG_SORTED modes: */ + VciAggStatePerGroup pergroup; /* per-Aggref-per-group working state */ + HeapTuple grp_firstTuple; /* copy of first tuple of current group */ + /* these fields are used in AGG_HASHED mode: */ + TupleTableSlot *hashslot; /* slot for loading hash table */ + TupleHashTable hashtable; /* hash table with one entry per group */ + int last_hash_column; + int *hash_needed; /* array of columns needed in hash table */ + int num_hash_needed; /* number of columns needed in hash table */ + Datum **hash_input_values; /* array of pointers to datum vector for + * each hash key */ + bool **hash_input_isnull; /* array of pointers to null vector for + * each ehash key */ + bool table_filled; /* hash table filled yet? */ + TupleHashIterator hashiter; /* for iterating through hash table */ + + /* + * aggregation function changes its behaviour by checking AggState + * Therefore, ExecEvalExpr() shows dummy AggState, not VciAggState + */ + AggState *pseudo_aggstate; + + /** + * Record VciAggHashEntry before copying to SMC, in case of parallel worker + * encounter out-of-memory error in SMC. + * Usually set to NULL. + */ + volatile TupleHashEntry saved_entry; + + /** + * Similar to saved_entry, but only records the first HeapTuple of + * each group in plain/sorted aggregation + * Usually set to NULL. + */ + volatile HeapTuple saved_grp_firstTuple; + +} VciAggState; + +typedef void (*VciAdvanceAggref_Func) (VciAggState *, int, VciAggStatePerGroup *, int); + +extern VciAdvanceAggref_Func VciGetSpecialAdvanceAggrefFunc(VciAggStatePerAgg peraggstate); + +/* ---------------- + * VCI Gather information + * ---------------- + */ + +typedef struct VciGather +{ + VciPlan vci; + +} VciGather; + +typedef struct VciGatherState +{ + VciPlanState vci; +} VciGatherState; + +/* ---------------- + * VCI Var State + * ---------------- + */ +/** + * Var expression state for VCI + * + * Normally, Var expression is converted to ExprState exression state in ExecInitNode(), + * but in VCI, additional information is required, so a dedicated class is created. + */ +typedef struct VciVarState +{ + ExprState xprstate; /* Base class VCI Plan State */ + VciScanState *scanstate; /* Pointer to VciScanState from which Var will + * load data */ +} VciVarState; + +/** + * Param expression state for VCI + * + * Normally, Param expression is converted to ExprState exression state in ExecInitNode(), + * but in VCI, additional information is required, so a dedicated class is created. + */ +typedef struct VciParamState +{ + ExprState xprstate; /* Base class VCI Plan State */ + Plan *plan; /* th plan to hold this Param */ + +} VciParamState; + +extern CustomScanMethods vci_scan_scan_methods; +extern CustomExecMethods vci_scan_exec_column_store_methods; +extern CustomScanMethods vci_sort_scan_methods; +extern CustomExecMethods vci_sort_exec_methods; +extern CustomScanMethods vci_agg_scan_methods; +extern CustomExecMethods vci_agg_exec_methods; +extern CustomScanMethods vci_hashagg_scan_methods; +extern CustomExecMethods vci_hashagg_exec_methods; +extern CustomScanMethods vci_groupagg_scan_methods; +extern CustomExecMethods vci_groupagg_exec_methods; +extern CustomScanMethods vci_gather_scan_methods; +extern CustomExecMethods vci_gather_exec_methods; + +/* ---------------- + * vci_executor.c + * ---------------- + */ + +/** + * Enum that specifies how Var is handled in ExecInitNode() + */ +typedef enum vci_initexpr +{ + VCI_INIT_EXPR_NONE, + + /** Var converts to ExprState like original */ + VCI_INIT_EXPR_NORMAL, + + /** Var converts to VciVarState */ + VCI_INIT_EXPR_FETCHING_COLUMN_STORE, + + /** Var converts to VciVarState, but Aggref and later convert to ExpState like original */ + VCI_INIT_EXPR_FETCHING_COLUMN_STORE_AGGREF, +} vci_initexpr_t; + +extern ExprState *VciExecInitQual(List *qual, PlanState *parent, vci_initexpr_t inittype); +extern TupleTableSlot *VciExecProject(VciProjectionInfo *projInfo); + +extern VciProjectionInfo *VciExecBuildProjectionInfo(List *targetList, + ExprContext *econtext, + TupleTableSlot *slot, + PlanState *parent, + TupleDesc inputDesc); + +/* ---------------- + * vci_planner.c + * ---------------- + */ +extern bool vci_is_supported_jointype(JoinType jointype); + +/* ---------------- + * vci_plan.c + * ---------------- + */ + +extern bool vci_is_custom_plan(Plan *plan); +extern int vci_get_vci_plan_type(Plan *plan); +extern void vci_copy_plan(VciPlan *dest, const VciPlan *src); +extern struct VciScan *vci_search_scan(VciPlan *); +extern struct VciScanState *vci_search_scan_state(VciPlanState *); +extern List *vci_generate_pass_through_target_list(List *targetlist); + +/* ---------------- + * vci_plan_func.c + * ---------------- + */ + +struct QueryDesc; + +/** + * Callback to notify plan_id before analyzing topmost plan + * (top of main plan tree and each subplan tree) in vci_plannedstmt_tree_walker() + * or vci_plannedstmt_tree_mutator() analyze. + */ +typedef void (*vci_topmost_plan_cb_t) (Plan *, int plan_id, void *context); + +/** + * Template for a function pointer passed as a callback to a mutator routine that rewrites a plan. + */ +typedef bool (*vci_mutator_t) (Plan **plan_p, Plan *parent, void *context, int eflags, bool *changed); + +extern PGDLLEXPORT bool vci_plannedstmt_tree_walker(PlannedStmt *plannedstmt, bool (*walker) (Plan *, void *), vci_topmost_plan_cb_t topmostplan, void *context); +extern PGDLLEXPORT bool vci_plan_tree_walker(Plan *plan, bool (*walker) (Plan *, void *), void *context); +extern bool vci_expression_walker(Plan *plan, bool (*walker) (Node *, void *), void *context); +extern bool vci_expression_and_colid_walker(Plan *plan, bool (*walker) (Node *, void *), void (*attr_cb) (AttrNumber *, void *), void *context); +extern bool vci_expression_and_initplan_walker(Plan *plan, bool (*walker) (Node *, void *), bool (*walker_initplan) (Node *, void *), void *context); + +extern bool vci_plannedstmt_tree_mutator(PlannedStmt *plannedstmt, vci_mutator_t mutator, vci_topmost_plan_cb_t topmostplan, void *context, int eflags, bool *changed); +extern bool vci_plannedstmt_tree_mutator_order(PlannedStmt *plannedstmt, vci_mutator_t mutator, vci_topmost_plan_cb_t topmostplan, void *context, int eflags, bool *changed, int *subplan_order); +extern bool vci_plan_tree_mutator(Plan **plan_p, Plan *parent, vci_mutator_t mutator, void *context, int eflags, bool *changed); + +/* ---------------- + * vci_scan.c + * ---------------- + */ +extern TupleTableSlot *VciExecProcScanTuple(VciScanState *node); +extern int VciExecProcScanVector(VciScanState *scanstate); + +/* ---------------- + * vci_sort.c + * ---------------- + */ +struct Tuplesortstate; + +extern struct Tuplesortstate *vci_sort_exec_top_half(VciSortState *sortstate); +extern void vci_sort_perform_sort(VciSortState *sortstate); + +/* ---------------- + * vci_agg.c + * ---------------- + */ + +extern void vci_agg_fill_hash_table(VciAggState *aggstate); +extern TupleTableSlot *vci_agg_retrieve_hash_table(VciAggState *aggstate); +extern TupleHashEntry vci_agg_find_group_from_hash_table(VciAggState *aggstate); +extern void vci_initialize_aggregates(VciAggState *aggstate, + VciAggStatePerAgg peragg, + VciAggStatePerGroup pergroup); +extern void vci_finalize_aggregate(VciAggState *aggstate, VciAggStatePerAgg peraggstate, VciAggStatePerGroup pergroupstate, Datum *resultVal, bool *resultIsNull); +extern void vci_advance_aggregates(VciAggState *aggstate, VciAggStatePerGroup pergroup); + +/* ---------------- + * vci_aggmergetranstype.c + * ---------------- + */ + +/** + * Template for function pointer for copying Datum + */ +typedef Datum (*VciCopyDatumFunc) (Datum, bool, int); + +extern bool vci_is_supported_aggregation(Aggref *aggref); + +/* ---------------- + * vci_gather.c + * ---------------- + */ + +/* ---------------- + * vci_param.c + * ---------------- + */ +extern void VciExecEvalParamExec(ExprState *exprstate, ExprEvalStep *op, ExprContext *econtext); + +/* ---------------- + * Column store fetching (vci_fetch_column_store.c) + * ---------------- + */ +extern void vci_initialize_query_context(QueryDesc *queryDesc, int eflags); +extern void vci_finalize_query_context(void); +extern void vci_free_query_context(void); +extern bool vci_is_processing_custom_plan(void); + +extern void vci_create_one_fetch_context_for_fetching_column_store(VciScanState *scanstate, ExprContext *econtext); +extern void vci_clone_one_fetch_context_for_fetching_column_store(VciScanState *scanstate); +extern void vci_destroy_one_fetch_context_for_fetching_column_store(VciScanState *scanstate); + +extern void vci_set_starting_position_for_fetching_column_store(VciScanState *scanstate, int64 crid, int size); + +extern bool vci_fill_vector_set_from_column_store(VciScanState *scanstate); +extern void vci_mark_pos_vector_set_from_column_store(VciScanState *scanstate); +extern void vci_restr_pos_vector_set_from_column_store(VciScanState *scanstate); +extern void vci_step_next_tuple_from_column_store(VciScanState *scanstate); +extern void vci_finish_vector_set_from_column_store(VciScanState *scanstate); + +extern void VciExecTargetListWithVectorProcessing(VciScanState *scanstate, ExprContext *econtext, int max_slots); +extern void VciExecEvalScalarVarFromColumnStore(ExprState *exprstate, ExprEvalStep *op, ExprContext *econtext); + +/* ---------------- + * vci_planner.c + * ---------------- + */ +extern PlannedStmt *vci_generate_custom_plan(PlannedStmt *src, int eflags, Snapshot snapshot); + +#endif /* VCI_EXECUTOR_H */ diff --git a/contrib/vci/include/vci_fetch.h b/contrib/vci/include/vci_fetch.h new file mode 100644 index 000000000000..60326eed58d0 --- /dev/null +++ b/contrib/vci/include/vci_fetch.h @@ -0,0 +1,1007 @@ +/*------------------------------------------------------------------------- + * + * vci_fetch.h + * Definitions and declarations of Column store fetch + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_fetch.h + * + *------------------------------------------------------------------------- + */ +#ifndef VCI_FETCH_H +#define VCI_FETCH_H + +#include "postgres.h" + +#include "access/attnum.h" +#include "utils/guc.h" + +#include "vci.h" +#include "vci_columns.h" + +#include "vci_mem.h" +#include "vci_ros.h" + +/* Get function of virtual tuples may used to get the storage area. + * In that case, no rows may stored. + * So, skipping the assertion check by default. + * To use the assertion check, define CHECK_VTUPLE_GET_RANGE. + */ +#define CHECK_VTUPLE_GET_RANGE + +/* + * memory image of data loaded by vci_CSFetchVirtualTuples(). + * The area is allocated by vci_CSCreateVirtualTuples(), and the addresses + * are fixed except for each data in "column N area"s. + * + * ADDRESS CONTENT + * allocated (palloc-ed address) + * (no padding) + * flags (or skip) uint8 of tuple[0] + * (bit 0 is a copy of uint8 of tuple[1] + * delete vector) . + * . + * uint8 of tuple[num_rows_read_at_once-1] + * uint8 of tuple[num_rows_read_at_once] (extra element) + * (no padding) + * isnull bool[0]--bool[num_columns-1] of tuple[0] + * bool[0]--bool[num_columns-1] of tuple[1] + * . + * . + * bool[0]--bool[num_columns-1] of tuple[num_..._once-1] + * (padding if necessary) + * crid (aligned) int64 of tuple[0] + * (when need_crid is int64 of tuple[1] + * true) . + * . + * int64 of tuple[num_rows_read_at_once-1] + * (no padding) + * tid (aligned) int64 of tuple[0] + * (when need_tid is int64 of tuple[1] + * true) . + * . + * . + * int64 of tuple[num_rows_read_at_once-1] + * (no padding) + * values (aligned) Datum[0]--Datum[num_columns-1] of tuple[0] + * Datum[0]--Datum[num_columns-1] of tuple[1] + * . + * . + * Datum[0]--Datum[num_columns-1] of tuple[num_..._once-1] + * (padding if necessary) + * column 0 area aligned data are stored when the element size is + * (aligned) larger than sizeof(Datum). Each datum are pointed + * by Datum[0] of tuples in the upper "data" area. + * The size for the area is calculated using worst + * case size. + * (free space) + * (padding if necessary) + * column 1 area aligned data are stored when the element size is + * (aligned) larger than sizeof(Datum). Each datum are pointed + * by Datum[1] of tuples in the upper "data" area. + * The size for the area is calculated using worst + * case size. + * . + * . + * (free space) + * (padding if necessary) + * column (num_rows-1) area + * (aligned) + * + * + * usage: + * + * ---- in backend process ---- + * vci_CSQueryContext queryContext = vci_CSCreateQueryContext( mainRelationOid, + * numReadColumns, attrNum, sharedMemCtx); + * + * Size localRosSize = vci_CSEstimateLocalRosSize(queryContext); + * if (limitLocalRos <= localRosSize) + * goto PostgreSQLQueryExecution; + * vci_local_ros_t *localRos = vci_CSGenerateLocalRos(queryContext); + * + * vci_CSFetchContext fetchContext = vci_CSCreateFetchContext( queryContext, + * numRowsReadAtOnce, + * useColumnStore, + * numReadColumns, attrNum, + * returnTid, returnCrid); + * Size fetchContextSize = vci_CSGetFetchContextSize(fetchContext); + * if (limitFetchContext <= sumOfFetchContextSize) + * goto PostgreSQLQueryExecution; + * + * ---- in background worker ---- + * int lenVector = vci_CSGetActualNumRowsReadAtOnce(fetchContext); + * vci_CSFetchContext localContext = vci_CSLocalizeFetchContext(fetchContext); + * vci_virtual_tuples_t *vTuples = vci_CSCreateVirtualTuples(localContext); + * + * ** here you can make pointers to vTuples from PostgreSQL virtual tuples. ** + * + * vci_extent_status_t *status =vci_CSCreateCheckExtent(fetchContext); + * + * for (extentID) + * { + * vci_CSCheckExtent(status, localContext, extentId, readMinMax); + * if (status->existence && status->visible) + * { + * ** loop of vectors and rows ** + * ** number of rows in the extent is in status->num_rows ** + * for (vectorID) + * { + * int readableRows = vci_CSFetchVirtualTuples(vTuples, + * vectroID * lenVector, + * lenVector); + * for (idInVector = 0; idInVector < readableRows; ++ idInVector) + * { + * + * ** normal style from here ** + * int8 *flags = vci_CSGetSkipOfVirtualTuple(vTuples); + * if ((* flags) & vcivtf_delete) + * continue; + * + * ** Row wise ** + * Datum *values = vci_CSGetValuesOfVirtualTuple(vTuples, + * idInVector); + * bool *isnull = vci_CSGetIsNullOfVirtualTuple(vTuples, + * idInVector); + * + * ** Column wise ** + * Datum *values = vci_CSGetValuesOfVirtualTupleColumnar(vTuples, + * columnId); + * bool *isnull = vci_CSGetIsNullOfVirtualTupleColumnar(vTuples, + * columnId); + * + * int64 *crid = vci_CSGetCridOfVirtualTuple(vTuples, + * idInVector); + * int64 *tid = vci_CSGetTidOfVirtualTuple(vTuples, + * idInVector); + * UpdateVirtualTupleLinks(); + * EvaluateQualsEtc(); + * ** normal style to here ** + * + * ** if you use fixed linked virtual tuples from here ** + * SelectPostgreSQLVirtualTuple(); + * EvaluateQualsEtc(); + * ** if you use fixed linked virtual tuples to here ** + * + * } + * } + * } + * } + * + * vci_CSDestroyCheckExtent(status) + * vci_CSDestroyVirtualTuples(vTuples); + * vci_CSDestroyFetchContext(localContext); + * + * ---- in backend process ---- + * vci_CSDestroyFetchContext(fetchContext); + * vci_CSDestroyLocalRos(localRos); + * vci_CSDestroyQueryContext(queryContext); + */ + +/** + * @brief Information to fetch data from one relation used in a query. + * + * When multiple relations are used in one query, + * multiple vci_CSQueryContextData should be created. + */ +typedef struct vci_CSQueryContextData +{ + /** Number of columns of the relation used in the query. */ + int num_columns; + + /** Attribute number in original PostgreSQL relation. */ + AttrNumber *volatile attr_num; + + /** Column ID in VCI main relation. */ + int16 *volatile column_id; + + /* Number of maximum WOS entries */ + int64 num_data_wos_entries; + + /* Number of maximum whiteout WOS entries */ + int64 num_whiteout_wos_entries; + + /** + * Number of entries in delete_list, just a copy of + * vci_local_ros_t.local_delete_list->num_entry. + */ + int num_delete; + + /** + * Local delete list, containing whiteout WOS. + * CAUTION : THIS POINTER VALUE IS JUST A COPY OF + * vci_local_ros_t.local_delete_list->crid_list. + * NEVER pfree(). + */ + uint64 *delete_list; + + /** + * Number of extents of local ROS. + * To keep the extnets of local ROS at reasonable size, + * they may contain fewer rows than 262,144 rows. + */ + int num_local_ros_extents; + + vci_local_ros_t *local_ros; /* pointer to the local ROS. */ + + /** Number of extents in ROS. */ + int num_ros_extents; + + /** + * Pointer to main relation information. + * The object is allocated in shared_memory_context, + * but info->rel cannot access from other process than that creates + * vci_CSFetchContext. + * In order to access main relation, open using main_relation_oid. + */ + vci_MainRelHeaderInfo *volatile info; + + /** Heap relation indexed by VCI to keep shared lock. */ + volatile Relation heap_rel; + + /** Oid of VCI main relation. */ + Oid main_relation_oid; + + uint32 num_nullable_columns; /* Number of nullable columns */ + uint32 null_width_in_byte; /* Size of null bit vector per row */ + + /** + * ROS version taken from current ROS version or last ROS version. + */ + TransactionId ros_version; + + /** + * @see inclusiveXid of struct vci_RosCommandContext + */ + TransactionId inclusive_xid; + + /** + * @see exclusiveXid of struct vci_RosCommandContext + */ + TransactionId exclusive_xid; + + uint32 tid_crid_diff_sel; /* Selection of TID CRID difference. */ + + /** + * Memory context where all the shared data are allocate, + * including the elements in this sturcture. + */ + MemoryContext shared_memory_context; + + /** lockmode of index relation (main relation) */ + LOCKMODE lockmode; + +} vci_CSQueryContextData; +typedef vci_CSQueryContextData *vci_CSQueryContext; + +/** + * @brief Buffer for decompression, + * + * and concatenate data separated into multiple pages. + */ +typedef struct vci_seq_scan_buffer +{ + int num_buffers; +} vci_seq_scan_buffer_t; + +/** + * @brief Context to fetch vectors. + * + * Vector itself is in vci_virtual_tuples_t, + * and the running parameters are kept in it. + * A master instance of vci_CSFetchContextData is created by backend process, + * then background workers copy to have locally. + * Some member variables in local copy is over-written, marked as + * \b LOCALIZED \b VARIABLE . + */ +typedef struct vci_CSFetchContextData +{ + uint32 size; /* Size of this structure. */ + + int32 extent_id; /* The extent ID of stored virtual tuples. */ + uint16 num_rows; /* Number of stored virtual tuples. */ + + int16 num_columns; /* Number of columns to fetch in this context. */ + + /** + * Number of rows for the context to read at once. + * The fetcher read multiple lines at once and store them into the + * virtual tuple storage. + */ + uint32 num_rows_read_at_once; + + bool use_column_store; /* Store data in columnar style (true) or + * not. */ + + bool need_crid; /* Fetch CRID or not. */ + bool need_tid; /* Fetch TID or not. */ + + /** Used in decompression or data concatenation. */ + vci_seq_scan_buffer_t *buffer; + + /** \b LOCALIZED \b VARIABLE \n + * The ROS data fetched are stored in this context. + * virtual tuple storage is located here. + */ + MemoryContext local_memory_context; + + /** The size of virtual Tuple storage. + * This is sum of size_values, size_flags, and sizes of area pointed by + * vci_virtual_tuples_t->column_info[columnId].al_area. + */ + Size size_vector_memory_context; + + /** area where Datum and pointers are stores */ + Size size_values; + + /** The area where nulls, skip information, local skip information, + * TIDs, CRIDs, dictionaries, compression workarea and temporay + * area for wor-wise mode are placed. + * The amount of dictionary sizes is in size_dictionary_area. + * The workarea size for compression and decompression is in + * size_decompression_area. + */ + Size size_flags; + + /** The memory size for dictionaries + * This is included in size_flags. + */ + Size size_dictionary_area; + + /** Workarea size to decompress one VCI_COMPACTION_UNIT_ROW. + * The size is calculated as + * MAXALIGN(VCI_MAX_PAGE_SPACE * VCI_COMPACTION_UNIT_ROW) + * when size_dictionary_area != 0, or zero. + * This is included in size_flags. + */ + Size size_decompression_area; + + /** The query context this fetch context belongs to. */ + vci_CSQueryContext query_context; + + /** \b LOCALIZED \b VARIABLE \n + * VCI main relation information used in localized fetch. + * Since the file discriptor or Relation structure must be obtained + * in each process, the main relation information also calculated in + * each process. + */ + vci_MainRelHeaderInfo *info; + + /** \b LOCALIZED \b VARIABLE \n + * Relations of the delete vector. + */ + vci_ColumnRelations rel_delete; + + /** \b LOCALIZED \b VARIABLE \n + * Relations of the null bit vector. + */ + vci_ColumnRelations rel_null; + + /** \b LOCALIZED \b VARIABLE \n + * Relations of the TID vector. + */ + vci_ColumnRelations rel_tid; + + /** \b LOCALIZED \b VARIABLE \n + * Pointer to the array of relations of normal columns. + */ + vci_ColumnRelations *rel_column; + + /** + * The column ID translation table. + * Since the column IDs in fetch vector are differ from those of + * VCI main relations, + * we have the translation table from the former to the latter here. + */ + int16 column_link[1]; /* VARIABLE LENGTH ARRAY */ +} vci_CSFetchContextData; /* VARIABLE LENGTH STRUCT */ +typedef vci_CSFetchContextData *vci_CSFetchContext; + +/** + * @brief Structure to keep minimum and maximum value for a column. + */ +typedef struct vci_minmax +{ + bool valid; /* min and max are meaningful (true) or not + * (false). */ + char min[VCI_MAX_MIN_MAX_SIZE]; /* Minimum value. */ + char max[VCI_MAX_MIN_MAX_SIZE]; /* Maximum value. */ +} vci_minmax_t; + +/** + * @brief The extent information which is obtained before fetching the + * extent itself. + * + * It has information of existence, visibility of the extent, + * number of rows in the extent, + * and the minimum and maximum values of the extent. + */ +typedef struct vci_extent_status +{ + uint32 size; /* Size of this structure. */ + uint32 num_rows; /* Number of rows in the extent. */ + bool existence; /* Existence of the extent. */ + bool visible; /* Visibility of the extent. */ + + /** The minimum and the maximum values of columns to be fetched. */ + vci_minmax_t minmax[1]; /* VARIABLE LENGTH ARRAY */ +} vci_extent_status_t; /* VARIABLE LENGTH STRUCT */ + +/** + * @brief The status after reading vector. + */ +typedef enum vci_read_vector_status_t +{ + vcirvs_read_whole, /* Whole the data, that are required, are + * read. */ + vcirvs_out_of_memory, /* Partially read since out of memory. */ + vcirvs_end_of_extent, /* Reaches the end of extent. */ + + /** Failed to read since the parameter is out of range. */ + vcirvs_out_of_range, + + vcirvs_not_visible, /* Failed to read since the extent is + * invisible. */ + vcirvs_not_exist, /* The specified extent is not exists. */ +} vci_read_vector_status_t; + +/** + * @brief Information of a fetched column in virtual tuple. + */ +typedef struct vci_virtual_tuples_column_info +{ + char *area; /* Aligned pointer of al_area. NEVER pfree() */ + + /** Allocated pointer, actual palloced() address is kept. */ + char *al_area; + + int32 null_bit_id; /* Null bit ID in null bit vector. */ + uint32 max_column_size; /* The maximum size of data in the column. */ + + /** true when the value is passed by the pointer (datum by reference). + * false when the value itself is contained in Datum (datum by value). + */ + bool strict_datum_type; + + vcis_compression_type_t comp_type; /* Compression method used. */ + Oid atttypid; /* Type ID of attribute. */ + bool *isnull; /* Pointer to the isnull flag area. */ + Datum *values; /* Pointer to the Datum array area. */ + + /** The information of the dictionary of LZVF compression. */ + vci_DictInfo *dict_info; +} vci_virtual_tuples_column_info_t; + +/** + * @brief Information of virtual tuple, a set of fetched data. + * + * In the form, both colum-wise and row-wise are supported. + */ +typedef struct vci_virtual_tuples +{ + uint32 size; /* Size of this instance. */ + uint16 num_columns; /* Number of columns to store. */ + int32 extent_id; /* The extent ID of stored data. */ + + /** Physically recorded number of rows in the target extent. */ + uint32 num_rows_in_extent; + + /** The row ID in extent of the stored first datum. */ + uint32 row_id_in_extent; + + uint32 num_rows; /* Number of stored rows in this structure. */ + + uint32 buffer_capacity; /* Capacity in unit of rows in this + * structure. */ + + vci_read_vector_status_t status; /* Read status. */ + + /** + * This keeps the position of first tuple of vector, + * since the first virtual tuple of the vector is not always the first + * entry of stored data. + * At present, the upstream users requre that always the first data + * to be placed at the same address, this member variable is always + * set to zero. + */ + uint32 offset_of_first_tuple_of_vector; + + /** + * Number of rows for the context to read at once. + * The fetcher read multiple lines at once and store them into the + * virtual tuple storage. + */ + uint32 num_rows_read_at_once; + + /** The fetch context for this virtual tuple. */ + vci_CSFetchContext fetch_context; + + /** True for store in column-wise style. False for row-wise. */ + bool use_column_store; + + /** + * The size of virtual Tuple storage. + * This is sum of size_values, size_flags, and sizes of area pointed by + * vci_virtual_tuples_t->column_info[columnId].al_area. + */ + Size size_vector_memory_context; + + /** The size of the area where Datum and pointers are stores. */ + Size size_values; + + /** + * The size of the area where nulls, skip information, + * local skip information, TIDs, CRIDs, dictionaries, + * compression workarea and temporay area for wor-wise mode are placed. + * The amount of dictionary sizes is in size_dictionary_area. + * The workarea size for compression / decompression is in + * size_decompression_area. + */ + Size size_flags; + + /** + * The memory size for dictionaries. + * This is included in size_flags. + */ + Size size_dictionary_area; + + /** + * Workarea size to decompress one VCI_COMPACTION_UNIT_ROW. + * The size is calculated as + * MAXALIGN(VCI_MAX_PAGE_SPACE * VCI_COMPACTION_UNIT_ROW) + * when size_dictionary_area != 0, or zero. + * This is included in size_flags. + */ + Size size_decompression_area; + + int64 *crid; /* Aligned pointer to CRID list in al_flags */ + + /** Aligned pointer to TID list in al_flags. + * ItemPointerData are wrtten. + */ + int64 *tid; + + /** Aligned pointer to skip list. */ + uint16 *skip; + + /** Aligned pointer to skip list for local ROS. */ + uint16 *local_skip; + + /** Aligned pointer to the area for isnull of all columns. */ + bool *isnull; + + /** + * In row-wise mode, the vector in local ROS is once built here. + * The area is allocated in local_memory_context. + * The size is + * num_rows_read_at_once * num_columns * (sizeof(Datum) + sizeof(bool)) + */ + char *row_wise_local_ros; + + /** + * Workarea to decompress data. + * Dictionaries follow work_decompression + */ + char *work_decompression; + + /** Aligned pointer to the area for values of all columns in al_values. */ + Datum *values; + + /** Aligned pointer to the area for meta information like skip, TID, + * NULL, and so on. + */ + char *flags; + + char *al_values; /* Allocated pointer for values. */ + char *al_flags; /* Allocated pointer for flags. */ + + /** Array of column informations. */ + vci_virtual_tuples_column_info_t column_info[1]; /* VARIABLE LENGTH ARRAY */ +} vci_virtual_tuples_t; /* VARIABLE LENGTH STRUCT */ + +extern PGDLLEXPORT vci_CSQueryContext vci_CSCreateQueryContextWLockMode(Oid mainRelationOid, + int numReadColumns, + /* attribute number in original relation */ + AttrNumber *attrNum, + MemoryContext sharedMemCtx, + LOCKMODE lockmode); + +/** + * @brief Create query context. + * + * @param[in] mainRelationOid Oid of VCI main relation. + * @param[in] numReadColumns The number of read columns in the part of query. + * @param[in] attrNum The attribute numbers in the original heap relation, + * not those of the VCI main relation. + * @param[in] sharedMemCtx The shared memory context to keep elements of + * query context, fetch context, local ROS. + * @param[in] recoveryInProgress true if recovery is still in progress. + * @param[in] estimatingLocalROSSize true if creating a local ROS. + * @return The pointer to the allocated vci_CSQueryContext. + */ +static inline vci_CSQueryContext +vci_CSCreateQueryContext(Oid mainRelationOid, + int numReadColumns, + AttrNumber *attrNum, + /* attribute number in original relation */ + MemoryContext sharedMemCtx, + bool recoveryInProgress, + bool estimatingLocalROSSize) +{ + /* + * ShareUpdateExclusiveLock is used for creating local ROS. But on the + * standby, AccessShareLock is used because queries on the standby can be + * used only RowExclusiveLock or weaker ones. + */ + LOCKMODE lockmode = (recoveryInProgress || estimatingLocalROSSize) ? AccessShareLock : ShareUpdateExclusiveLock; + + return vci_CSCreateQueryContextWLockMode(mainRelationOid, numReadColumns, + attrNum, sharedMemCtx, lockmode); +} + +extern PGDLLEXPORT void vci_CSDestroyQueryContext(vci_CSQueryContext queryContext); + +/* obtain the worst size of local ROS to be estimated */ +extern Size vci_CSEstimateLocalRosSize(vci_CSQueryContext queryContext); + +extern PGDLLEXPORT vci_local_ros_t *vci_CSGenerateLocalRos(vci_CSQueryContextData *queryContext); + +/** + * @brief Entry point to destroy local ROS. + * + * @param[in] localRos Local ROS to be destroyed. + */ +static inline void +vci_CSDestroyLocalRos(vci_local_ros_t *localRos) +{ + vci_DestroyLocalRos(localRos); +} + +extern PGDLLEXPORT vci_CSFetchContext vci_CSCreateFetchContextBase( + vci_CSQueryContext queryContext, + uint32 numRowsReadAtOnce, + int16 numReadColumns, + /* attribute number in original relation */ + AttrNumber *attrNum, + bool useColumnStore, + bool returnTid, + bool returnCrid, + bool useCompression); + +#define VCI_MAX_NUM_ROW_TO_FETCH (65536 - VCI_COMPACTION_UNIT_ROW) + +/** + * @brief The entry point to the function creating fetch context. + * + * The actual number of rows read at once is quantized + * by VCI_COMPACTION_UNIT_ROW by the formula, + * actualNumRowsReadAtOnce + * = TYPEALIGN(VCI_COMPACTION_UNIT_ROW, numRowsReadAtOnce), + * and numRowsReadAtOnce is unsigned 16 bit integer, it should be smaller than + * or equal to VCI_MAX_NUM_ROW_TO_FETCH. Otherwise, it returns NULL. + * + * @param[in] queryContext The query context. + * @param[in] numRowsReadAtOnce The number of rows which read at once and + * stored in the virtual tuples. + * @param[in] numReadColumns The number of columns to be read. + * @param[in] attrNum The pointer to the array which has the attribute numbers + * of the original heap relation, not VCI main relation. + * @param[in] useColumnStore True for column-wise store. False for row-wise. + * @param[in] returnTid True to get TID in virtual tuples. + * @param[in] returnCrid True to get CRID in virtual tuples. + * @return The pointer to the created fetch context. + * NULL if some parameters are invald resulting no fetch context is created. + */ +static inline vci_CSFetchContext +vci_CSCreateFetchContext(vci_CSQueryContext queryContext, + uint16 numRowsReadAtOnce, + int16 numReadColumns, + /* attribute number in original relation */ + AttrNumber *attrNum, + bool useColumnStore, + bool returnTid, + bool returnCrid) +{ + return vci_CSCreateFetchContextBase(queryContext, + numRowsReadAtOnce, + numReadColumns, + attrNum, + useColumnStore, + returnTid, + returnCrid, + false); +} + +extern PGDLLEXPORT void vci_CSDestroyFetchContext(vci_CSFetchContext fetchContext); +extern PGDLLEXPORT vci_CSFetchContext vci_CSLocalizeFetchContext( + vci_CSFetchContext fetchContext, + MemoryContext memoryContext); +extern PGDLLEXPORT vci_extent_status_t *vci_CSCreateCheckExtent( + vci_CSFetchContext localContext); +extern PGDLLEXPORT void vci_CSDestroyCheckExtent(vci_extent_status_t *status); +extern PGDLLEXPORT void vci_CSCheckExtent(vci_extent_status_t *status, + vci_CSFetchContext fetchContext, + int32 extentId, + bool readMinMax); + +extern PGDLLEXPORT vci_virtual_tuples_t *vci_CSCreateVirtualTuplesWithNumRows(vci_CSFetchContext fetchContext, uint32 numRows); + +/** + * @brief Create virtual tuples according to the context. + * + * @param[in] localContext The localized fetch context. + * @return The created virtual tuples. + */ +static inline vci_virtual_tuples_t * +vci_CSCreateVirtualTuples(vci_CSFetchContext localContext) +{ + return vci_CSCreateVirtualTuplesWithNumRows(localContext, + localContext->num_rows_read_at_once); +} + +extern PGDLLEXPORT void vci_CSDestroyVirtualTuples(vci_virtual_tuples_t *vTuples); + +/** + * @brief Get the address of the area where Datum of the specified column + * is stored. + * + * At present, the upstream requester requires the start address fixed. + * For better performance, it is better that the start address is modifiable, + * to fetch many rows at once, or to use local ROS directly. + * + * @param[in] vTuples The virtual tuples. + * @param[in] columnId Target column ID. + * @return The pointer to the Datum array. + */ +static inline Datum * +vci_CSGetValueAddrFromVirtualTuplesColumnwise(vci_virtual_tuples_t *vTuples, uint16 columnId) +{ + return vTuples->column_info[columnId].values; +} + +/** + * @brief Get the address of the area where isnull of the specified column + * is stored. + * + * At present, the upstream requester requires the start address fixed. + * For better performance, it is better that the start address is modifiable, + * to fetch many rows at once, or to use local ROS directly. + * + * @param[in] vTuples The virtual tuples. + * @param[in] columnId Target column ID. + * @return The pointer to the bool array. + */ +static inline bool * +vci_CSGetIsNullAddrFromVirtualTuplesColumnwise(vci_virtual_tuples_t *vTuples, uint16 columnId) +{ + return vTuples->column_info[columnId].isnull; +} + +/** + * @brief Get the address of the skip information of the specified column + * is stored. + * + * @param[in] vTuples The virtual tuples. + * @return The pointer to the skip information array. + */ +static inline uint16 * +vci_CSGetSkipAddrFromVirtualTuples(vci_virtual_tuples_t *vTuples) +{ + return vTuples->skip; +} + +/** + * @brief Get the vector of specified skip information. + * + * @param[in] vTuples The virtual tuples. + * @return The pointer to the array of skip information. + * + * @note The instrtuction is the same as + * vci_CSGetValuesOfVirtualTupleColumnar(). + */ +static inline uint16 * +vci_CSGetSkipFromVirtualTuples(vci_virtual_tuples_t *vTuples) +{ +#ifdef CHECK_VTUPLE_GET_RANGE + Assert((0 <= vTuples->offset_of_first_tuple_of_vector) && + (vTuples->offset_of_first_tuple_of_vector < vTuples->num_rows)); +#endif /* #ifdef CHECK_VTUPLE_GET_RANGE */ + + return &(vTuples->skip[vTuples->offset_of_first_tuple_of_vector]); +} + +/** + * @brief Get the vector of TID. + * + * @param[in] vTuples The virtual tuples. + * @return The pointer to the array of TID information in int64* form. + * + * @note This function is available when the fetch context is created + * with the option returnTid is true. + * This function can be available independent of useColumnStore option. + */ +/* Cast please */ +static inline int64 * +vci_CSGetTidFromVirtualTuples(vci_virtual_tuples_t *vTuples) +{ +#ifdef CHECK_VTUPLE_GET_RANGE + Assert((0 <= vTuples->offset_of_first_tuple_of_vector) && + (vTuples->offset_of_first_tuple_of_vector < vTuples->num_rows)); +#endif /* #ifdef CHECK_VTUPLE_GET_RANGE */ + + return &(vTuples->tid[vTuples->offset_of_first_tuple_of_vector]); +} + +/** + * @brief Get the TID of specified tuple. + * + * @param[in] vTuples The virtual tuples. + * @param[in] offsetInVector offset in the vector. + * @return TID information. + * + * @note The instruction is the same as vci_GetTidFromVirtualTuples(). + */ +#ifdef __s390x__ +static inline ItemPointerData +vci_CSGetTidInItemPointerFromVirtualTuples(vci_virtual_tuples_t *vTuples, + int offsetInVector) +{ + ItemPointerData ipd; + int64 result = (vci_CSGetTidFromVirtualTuples(vTuples))[offsetInVector]; +#ifdef WORDS_BIGENDIAN + result = result << 16; +#else +#endif + ipd = *((ItemPointer) &result); + return ipd; +} +#else +static inline ItemPointer +vci_CSGetTidInItemPointerFromVirtualTuples(vci_virtual_tuples_t *vTuples, + int offsetInVector) +{ + return (ItemPointer) &(vci_CSGetTidFromVirtualTuples(vTuples) + [offsetInVector]); +} +#endif + +extern PGDLLEXPORT int vci_CSFetchVirtualTuples(vci_virtual_tuples_t *vTuples, + int64 cridStart, + uint32 numReadRows); + +/** + * @brief Get the tuple specified. + * + * @param[in] vTuples The virtual tuples. + * @param[in] offsetInVector offset in the vector. + * @return The pointer to the array of Datum. + * + * @note This function can be used when the fetch context is created in + * row-wise mode, i.e. useColumnStore = false. + * The column fetcher is read rows in unit of VCI_COMPACTION_UNIT_ROW. + * Therefore, at the start address of the buffer does not always have + * the specified data. + * The specified data is pointed by the offset of + * vTuples->offset_of_first_tuple_of_vector, actually. + * To have the data at the start address, always read rows of multiples + * of VCI_COMPACTION_UNIT_ROW at once. + * For example, when VCI_COMPACTION_UNIT_ROW = 128, then + * read 128 rows at once from the row ID in the extent, 0, 128, 256, 384, .... + * Or, read 256 rows at once from the row ID in the extent, 0, 256, 512, ... + */ +static inline Datum * +vci_CSGetValuesOfVirtualTuple(vci_virtual_tuples_t *vTuples, + uint32 offsetInVector) +{ + offsetInVector += vTuples->offset_of_first_tuple_of_vector; + +#ifdef CHECK_VTUPLE_GET_RANGE + Assert(!vTuples->use_column_store); + Assert((0 <= offsetInVector) && (offsetInVector < vTuples->num_rows)); +#endif /* #ifdef CHECK_VTUPLE_GET_RANGE */ + + return &(vTuples->values[vTuples->num_columns * offsetInVector]); +} + +/** + * @brief Get the isnull of specified tuple. + * + * @param[in] vTuples The virtual tuples. + * @param[in] offsetInVector offset in the vector. + * @return The pointer to the array of bool. + * + * @note See instruction of vci_CSGetValuesOfVirtualTuple(). + */ +static inline bool * +vci_CSGetIsNullOfVirtualTuple(vci_virtual_tuples_t *vTuples, + int32 offsetInVector) +{ + offsetInVector += vTuples->offset_of_first_tuple_of_vector; + +#ifdef CHECK_VTUPLE_GET_RANGE + Assert(!vTuples->use_column_store); + Assert((0 <= offsetInVector) && ((uint32) offsetInVector < vTuples->num_rows)); +#endif /* #ifdef CHECK_VTUPLE_GET_RANGE */ + + return &(vTuples->isnull[vTuples->num_columns * offsetInVector]); +} + +/** + * @brief Get the vector of specified column data. + * + * @param[in] vTuples The virtual tuples. + * @param[in] columnId The column ID. + * @return The pointer to the array of Datum. + * + * @note This function can be used when the fetch context is created in + * column-wise mode, i.e. useColumnStore = true. + * The other instruction is the same as vci_CSGetValuesOfVirtualTuple(). + */ +static inline Datum * +vci_CSGetValuesOfVirtualTupleColumnar(vci_virtual_tuples_t *vTuples, uint16 columnId) +{ +#ifdef CHECK_VTUPLE_GET_RANGE + Assert(vTuples->use_column_store); + Assert((VCI_FIRST_NORMALCOLUMN_ID <= columnId) && (columnId < vTuples->num_columns)); +#endif /* #ifdef CHECK_VTUPLE_GET_RANGE */ + + return &(vTuples->column_info[columnId].values + [vTuples->offset_of_first_tuple_of_vector]); +} + +/** + * @brief Get the vector of specified isnull information. + * + * @param[in] vTuples The virtual tuples. + * @param[in] columnId The column ID. + * @return The pointer to the array of bool. + * + * @note The instrtuction is the same as + * vci_CSGetValuesOfVirtualTupleColumnar(). + */ +static inline bool * +vci_CSGetIsNullOfVirtualTupleColumnar(vci_virtual_tuples_t *vTuples, uint16 columnId) +{ +#ifdef CHECK_VTUPLE_GET_RANGE + Assert(vTuples->use_column_store); + Assert((VCI_FIRST_NORMALCOLUMN_ID <= columnId) && (columnId < vTuples->num_columns)); +#endif /* #ifdef CHECK_VTUPLE_GET_RANGE */ + + return &(vTuples->column_info[columnId].isnull[vTuples->offset_of_first_tuple_of_vector]); +} + +/** + * @brief Obtains the column ID in the VCI main relation from the serial number + * in a set of read columns listed in vci_CSFetchContext. + * + * @param[in] fetchContext The fetch context. + * @param[in] serialNumber The serial number in a set of read columns. + * @return the columnID in the VCI main relation. + */ +static inline int16 +vci_GetColumnIdFromFetchContext(vci_CSFetchContext fetchContext, + int16 serialNumber) +{ + int cId; + + Assert((0 <= serialNumber) && (serialNumber < fetchContext->num_columns)); + cId = fetchContext->column_link[serialNumber]; + Assert((0 <= cId) && (cId < fetchContext->query_context->num_columns)); + + return fetchContext->query_context->column_id[cId]; +} + +extern void vci_FillCridInVirtualTuples(vci_virtual_tuples_t *vTuples); +extern void + vci_FillFixedWidthColumnarFromRosChunkStorage(vci_virtual_tuples_t *vTuples, + int16 columnId, + RosChunkStorage *rosChunkStorage); +extern void + vci_FillVariableWidthColumnarFromRosChunkStorage(vci_virtual_tuples_t *vTuples, + int16 columnId, + RosChunkStorage *rosChunkStorage); +extern int16 *vci_GetNullableColumnIds(vci_virtual_tuples_t *vTuples); + +#endif /* VCI_FETCH_H */ diff --git a/contrib/vci/include/vci_fetch_row_store.h b/contrib/vci/include/vci_fetch_row_store.h new file mode 100644 index 000000000000..841eebd8c78c --- /dev/null +++ b/contrib/vci/include/vci_fetch_row_store.h @@ -0,0 +1,22 @@ +/*------------------------------------------------------------------------- + * vci_fetch_row_store.h + * + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_fetch_row_store.h + * + *------------------------------------------------------------------------- + */ +#ifndef VCI_FETCH_ROW_STORE_H +#define VCI_FETCH_ROW_STORE_H + +#include "access/heapam.h" + +struct VciScanState; + +extern void VciExecAssignScanProjectionInfo(struct VciScanState *node); +extern HeapTuple vci_heap_getnext(struct VciScanState *scanstate, HeapScanDesc scan, ScanDirection direction); + +#endif /* VCI_FETCH_ROW_STORE_H */ diff --git a/contrib/vci/include/vci_freelist.h b/contrib/vci/include/vci_freelist.h new file mode 100644 index 000000000000..8cdfec715ad2 --- /dev/null +++ b/contrib/vci/include/vci_freelist.h @@ -0,0 +1,75 @@ +/*------------------------------------------------------------------------- + * + * vci_freelist.h + * Definitions and declarations of Free space link list + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_freelist.h + * + *------------------------------------------------------------------------- + */ + +#ifndef VCI_FREELIST_H +#define VCI_FREELIST_H + +#include "postgres.h" + +#include "vci.h" +#include "vci_columns.h" +#include "vci_ros.h" + +#define VCI_FREESPACE_ITEM_ID FirstOffsetNumber + +typedef struct vcis_free_space +{ + uint32 size; + + vcis_extent_type_t type; + + BlockNumber prev_pos; + + BlockNumber next_pos; +} vcis_free_space_t; + +#define vci_hasFreeLinkNode(freespace) \ + (vcis_free_space == (freespace)->type) \ + || (vcis_tidcrid_type_pagetag == (freespace)->type) + +extern PGDLLEXPORT vcis_free_space_t *vci_GetFreeSpace(vci_RelationPair *relPair, BlockNumber blk); + +extern int32 vci_MakeFreeSpace(vci_RelationPair *relPair, + BlockNumber startBlockNumber, + BlockNumber *newFSBlockNumber, + vcis_free_space_t *newFS, + bool coalesce); + +extern void vci_AppendFreeSpaceToLinkList(vci_RelationPair *relPair, + BlockNumber startBlockNumber, + BlockNumber prevFreeBlockNumber, + BlockNumber nextFreeBlockNumber, + BlockNumber size); + +extern BlockNumber vci_FindFreeSpaceForExtent(vci_RelationPair *relPair, + BlockNumber requiredSize); + +extern void vci_RemoveFreeSpaceFromLinkList(vci_RelationPair *relPair, + BlockNumber startBlockNumber, + BlockNumber numExtentPages); + +/* *************** */ +/* Recovery */ +/* *************** */ + +extern void vci_InitRecoveryRecordForFreeSpace(vci_MainRelHeaderInfo *info); + +extern void vci_WriteRecoveryRecordForFreeSpace(vci_RelationPair *relPair, + int16 colId, + int16 dictId, + BlockNumber StartBlockNumber, + vcis_free_space_t *FS); + +extern void vci_RecoveryFreeSpace(vci_MainRelHeaderInfo *info, vci_ros_command_t command); + +#endif /* VCI_FREELIST_H */ diff --git a/contrib/vci/include/vci_mem.h b/contrib/vci/include/vci_mem.h new file mode 100644 index 000000000000..3f455d5b99e8 --- /dev/null +++ b/contrib/vci/include/vci_mem.h @@ -0,0 +1,177 @@ +/*------------------------------------------------------------------------- + * + * vci_mem.h + * Definitions of on-memmory structures + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_mem.h + * + *------------------------------------------------------------------------- + */ +#ifndef VCI_MEM_H +#define VCI_MEM_H + +#include "postgres.h" + +#include + +#include "lib/ilist.h" +#include "portability/instr_time.h" +#include "storage/lwlock.h" +#include "utils/palloc.h" + +#include "vci.h" +#include "vci_ros.h" +#include "vci_memory_entry.h" + +/*------------------------------------------------------------------------- + * START: Copied from include/vci_port.h + *------------------------------------------------------------------------- + */ + +#ifndef VCI_PORT_H +#define VCI_PORT_H + +/* + * key for vci_devload_t + */ +#define VCI_PSEUDO_UNMONITORED_DEVICE "" + +#ifndef WIN32 +#define VCI_PATH_MAX PATH_MAX +#else +#define VCI_PATH_MAX MAX_PATH +#endif + +/* + * Memory entry on the each device + * + * * head is the actual list, link is used to track unused entries + */ +typedef struct +{ + dlist_head head; + dlist_node link; +} vci_memory_entry_list_t; + +/* + * IO statistics, mount information, etc for each devices + */ +typedef struct +{ + char devname[VCI_PATH_MAX]; + + vci_memory_entry_list_t *memory_entry_queue; + + /* + * Next position when memory entry would be traced. NULL means there are + * no entries to be seen. + */ + dlist_node *memory_entry_pos; +} vci_devload_t; + +#endif /* VCI_PORT_H */ + +/*------------------------------------------------------------------------- + * END: Copied from include/vci_port.h + *------------------------------------------------------------------------- + */ + +typedef struct VciGucStruct +{ + bool have_loaded_postgresql_conf; + + bool enable; + + bool log_query; + + int cost_threshold; + + int table_scan_policy; + + /* GUC parameters read from postgresq.conf */ + int maintenance_work_mem; + int max_devices; /* max device num for storage */ + + /* ROS control worker/daemon */ + int control_max_workers; + int control_naptime; + + /* command thresholds */ + int wosros_conv_threshold; + int cdr_threshold; + + /* for custom plan execution */ + int max_local_ros_size; + + /* for parallel processing */ + int table_rows_threshold; + + bool enable_seqscan; + bool enable_indexscan; + bool enable_bitmapheapscan; + bool enable_sort; + bool enable_hashagg; + bool enable_sortagg; + bool enable_plainagg; + bool enable_hashjoin; + bool enable_nestloop; + + /* GUC parameters for internal use */ + bool enable_ros_control_daemon; + +} VciGucStruct; + +extern PGDLLEXPORT VciGucStruct VciGuc; + +/* + * Data structure on shared memory + * + * The instance would be allocated on the shared memory and can be accessed via + * VciShmemAddr. + */ +typedef struct VciShmemStruct +{ + /* --- ROS Control Daemon --- */ + + /* Attributes for passing attributes to a worker */ + + vci_wosros_conv_worker_arg_t *worker_args_array; + + /** vci_memory_entries_t is defined in vci_ros.h + * That keeps information of VCI indices kept in memory. + * The life is the same with PostgreSQL instance. + */ + vci_memory_entries_t *memory_entries; + + dlist_head memory_entry_device_unknown_list; + + /* Standby server controller */ + LWLock *standby_exec_loc; + int num_standby_exec_queries; + + /* IO statistics */ + + vci_devload_t *devload_array; + + vci_memory_entry_list_t *memory_entry_queue_array; + + dlist_head free_memory_entry_queue_list; /**list of memory_entry_queue_array */ + int num_devload_info; /* monitored device numbers + 1(for + * unmonitored devices) */ + int max_devices; /* max device num for storage */ + int translated_dev_pos; /* index of a device VCIs on which is to + * be translated */ + LWLock *io_load_lock; + + /* Additional Lwlocks used by various modules */ + LWLock *vci_memory_entries_lock; + LWLock *vci_query_context_lock; + LWLock *vci_mnt_point2dev_lock; +} VciShmemStruct; + +extern PGDLLEXPORT VciShmemStruct *VciShmemAddr; + +#endif /* VCI_MEM_H */ diff --git a/contrib/vci/include/vci_memory_entry.h b/contrib/vci/include/vci_memory_entry.h new file mode 100644 index 000000000000..7aba17e382d6 --- /dev/null +++ b/contrib/vci/include/vci_memory_entry.h @@ -0,0 +1,118 @@ +/*------------------------------------------------------------------------- + * + * vci_memory_entry.h + * Definitions and declarations of on-memory structures per VCI index + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_memory_entry.h + * + *------------------------------------------------------------------------- + */ + +#ifndef VCI_MEMORY_ENTRY_H +#define VCI_MEMORY_ENTRY_H + +#include "lib/ilist.h" +#include "storage/lwlock.h" + +#include "vci_ros.h" + +/** + * the key when searching a vci_memory_entry_t type value from its set. + */ +typedef struct +{ + Oid oid; /* Oid of VCI main relation */ + Oid dbid; /* Oid of database where VCI main relations + * belongs */ +} vci_id_t; + +/** + * VCI index placeholder to determine the target of ROS command by ROS daemon + */ +typedef struct +{ + vci_id_t id; /* identifier of vci_memory_entry_t */ + Oid tsid; /* Oid of tablespace where VCI a main relation + * belongs */ + + /** + * If tsid is equal to InvalidOid, the Oid corresponding to default table + * space. Otherwise, this is equal to tsid. + */ + Oid real_tsid; + + /** + * Timestamp used for least recent update. + * We do nothing for the wraparound effect, aka "wraparound failures" in + * the PostgreSQL manual. + */ + int32 time_stamp; + + /** + * flag to force the ROS control daemon to do WOS->ROS conversion + * at next WOS->ROS conversion stage regardless of the WOS size. + * + * This flag is set to true when a local WOS->ROS conversion fails + * on account of out-of-memory error. This flag is set to false when + * WOS->ROS conversion is done. + */ + bool force_next_wosros_conv; + + dlist_node link; /* links of vci indexes on a same device */ + +} vci_memory_entry_t; + +/** + * @brief Contains the pointer to the array of vci_memory_entry_t, + * and a lock. + * + * The lock must be used when the array is exclusively accessed, say + * add / remove entries to / from the array, or so. + * + * The instance of vci_memory_entries_t and the array of entries must + * be allocated in shared memory living throughout the PostgreSQL instance. + */ +typedef struct +{ + /** + * Lock to update member variables of vci_memory_entries_t. + */ + LWLock *lock; + + /** + * Number of allocated vci_memory_entry_t pointed by data[]. + */ + uint32 capacity_hash_entries; + + /** + * Current time stamp value, used to least-recently-updated method. + * Instances of vci_memory_entry_t have the timestamp of last access, + * which we do not care wraparound effect, aka "wraparound failures" in + * the PostgreSQL manual. + */ + int32 time_stamp; + + /** + * Pointer to the array of vci_memory_entry_t. + */ + vci_memory_entry_t data[1]; /* VARIABLE LENGTH ARRAY */ + +} vci_memory_entries_t; + +extern Size vci_GetSizeOfMemoryEntries(void); +extern void vci_InitMemoryEntries(void); + +extern void vci_TouchMemoryEntry(vci_id_t *vciid, Oid tsid); +extern bool vci_GetWosRosConvertingVCI(vci_wosros_conv_worker_arg_t *vci_info); +extern void vci_freeMemoryEntry(vci_id_t *vciid); + +extern void vci_update_memoryentry_in_devloadinfo(void); +extern void vci_MoveTranslatedVCI2Tail(void); +extern void vci_ResetDevloadCurrentPos(void); +extern void vci_RemoveMemoryEntryOnDroppedDatabase(void); +extern void vci_SetForceNextWosRosConvFlag(vci_id_t *vciid, bool value); + +#endif /* VCI_MEMORY_ENTRY_H */ diff --git a/contrib/vci/include/vci_planner.h b/contrib/vci/include/vci_planner.h new file mode 100644 index 000000000000..bf2b4c49291c --- /dev/null +++ b/contrib/vci/include/vci_planner.h @@ -0,0 +1,151 @@ +/*------------------------------------------------------------------------- + * + * vci_planner.h + * Data struct definitions needed for analysis to rewrite plans + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_planner.h + * + *------------------------------------------------------------------------- + */ +#ifndef VCI_PLANNER_H +#define VCI_PLANNER_H + +#include "access/attnum.h" +#include "nodes/execnodes.h" +#include "nodes/plannodes.h" + +/** + * Types for internal use only by planners + * + * NestLoop and HashJoin do not actually replace VCI Plan. + * So only record the possibility of including NestLoop and HashJoin + * in the parallel plan group + */ +typedef enum +{ + VCI_INNER_PLAN_TYPE_NONE = 0, + VCI_INNER_PLAN_TYPE_SCAN, + VCI_INNER_PLAN_TYPE_SORT, + VCI_INNER_PLAN_TYPE_AGG, + VCI_INNER_PLAN_TYPE_HASHJOIN, + VCI_INNER_PLAN_TYPE_NESTLOOP, + VCI_INNER_PLAN_TYPE_REDIST, +} vci_inner_plan_type_t; + +/** + * Whether plan node is suitable for VCi execution + */ +typedef enum +{ + VCI_PLAN_COMPAT_OK = 0, + VCI_PLAN_COMPAT_FORBID_TYPE, /* VCI execution prohibited type */ + VCI_PLAN_COMPAT_UNSUPPORTED_OBJ, +} vci_plan_compat_t; + +typedef struct +{ + /** + * VCI Plan Type + */ + vci_inner_plan_type_t plan_type; + + AttrNumber scan_plan_no; + + int preset_eflags; + + Bitmapset *def_param_ids; + Bitmapset *use_param_ids; + + vci_plan_compat_t plan_compat; + +} vci_plan_attr_t; + +typedef enum +{ + VCI_PARAM_EXEC_UNKNOWN = 0, + + VCI_PARAM_EXEC_NESTLOOP, + + VCI_PARAM_EXEC_INITPLAN, + + VCI_PARAM_EXEC_SUBPLAN, +} vci_param_exec_type_t; + +typedef struct +{ + vci_param_exec_type_t type; + Bitmapset *def_plan_nos; + int num_def_plans; + Bitmapset *use_plan_nos; + int num_use_plans; + int plan_id; +} vci_param_exec_attr_t; + +typedef enum +{ + VCI_SUBPLAN_UNKNOWN = 0, + VCI_SUBPLAN_INITPLAN, + VCI_SUBPLAN_SUBPLAN, +} vci_subplan_type_t; + +typedef struct +{ + Plan *topmostplan; /** Topmost Plan */ + vci_subplan_type_t type; + Bitmapset *plan_ids; + + bool has_analyzed_parallel; +} vci_subplan_attr_t; + +typedef struct +{ + PlannedStmt *plannedstmt; + + EState *estate; + + vci_subplan_attr_t *subplan_attr_map; + + int max_subplan_attrs; + + int *subplan_order_array; + + vci_plan_attr_t *plan_attr_map; + + int max_plan_attrs; + + AttrNumber last_plan_no; + + vci_param_exec_attr_t *param_exec_attr_map; + + int current_plan_id; + + AttrNumber current_plan_no; + + bool forbid_parallel_exec; + + bool suppress_vp; + + struct + { + List *main_plan_list; + + Bitmapset *plan_group; + + Bitmapset *correlated_subplans; + + Bitmapset *local_param_ids; + } parallel; + +} vci_rewrite_plan_context_t; + +extern bool vci_preanalyze_plan_tree(PlannedStmt *target, vci_rewrite_plan_context_t *rp_context, int eflags, bool *isGather); +extern void vci_register_plan_id(Plan *plan, int plan_id, void *context); +extern void vci_expand_plan_attr_map(vci_rewrite_plan_context_t *rp_context); +extern vci_inner_plan_type_t vci_get_inner_plan_type(vci_rewrite_plan_context_t *context, const Plan *plan); +extern AttrNumber vci_get_inner_scan_plan_no(vci_rewrite_plan_context_t *context, const Plan *plan); +extern void vci_set_inner_plan_type_and_scan_plan_no(vci_rewrite_plan_context_t *context, Plan *plan, vci_inner_plan_type_t plan_type, AttrNumber scan_plan_no); + +#endif /* VCI_PLANNER_H */ diff --git a/contrib/vci/include/vci_ros.h b/contrib/vci/include/vci_ros.h new file mode 100644 index 000000000000..16f561bec12e --- /dev/null +++ b/contrib/vci/include/vci_ros.h @@ -0,0 +1,1085 @@ +/*------------------------------------------------------------------------- + * + * vci_ros.h + * Definitions and declarations of VCI main relation + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_ros.h + * + *------------------------------------------------------------------------- + */ + +/**************************************************************************** + * ** CAUTION: THE STRUCTURES DEFINED IN THIS HEADER FILE WITH THE PREFIX ** + * ** OF "vcis_" AND vci_MainRelVar, vcis_Crid DEFINE THE FORMAT OF THE ROS ** + * ** DATA. ANY MODIFICATION ON THEM MAY CAUSE FORMAT INCOMPATIBILITY. ** + * ** PLEASE BE SURE TO CHANGE THE VALUE OF EITHER MACRO ** + * ** VCI_ROS_VERSION_MAJOR OR VCI_ROS_VERSION_MINOR, TO DETECT FORMAT ** + * ** INCOMPATIBILITY. ** + * ************************************************************************** + */ + +#ifndef VCI_ROS_H +#define VCI_ROS_H + +#include "postgres.h" + +#include "access/heapam.h" +#include "access/htup_details.h" +#include "c.h" +#include "catalog/pg_attribute.h" +#include "catalog/pg_class.h" +#include "nodes/execnodes.h" +#include "storage/block.h" +#include "storage/buf.h" +#include "storage/bufmgr.h" +#include "storage/itemptr.h" +#include "storage/lock.h" +#include "storage/off.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/relcache.h" +#include "utils/snapshot.h" + +#include "vci.h" + +#include "vci_utils.h" + +#if (!defined(WIN32)) +#define UINT uint +#endif + +#define VCI_ROS_VERSION_MAJOR ((uint32) 0x00000000) +#define VCI_ROS_VERSION_MINOR ((uint32) 0x0000000D) + +/** + * @brief IDs of ROS commands. + */ +typedef enum vci_ros_command +{ + vci_rc_invalid = -11, /* Invalid case. */ + + /** For vacuum with vci_mrlm_read_write_exclusive. */ + vci_rc_vacuum = -10, + + /** For normal query with vci_mrlm_read_share. */ + vci_rc_query = -9, + + /** For DROP command with vci_mrlm_read_write_exclusive. */ + vci_rc_drop_index = -8, + + /** For DELETE or UPDATE commands with vci_mrlm_read_share. */ + vci_rc_wos_delete = -7, + + /** For INSERT or UPDATE commands with vci_mrlm_read_share. */ + vci_rc_wos_insert = -6, + + /** For recovering ROS with vci_mrlm_read_share, assumed that this command + * is used in vci_mrlm_write_exclusive lock of ROS commands. */ + vci_rc_recovery = -5, + + /** For collecting VCI information with vci_mrlm_read_share. + * This is also used by vci_KeepMainRelHeader() and + * vci_KeepMainRelHeaderWOVersionCheck() automatically. + * */ + vci_rc_probe = -4, + + /** For building ROS in initial index building with + * vci_mrlm_read_write_exclusive. */ + vci_rc_wos_ros_conv_build = -3, + + /** For building local ROS with vci_mrlm_read_write_exclusive, to serialize + * ROS commands. + */ + vci_rc_generate_local_ros = -2, + + /** For COPY command with vci_mrlm_write_share. */ + vci_rc_copy_command = -1, + + /** For WOS -> ROS conversion with vci_mrlm_write_exclusive */ + vci_rc_wos_ros_conv = 0, + + /** For updating delete vector with vci_mrlm_write_exclusive */ + vci_rc_update_del_vec, + + /** For collecting deleted rows with vci_mrlm_write_exclusive */ + vci_rc_collect_deleted, + + /** For collecting deleted extents, unable to access anymore, + * with vci_mrlm_write_exclusive + */ + vci_rc_collect_extent, + + /** For updating TID -> CRID relations with vci_mrlm_write_exclusive */ + vci_rc_update_tid_crid, + + /** For compaction with vci_mrlm_write_exclusive */ + /* vci_rc_compaction, */ + + num_vci_rc, /* anchor */ +} vci_ros_command_t; + +/** + * @brief function to obtain the size of the varlena headers. + * + * @param[in] ptr Pointer to the varlena. + * @return Header size of given varlena. + */ +static inline int32 +vci_VARHDSZ_ANY(void *ptr) +{ + return VARATT_IS_1B_E(ptr) ? VARHDRSZ_EXTERNAL + : ((VARATT_IS_1B(ptr) ? VARHDRSZ_SHORT : VARHDRSZ)); +} + +/** taken from src/backend/utils/adt/tid.c */ +#define DatumGetItemPointer(X) ((ItemPointer) DatumGetPointer(X)) +/** taken from src/backend/utils/adt/tid.c */ +#define ItemPointerGetDatum(X) PointerGetDatum(X) + +typedef uint32 vci_offset_in_extent_t; /* offset to data */ + +/** bit width of maximum number of row ID in an extent */ +#define VCI_CRID_ROW_ID_BIT_WIDTH (18) + +/** Calculate CRID in int64 format from extentID and rowID in extent */ +static inline int64 +vci_CalcCrid64(int32 extentId, uint32 rowIdInExtent) +{ + return ((int64) extentId << VCI_CRID_ROW_ID_BIT_WIDTH) | + (rowIdInExtent & ((UINT64CONST(1) << VCI_CRID_ROW_ID_BIT_WIDTH) - 1)); +} + +/** Calculate extentID from CRID in int64 format */ +static inline int32 +vci_CalcExtentIdFromCrid64(int64 crid64) +{ + return (int32) (crid64 >> VCI_CRID_ROW_ID_BIT_WIDTH); +} + +/** Calculate rowID in extent from CRID in int64 format */ +static inline uint32 +vci_CalcRowIdInExtentFromCrid64(int64 crid64) +{ + return (uint32) (crid64 & ((UINT64CONST(1) << VCI_CRID_ROW_ID_BIT_WIDTH) - 1)); +} + +/** Maximum number of rows in an extent. (256 * 1024) for 18 bits */ +#define VCI_NUM_ROWS_IN_EXTENT (1 << VCI_CRID_ROW_ID_BIT_WIDTH) + +#define VCI_MAX_NUMBER_UNCONVERTED_ROS (128) + +#define VCI_INVALID_CRID_IN_48_BIT (UINT64CONST(0xFFFF800000000000)) +#define VCI_INVALID_CRID VCI_INVALID_CRID_IN_48_BIT + +#define VCI_MOVED_CRID_IN_48_BIT (UINT64CONST(0xFFFFC00000000000)) +#define VCI_MOVED_CRID VCI_MOVED_CRID_IN_48_BIT + +/** Value indicating invalid extent. The value is 0xE0000000 */ +#define VCI_INVALID_EXTENT_ID \ + ((int32) (VCI_INVALID_CRID_IN_48_BIT >> VCI_CRID_ROW_ID_BIT_WIDTH)) + +/** ID of the first extent stored in the storage. */ +#define VCI_FIRST_NORMAL_EXTENT_ID (0) + +/** Value indicating invalid dictionary. The value is -1 */ +#define VCI_INVALID_DICTIONARY_ID (-1) + +/** The number of rows converted at once by WOS->ROS converter. + * Offset is assigned every VCI_COMPACTION_UNIT_ROW rows. + */ +#define VCI_COMPACTION_UNIT_ROW (128) + +/** The ratio to keep usage of work area in safe level */ +#define VCI_WOS_ROS_WORKAREA_SAFE_RATIO (0.5) + +/** Base alignment in storage. + * In the storage, normally VCI uses four-byte integers. + * Thus, we align the data in the storage by four bytes. + */ +#define VCI_DATA_ALIGNMENT_IN_STORAGE (4) + +/** Aligned values, rounded up */ +#define vci_RoundUpValue(value, unit) \ + ((((value) + (unit) - 1) / (unit)) * (unit)) +/** Aligned values, rounded down */ +#define vci_RoundDownValue(value, unit) \ + (((value) / (unit)) * (unit)) + +/** Get byte size of data in an item when a page contains multiple items. + * @param[in] numItem Number of items in a page. + * @return The size of data in an item in byte. + */ +#define VCI_ITEM_SPACE(numItem) \ + ((((BLCKSZ - offsetof(PageHeaderData, pd_linp) \ + - (numItem * (sizeof(HeapTupleHeaderData) + sizeof(ItemIdData)))) \ + / numItem) / VCI_DATA_ALIGNMENT_IN_STORAGE) \ + * VCI_DATA_ALIGNMENT_IN_STORAGE) + +/** Get byte size of an item include item header, + * when a page contains multiple items. + * @param[in] numItem Number of items in a page. + * @return The size of an item in byte. + */ +#define VCI_ITEM_SIZE(numItem) \ + (VCI_ITEM_SPACE(numItem) + sizeof(HeapTupleHeaderData)) + +/** Minimum header space in DB page with one item, normally 52 byts */ +#define VCI_MIN_PAGE_HEADER \ + (SizeOfPageHeaderData + sizeof(HeapTupleHeaderData) \ + + sizeof(ItemIdData)) + +/** Available area in DB page with one item, normally 8140 bytes */ +#define VCI_MAX_PAGE_SPACE (BLCKSZ - VCI_MIN_PAGE_HEADER) + +/** + * @brief Return ID of the target page and offset in the target page + * calculated from the position. + * + * The position and offsetInPage is measured in data area in DB pages. We do + * not care the header of DB page in this macro. + * + * @param[out] blockNumber Block number for the given position. + * @param[out] offsetInPage Offset in page in byte, ignoring page header, + * for the given position. + * @param[in] position Byte offset in area formed by multiple DB pages. + */ +static inline void +vci_GetBlockNumberAndOffsetInPage(BlockNumber *blockNumber, + uint32 *offsetInPage, + uint32 position) +{ + *blockNumber = position / VCI_MAX_PAGE_SPACE; + *offsetInPage = position - (*blockNumber * VCI_MAX_PAGE_SPACE); +} + +/** + * @brief Get number of pages to write given data size. + * + * @param[in] size The data size. + * @return Number of pages to write. + */ +static inline uint32 +vci_GetNumBlocks(Size size) +{ + if (size == MaxBlockNumber) + return MaxBlockNumber; + + return (size + VCI_MAX_PAGE_SPACE - 1) / VCI_MAX_PAGE_SPACE; +} + +/** Maximum data size of maximum and minimum values in extents. */ +#define VCI_MAX_MIN_MAX_SIZE (16) + +/* Accessing VCI main relation header + * Because the header of VCI main relation has three pages, we can not map + * one structure of C on the header pages simply. + * Instead, we use access functions. + * + * In order to, first prepare a variable to keep page info and call the + * initialize function, with relation opend already. + * vci_InitMainRelHeaderInfo(info, rel) + * + * use one of these two * functions. + * vci_KeepReadingMainRelHeader() + * Read header pages for reading, pin and lock them. + * vci_KeepWritingMainRelHeader() + * Read header pages for writing, pin and lock them. + * + * We have to repair all VCI relation, if some of them are broken. + * Just call the next for the purpose. + * vci_RecoverOneVCIIfNecessary() + * + * Then, use the following two functions, + * + * vci_SetMainRelVar() + * To set the value to the field. + * vci_GetMainRelVar() + * To get the value of the field. + * + * Or, if you access column_info, use + * vci_GetMColumn() + * which gives the pointer to the vcis_m_column_t on the DB buffer directly. + * + * The field is defined in enum enum vci_MainRelVar. + * + * + * To write the updated data, use the funcition + * vci_WriteMainRelVar() + * + * After accessing the header, release the DB pages with the following + * function. + * + * vci_ReleaseMainRelHeader() + * Release header pages, pins and locks. + */ + +/** + * @brief Field names and addresses of VCI main relation. + * + * These enum values has the page ID at upper 16 bits, and offset for the + * field at lower 16 bits. + * The offset is measured from the top of DB page, not after the page header. + * + * This is for struct vcis_main_t. + * Because the header ov VCI main relation has three pages, we can not map + * one structure of C on the header pages. + * + * Minimum header in DB page is 52 bytes (0x34) + */ +typedef enum vci_MainRelVar +{ + /* page 0 */ + vcimrv_data_wos_oid = 0x00000034, + vcimrv_whiteout_wos_oid = 0x00000038, + /* vcimrv_cdr_tid_crid_data_oid = 0x0000003C, //reserved */ + vcimrv_tid_crid_meta_oid = 0x00000040, + vcimrv_tid_crid_data_oid = 0x00000044, + vcimrv_tid_crid_update_oid_0 = 0x00000048, + vcimrv_tid_crid_update_oid_1 = 0x0000004C, + /* vcimrv_tid_crid_write_oid = 0x00000050, //reserved */ + vcimrv_delete_meta_oid = 0x00000054, + vcimrv_delete_data_oid = 0x00000058, + vcimrv_null_meta_oid = 0x0000005C, + vcimrv_null_data_oid = 0x00000060, + vcimrv_tid_meta_oid = 0x00000064, + vcimrv_tid_data_oid = 0x00000068, + vcimrv_ros_version_major = 0x0000006C, /** MUST BE 0x0000006C */ + vcimrv_ros_version_minor = 0x00000070, /** MUST BE 0x00000070 */ + vcimrv_num_nullable_columns = 0x00000074, + vcimrv_null_width_in_byte = 0x00000078, /** byte size of null bit vector for one row. */ + vcimrv_column_info_offset = 0x0000007C, + vcimrv_num_columns = 0x00000080, + vcimrv_extent_info_offset = 0x00000084, + /* page 0 to 2 */ + vcimrv_column_info = 0x00000088, + /* page 3 */ + vcimrv_size_mr = 0x00030034, /** @todo Maybe, dose not need */ + vcimrv_size_mr_old = 0x00030038, /** @todo Maybe, dose not need */ + vcimrv_current_ros_version = 0x0003003C, + vcimrv_last_ros_version = 0x00030040, + vcimrv_tid_crid_diff_sel = 0x00030044, + vcimrv_tid_crid_diff_sel_old = 0x00030048, + vcimrv_xid_generation = 0x0003004C, + vcimrv_xid_gen_update_xid = 0x00030050, + /* vcimrv_xgen_tid_crid_write = 0x00030054, //reserved */ + /* vcimrv_num_tid_crid_update_oid_0 = 0x00030058, //reserved */ + /* vcimrv_num_tid_crid_update_oid_1 = 0x0003005C, //reserved */ + vcimrv_ros_command = 0x00030060, + /* vcimrv_ros_conv_extent_id = 0x00030064, //reserved */ + /* vcimrv_ros_conv_common_dict_id = 0x00030068, //reserved */ + vcimrv_old_extent_id = 0x0003006C, + vcimrv_new_extent_id = 0x00030070, + vcimrv_working_column_id = 0x00030074, + vcimrv_working_dictionary_id = 0x00030078, + vcimrv_tid_crid_operation = 0x0003007C, + vcimrv_tid_crid_target_blocknumber = 0x00030080, + vcimrv_tid_crid_target_info = 0x00030084, + vcimrv_tid_crid_free_blocknumber = 0x00030088, + /* vcimrv_compaction_colmn_id = 0x0003007C, //reserved */ + /* vcimrv_compaction_extent_id = 0x00030080, //reserved */ + /* vcimrv_compaction_old_block_number = 0x00030084, //reserved */ + /* vcimrv_compaction_new_block_number = 0x00030088, //reserved */ + vcimrv_num_unterminated_copy_cmd = 0x0003008C, + vcimrv_tid_crid_tag_bitmap = 0x00030090, + /* vcimrv_num_request_cdr = 0x00030090, //reserved */ + /* vcimrv_num_appendable_extents = 0x00030094, //reserved */ + /* vcimrv_num_compaction = 0x00030098, //reserved */ + /* vcimrv_extent_id_to_write = 0x0003009C, //reserved */ + vcimrv_num_extents = 0x000300A0, + vcimrv_num_extents_old = 0x000300A4, + vcimrv_extent_info = 0x000300A8, + + /* error code */ + vcimrv_invalid = 0xFFFFFFFF, +} vci_MainRelVar; + +/** mask data to get offset for fileds in VCI main relation header in DB page */ +#define VCI_MRV_MASK_OFFSET (0xFFFF) +/** bit to shift to get DB page ID for fileds in VCI main relation header */ +#define VCI_MRV_PAGE_SHIFT (16) + +/** + * @brief Get block number for given field of main relation header. + * + * @param[in] value value defined in vci_MainRelVar. + * @return Block number containing given field. + */ +#define vci_MRVGetBlockNumber(value) ((value) >> VCI_MRV_PAGE_SHIFT) + +/** + * @brief Get offset in DB page for given field of main relation header. + * + * @param[in] value value defined in vci_MainRelVar. + * @return Offset for containing given field from page top including header. + */ +#define vci_MRVGetOffset(value) ((value) & VCI_MRV_MASK_OFFSET) + +/** Number of header pages of VCI main relation */ +#define VCI_NUM_MAIN_REL_HEADER_PAGES (4) + +/** Struct to keep pointers to the header pages of VCI main relation */ +typedef struct vci_MainRelHeaderInfo +{ + Relation rel; /* Relation of VCI main relation */ + + /* + * VCI mainrelation header pages should be initialized with InvalidBuffer + */ + Buffer buffer[VCI_NUM_MAIN_REL_HEADER_PAGES]; /* Buffers for the main + * relation header + * pages. */ + vci_ros_command_t command; /* Command using this structure. */ + + /** number of extents that have the area to store their vcis_m_extent_t + * in main relation. + * This field is used in query execution, otherwise it has "-1". + */ + int32 num_extents_allocated; + /** To create VCI on more than 32 columns, creating TupleDesc by copying table's + * one is required. However, it is too heavy to repeat. So cache the created + * one to cached_tupledesc in initctx context. + */ + MemoryContext initctx; + TupleDesc cached_tupledesc; +} vci_MainRelHeaderInfo; + +/** Minimum size of an extent + * The extents of fixed field length columns has the size. + * The extents of the other types have larger size. + * Use vci_GetExtentFixedLengthRawDataHeaderSize() or something to obtain + * the size actually. + */ +#define VCI_EXTENT_HEADER_SIZE (offsetof(vcis_extent_t, dict_body)) + +/** This function returns the size of header of extent for fixed field length + * data. The size can be calculated from the format and the number of rows + * in an extent. Actually, it is independent of the number of rows, but that + * of variable length depends. + * @param[in] numRowsInExtent The number of rows in the extent. + * @return The size of extent header. + */ +#define vci_GetExtentFixedLengthRawDataHeaderSize(numRowsInExtent) \ + VCI_EXTENT_HEADER_SIZE + +/** Function to calculate necessary number of offset data to the chunks + * of VCI_COMPACTION_UNIT_ROW in ROS. + * @param[in] numRowsInExtent Number of rows in the extent. + * @return Number of necessary offsets. + */ +#define vci_GetOffsetArrayLength(numRowsInExtent) \ + (1 + (((numRowsInExtent) + VCI_COMPACTION_UNIT_ROW - 1) \ + / VCI_COMPACTION_UNIT_ROW)) + +/** Function to calculate data size of necessary offset data to the chunks + * of VCI_COMPACTION_UNIT_ROW in ROS. + * @param[in] numRowsInExtent Number of rows in the extent. + * @return Necessary data size. + */ +#define vci_GetOffsetArraySize(numRowsInExtent) \ + vci_GetOffsetArrayLength(numRowsInExtent) \ + * sizeof(vci_offset_in_extent_t) + +/** This function returns the size of header of extent for variable field + * length data, and compressed data. + * The size can be calculated from the format and the number of rows + * in an extent. Actually, it is independent of the number of rows, but that + * of variable length depends. + * @param[in] numRowsInExtent The number of rows in the extent. + * @return The size of extent header. + */ +#define vci_GetExtentVariableLengthRawDataHeaderSize(numRowsInExtent) \ + (VCI_EXTENT_HEADER_SIZE + vci_GetOffsetArraySize(numRowsInExtent)) + +/** One entry of column_info in VCI main relation + */ +typedef struct vcis_m_column +{ + Oid meta_oid; /** OID of metadata relation */ + Oid data_oid; /** OID of data relation */ + + /* + * int16 max_columns_size; + */ + /** AttrNumber original_attribute_number; */ + int16 max_columns_size; + int16 comp_type; /** vcis_compression_type_t */ +} vcis_m_column_t; + +/** One entry of extent_info in VCI main relation + */ +typedef struct vcis_m_extent +{ + /** number of rows recorded, including marked as deleted. */ + uint32 num_rows; + uint32 num_deleted_rows; /* number of rows marked as deleted. */ + uint32 num_deleted_rows_old; /* num_deleted_rows for recovery */ + TransactionId xgen; /* like xmin */ + TransactionId xdel; /* like xmax */ + + uint16 flags; + uint16 recovered_colid; +} vcis_m_extent_t; + +#define VCIS_M_EXTENT_FLAG_ENABLE_RECOVERED_COLID (0x0001) + +/** + * @brief VCI main relation header area to store by vci_WriteMainRelVar(). + * + * vci_wmrv_all is used when the VCI relation is built, since first two or + * three pages are defined in building time, then not modified at all. + * The last page has ROS command, current ROS version, and extent information + * so will be updated after creation. vci_wmrv_update is used when the last + * page is updated. + */ +typedef enum vci_wmrv_t +{ + vci_wmrv_update, /** Only the last header page will be wrote to storage */ + vci_wmrv_all, /** All the header pages will be wrote to storage */ +} vci_wmrv_t; + +/** I categorized ROS data like TID, NULL bit vector, normal column data + * as shown below. + */ +typedef enum vcis_attribute_type_t +{ + vcis_attribute_type_main = 0, /* data only */ + vcis_attribute_type_data_wos, /* data only */ + vcis_attribute_type_whiteout_wos, /* data only */ + vcis_attribute_type_tid_crid, /* special type, meta and data */ + vcis_attribute_type_tid_crid_update, /* data only */ /* two elements */ + vcis_attribute_type_delete_vec, /* normal column type */ + vcis_attribute_type_null_vec, /* normal column type */ + vcis_attribute_type_tid, /* normal column type */ + vcis_attribute_type_pgsql, /* normal column type */ + /* number of indexed columns */ + num_vcis_attribute_type, +} vcis_attribute_type_t; + +/** + * @brief Gives how many colums or data belong to the given category. + * + * Some categories, defined in vcis_attribute_type_t, have multiple elements. + * For example, vcis_attribute_type_pgsql category contains all the columns + * given in CREATE INDEX command. This function gives how many colums or data + * belong to the given category. + * + * @param[in] attrType Attribute type define in vcis_attribute_type_t. + * For normal columns, it takes vcis_attribute_type_pgsql. + * @param[in] numColumns The number of columns, which is returned when + * attrType is vcis_attribute_type_pgsql. + */ +static inline int +vci_GetNumIndexForAttributeType(vcis_attribute_type_t attrType, + int16 numColumns) +{ + return (vcis_attribute_type_pgsql == attrType) ? numColumns + : ((vcis_attribute_type_tid_crid_update == attrType) ? 2 + : ((0 <= attrType) && (attrType < num_vcis_attribute_type)) ? 1 + : 0); +} + +extern PGDLLEXPORT int vci_GetSumOfAttributeIndices(int16 numColumns); +extern PGDLLEXPORT void vci_GetAttrTypeAndIndexFromSumOfIndices( + vcis_attribute_type_t *attrType, + int *index, + int16 numColumns, + int sumOfIndex); + +typedef enum vcis_compression_type_t +{ + vcis_compression_type_invalid = -1, + vcis_compression_type_fixed_raw = 0, + vcis_compression_type_variable_raw, + vcis_compression_type_fixed_comp, /* reserved */ + vcis_compression_type_auto, /* reserved */ + num_vcis_compression_type, +} vcis_compression_type_t; + +typedef enum vcis_extent_type_t +{ + /** initial value is zero, since newly created DB page is filled with zero. + */ + vcis_undef_space = 0, + + vcis_extent_type_data, + vcis_extent_type_dict, + vcis_free_space, + + vcis_tidcrid_type_leaf, + vcis_tidcrid_type_trunk, + vcis_tidcrid_type_pagetag, + + num_vcis_extent_type, +} vcis_extent_type_t , +vcis_tidcrid_item_type_t; + +/** Type(s) of dictionary. + */ +typedef enum vcis_dict_type_t +{ + /** initial value is zero, since newly created DB page is filled with zero. + */ + vcis_dict_type_none = 0, + vcis_dict_type_lzvf, + num_vcis_dict_type, +} vcis_dict_type_t; + +/** Type(s) of operations in updating TID-CRID tree. + */ +typedef enum +{ + vcis_tid_crid_op_none = 0, + vcis_tid_crid_op_trunk, + vcis_tid_crid_op_leaf_add, + vcis_tid_crid_op_leaf_remove, +} vcis_tid_crid_op_type_t; + +#define vci_GetBlockNumberFromUint64(tId) \ + ((tId) >> (BITS_PER_BYTE * sizeof(OffsetNumber))) +#define vci_GetOffsetFromUint64(tId) \ + ((tId) & ((1U << (BITS_PER_BYTE * sizeof(OffsetNumber))) - 1)) +#define vci_MakeUint64FromBlockNumberAndOffset(blockNumber, offset) \ + (((uint64) (blockNumber) << (BITS_PER_BYTE * sizeof(OffsetNumber))) | (offset)) + +/** Local delete list */ +typedef struct vci_local_delete_list +{ + uint32 num_entry; /* the number of CRID stored */ + uint32 length; /* capacity of crid_list */ + uint64 *crid_list; /* actual values taken from whiteout WOS */ +} vci_local_delete_list; + +struct vci_CSFetchContextData; + +/** Local ROS */ +typedef struct vci_local_ros +{ + vci_local_delete_list local_delete_list; + + /** Number of extents of local ROS. + * The minimum extent ID of the local ROS is (-num_local_extents). + */ + uint32 num_local_extents; + + /** Pointer of the array of pointers to extent data. + * When release the data, first pfree(extent[i]) where i is from zero + * to (num_local_extents - 1), then pfree(extent). + */ + struct vci_virtual_tuples **extent; + + /* Memory context to store local ROS data */ + MemoryContext memory_context; + + /* not localized one */ + /** this fetch_context is allocated in shared memory context created + * in vci_GenerateLocalRos(), and destructed in vci_DestroyLocalRos(). + * In the latter function, the fetch_context is freed automatically. + */ + struct vci_CSFetchContextData *fetch_context; +} vci_local_ros_t; + +typedef struct vci_RelationPair +{ + vci_MainRelHeaderInfo *info; + + Relation meta; + Relation data; + + Buffer bufMeta; + Buffer bufData; +} vci_RelationPair; + +extern PGDLLEXPORT void vci_InitMainRelHeaderInfo(vci_MainRelHeaderInfo *info, + Relation rel, + vci_ros_command_t command); +extern void vci_KeepMainRelHeaderWithoutVersionCheck(vci_MainRelHeaderInfo *info); +extern PGDLLEXPORT void vci_KeepMainRelHeader(vci_MainRelHeaderInfo *info); +extern void vci_ChangeCommand(vci_MainRelHeaderInfo *info, vci_ros_command_t command); + +extern PGDLLEXPORT void vci_ReleaseMainRelHeader(vci_MainRelHeaderInfo *info); + +extern void vci_SetMainRelVar(vci_MainRelHeaderInfo *info, + vci_MainRelVar var, + int elemId, + uint32 value); +extern PGDLLEXPORT uint32 vci_GetMainRelVar(vci_MainRelHeaderInfo *info, + vci_MainRelVar var, + int elemId); +extern void vci_WriteMainRelVar(vci_MainRelHeaderInfo *info, + vci_wmrv_t writeArea); + +extern void vci_InitPageCore(Buffer buffer, int16 numItem, bool locked); +extern void vci_InitPage(Relation rel, BlockNumber blockNumber, int16 numItem); + +extern Buffer vci_ReadBufferWithPageInit(Relation reln, BlockNumber blockNumber); +extern Buffer vci_ReadBufferWithPageInitDelVec(Relation reln, BlockNumber blockNumber); + +/* + * In order to keep the heap tuple plane, set 'p' to attstorage in + * FormData_pg_attribute. + */ + +extern PGDLLEXPORT vci_MainRelVar vci_GetMColumnPosition(int16 columnId); +extern PGDLLEXPORT vcis_m_column_t *vci_GetMColumn(vci_MainRelHeaderInfo *info, int16 columnId); +extern PGDLLEXPORT vcis_m_extent_t *vci_GetMExtent(Buffer *buffer, vci_MainRelHeaderInfo *info, int32 extentId); + +extern void vci_GetExtentInfoPosition(BlockNumber *blockNumber, + OffsetNumber *offset, + int32 extentId); +extern bool vci_ExtentInfoExists(vci_MainRelHeaderInfo *info, int32 extentId); +extern bool vci_ExtentIsVisible(vcis_m_extent_t *mExtent, TransactionId xid); +extern bool vci_ExtentIsCollectable(vcis_m_extent_t *mExtent, TransactionId wos2rosXid); +extern bool vci_ExtentIsFree(vcis_m_extent_t *extentInfo); + +extern uint32 vci_GetFreeExtentId(vci_MainRelHeaderInfo *info); +extern PGDLLEXPORT int16 vci_GetColumnWorstSize(Form_pg_attribute attr); + +/* ************************************** + * ** CAUTION: AttrNumber is 1 origin. ** + * ************************************** + */ +extern Size vci_GetColumnIdsAndSizes(AttrNumber *heapAttrNumList, + int16 *indxColumnIdList, + int16 *columnSizeList, + int numColumn, + vci_MainRelHeaderInfo *info, + Oid heapOid); +extern void vci_WriteExtentInfoInMainRosForWriteExtentOrCommonDict( + vci_MainRelHeaderInfo *info, + int32 extentId, + int32 dictionaryId, + TransactionId xid, + vci_ros_command_t command); + +static inline void +vci_WriteExtentInfoInMainRosForWriteExtent(vci_MainRelHeaderInfo *info, + int32 extentId, + TransactionId xid, + vci_ros_command_t command) +{ + vci_WriteExtentInfoInMainRosForWriteExtentOrCommonDict(info, extentId, + VCI_INVALID_DICTIONARY_ID, + xid, command); +} + +static inline void +vci_SetItemPointerFromTid64(ItemPointer item, uint64 tId) +{ + ItemPointerSet(item, + vci_GetBlockNumberFromUint64(tId), + vci_GetOffsetFromUint64(tId)); +} + +static inline uint64 +vci_GetTid64FromItemPointer(ItemPointer item) +{ + uint64 blockNumber; + + Assert(NULL != item); + blockNumber = BlockIdGetBlockNumber(&(item->ip_blkid)); + + return vci_MakeUint64FromBlockNumberAndOffset(blockNumber, item->ip_posid); +} + +/* ************************************** + * ** CAUTION: AttrNumber is 1 origin. ** + * ************************************** + */ +extern Buffer vci_WriteOnePageIfNecessaryAndGetBuffer(Relation relation, + BlockNumber blockNumber, + BlockNumber blockNumberOld, + Buffer buffer); +extern void vci_WriteExtentInfo(vci_MainRelHeaderInfo *info, + int32 extentId, + uint32 numRows, + uint32 numDeletedRows, + uint32 numDeletedRowsOld, + TransactionId xgen, + TransactionId xdel); + +/* + * ********************************************************* + * functions to recover ROS + * ********************************************************* + */ +extern void vci_RecoverOneVCIIfNecessary(vci_MainRelHeaderInfo *info); + +extern PGDLLEXPORT void + vci_PreparePagesIfNecessaryCore(Relation rel, + BlockNumber blockNumber, + uint16 numItems, + bool forceInit, + bool logItems); + +/** + * @brief This function checks if the relation has the DB page with the page ID + * blockNumber. + * + * When it does not exists, the function extends the relation and initialize + * extended pages with one item per page. + * + * @param[in] rel The relation. + * @param[in] blockNumber The block number to be examined. + * @param[in] numItems The number of items the page is initialized with. + */ +static inline void +vci_FormatPageWithItems(Relation rel, BlockNumber blockNumber, int16 numItems) +{ + vci_PreparePagesIfNecessaryCore(rel, blockNumber, numItems, true, false); +} + +static inline void +vci_PreparePagesIfNecessary(Relation rel, BlockNumber blockNumber, uint16 numItems) +{ + vci_PreparePagesIfNecessaryCore(rel, blockNumber, numItems, false, false); +} + +extern PGDLLEXPORT void vci_WriteItem(Relation rel, + Buffer buffer, + OffsetNumber itemId); + +extern void + vci_UpdateOldFieldsInMetaHeader(Relation rel, TransactionId xId); +extern PGDLLEXPORT uint16 + vci_GetFixedColumnSize(vci_MainRelHeaderInfo *info, int16 columnId); +extern PGDLLEXPORT void + vci_GetPositionForFixedColumn(BlockNumber *blockNumber, + uint32 *offset, + vci_MainRelHeaderInfo *info, + int16 columnId, + int32 extentId, + uint32 rowIdInExtent, + bool atEnd); + +extern int vci_GetNumberOfNullableColumn(TupleDesc tupleDesc); +extern PGDLLEXPORT int16 vci_GetBitIdInNullBits(TupleDesc tupleDesc, int16 columnId); + +extern PGDLLEXPORT Snapshot vci_GetCurrentSnapshot(void); +extern void vci_FinalizeCopyCommand(void); + +struct vci_CSQueryContextData; +extern struct vci_local_ros *vci_GenerateLocalRos( + struct vci_CSQueryContextData *queryContext, + + /* maximum memory size to generate and keep local ROS */ + Size workareaSize, + + /* the number of rows from data WOS to local ROS */ + int64 numDataWosRows, + + /* the number of rows from whiteout WOS to local delete list */ + int64 numWhiteoutWosRows); + +static inline unsigned int +vci_GetNumRowsInLocalRosExtent(int numColumns) +{ + unsigned int numRowsInExtent = MaxAllocSize / Max( + + /* + * The size of area to store pointers to larger data or values of small + * fixed length directly, say each size is smaller than or equal to + * sizeof(Datum). We allocate one are for all columns to support both row + * wise and column wise access. + */ + sizeof(Datum) * numColumns, + + /* + * The size of area to store with larger size than sizeof(Datum). The data + * in the area is pointed from pointers stored in above area, so we can + * allocate separately. + */ + MaxHeapTupleSize); + + return 1U << vci_GetHighestBit(Min(numRowsInExtent, VCI_NUM_ROWS_IN_EXTENT)); +} + +extern void vci_DestroyLocalRos(vci_local_ros_t *localRos); + +#define vci_WriteExtentInfoInMainRosForWosRosConvInit(info, extentId, xid) \ + vci_WriteExtentInfoInMainRosForWriteExtent((info), \ + (extentId), \ + (xid), \ + vci_rc_wos_ros_conv) + +#define vci_WriteExtentInfoInMainRosForCopyInit(info, extentId, xid) \ + vci_WriteExtentInfoInMainRosForWriteExtent((info), \ + (extentId), \ + (xid), \ + vci_rc_copy_command) + +/* + * + */ +static inline void +vci_PreparePagesWithOneItemIfNecessary(Relation relation, + BlockNumber blockNumber) +{ + vci_PreparePagesIfNecessary(relation, blockNumber, 1); +} + +/* this function set the dirty bit, and write all the items in the page + * to the WAL. + * arguments + * Relation rel + * Buffer buffer + */ +static inline void +vci_WriteOneItemPage(Relation rel, + Buffer buffer) +{ + vci_WriteItem(rel, buffer, FirstOffsetNumber); +} + +/* Initialize a DB page with one item format + * argumtents + * Relation relation + * BlockNumber blockNumber + */ +static inline void +vci_InitOneItemPage(Relation relation, BlockNumber blockNumber) +{ + vci_InitPage(relation, blockNumber, 1); +} + +static inline void +vci_FormatPageWithOneItem(Relation rel, BlockNumber blockNumber) +{ + vci_FormatPageWithItems(rel, blockNumber, 1); +} + +static inline uint32 +vci_VarSizeAny(char *ptr) +{ + if (!VARATT_IS_1B(ptr)) + { + static varattrib_4b tmp; + + MemCpy(&tmp, ptr, sizeof(varattrib_4b)); + + return VARSIZE_4B(&tmp); + } + + return VARSIZE_ANY(ptr); +} + +static inline bool +vci_PassByRefForFixed(Form_pg_attribute attr) +{ +#ifndef USE_FLOAT8_BYVAL + if (8 == attr->attlen) + return true; +#endif /* #ifndef USE_FLOAT8_BYVAL */ + + return sizeof(Datum) < (unsigned long) attr->attlen; +} + +static inline void * +vci_repalloc(void *ptr, size_t size) +{ + return ptr ? repalloc(ptr, size) : palloc(size); +} + +static inline bool +vci_GetBit(uint8 *bitArray, int bitId) +{ + return (bitArray[bitId >> 3] >> (bitId & 7)) & 1; +} + +typedef struct vci_DictInfo +{ + /* + * Memory area to read dictionary. This is not used when create new + * dictionaries. + */ + unsigned char *dictionary_storage; + + Size storage_size; /* byte size of dictionary_storage */ + + /* + * The extent ID for individual dictionary. VCI_INVALID_EXTENT_ID for + * common dictionaries. + */ + int32 extent_id; + + /* VCI_INVALID_DICTIONARY_ID for individual dictionary */ + int16 common_dict_id; + + vcis_dict_type_t dict_type; + +} vci_DictInfo; + +Buffer + vci_WriteDataIntoMultiplePages(Relation rel, + BlockNumber *blockNumber, + BlockNumber *blockNumberOld, + uint32 *offsetInPage, + Buffer buffer, + const void *data_, + Size size); + +typedef struct vci_meta_item_scanner +{ + bool inited; + + Relation rel; + int index; + + BlockNumber end_block; /* inclusive */ + BlockNumber start_block; + + Buffer buffer; + BlockNumber current_block; + + int max_item; + int max_item_in_page; + int item_size; + + int buf_lockmode; + +} vci_meta_item_scanner_t; + +typedef struct +{ + Oid oid; /* Oid of VCI main relation */ + Oid dbid; /* Oid of database to which a VCI main + * relation belongs */ + bool force_next_wosros_conv; /* flag to force WOS->ROS conversion + * on next time */ +} vci_wosros_conv_worker_arg_t; + +extern vcis_m_extent_t *vci_GetMExtentNext(vci_MainRelHeaderInfo *info, vci_meta_item_scanner_t *scan); +extern vci_meta_item_scanner_t *vci_BeginMetaItemScan(Relation rel, int buf_lock); +extern void vci_EndMetaItemScan(vci_meta_item_scanner_t *scan); + +/* recovery functions for command */ +extern void vci_UpdateLastRosVersionAndOthers(vci_MainRelHeaderInfo *info); +extern void vci_RecoveryDone(vci_MainRelHeaderInfo *info); +extern void vci_WriteRecoveryRecordDone(vci_MainRelHeaderInfo *info, vci_ros_command_t command, TransactionId xid); + +extern void vci_WriteRecoveryRecordForExtentInfo(vci_MainRelHeaderInfo *info, + int32 newExtentId, int32 oldExtentId); +extern void vci_RecoveryExtentInfo(vci_MainRelHeaderInfo *info, vci_ros_command_t command); + +extern void vci_WriteRecoveryRecordForUpdateDelVec(vci_MainRelHeaderInfo *info); +extern void vci_RecoveryUpdateDelVec(vci_MainRelHeaderInfo *info); +extern const char *vci_GetRosCommandName(vci_ros_command_t command); + +/* ---------------- + * vci_index.c + * ---------------- + */ + +extern bool vci_isVciAdditionalRelation(Relation rel); +extern bool vci_isVciAdditionalRelationTuple(Oid reloid, Form_pg_class reltuple); + +/* ---------------- + * vci_internal_view.c + * ---------------- + */ + +extern void vci_check_prohibited_operation(Node *parseTree, bool *creating_vci_extension); + +#endif /* VCI_ROS_H */ diff --git a/contrib/vci/include/vci_ros_command.h b/contrib/vci/include/vci_ros_command.h new file mode 100644 index 000000000000..8c2cb5cdc4b7 --- /dev/null +++ b/contrib/vci/include/vci_ros_command.h @@ -0,0 +1,214 @@ +/*------------------------------------------------------------------------- + * + * vci_ros_command.h + * Definitions and declarations of ROS control commands + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_ros_command.h + * + *------------------------------------------------------------------------- + */ + +#ifndef VCI_ROS_COMMAND_H +#define VCI_ROS_COMMAND_H + +#include "postgres.h" +#include "c.h" +#include "utils/tuplesort.h" +#include "access/genam.h" + +#include "vci_ros.h" +#include "vci_chunk.h" + +typedef struct +{ + ItemPointerData *orig_tids; + + ItemPointerData *wos_tids; + + int max; + + int num; + + int offset; +} vci_tid_array_t; + +typedef struct +{ + BlockNumber *orig_blknos; + + int max; + + int num; + +} vci_blk_array_t; + +/** + * @brief Context for ROS commands, containing TID list read from data WOS or + * whiteout WOS, data read from the PostgreSQL heap relation or from ROS, + * related attribute numbers, OIDs, number of rows, and so on. + */ +typedef struct vci_RosCommandContext +{ + vci_ros_command_t command; /* command using this context */ + + RosChunkBuffer buffer; /* data are stored primary here */ + RosChunkStorage storage; /* data are compacted and copied here */ + vci_MainRelHeaderInfo info; /* VCI main relation header */ + + /** numRowsToConvert is something tricky. + * set VCI_NUM_ROWS_IN_EXTENT in index building phase. + * set number of rows (up to VCI_NUM_ROWS_IN_EXTENT) to convert after + * building. + */ + int numRowsToConvert; + + int numRowsAtOnce; /* maximum number of rows in a chunk */ + Relation heapRel; /* the original relation indexed by VCI */ + Oid heapOid; /* the original relation indexed by VCI */ + Oid indexOid; /* the VCI indexed relation */ + + int numColumns; /* number of columns in VCI index */ + + /** the processing extent ID. negative IDs for local ROSes */ + int32 extentId; + int32 extentIdSrc; /* source extentId in copy operation (wos2ros, + * cdr) */ + + struct vci_local_ros *local_ros; /* local ROS */ + + /** list of worst case column size */ + int16 *columnSizeList; + + /** attribute number (1-origin) in the original relation */ + AttrNumber *heapAttrNumList; + + /** index ID (0-origin) in the VCI relation */ + int16 *indxColumnIdList; + + /** transaction ID using this context */ + TransactionId xid; + + TransactionId oldestXmin; + + TransactionId wos2rosXid; + + TransactionId inclusiveXid; + + TransactionId exclusiveXid; + + vci_tid_array_t wos2ros_array; + + vci_tid_array_t delvec_array; + + vci_blk_array_t utility_array; + + /** + * TID on "WOS Relation" list to convert in Item Pointer format + */ + + bool done; /* true if all records are read */ + + /** + * Number of rows in the relation estimated by analyze or vacuum command. + * This is used to build ROS in CREATE INDEX command. + */ + double estimatedNumRows; + + /** + * Number of converted rows. + * This is used to build ROS in CREATE INDEX command. + */ + uint64 numConvertedRows; + + /** + * The name of index relation built. + * This is used to build ROS in CREATE INDEX command. + */ + char relName[NAMEDATALEN]; + + /** + * scan context. + * This is used only in initial building to scan the original relation + * sequentially. + */ + HeapScanDesc scan; + + TupleDesc tid_tid_tupdesc; + + TupleTableSlot *tid_tid_slot; + + /** + * a sorted TID list to be converted into ROS extents + */ + Tuplesortstate *wos2ros_tid_list; + int64 num_wos2ros_tids; + + /** + * a sorted TID list to be converted into a delete vector + */ + Tuplesortstate *delvec_tid_list; + int64 num_delvec_tids; + + Tuplesortstate *data_wos_del_list; + + Tuplesortstate *whiteout_wos_del_list; + +} vci_RosCommandContext; + +typedef struct +{ + int32 num_fit_extents; + int32 best_extent_id; +} vci_target_extent_info_t; + +/* + * ********************************************************* + * Conversion Context operation + * ********************************************************* + */ +extern void vci_InitRosCommandContext0(vci_RosCommandContext *context, + Relation rel, vci_ros_command_t command); +extern void vci_InitRosCommandContext1(vci_RosCommandContext *comContext, + Size workareaSize, + int numInsertRows, + int numDeleteRows, + bool readOriginalData); +extern void vci_InitRosCommandContext2(vci_RosCommandContext *comContext, Size workareaSize); + +extern void vci_InitRosChunkStroageAndBuffer(vci_RosCommandContext *comContext, bool forAppending); + +extern void vci_CleanRosCommandContext(vci_RosCommandContext *comContext, bool neverWrite); +extern void vci_FinRosCommandContext(vci_RosCommandContext *comContext, bool neverWrite); + +extern void vci_ReleaseMainRelInCommandContext(vci_RosCommandContext *comContext); +extern void vci_CloseHeapRelInCommandContext(vci_RosCommandContext *comContext); + +/* + * ********************************************************* + * Functions for ROS command + * ********************************************************* + */ +extern PGDLLEXPORT int vci_ConvertWos2Ros(Relation mainRel, Size workareaSize, int numRows); +extern double vci_ConvertWos2RosForBuild(Relation mainRel, Size workarea, IndexInfo *indexInfo); +extern PGDLLEXPORT int vci_UpdateDelVec(Relation mainRel, Size workareaSize, int numRows); +extern PGDLLEXPORT int vci_CollectDeletedRows(Relation mainRel, Size workareaSize, int32 extentId); +extern PGDLLEXPORT int vci_UpdateTidCrid(Relation mainRel, Size workareaSize, int numPages); +extern PGDLLEXPORT int vci_CollectUnusedExtent(Relation mainRel, Size workareaSize); + +extern void vci_VacuumRos(Relation mainRel, IndexVacuumInfo *vacuumInfo); + +/* + * ********************************************************* + * Probing functions to decided whether to execute the command + * ********************************************************* + */ +extern PGDLLEXPORT uint32 vci_CountFreezedInDataWos(Relation mainRel, Size workarea); +extern PGDLLEXPORT uint32 vci_CountFreezedInWhiteoutWos(Relation mainRel, Size workarea); +extern PGDLLEXPORT vci_target_extent_info_t vci_CountDeletedRowsInROS(Relation mainRel, uint32 threshold); +extern vci_target_extent_info_t vci_CountUnusedExtents(Relation mainRel); +extern int32 vci_CountTidCridUpdateListLength(Relation mainRel, Size workarea); + +#endif /* #ifndef VCI_ROS_COMMAND_H */ diff --git a/contrib/vci/include/vci_ros_daemon.h b/contrib/vci/include/vci_ros_daemon.h new file mode 100644 index 000000000000..8def778b48e5 --- /dev/null +++ b/contrib/vci/include/vci_ros_daemon.h @@ -0,0 +1,69 @@ +/*------------------------------------------------------------------------- + * + * vci_ros_daemon.h + * Definitions and declarations of ROS Control Daemon and Worker + * + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_ros_daemon.h + * + *------------------------------------------------------------------------- + */ +#ifndef VCI_ROS_DAEMON_H +#define VCI_ROS_DAEMON_H + +#include "postgres.h" + +#include "lib/ilist.h" +#include "postmaster/bgworker.h" +#include "utils/relcache.h" + +#include "vci_ros.h" + +/** + * The threshold of tid->crid update list item coutns to execute tid->crid update + */ +#define VCI_UPDATE_TIDCRID_THRESHOLD (1024) + +/** + * The threshold of Whiteout WOS rows to update Delete Vector + */ +#define VCI_UPDATE_DELVEC_THRESHOLD (256 * 1024) + +/** + * @see src/backend/postmaster/bgworker.c + */ +struct BackgroundWorkerHandle +{ + int slot; + uint64 generation; +}; + +typedef struct vci_workerslot +{ + pid_t pid; + + BackgroundWorkerHandle handle; + + Oid dbid; + Oid oid; +} vci_workerslot_t; + +/* ************************* */ +/* daemon functions */ +/* ************************* */ + +extern void vci_ROS_control_daemon_setup(void); +PGDLLEXPORT void vci_ROS_control_daemon_main(Datum main_arg); + +extern PGDLLEXPORT vci_workerslot_t vci_LaunchROSControlWorker(vci_wosros_conv_worker_arg_t *vciinfo, int slot_id); +PGDLLEXPORT void vci_ROS_control_worker_main(Datum main_arg); + +extern BackgroundWorkerHandle vci_LaunchROSControlMaintainer(int mode); +extern void vci_ROS_control_maintainer_main(Datum main_arg); + +extern void vci_InitDbPriorityList(void); + +#endif /* VCI_ROS_DAEMON_H */ diff --git a/contrib/vci/include/vci_supported_oid.h b/contrib/vci/include/vci_supported_oid.h new file mode 100644 index 000000000000..504de68d06ea --- /dev/null +++ b/contrib/vci/include/vci_supported_oid.h @@ -0,0 +1,34 @@ +/*------------------------------------------------------------------------- + * + * vci_supported_oid.h + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_supported_oid.h + * + *------------------------------------------------------------------------- + */ + +#ifndef VCI_SUPPORTED_OID_H +#define VCI_SUPPORTED_OID_H + +#include "utils/snapshot.h" + +#define VCI_MAX_APPLICABLE_UDFS (32) + +typedef struct +{ + int num_applicable_udfs; + Oid applicable_udfs[VCI_MAX_APPLICABLE_UDFS]; + Oid vci_runs_in_plan_funcoid; + Oid vci_always_return_true_funcoid; +} vci_special_udf_info_t; + +extern vci_special_udf_info_t vci_special_udf_info; + +extern bool vci_is_supported_type(Oid oid); +extern bool vci_is_supported_function(Oid oid); +extern void vci_register_applicable_udf(Snapshot snapshot); + +#endif /* VCI_SUPPORTED_OID_H */ diff --git a/contrib/vci/include/vci_tidcrid.h b/contrib/vci/include/vci_tidcrid.h new file mode 100644 index 000000000000..6728a606e2e6 --- /dev/null +++ b/contrib/vci/include/vci_tidcrid.h @@ -0,0 +1,344 @@ +/*------------------------------------------------------------------------- + * + * vci_tidcrid.h + * Definitions and Declarations of TIDCRID update list and + * TIDCRID Tree relation + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_tidcrid.h + * + *------------------------------------------------------------------------- + */ + +#ifndef VCI_TIDCRID_H +#define VCI_TIDCRID_H + +#include "postgres.h" + +#include "utils/tuplesort.h" + +#include "vci.h" +#include "vci_ros.h" +#include "vci_chunk.h" + +/** header page ID of TID->CRID update (differential) list */ +#define VCI_TID_CRID_UPDATE_HEADER_PAGE_ID (0) + +/** first body page ID of TID->CRID update (differential) list */ +#define VCI_TID_CRID_UPDATE_BODY_PAGE_ID (1) + +/** First page of tidcrid tree meta relation */ +#define VCI_TID_CRID_META_FIRST_PAGE_ID (0) + +/** First page of tidcrid tree data relation */ +#define VCI_TID_CRID_DATA_FIRST_PAGE_ID (0) + +/** Item number in page for tidcrid tree relation */ +#define VCI_ITEMS_IN_PAGE_FOR_TID_CRID_TREE (18) + +/** Offset number of page tag */ +#define VCI_TID_CRID_PAGETAG_ITEM_ID (VCI_FREESPACE_ITEM_ID) + +/** Capacity of tidcrid leaf node in bit*/ +#define VCI_TID_CRID_LEAF_CAPACITY_BITS (6) + +/** Capacity of tidcrid leaf node in bit*/ +#define VCI_TID_CRID_LEAF_CAPACITY (1 << VCI_TID_CRID_LEAF_CAPACITY_BITS) + +/** Capacity of tidcrid trunk node in bit*/ +#define VCI_TID_CRID_TRUNK_CAPACITY_BITS (6) + +/** Capacity of tidcrid trunk node in bit*/ +#define VCI_TID_CRID_TRUNK_CAPACITY (1 << VCI_TID_CRID_TRUNK_CAPACITY_BITS) + +/** Index of trunk node */ +#define VCI_TID_CRID_TRUNKNODE (-1) + +/** The number of items in DB page of TID-CRID Update List, normally 678 */ +#define VCI_TID_CRID_UPDATE_PAGE_ITEMS (VCI_MAX_PAGE_SPACE / sizeof(vcis_tidcrid_pair_item_t)) + +/** Available area in DB page of TID-CRID Update List, normally 8136 bytes */ +#define VCI_TID_CRID_UPDATE_PAGE_SPACE (VCI_TID_CRID_UPDATE_PAGE_ITEMS * sizeof(vcis_tidcrid_pair_item_t)) + +#define VCI_TID_CRID_UPDATE_CONTEXT_SAMPLES (1353) + +/* + * On-disk data structure for CRID + * + * GetUin64tFromCrid() can be used to convert to uint64 + * + * Sometimes v2 has special meanings, it represents special CRID. + */ +typedef struct vcis_Crid +{ + uint16 v0; + uint16 v1; + uint16 v2; +} +#ifdef __arm__ + __attribute__((packed)) +#endif +vcis_Crid; + +/* + * Convert vcis_Crid to uint64, on-memory structure + */ +static inline uint64 +vci_GetUint64FromCrid(vcis_Crid crid) +{ + /* Handle special values */ + if (crid.v2 == 0x8000) + return VCI_INVALID_CRID; + if (crid.v2 == 0xc000) + return VCI_MOVED_CRID; + + return ((uint64) crid.v2 << 32) | ((uint64) crid.v1 << 16) | crid.v0; +} + +/* + * Convert uint64 to vcis_Crid, on-disk structure + */ +static inline vcis_Crid +vci_GetCridFromUint64(uint64 crid_uint64) +{ + vcis_Crid crid; + + crid.v0 = crid_uint64 & ((uint64) 0xFFFF); + crid.v1 = (crid_uint64 >> 16) & ((uint64) 0xFFFF); + crid.v2 = (crid_uint64 >> 32) & ((uint64) 0xFFFF); + + return crid; +} + +/* + * TID-CRID tree relation + * + * The relation for the TID-CRID tree adds 18 tuples per page. In more detail, + * each tuple can use only 424 bytes. + * + * Each node of the tree has 64 slots, and each slot has 6 bytes, so 384 bytes + * are used to represent the tree. The remaining part is used for maintenance. + * Also, the initial tuple of each page is used for maintaining the page. + */ + +/* + * Entries of flexible array in vcis_tidcrid_meta + */ +typedef struct vcis_tidcrid_meta_item +{ + BlockNumber block_number; /* block number in TID-CRID tree relation */ + BlockNumber block_number_old; /* previous block_number, used for + * recovery purpose */ + int16 item_id; /* item id on TID-CRID tree relation */ + int16 item_id_old; /* previous item_id, used for recovery purpose */ +} vcis_tidcrid_meta_item_t; + +/* + * Meta relation for TID-CRID tree + * + * XXX: Several arrtibutes are not used but retained, to be consistent with + * Column Meta Relation. + */ +typedef struct vcis_tidcrid_meta +{ + vcis_attribute_type_t vcis_attr_type; /* Attribute type */ + + Oid pgsql_atttypid; /* taken from FormData_pg_attribute.atttypid */ + int16 pgsql_attnum; /* taken from FormData_pg_attribute.attnum */ + int16 pgsql_attlen; /* taken from FormData_pg_attribute.attlen */ + int32 pgsql_atttypmod; /* taken from + * FormData_pg_attribute.atttypmod */ + uint32 num_extents; /* number of extents (for debug) */ + uint32 num_extents_old; /* previous number of extents (for + * recovery) */ + + BlockNumber free_page_begin_id; /* page ID of the first free area */ + BlockNumber free_page_begin_id_old; /* previous free_page_begin_id (for + * recovery) */ + + BlockNumber free_page_end_id; /* page ID of the last free area */ + BlockNumber free_page_end_id_old; /* previous free_page_end_id (for + * recovery) */ + + /** + * The DB page ID of free area that located in front of the added or + * deleted extent by the ROS command. (for recovery) + * This is used to recover free area list. + */ + BlockNumber free_page_prev_id; + + /** + * Same as free_page_prev_id, but just behind the added or deleted extent. + */ + BlockNumber free_page_next_id; + + /** + * The freespace size of added or deleted extent by the ROS command (for recovery) + */ + uint32 free_page_old_size; + + /** + * The freespace position of added or deleted extent in BlockNumber + * by the ROS command (for recovery) + */ + BlockNumber new_data_head; + BlockNumber new_freespace_head; /* @todo unused field */ + + BlockNumber num_free_pages; /* number of free DB pages in the listed free + * area */ + BlockNumber num_free_pages_old; /* for recovery */ + BlockNumber num_free_page_blocks; /* number of free areas, not number of + * free DB pages */ + BlockNumber num_free_page_blocks_old; /* for recovery */ + + /*--- Above must be same as column Meta ---*/ + + BlockNumber num; /* number of Stored items */ + BlockNumber num_old; /* previous num, used for recovery purpose */ + BlockNumber free_block_number; /* number of free blocks */ + int32 offset; /* Offset from the head */ + vcis_tidcrid_meta_item_t body[1]; /* Flexible array of + * vcis_tidcrid_meta_item_t */ +} vcis_tidcrid_meta_t; + +/* + * Metadata at the initial tuple + */ +typedef struct vcis_tidcrid_pagetag +{ + uint32 size; + vcis_extent_type_t type; + BlockNumber prev_pos; + BlockNumber next_pos; + + uint32 num; + uint32 free_size; + uint32 bitmap; + char rsv[4]; +} vcis_tidcrid_pagetag_t; + +/* + * Leaf in the TID-CRID tree + */ +typedef struct vcis_tidcrid_leaf +{ + uint32 size; + vcis_tidcrid_item_type_t type; + + uint64 bitmap; + uint64 unused; + + /* Sum of above must be less than 40 bytes */ + + vcis_Crid crid[VCI_TID_CRID_LEAF_CAPACITY]; /* CRIDs related with TID */ +} vcis_tidcrid_leaf_t; + +/* + * Intermediate (trunk) node in TID-CRID tree + */ +typedef struct vcis_tidcrid_trunk +{ + uint32 size; + vcis_tidcrid_item_type_t type; + + uint64 bitmap; + uint64 unused; + + /* Sum of above must be less than 40 bytes */ + + ItemPointerData leaf_item[VCI_TID_CRID_TRUNK_CAPACITY]; /* Pointer to the leaf */ +} vcis_tidcrid_trunk_t; + +/* + * TID-CRID pair used for TIDCRID update list + */ +typedef struct vcis_tidcrid_pair_item +{ + ItemPointerData page_item_id; /* TID on the original relation */ + vcis_Crid crid; /* CRID */ +} vcis_tidcrid_pair_item_t; + +/* + * TID-CRID Update List + */ +typedef struct vcis_tidcrid_pair_list +{ + uint64 num; /* Number of items in the list */ + + uint16 blocks_per_samp; /* Number of blocks each entries in + * samples_tids[] handles */ + uint16 num_samples; /* Number of entries in samples_tids[] */ + + /* + * TID samples from update list. Sampling condition: + * + * 1. Initial entries in each blocks_per_samp blocks 2. Final entry + */ + ItemPointerData sample_tids[VCI_TID_CRID_UPDATE_CONTEXT_SAMPLES + 1]; + + vcis_tidcrid_pair_item_t body[1]; /* Flexible array of + * vcis_tidcrid_pair_item_t */ +} vcis_tidcrid_pair_list_t; + +typedef struct vci_TidCridUpdateListContext +{ + vci_MainRelHeaderInfo *info; /* Parent VCI main relation */ + + Relation rel; + + /* Number of vcis_tidcrid_pair_item_t entries in the rel */ + uint64 count; + + /* Number of blocks of the rel */ + BlockNumber nblocks; + + /* Head pointer to the TID-CRID Update List */ + vcis_tidcrid_pair_list_t header; + +} vci_TidCridUpdateListContext; + +typedef vci_RelationPair vci_TidCridRelations; + +/* initialize function */ +extern void vci_InitializeTidCridUpdateLists(vci_MainRelHeaderInfo *info); +extern void vci_InitializeTidCridTree(vci_MainRelHeaderInfo *info); + +/* TIDCRID Update List access functions */ + +extern PGDLLEXPORT vci_TidCridUpdateListContext *vci_OpenTidCridUpdateList(vci_MainRelHeaderInfo *info, int sel); +extern PGDLLEXPORT void vci_CloseTidCridUpdateList(vci_TidCridUpdateListContext *context); + +extern PGDLLEXPORT void vci_ReadOneBlockFromTidCridUpdateList(vci_TidCridUpdateListContext *context, BlockNumber blkno, vcis_tidcrid_pair_item_t *array); + +extern int32 vci_GetTidCridUpdateListLength(vci_MainRelHeaderInfo *info, int sel); +extern void vci_MergeAndWriteTidCridUpdateList(vci_MainRelHeaderInfo *info, int newSel, int oldSel, Tuplesortstate *newList, vcis_Crid crid); + +/* TIDCRID Tree access functions */ +extern void vci_OpenTidCridRelations(vci_TidCridRelations *rel, + vci_MainRelHeaderInfo *info, + LOCKMODE lockmode); +extern void vci_CloseTidCridRelations(vci_TidCridRelations *rel, LOCKMODE lockmode); + +extern void vci_GetTidCridSubTree(vci_TidCridRelations *relPair, BlockNumber blkOrig, + ItemPointer retPtr); +extern void vci_CreateTidCridSubTree(vci_TidCridRelations *relPair, BlockNumber blkOrig, + ItemPointer retPtr); +extern void vci_UpdateTidCridSubTree(vci_TidCridRelations *relPair, ItemPointer trunkPtr, + vcis_tidcrid_pair_list_t *newItems); + +/* TID->CRID Conversion */ +extern PGDLLEXPORT uint64 vci_GetCridFromTid(vci_TidCridUpdateListContext *context, ItemPointer tId, bool *fromTree); + +/* Recovery functions */ + +extern void vci_RecoveryFreeSpaceForTidCrid(vci_MainRelHeaderInfo *info); +extern void vci_RecoveryTidCrid(vci_MainRelHeaderInfo *info); +extern void vci_InitRecoveryRecordForTidCrid(vci_MainRelHeaderInfo *info); + +extern void vci_AddTidCridUpdateList(vci_MainRelHeaderInfo *info, + RosChunkStorage *src, + int32 extentId); + +#endif /* VCI_TIDCRID_H */ diff --git a/contrib/vci/include/vci_utils.h b/contrib/vci/include/vci_utils.h new file mode 100644 index 000000000000..1095e8d3a7c4 --- /dev/null +++ b/contrib/vci/include/vci_utils.h @@ -0,0 +1,238 @@ +/*------------------------------------------------------------------------- + * + * vci_utils.h + * Debugging functions and macros + * + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_utils.h + * + *------------------------------------------------------------------------- + */ +#ifndef VCI_DEBUG_H +#define VCI_DEBUG_H + +#include "postgres.h" + +#include +#include +#include + +#include "nodes/nodes.h" + +#include "vci.h" + +/* obtain the node name of type */ +extern PGDLLEXPORT const char *VciGetNodeName(NodeTag type); + +/** + * @brief inlined memcpy(). When len is larger than 1024, call memcpy(). + * + * @param[out] dst_ The data are copied to the memory area pointed by dst_. + * @param[in] src_ The source data address. + * @param[in] len The length of the data. + * @return dst_ itself. + * + * XXX: This is just a wrapper of memcpy() now, retained due to a backward + * compatibility. + */ +static inline void * +MemCpy(void *dst_, const void *src_, Size len) +{ + return memcpy(dst_, src_, len); +} + +/** + * @brief Find value from unsorted array of int16 and returns the position. + * @param[in] array Pointer to the array of int16. + * @param[in] len Length of the array. + * @param[in] value The value to find out. + * @return The position of value. + * When the value is not found, it returns -1. + */ +static inline int +FindInt16(int16 *array, int len, int16 value) +{ + int ptr; + + for (ptr = 0; ptr < len; ++ptr) + if (array[ptr] == value) + return ptr; + return -1; +} + +/** + * @brief Pfree and make the pointer null. + * + * @param[in, out] ptr When *ptr, NOT ptr ITSELF, is not NULL, pfree *ptr. + * Then, put *ptr = NULL. + * So use like * vci_PfreeAndNull(& pointer). + */ +static inline void +vci_PfreeAndNull(void *ptr) +{ + Assert(ptr); + if (NULL == *(void **) ptr) + return; + pfree(*(void **) ptr); + *(void **) ptr = NULL; +} + +/** + * @brief Allocate memory area with given size, and copy the given source + * to the area newly allocated. + * + * @param[in] src Pointer to the array of byte data to be copied. + * @param[in] size The size of source data pointed by src. + */ +static inline void * +vci_AllocateAndCopy(const void *src, Size size) +{ + if (src != NULL && size > 0) + { + void *dst = palloc(size); + + MemCpy(dst, src, size); + return dst; + } + return NULL; +} + +/** + * @brief A part of GetHighestBit(). + * + * @note Do not use this function directly. + */ +static inline void +vci_GetHighestBitSub(int *result, uint64 *value, uint64 mask, int inc) +{ + if (mask & *value) + { + *result += inc; + *value &= mask; + } + else + *value &= ~mask; +} + +/** + * @brief Get the largest bit ID in bits set 1 in the given uint64 value. + * + * It should be 63 - CLZ(value) (Count Leading Zero). + * If the given value is 0, returns -1. + * Same as + * \code{.c} + * if (value & 0x8000000000000000) return 63; + * if (value & 0x4000000000000000) return 62; + * ... + * if (value & 0x0000000000000002) return 1; + * if (value & 0x0000000000000001) return 0; + * return -1; + * \endcode + * + * @param[in] value Value to examine. + * @return The bit ID of MSB set, in a manner of zero-origin. + * If no bit has 1, -1 is returned. + * + * @note Better to use count leading zero (CLZ), if possible. + */ +static inline int +vci_GetHighestBit(uint64 value) +{ + int result = 0; + + if (0 == value) + return -1; + + vci_GetHighestBitSub(&result, &value, UINT64CONST(0xFFFFFFFF00000000), 32); + vci_GetHighestBitSub(&result, &value, UINT64CONST(0xFFFF0000FFFF0000), 16); + vci_GetHighestBitSub(&result, &value, UINT64CONST(0xFF00FF00FF00FF00), 8); + vci_GetHighestBitSub(&result, &value, UINT64CONST(0xF0F0F0F0F0F0F0F0), 4); + vci_GetHighestBitSub(&result, &value, UINT64CONST(0xCCCCCCCCCCCCCCCC), 2); + vci_GetHighestBitSub(&result, &value, UINT64CONST(0xAAAAAAAAAAAAAAAA), 1); + + return result; +} + +/** + * @brief Calculate the ID of least significant bit (LSB) set, 1. + * + * Same as + * \code{.c} + * if (value & 0x8000000000000001) return 0; + * if (value & 0x4000000000000002) return 1; + * ... + * if (value & 0x4000000000000000) return 62; + * if (value & 0x8000000000000000) return 63; + * return 63; + * \endcode + * + * @param[in] value Value to examine. + * @return The bit ID of LSB set, in a manner of zero-origin. + * If no bit has 1, -1 is returned. + * + * @note Better to use count leading zero (CLZ) and reverse bit, if possible. + */ +static inline int +vci_GetLowestBit(uint64 value) +{ + int result = 63; + + if (0 == value) + return -1; + + vci_GetHighestBitSub(&result, &value, ~UINT64CONST(0xFFFFFFFF00000000), -32); + vci_GetHighestBitSub(&result, &value, ~UINT64CONST(0xFFFF0000FFFF0000), -16); + vci_GetHighestBitSub(&result, &value, ~UINT64CONST(0xFF00FF00FF00FF00), -8); + vci_GetHighestBitSub(&result, &value, ~UINT64CONST(0xF0F0F0F0F0F0F0F0), -4); + vci_GetHighestBitSub(&result, &value, ~UINT64CONST(0xCCCCCCCCCCCCCCCC), -2); + vci_GetHighestBitSub(&result, &value, ~UINT64CONST(0xAAAAAAAAAAAAAAAA), -1); + return result; +} + +/** + * @brief Count number of bits set, 1. + * + * Same as + * \code{.c} + * uint64 count = 0; + * if (value & 0x0000000000000001) ++ count; + * if (value & 0x0000000000000002) ++ count; + * ... + * if (value & 0x4000000000000000) ++ count; + * if (value & 0x8000000000000000) ++ count; + * return count; + * \endcode + * + * @param[in] value Value to examine. + * @return The number of bit set. + */ +static inline int +vci_GetBitCount(uint64 value) +{ + uint64 count = 0; + + count = (value & UINT64CONST(0x5555555555555555)) + ((value >> 1) & UINT64CONST(0x5555555555555555)); + count = (count & UINT64CONST(0x3333333333333333)) + ((count >> 2) & UINT64CONST(0x3333333333333333)); + count = (count & UINT64CONST(0x0f0f0f0f0f0f0f0f)) + ((count >> 4) & UINT64CONST(0x0f0f0f0f0f0f0f0f)); + count = (count & UINT64CONST(0x00ff00ff00ff00ff)) + ((count >> 8) & UINT64CONST(0x00ff00ff00ff00ff)); + count = (count & UINT64CONST(0x0000ffff0000ffff)) + ((count >> 16) & UINT64CONST(0x0000ffff0000ffff)); + count = (count & UINT64CONST(0x00000000ffffffff)) + ((count >> 32) & UINT64CONST(0x00000000ffffffff)); + return (int) count; +} + +/** + * @brief Set specified bit in char array to 1. + * + * @param[in, out] bitData Pointer to an array of char. + * @param bitID Bit ID to set. + */ +static inline void +vci_SetBit(char *bitData, uint16 bitId) +{ + bitData[bitId >> 3] |= 1 << (bitId & 7); +} + +#endif /* VCI_DEBUG_H */ diff --git a/contrib/vci/include/vci_wos.h b/contrib/vci/include/vci_wos.h new file mode 100644 index 000000000000..7bc302b91379 --- /dev/null +++ b/contrib/vci/include/vci_wos.h @@ -0,0 +1,29 @@ +/*------------------------------------------------------------------------- + * + * vci_wos.h + * Declarations of WOS functions + * + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_wos.h + * + *------------------------------------------------------------------------- + */ +#ifndef VCI_WOS_H +#define VCI_WOS_H + +#include "postgres.h" + +#include "storage/itemptr.h" +#include "lib/rbtree.h" +#include "utils/relcache.h" +#include "utils/snapshot.h" + +extern Snapshot vci_GetSnapshotForWos2Ros(void); +extern Snapshot vci_GetSnapshotForLocalRos(TransactionId inclusive_xid, TransactionId exclusive_xid); + +extern PGDLLEXPORT uint64 vci_EstimateNumEntriesInHeapRelation(Oid oid); + +#endif /* VCI_WOS_H */ diff --git a/contrib/vci/include/vci_xact.h b/contrib/vci/include/vci_xact.h new file mode 100644 index 000000000000..67fe1e4960d7 --- /dev/null +++ b/contrib/vci/include/vci_xact.h @@ -0,0 +1,39 @@ +/*------------------------------------------------------------------------- + * + * vci_xact.h + * Transaction control + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/include/vci_xact.h + * + *------------------------------------------------------------------------- + */ + +#ifndef VCI_XACT_H +#define VCI_XACT_H + +#include "access/xact.h" + +struct vci_MainRelHeaderInfo; + +/* + * States of transactions + */ +enum vci_xact_status_kind +{ + VCI_XACT_INVALID, /* invalid transaction ID */ + VCI_XACT_SELF, /* my transaction */ + VCI_XACT_IN_PROGRESS, /* in-progress transaction (not mine) */ + VCI_XACT_DID_COMMIT, /* committed transaction */ + VCI_XACT_DID_ABORT, /* aborted transaction */ + VCI_XACT_DID_CRASH /* crash was happened during the transaction */ +}; + +extern enum vci_xact_status_kind vci_transaction_get_type(TransactionId xid); + +extern int64 vci_GenerateXid64(TransactionId target_xid, struct vci_MainRelHeaderInfo *info); +extern void vci_UpdateXidGeneration(struct vci_MainRelHeaderInfo *info); + +#endif /* VCI_XACT_H */ diff --git a/contrib/vci/meson.build b/contrib/vci/meson.build new file mode 100644 index 000000000000..130075b037bc --- /dev/null +++ b/contrib/vci/meson.build @@ -0,0 +1,67 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +subdir('executor') +subdir('storage') +subdir('utils') + + +vci_sources = files( + 'vci_main.c', + 'vci_read_guc.c', + 'vci_shmem.c', + 'vci_supported_funcs.c', + 'vci_supported_types.c', +) + +vci_sources += vci_executor_sources + +vci_sources += vci_storage_sources + +vci_sources += vci_utils_sources + +if host_system == 'windows' + vci_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'vci', + '--FILEDESC', 'vci - vertical clustered index',]) +endif + +if host_system == 'solaris' + ldflags += ['-lc -lkstat'] +endif + +vci_cflags = [] + +vci_cflags += var_cflags_sl + +vci = shared_module('vci', + vci_sources, + c_args : vci_cflags, + include_directories : include_directories('../../contrib/vci/include'), + link_args: ldflags, + kwargs: contrib_mod_args, +) + +install_data( + 'vci.control', + 'vci--1.0.sql', + kwargs:contrib_data_args, +) + +tests += { + 'name': 'vci', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'vci', + 'bugs', + ], + 'regress_args': ['--temp-config', files('vci.conf')], + + # runningcheck is disabled because a running instance needs to have + # "shared_preload_libraries=vci", but typical runnigcheck users + # (e.g. build farm clients) won't have that setting so they would fail. + # Note: This is copied from contrib/pg_stat_statements/meson.build + 'runningcheck' : false, + } +} diff --git a/contrib/vci/sql/bugs.sql b/contrib/vci/sql/bugs.sql new file mode 100644 index 000000000000..6b77938ed8af --- /dev/null +++ b/contrib/vci/sql/bugs.sql @@ -0,0 +1,87 @@ +-- Bug reported by Japin Li that caused a vci_beginscan PANIC +-- See https://www.postgresql.org/message-id/ME0P300MB04457E24CA8965F008FB2CDBB648A%40ME0P300MB0445.AUSP300.PROD.OUTLOOK.COM + +CREATE TABLE t1 (id int, info text); +CREATE INDEX t1_id_idx ON t1 USING vci (id); +INSERT INTO t1 SELECT id, md5(id::text) FROM generate_series(1, 1000) id; +SET enable_seqscan TO off; +SELECT * FROM t1 WHERE id = 100; +DROP TABLE t1; + +-- Bug reported by Japin Li that VACUUM caused a TRAP +-- See https://www.postgresql.org/message-id/SY8P300MB0442BEC3F5CF432F0121ACC4B642A%40SY8P300MB0442.AUSP300.PROD.OUTLOOK.COM + +CREATE TABLE t2 (id int, info text) WITH (autovacuum_enabled = off); +CREATE INDEX t2_id_idx ON t2 USING vci (id); +INSERT INTO t2 SELECT id, 'test' || id FROM generate_series(1, 1000) id; +DELETE FROM t2 WHERE id % 10 = 0; +VACUUM t2; +DROP TABLE t2; + +-- Bug reported by Japin Li that caused a Segmentation Violation attempting to REFRESH a VCI internal relation +-- See https://www.postgresql.org/message-id/ME0P300MB0445EBA04D6947DD717074DFB65CA%40ME0P300MB0445.AUSP300.PROD.OUTLOOK.COM + +CREATE TABLE t3 (id int, info text); +CREATE INDEX ON t3 USING vci (id); +SELECT relname FROM pg_class WHERE relname ~ '^pg_vci_*' LIMIT 1 \gset +SELECT * FROM :relname; +\d+ :relname +REFRESH MATERIALIZED VIEW :relname; +DROP TABLE t3; + +-- Bug missing logic. Ensure VCI internal relations get removed when the TABLE is dropped. + +CREATE TABLE t4 (id int, info text); +CREATE INDEX t4_idx ON t4 USING vci (id); +SELECT relname FROM pg_class WHERE relname ~ '^pg_vci_*' ORDER BY relname; +DROP TABLE t4; +SELECT relname FROM pg_class WHERE relname ~ '^pg_vci_*'; + +-- Bug reported by Japin Li that REINDEX forgot to restore security context +-- See https://www.postgresql.org/message-id/ME0P300MB0445827B6E9CC04E0FAEE446B624A%40ME0P300MB0445.AUSP300.PROD.OUTLOOK.COM + +CREATE TABLE t5 (id int, info text); +CREATE INDEX t5_idx ON t5 USING vci (id); +REINDEX TABLE t5; +REINDEX TABLE t5; +DROP TABLE t5; + +-- InstrStartNode bug: +-- Unexpected error "InstrStartNode called twice in a row" +-- NOTE -Change the EXPLAIN below to use TIMING TRUE reproduce the bug, +-- otherwise leave it FALSE so timings don't cause 'make check' to fail. + +CREATE TABLE t6(id int, info text); +CREATE INDEX t6_id_idx ON t6 USING vci (id); +INSERT INTO t6 SELECT id, 'info' || id FROM generate_series(1, 500) id; +ANALYZE t6; +EXPLAIN (ANALYZE, COSTS FALSE, BUFFERS FALSE, TIMING FALSE, SUMMARY FALSE) SELECT max(id) FROM t6; +DROP TABLE t6; + +-- Bug reported by Timur: VCI Sort does not work on top of a non-VCI join +-- See https://www.postgresql.org/message-id/a27f68845af78d404459fcab940bfae2ec7755e5.camel%40postgrespro.ru + +CREATE TABLE main (id BIGSERIAL PRIMARY KEY); +CREATE TABLE secondary (id BIGSERIAL PRIMARY KEY, main_id BIGINT REFERENCES main (id), val INTEGER); + +CREATE INDEX main_vci ON main USING vci (id); +CREATE INDEX sec_vci ON secondary USING vci (id, main_id, val); + +-- Check VCI Sort is not put on top of non-VCI join +EXPLAIN (ANALYZE, COSTS FALSE, BUFFERS FALSE, TIMING FALSE, SUMMARY FALSE) +SELECT * + FROM main m + JOIN secondary s + ON m.id = s.main_id + WHERE s.val in ( + SELECT MAX(val) + FROM secondary s2 + WHERE s2.main_id = m.id) + ORDER BY s.val; + +-- Check VCI Sort is used if suitable +EXPLAIN (ANALYZE, COSTS FALSE, BUFFERS FALSE, TIMING FALSE, SUMMARY FALSE) +SELECT * FROM secondary s ORDER BY s.val; + +DROP TABLE secondary; +DROP TABLE main; diff --git a/contrib/vci/sql/vci.sql b/contrib/vci/sql/vci.sql new file mode 100644 index 000000000000..3fec0b396af9 --- /dev/null +++ b/contrib/vci/sql/vci.sql @@ -0,0 +1,108 @@ +CREATE EXTENSION vci; +SELECT amname, amhandler, amtype FROM pg_am WHERE amname = 'vci'; + +SET vci.table_rows_threshold = 0; + +CREATE TABLE testtable ( + key int, + cond int, + c01a bool, + c01b bool, + c02 bytea, + c03 "char", + c05 int8, + c06 int2, + c07 int4, + c08 text, + c09 float4, + c10 float8, + c13 interval, + c15 money, + c16 bpchar, + c17 varchar, + c18 date, + c19 time, + c20 timetz, + c21 timestamp, + c22 timestamptz, + c23a bit, + c23b bit, + c24a varbit, + c24b varbit, + c25 numeric, + c26 uuid); + +-- Input data +INSERT INTO testtable (key, cond, c01a, c01b, c02, c03, c05, c06, c07, c08, c09, c10, c13, c15, c16, c17, c18, c19, c20, c21, c22, c23a, c23b, c24a, c24b, c25, c26) +SELECT + i % 10, -- key int + i % 21.000000000000, -- cond int + (i % 3.000000000000) > 0, -- c01a bool + (i % 11.000000000000) > 0, -- c01b bool + CAST(to_char((i % 1001.000000000000), '9999') AS bytea), -- c02 bytea + CAST(CAST((i % 249.000000000000) AS character varying) AS "char"), -- c03 "char" + i % 651.000000000000 + i % 350, -- c05 int8 + i % 1001.000000000000, -- c06 in2 + i % 1001.000000000000, -- c07 int4 + i % 1001.000000000000, -- c08 text + i % 1001.000000000000, -- c09 float4 + i % 1001.000000000000, -- c10 float8 + i % 1001.000000000000 * interval '1h', -- c13 interval + (i % 1001.000000000000)::integer::money, -- c15 money + i % 1001.000000000000, -- c16 bpchar + i % 1001.000000000000, -- c17 varchar + date '2015-12-21' + 1 % 1001.000000000000 * interval '1d', -- c18 date + TIMESTAMP '2015-12-21' + (i % 1001.000000000000) * interval '1h', -- c19 time + TIMESTAMP WITH TIME ZONE '2015-12-21 10:00:00+09' + (i % 1001.000000000000) * interval '1h', -- c20 timetz + TIMESTAMP '2015-12-21' + (i % 1001.000000000000) * interval '1h', -- c21 timestamp + TIMESTAMP WITH TIME ZONE '2015-12-21 10:00:00+09' + (i % 1001.000000000000) * interval '1h', -- c22 timestamptz + ((i % 3.000000000000)>0)::integer::bit(1), -- c23a bit + ((i % 11.000000000000)>0)::integer::bit(1), -- c23b bit + (i % 1001.000000000000)::integer::bit(10), -- c24a varbit + (i % 999.000000000000)::integer::bit(10), -- c24b varbit + i % 1001.000000000000, -- c25 numeric + 'a0eebc99-9c0b-4ef8-bb6d-6bb9bd380a11'::uuid -- c26 uuid +FROM generate_series(1, 10000) AS i; + +-- Testcase: insert with some NULL values +INSERT INTO testtable (key, cond, c01a, c01b, c02, c03, c05, c06, c07, c08, c09, c10, c13, c15, c16, c17, c18, c19, c20, c21, c22, c23a, c23b, c24a, c24b, c25, c26) SELECT i, 1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL FROM generate_series(1, 9) AS i; + +-- Testcase: insert with some special values +INSERT INTO testtable (key, cond, c09, c10, c18, c21, c22) VALUES (7, 1, 'Infinity', 'Infinity', 'Infinity', 'infinity', 'infinity'); +INSERT INTO testtable (key, cond, c09, c10, c18, c21, c22) VALUES (8, 1, 'Infinity', 'Infinity', 'Infinity', 'infinity', 'infinity'); +INSERT INTO testtable (key, cond, c09, c10, c18, c21, c22) VALUES (8, 1, '-Infinity', '-Infinity', '-Infinity', '-infinity', '-infinity'); +INSERT INTO testtable (key, cond, c09, c10) VALUES (9, 1, 'NaN', 'NaN'); + +-- Testcase: NaN only +INSERT INTO testtable (key, cond, c09, c10) VALUES (10, 1, 'NaN', 'NaN'); +INSERT INTO testtable (key, cond, c09, c10) VALUES (10, 1, 'NaN', 'NaN'); +INSERT INTO testtable (key, cond, c05) VALUES (10, 1, 1); + +-- Testcase: Timestamp with timezone +INSERT INTO testtable (key, cond, c18, c22) VALUES (11, 1, TIMESTAMP WITH TIME ZONE '2004-10-19 01:00:00+01', TIMESTAMP WITH TIME ZONE '2004-10-19 02:00:00+01'); +INSERT INTO testtable (key, cond, c18, c22) VALUES (11, 1, TIMESTAMP WITH TIME ZONE '2004-10-19 02:00:00+02', TIMESTAMP WITH TIME ZONE '2004-10-19 02:00:00+02'); +INSERT INTO testtable (key, cond, c18, c22) VALUES (11, 1, TIMESTAMP WITH TIME ZONE '2004-10-19 01:00:00+02', TIMESTAMP WITH TIME ZONE '2004-10-19 01:00:00+02'); +INSERT INTO testtable (key, cond, c05) VALUES (11, 1, 1); + +-- Testcase: few attributes are valid +INSERT INTO testtable (key, cond, c01a, c01b, c02, c03, c05, c06, c07, c08, c09, c10, c13, c15, c16, c17, c18, c19, c20, c21, c22, c23a, c23b, c24a, c24b, c25, c26) SELECT 98, 1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL FROM generate_series(1, 50) AS i; + +INSERT INTO testtable (key, cond, c01a, c01b, c02, c03, c05, c06, c07, c08, c09, c10, c15, c16, c17, c18, c19, c20, c21, c22, c23a, c23b, c24a, c24b, c25) VALUES (98, 1, true, true, 'text', 1::char, 1, 1, 1, 'text', 1.0, 1.0, 1, 'text', 'text', timestamp '2015-12-22', timestamp '2015-12-22', timestamp with time zone '2015-12-22 10:23:54+02', timestamp '2015-12-22', timestamp with time zone '2015-12-22 10:23:54+02', 1::bit(1), 1::bit(1), 1::bit(10), 1::bit(10), 1); + +INSERT INTO testtable (key, cond, c01a, c01b, c02, c03, c05, c06, c07, c08, c09, c10, c13, c15, c16, c17, c18, c19, c20, c21, c22, c23a, c23b, c24a, c24b, c25, c26) SELECT 98, 1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL FROM generate_series(1, 50) AS i; + +INSERT INTO testtable (key, cond, c01a, c01b, c02, c03, c05, c06, c07, c08, c09, c10, c13, c15, c16, c17, c18, c19, c20, c21, c22, c23a, c23b, c24a, c24b, c25, c26) SELECT 99, 1, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL FROM generate_series(1, 100) AS i; + +-- Create an index which uses VCI index access method +CREATE INDEX testindex ON testtable USING vci (key, cond, c01a, c01b, c02, c03, c05, c06, c07, c08, c09, c10, c13, c15, c16, c17, c18, c19, c20, c21, c22, c23a, c23b, c24a, c24b, c25, c26); + +-- We expect VCI plans are chosen here +EXPLAIN (ANALYZE, TIMING OFF, COSTS OFF, SUMMARY OFF, BUFFERS OFF) +SELECT key, count(*) AS count_star, count(c05) AS count_c05 FROM testtable WHERE NOT cond = 0 GROUP BY key ORDER BY key; + +-- Confirms the aggregation can work. The first column indicates whether the +-- VCI scan was used. +SELECT vci_runs_in_query() AS vci_runs_in_query, key, count(*) AS count_star, count(c05) AS count_c05 FROM testtable WHERE NOT cond = 0 GROUP BY key ORDER BY key; + +-- cleanup +DROP TABLE testtable; diff --git a/contrib/vci/storage/Makefile b/contrib/vci/storage/Makefile new file mode 100644 index 000000000000..332e9a529413 --- /dev/null +++ b/contrib/vci/storage/Makefile @@ -0,0 +1,34 @@ +# contrib/vci/storage/Makefile + +SUBOBJS = \ + vci_chunk.o \ + vci_columns.o \ + vci_columns_data.o \ + vci_fetch.o \ + vci_freelist.o \ + vci_index.o \ + vci_internal_view.o \ + vci_low_utils.o \ + vci_memory_entry.o \ + vci_ros.o \ + vci_ros_command.o \ + vci_ros_daemon.o \ + vci_tidcrid.o \ + vci_wos.o \ + vci_xact.o + +EXTRA_CLEAN = SUBSYS.o $(SUBOBJS) + +PG_CPPFLAGS = -I $(top_srcdir)/contrib/vci/include + +ifdef USE_PGXS +PGXS := $(shell pg_config --pgxs) +include $(PGXS) +else +subdir = contrib/vci/storage +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +override CFLAGS += $(CFLAGS_SL) diff --git a/contrib/vci/storage/meson.build b/contrib/vci/storage/meson.build new file mode 100644 index 000000000000..247acb057638 --- /dev/null +++ b/contrib/vci/storage/meson.build @@ -0,0 +1,19 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +vci_storage_sources = files( + 'vci_chunk.c', + 'vci_columns.c', + 'vci_columns_data.c', + 'vci_fetch.c', + 'vci_freelist.c', + 'vci_index.c', + 'vci_internal_view.c', + 'vci_low_utils.c', + 'vci_memory_entry.c', + 'vci_ros.c', + 'vci_ros_command.c', + 'vci_ros_daemon.c', + 'vci_tidcrid.c', + 'vci_wos.c', + 'vci_xact.c', +) diff --git a/contrib/vci/storage/vci_chunk.c b/contrib/vci/storage/vci_chunk.c new file mode 100644 index 000000000000..62cb8b19a052 --- /dev/null +++ b/contrib/vci/storage/vci_chunk.c @@ -0,0 +1,616 @@ +/*------------------------------------------------------------------------- + * + * vci_chunk.c + * Buffering mechanism used for WOS->ROS conversion + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/storage/vci_chunk.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "miscadmin.h" +#include "utils/snapmgr.h" +#include "utils/timestamp.h" +#include "utils/uuid.h" + +#include "vci.h" +#include "vci_chunk.h" +#include "vci_columns.h" +#include "vci_columns_data.h" +#include "vci_ros.h" + +static void +InitOneRosChunkBufferCore(RosChunkBuffer *rosChunkBuffer, + int numRowsAtOnce, + int16 *columnIdList, + int16 *columnSizeList, + bool useTid, + bool useDeleteVector, + vci_MainRelHeaderInfo *info) +{ + int16 colId; + int16 nullBitId; + char *bufferIndex; + char *bufferData; + int sizeIndexArray; + Size sizeTuple = 0; + + const int16 numColumns = rosChunkBuffer->numColumns; + const int16 numNullableColumns = rosChunkBuffer->numNullableColumns; + + rosChunkBuffer->numColumnsWithIndex = 0; + rosChunkBuffer->nullWidthInByte = (numNullableColumns + BITS_PER_BYTE - 1) / BITS_PER_BYTE; + rosChunkBuffer->numFilled = 0; + rosChunkBuffer->compType = palloc(sizeof(vcis_compression_type_t) * + numColumns); + rosChunkBuffer->nullBitId = palloc(sizeof(int16) * numColumns); + rosChunkBuffer->columnSizeList = palloc(sizeof(int16) * numColumns); + rosChunkBuffer->data = palloc0(sizeof(void *) * numColumns * 2); + rosChunkBuffer->dataOffset = (vci_offset_in_extent_t **) + &(rosChunkBuffer->data[numColumns]); + MemCpy(rosChunkBuffer->columnSizeList, + columnSizeList, + sizeof(int16) * numColumns); + rosChunkBuffer->nullData = palloc(rosChunkBuffer->nullWidthInByte * + numRowsAtOnce); + rosChunkBuffer->tidData = useTid ? palloc(sizeof(ItemPointerData) * numRowsAtOnce) + : NULL; + rosChunkBuffer->deleteData = useDeleteVector + ? palloc(vci_RoundUpValue(numRowsAtOnce, 8)) + : NULL; + + for (colId = VCI_FIRST_NORMALCOLUMN_ID, nullBitId = 0; colId < numColumns; ++colId) + { + vcis_compression_type_t compType; + + compType = vci_GetMColumn(info, columnIdList ? columnIdList[colId] : colId) + ->comp_type; + rosChunkBuffer->compType[colId] = compType; + switch (compType) + { + case vcis_compression_type_fixed_raw: + rosChunkBuffer->dataOffset[colId] = NULL; + break; + case vcis_compression_type_variable_raw: + + /* + * we put the value 1 in rosChunkBuffer->dataOffset[colId] as + * a mark that later in this function the memory area to keep + * offsets should be allocated. + */ + rosChunkBuffer->dataOffset[colId] = (vci_offset_in_extent_t *) 1; + ++(rosChunkBuffer->numColumnsWithIndex); + break; + default: + elog(ERROR, "unsupported compression type"); /* FIXME */ + } + sizeTuple += columnSizeList[colId]; + if (0 < numNullableColumns) + rosChunkBuffer->nullBitId[colId] = nullBitId++; + } + Assert(nullBitId == numNullableColumns); + + sizeIndexArray = sizeof(vci_offset_in_extent_t) * + rosChunkBuffer->numColumnsWithIndex; + rosChunkBuffer->dataAllocPtr = palloc((sizeIndexArray * + (numRowsAtOnce + 1)) + + (sizeTuple * numRowsAtOnce) + + (VCI_DATA_ALIGNMENT_IN_STORAGE + * numColumns)); + bufferIndex = rosChunkBuffer->dataAllocPtr; + bufferData = &(bufferIndex[sizeIndexArray * (numRowsAtOnce + 1)]); + for (colId = VCI_FIRST_NORMALCOLUMN_ID; colId < numColumns; ++colId) + { + int colSize = vci_RoundUpValue(columnSizeList[colId] * numRowsAtOnce, + VCI_DATA_ALIGNMENT_IN_STORAGE); + + rosChunkBuffer->data[colId] = bufferData; + bufferData += colSize; + + /* + * we put 1 in rosChunkBuffer->dataOffset[colId] for those columns to + * need offset data. + */ + if (rosChunkBuffer->dataOffset[colId]) + { + rosChunkBuffer->dataOffset[colId] = + (vci_offset_in_extent_t *) bufferIndex; + bufferIndex += sizeof(vci_offset_in_extent_t) * + (numRowsAtOnce + 1); + rosChunkBuffer->dataOffset[colId][0] = 0; + } + } +} + +/** + * @brief Initialize a buffer to keep a chunk for ROS. + * + * The buffer initialized by this function must be destroyed by + * vci_DestroyOneRosChunkBuffer(). + * + * @param[out] rosChunkBuffer data in rosChunkBuffer is initialized. + * @param[in] numRowsAtOnce number of rows to be stored in a chunk. + * @param[in] columnSizeList worst-case column sizes. + * @param[in] numColumns number of columns. + * @param[in] info VCI main relation header information. + */ +void +vci_InitOneRosChunkBuffer(RosChunkBuffer *rosChunkBuffer, + int numRowsAtOnce, + int16 *columnSizeList, + int numColumns, + bool useDeleteVector, + vci_MainRelHeaderInfo *info) +{ + rosChunkBuffer->numColumns = numColumns; + rosChunkBuffer->numNullableColumns = vci_GetNumberOfNullableColumn( + vci_GetTupleDescr(info)); + InitOneRosChunkBufferCore(rosChunkBuffer, + numRowsAtOnce, + NULL, + columnSizeList, + true, + useDeleteVector, + info); +} + +/** + * @brief Destroy chunk buffer. + * + * @param[in] rosChunkBuffer target to destroy. + */ +void +vci_DestroyOneRosChunkBuffer(RosChunkBuffer *rosChunkBuffer) +{ + Assert(rosChunkBuffer); + + if (NULL == rosChunkBuffer->compType) + return; + + vci_PfreeAndNull(&(rosChunkBuffer->compType)); + vci_PfreeAndNull(&(rosChunkBuffer->nullBitId)); + vci_PfreeAndNull(&(rosChunkBuffer->columnSizeList)); + vci_PfreeAndNull(&(rosChunkBuffer->data)); + vci_PfreeAndNull(&(rosChunkBuffer->nullData)); + vci_PfreeAndNull(&(rosChunkBuffer->tidData)); + vci_PfreeAndNull(&(rosChunkBuffer->deleteData)); + vci_PfreeAndNull(&(rosChunkBuffer->dataAllocPtr)); + rosChunkBuffer->numColumns = 0; +} + +/** + * @brief Initialize a RosChunkStorage, which holds multiple + * RosChunkBuffer. + * + * @param[out] rosChunkStorage pointer to the target RosChunkStorage that + * is initialized. + * @param[in] numRowsAtOnce number of rows to be stored in a chunk + * @param[in] forAppending false for normal ROS creation. + * make true only for collect-deleted-rows with appending new data. + * + * @note The instance should be destroyed by vci_DestroyRosChunkStorage(). + */ +void +vci_InitRosChunkStorage(RosChunkStorage *rosChunkStorage, + int numRowsAtOnce, + bool forAppending) +{ + Assert(rosChunkStorage); + rosChunkStorage->numChunks = (VCI_NUM_ROWS_IN_EXTENT + numRowsAtOnce - 1) / + numRowsAtOnce; + + if ((rosChunkStorage->forAppending = forAppending)) /* pgr0011 */ + rosChunkStorage->numChunks *= 2; + + rosChunkStorage->numFilled = 0; + rosChunkStorage->numTotalRows = 0; + rosChunkStorage->chunk = palloc0(sizeof(void *) * + rosChunkStorage->numChunks); +} + +/** + * @brief Reset RosChunkStorage to reuse it for new extent creation. + * + * The RosChunkBuffer's held by the storage are destroyed. + * + * @param[in] rosChunkStorage the target to be reset. + */ +void +vci_ResetRosChunkStorage(RosChunkStorage *rosChunkStorage) +{ + int cId; + + for (cId = 0; cId < rosChunkStorage->numFilled; ++cId) + { + vci_DestroyOneRosChunkBuffer(rosChunkStorage->chunk[cId]); + rosChunkStorage->chunk[cId] = NULL; + } + rosChunkStorage->numFilled = 0; + rosChunkStorage->numTotalRows = 0; +} + +/** + * @brief Destroy RosChunkStorage. + * + * The RosChunkBuffer's held by the storage are also destroyed. + * + * @param[in] rosChunkStorage the target to be destroyed. + */ +void +vci_DestroyRosChunkStorage(RosChunkStorage *rosChunkStorage) +{ + int cId; + + Assert(rosChunkStorage); + if (NULL == rosChunkStorage->chunk) + return; + + for (cId = 0; cId < rosChunkStorage->numFilled; ++cId) + vci_DestroyOneRosChunkBuffer(rosChunkStorage->chunk[cId]); + pfree(rosChunkStorage->chunk); + rosChunkStorage->chunk = NULL; + rosChunkStorage->numChunks = 0; + rosChunkStorage->numFilled = 0; +} + +/** + * @brief Fill one tuple in a RosChunkBuffer. + * + * @param[in] rosChunkBuffer the buffer where the tuple is stored into. + * @param[in] info VCI main relation header information. + * @param[in] tid the tid to be stored. + * @param[in] tuple the tuple to be stored. + * @param[in] dstColumnIdList the target column IDs in the VCI. + * @param[in] heapAttrNumList attribute numbers of the target columns + * in the original heap tuple. + * @param[in] tupleDesc the tuple descriptor of the original heap + * relation. + */ +void +vci_FillOneRowInRosChunkBuffer(RosChunkBuffer *rosChunkBuffer, + vci_MainRelHeaderInfo *info, + ItemPointer tid, + HeapTuple tuple, + int16 *dstColumnIdList, + AttrNumber *heapAttrNumList, + TupleDesc tupleDesc) +{ + int16 colId; + int offset = (rosChunkBuffer->numFilled)++; + int nullWidthInByte = rosChunkBuffer->nullWidthInByte; + char *nullData = (NULL == rosChunkBuffer->nullData) ? NULL : + &(rosChunkBuffer->nullData[nullWidthInByte * offset]); + + if (nullData) + MemSet(nullData, 0, nullWidthInByte); + + if (rosChunkBuffer->tidData) + MemCpy(&(rosChunkBuffer->tidData[sizeof(ItemPointerData) * offset]), tid, sizeof(ItemPointerData)); + + for (colId = VCI_FIRST_NORMALCOLUMN_ID; colId < rosChunkBuffer->numColumns; ++colId) + { + bool isnull; + Datum datum = heap_getattr(tuple, + heapAttrNumList[colId], + tupleDesc, + &isnull); + + if (isnull) + { + Assert((VCI_FIRST_NORMALCOLUMN_ID <= dstColumnIdList[colId]) && + (dstColumnIdList[colId] < + vci_GetMainRelVar(info, vcimrv_num_columns, 0))); + if (nullData) + vci_SetBit(nullData, + rosChunkBuffer->nullBitId[dstColumnIdList[colId]]); + + switch (rosChunkBuffer->compType[colId]) + { + case vcis_compression_type_fixed_raw: + { + int size = rosChunkBuffer->columnSizeList[colId]; + char *ptr; + + ptr = &(rosChunkBuffer->data[colId][offset * size]); + if (0 == offset) + MemSet(ptr, 0, size); + else + MemCpy(ptr, &(ptr[-size]), size); + } + break; + case vcis_compression_type_variable_raw: + { + static struct varlena datumNull; + static vci_offset_in_extent_t size = 0; + vci_offset_in_extent_t curOffset; + + if (size == 0) + { + /* One-time initialization */ + + MemSet(&datumNull, 0, sizeof(datumNull)); + + /* + * varlena for extenal is type of 1B_E and has the + * the length of zero. We must give 1 or larger + * length to normal varlena data. + */ + SET_VARSIZE_SHORT(&datumNull, 1); + size = 1; + } + curOffset = rosChunkBuffer->dataOffset[colId][offset]; + rosChunkBuffer->dataOffset[colId][offset + 1] = + curOffset + size; + MemCpy(&(rosChunkBuffer->data[colId][curOffset]), + &datumNull, + size); + } + break; + default: + elog(ERROR, "unsupported compression type"); /* FIXME */ + + } + } + else + { + switch (rosChunkBuffer->compType[colId]) + { + case vcis_compression_type_fixed_raw: + { + int size = rosChunkBuffer->columnSizeList[colId]; + char *ptr; + + ptr = &(rosChunkBuffer->data[colId][offset * size]); + if (size <= sizeof(Datum)) + { + switch (size) + { + case 1: + *ptr = DatumGetUInt8(datum); + break; + case 2: + { + uint16 val = DatumGetUInt16(datum); + + MemCpy(ptr, &val, sizeof(uint16)); + } + break; + case 4: + { + uint32 val = DatumGetUInt32(datum); + + MemCpy(ptr, &val, sizeof(uint32)); + } + break; + case 8: + { + uint64 val = DatumGetInt64(datum); + + MemCpy(ptr, &val, sizeof(uint64)); + } + break; + default: + elog(ERROR, "unsupported fixed length"); + } + } + else + { + size = rosChunkBuffer->columnSizeList[colId]; + + /* FIXME */ + + /* + * sizeof(TimeTzADT) is 16, 4 bytes are padding, + * so we cannot use (sizeof(TimeTzADT) == size). + * Instead, (12U == size). Can we use better way? + */ + Assert((12U == size) || + (sizeof(Interval) == size) || + (UUID_LEN == size) || + (NAMEDATALEN == size)); + MemCpy(ptr, DatumGetPointer(datum), size); + } + } + break; + + /* FIXME */ + + /* + * We need to fill variable length data into fixed length + * area in order to reduce the space for the offsets and + * headers. + */ + case vcis_compression_type_variable_raw: + { + vci_offset_in_extent_t curOffset; + vci_offset_in_extent_t size = VARSIZE_ANY(DatumGetPointer(datum)); + + /* Check worst size. */ + Assert(size <= rosChunkBuffer->columnSizeList[colId]); + + curOffset = rosChunkBuffer->dataOffset[colId][offset]; + rosChunkBuffer->dataOffset[colId][offset + 1] = + curOffset + size; + MemCpy(&(rosChunkBuffer->data[colId][curOffset]), + DatumGetPointer(datum), + size); + } + break; + default: + elog(ERROR, "unsupported compression type"); /* FIXME */ + } + } + } +} + +/** + * @brief Reset counter of a RosChunkBuffer. + * + * @param[in] buffer the target RosChunkBuffer. + */ +void +vci_ResetRosChunkBufferCounter(RosChunkBuffer *buffer) +{ + int colId; + + for (colId = VCI_FIRST_NORMALCOLUMN_ID; colId < buffer->numColumns; ++colId) + { + if (NULL != buffer->dataOffset[colId]) + buffer->dataOffset[colId][0] = 0; + } + buffer->numFilled = 0; +} + +/** + * @brief Create a copy of a RosChunkBuffer. + * + * In creation, memory are allocated only by the necessary capacity. + * + * @param[in] src the original RosChunkBuffer. + * @return pointer to the copy. + * + * @note The created RosChunkBuffer should be destroyed by + * vci_DestroyOneRosChunkBuffer(). + */ +static RosChunkBuffer * +vci_CopyRosChunkBuffer(RosChunkBuffer *src) +{ + Size totalSize; + int16 colId; + char *bufferIndex; + char *bufferData; + int sizeIndexArray; + int16 numColumns = src->numColumns; + int numFilled = src->numFilled; + RosChunkBuffer *dst; + + CHECK_FOR_INTERRUPTS(); + + dst = vci_AllocateAndCopy(src, sizeof(RosChunkBuffer)); + + dst->compType = vci_AllocateAndCopy(src->compType, + sizeof(vcis_compression_type_t) * numColumns); + dst->nullBitId = vci_AllocateAndCopy(src->nullBitId, + sizeof(int16) * numColumns); + dst->columnSizeList = vci_AllocateAndCopy(src->columnSizeList, + sizeof(int16) * numColumns); + dst->nullData = vci_AllocateAndCopy(src->nullData, + src->nullWidthInByte * numFilled); + dst->tidData = vci_AllocateAndCopy(src->tidData, + sizeof(ItemPointerData) * numFilled); + dst->deleteData = NULL; + if (src->deleteData) + dst->deleteData = vci_AllocateAndCopy(src->deleteData, + vci_RoundUpValue(numFilled, 8)); + + CHECK_FOR_INTERRUPTS(); + + dst->data = palloc(sizeof(void *) * numColumns * 2); + dst->dataOffset = (vci_offset_in_extent_t **) &(dst->data[numColumns]); + + sizeIndexArray = sizeof(vci_offset_in_extent_t) * src->numColumnsWithIndex; + totalSize = sizeIndexArray * (numFilled + 1); /* pgr0062 */ + for (colId = VCI_FIRST_NORMALCOLUMN_ID; colId < numColumns; ++colId) + { + if (NULL == src->dataOffset[colId]) + totalSize += src->columnSizeList[colId] * numFilled; + else + totalSize += src->dataOffset[colId][numFilled] - + src->dataOffset[colId][0]; + } + dst->dataAllocPtr = palloc(totalSize); + bufferIndex = dst->dataAllocPtr; + bufferData = &(bufferIndex[sizeIndexArray * (numFilled + 1)]); + for (colId = VCI_FIRST_NORMALCOLUMN_ID; colId < numColumns; ++colId) + { + if (0 == (colId & 3)) + CHECK_FOR_INTERRUPTS(); + + if (NULL == src->dataOffset[colId]) + { + Size copySize = src->columnSizeList[colId] * numFilled; + + dst->data[colId] = bufferData; + MemCpy(bufferData, src->data[colId], copySize); + bufferData += copySize; + dst->dataOffset[colId] = NULL; + } + else + { + Size copySize = src->dataOffset[colId][numFilled] - + src->dataOffset[colId][0]; + + dst->data[colId] = bufferData; + MemCpy(bufferData, src->data[colId], copySize); + bufferData += copySize; + copySize = sizeof(vci_offset_in_extent_t) * (numFilled + 1); + dst->dataOffset[colId] = (vci_offset_in_extent_t *) bufferIndex; + MemCpy(bufferIndex, src->dataOffset[colId], copySize); + bufferIndex += copySize; + } + } + + return dst; +} + +/** + * @brief Register a RosChunkBuffer to a RosChunkStorage. + * + * @param[in] rosChunkStorage the holder of RosChunkBuffer. + * @param[in] src the RosChunkBuffer to be registered. + */ +void +vci_RegisterChunkBuffer(RosChunkStorage *rosChunkStorage, RosChunkBuffer *src) +{ + Assert(rosChunkStorage->numFilled < rosChunkStorage->numChunks); + rosChunkStorage->chunk[rosChunkStorage->numFilled] = + vci_CopyRosChunkBuffer(src); + ++(rosChunkStorage->numFilled); + rosChunkStorage->numTotalRows += src->numFilled; +} + +/** + * @brief Calculate the data size of the specified column + * in the RosChunkStorage. + * + * @param[in] src the target RosChunkStorage to be inspected. + * @param[in] columnId the ID of the target column. + * @param[in] asFixed true to treat a variable-field-length column as a + * fixed-field-length column. + */ +Size +vci_GetDataSizeInChunkStorage(RosChunkStorage *src, int columnId, bool asFixed) +{ + Size dataSize; + int chunkId; + + if (src->numFilled < 1) + return 0; + + Assert((VCI_FIRST_NORMALCOLUMN_ID <= columnId) && (columnId < src->chunk[0]->numColumns)); + + switch (src->chunk[0]->compType[columnId]) + { + case vcis_compression_type_fixed_raw: + return src->numTotalRows * src->chunk[0]->columnSizeList[columnId]; + + default: + ; + } + + dataSize = 0; + for (chunkId = 0; chunkId < src->numFilled; ++chunkId) + { + RosChunkBuffer *chunk = src->chunk[chunkId]; + vci_offset_in_extent_t *dataOffset = chunk->dataOffset[columnId]; + + dataSize += dataOffset[chunk->numFilled] - dataOffset[0]; + } + + return dataSize; +} diff --git a/contrib/vci/storage/vci_columns.c b/contrib/vci/storage/vci_columns.c new file mode 100644 index 000000000000..122ad8729ff5 --- /dev/null +++ b/contrib/vci/storage/vci_columns.c @@ -0,0 +1,1163 @@ +/*------------------------------------------------------------------------- + * + * vci_columns.c + * Column store which consists ROS + * + * Column store consists of a main and a meta relation. Main relation + * consists of some extents and dictionaries. This file contains their + * handlings. + * + * Also, delete vector is also handled here. + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/storage/vci_columns.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include + +#include "access/xact.h" +#include "catalog/index.h" + +#include "postgresql_copy.h" + +#include "vci.h" +#include "vci_chunk.h" + +#include "vci_columns.h" +#include "vci_columns_data.h" + +#include "vci_freelist.h" +#include "vci_fetch.h" +#include "vci_mem.h" +#include "vci_ros.h" +#include "vci_tidcrid.h" + +#define VCI_LIMIT_INEFFICIENT_COUNT (10) +#define GROWTH_NODE (10) + +#define VCI_MINIMUM_DATA_AMOUNT_FOR_COMMON_DICT (64 * 1024 * 1024) + +/** + * function to cast from Page to (vcis_column_meta_t *). + */ +#define vci_GetColumnMetaT(page) \ + ((vcis_column_meta_t *) &((page)[VCI_MIN_PAGE_HEADER])) + +static void + UpdateInfoInMetaForFixedLengthRawData(vci_ColumnRelations *rel, + int numExtentPages); + +static uint32 GetVarlenAHeader(Datum *header, + Buffer *buffer, + BlockNumber *currentBlockNumber, + uint32 offsetInPage, + Relation rel); + +typedef struct vci_CmpInfo +{ + vci_DictInfo dict_info; + + /* + * pointer to compressed data. NULL if no compressed data, or the size of + * compressed data is larger than or equal to that of raw. In this case, + * the memory areas pointed by compressed_data and compressed_offset + * should be freed, and compressed_num_offset should be zero. + */ + char *compressed_data; + + vci_offset_in_extent_t *compressed_offset; /* offset of each + * VCI_COMPACTION_UNIT_ROW */ + uint32 compressed_num_offset; /* number of offset */ +} vci_CmpInfo; + +static void +InitializeCmpInfo(vci_CmpInfo *cmpInfo) +{ + Assert(cmpInfo); + vci_InitializeDictInfo(&(cmpInfo->dict_info)); + cmpInfo->compressed_data = NULL; + cmpInfo->compressed_offset = NULL; + cmpInfo->compressed_num_offset = 0; +} + +/* *************************** + * Extent operation function + * *************************** + */ +void +vci_WriteRawDataExtentInfo(Relation rel, + int32 extentId, + uint32 startBlockNumber, + uint32 numBlocks, + char *minData, + char *maxData, + bool validMinMax, + bool checkOverwrite) +{ + Buffer bufMeta; + Buffer buffer; + BlockNumber blockNumber; + vcis_c_extent_t *columnExtent; + vcis_column_meta_t *columnMeta = vci_GetColumnMeta(&bufMeta, rel); + + Assert(false == validMinMax); + + columnExtent = vci_GetColumnExtent(&buffer, + &blockNumber, + rel, + extentId); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + /* vci_MinMaxTypeInfo *mmti = vci_GetMinMaxTypeInfo(attr); */ + if (checkOverwrite) + if (columnExtent->enabled && + (0 != columnExtent->block_number)) + elog(ERROR, "overwrite column meta data"); /* FIXME */ + + columnExtent->block_number = startBlockNumber; + columnExtent->num_blocks = numBlocks; + columnExtent->enabled = (startBlockNumber != InvalidBlockNumber); + columnExtent->valid_min_max = validMinMax; + + if (minData) + MemCpy(columnExtent->min, minData, columnMeta->min_max_content_size); + + if (maxData) + MemCpy(&(columnExtent->min[columnMeta->min_max_field_size]), + maxData, columnMeta->min_max_content_size); + + vci_WriteOneItemPage(rel, buffer); + UnlockReleaseBuffer(buffer); + ReleaseBuffer(bufMeta); +} + +static void +WriteFixedLengthRawData(vci_MainRelHeaderInfo *info, + RosChunkStorage *src, + int16 columnId, + int extentId) +{ + vci_ColumnRelations rel; + int16 columnSize; + Size dataSize; + int extentHeaderSize; + int numExtentPages; + char minData[VCI_MAX_MIN_MAX_SIZE]; + char maxData[VCI_MAX_MIN_MAX_SIZE]; + BlockNumber startBlockNumber; + BlockNumber blockNumber; + uint32 offsetInPage; + int chunkId; + Buffer buffer = InvalidBuffer; + Page page = NULL; /* invalid page */ + LOCKMODE lockmode = RowExclusiveLock; + bool fixedPages = true; + + Assert(VCI_FIRST_NORMAL_EXTENT_ID <= extentId); + + Assert(info); + Assert(src); + Assert(0 < src->numFilled); + Assert(src->chunk[0]); + Assert((VCI_COLUMN_ID_TID == columnId) || + (VCI_COLUMN_ID_NULL == columnId) || + (columnId < src->chunk[0]->numColumns)); + + vci_OpenColumnRelations(&rel, info, columnId, lockmode); + + columnSize = vci_GetFixedColumnSize(info, columnId); + + dataSize = (Size) columnSize * VCI_NUM_ROWS_IN_EXTENT; + extentHeaderSize = vci_GetExtentFixedLengthRawDataHeaderSize( + VCI_NUM_ROWS_IN_EXTENT); + numExtentPages = vci_GetNumBlocks(dataSize + extentHeaderSize); + + startBlockNumber = extentId * numExtentPages; + if (VCI_FIRST_NORMALCOLUMN_ID <= columnId) + { + vcis_m_column_t *colInfo = vci_GetMColumn(info, columnId); + + switch (colInfo->comp_type) + { + case vcis_compression_type_fixed_raw: + vci_WriteRawDataExtentInfo(rel.meta, + extentId, + startBlockNumber, + numExtentPages, + NULL, + NULL, + false, + true); + UpdateInfoInMetaForFixedLengthRawData(&rel, + numExtentPages); + break; + default: + Assert(false); + elog(ERROR, "internal error"); + } + } + + vci_PreparePagesWithOneItemIfNecessary(rel.data, + startBlockNumber + numExtentPages - 1); + + vci_GetBlockNumberAndOffsetInPage(&blockNumber, + &offsetInPage, + extentHeaderSize); + blockNumber += startBlockNumber; + buffer = vci_ReadBufferWithPageInit(rel.data, blockNumber); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buffer); + + { + vcis_extent_t *extent = vci_GetExtentT(page); + + extent->size = numExtentPages * VCI_MAX_PAGE_SPACE; + extent->type = vcis_extent_type_data; + extent->id = extentId; + extent->comp_type = vcis_compression_type_fixed_raw; + extent->offset_offset = 0; + extent->offset_size = 0; + extent->data_offset = extentHeaderSize; + extent->data_size = dataSize; + extent->compressed = 0; + extent->dict_offset = 0; + extent->dict_size = 0; + extent->dict_type = vcis_dict_type_none; + } + + for (chunkId = 0; chunkId < src->numFilled; ++chunkId) + { + RosChunkBuffer *chunk = src->chunk[chunkId]; + int written; + int size; + + Assert(chunk); + + size = chunk->numFilled * columnSize; + for (written = 0; written < size;) + { + int writeSize; + + if (VCI_MAX_PAGE_SPACE <= offsetInPage) + { + if (BufferIsValid(buffer)) + { + vci_WriteOneItemPage(rel.data, buffer); + UnlockReleaseBuffer(buffer); + } + ++blockNumber; + /* FIXME */ + + /* + * To obtain better performance, each DB page should be + * initialized only when it is accessed for the first time. + */ + offsetInPage = 0; + buffer = ReadBuffer(rel.data, blockNumber); + vci_InitPageCore(buffer, 1, false); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buffer); + } + + writeSize = Min(VCI_MAX_PAGE_SPACE - offsetInPage, size - written); /* pgr0063 */ + switch (columnId) + { + case VCI_COLUMN_ID_TID: + MemCpy(&(((char *) page)[VCI_MIN_PAGE_HEADER + offsetInPage]), + &(chunk->tidData[written]), + writeSize); + break; + case VCI_COLUMN_ID_NULL: + MemCpy(&(((char *) page)[VCI_MIN_PAGE_HEADER + offsetInPage]), + &(chunk->nullData[written]), + writeSize); + break; + default: + MemCpy(&(((char *) page)[VCI_MIN_PAGE_HEADER + offsetInPage]), + &(chunk->data[columnId][written]), + writeSize); + break; + } + written += writeSize; + offsetInPage += writeSize; + } + } + if (BufferIsValid(buffer)) + { + vci_WriteOneItemPage(rel.data, buffer); + UnlockReleaseBuffer(buffer); + ++blockNumber; + + if (fixedPages) + { + for (; blockNumber < (startBlockNumber + numExtentPages); + ++blockNumber) + { + buffer = vci_ReadBufferWithPageInit(rel.data, blockNumber); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + vci_WriteOneItemPage(rel.data, buffer); + UnlockReleaseBuffer(buffer); + } + } + } + + vci_WriteRawDataExtentInfo(rel.meta, + extentId, + startBlockNumber, + numExtentPages, + minData, + maxData, + false, + false); + + vci_CloseColumnRelations(&rel, lockmode); +} + +static void +WriteVariableLengthRawData(vci_MainRelHeaderInfo *info, + RosChunkStorage *src, + int columnId, + int extentId, + TransactionId xId) +{ + vci_ColumnRelations rel; + Size dataSize; + int extentHeaderSize; + int numExtentPages; + int numCommonDictPages; + char minData[VCI_MAX_MIN_MAX_SIZE]; + char maxData[VCI_MAX_MIN_MAX_SIZE]; + BlockNumber startBlockNumber; + BlockNumber blockNumber; + BlockNumber blockNumberOld = InvalidBlockNumber; + uint32 offsetInPage; + int chunkId; + Buffer buffer = InvalidBuffer; + LOCKMODE lockmode = RowExclusiveLock; + vcis_extent_t *extent; + vcis_compression_type_t compType; + vci_CmpInfo cmpInfo; + + vcis_free_space_t *FS; + + Assert(VCI_FIRST_NORMAL_EXTENT_ID <= extentId); + Assert(info); + Assert(src); + Assert(0 < src->numFilled); + Assert(src->chunk[0]); + Assert(columnId < src->chunk[0]->numColumns); + + InitializeCmpInfo(&cmpInfo); + + vci_OpenColumnRelations(&rel, info, columnId, lockmode); + + { + vcis_m_column_t *colInfo = vci_GetMColumn(info, columnId); + + compType = colInfo->comp_type; + } + Assert(compType == vcis_compression_type_variable_raw); + + dataSize = vci_GetDataSizeInChunkStorage(src, columnId, false); + + extentHeaderSize = vci_GetExtentVariableLengthRawDataHeaderSize( + src->numTotalRows); + + numExtentPages = vci_GetNumBlocks(dataSize + extentHeaderSize); + numCommonDictPages = 0; + + startBlockNumber = vci_FindFreeSpaceForExtent(&rel, numExtentPages + numCommonDictPages); + FS = vci_GetFreeSpace((vci_RelationPair *) &rel, startBlockNumber); + vci_WriteRecoveryRecordForFreeSpace(&rel, columnId, cmpInfo.dict_info.common_dict_id, + startBlockNumber, FS); + ReleaseBuffer(rel.bufData); + + vci_RemoveFreeSpaceFromLinkList(&rel, + startBlockNumber, + numExtentPages + numCommonDictPages); + + vci_WriteRawDataExtentInfo(rel.meta, + extentId, + startBlockNumber, + numExtentPages + numCommonDictPages, + NULL, /* min */ + NULL, /* max */ + false, + true); + + /* write the header part of extent data in data relation */ + blockNumberOld = blockNumber = startBlockNumber + numCommonDictPages; + buffer = vci_ReadBufferWithPageInit(rel.data, blockNumber); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + extent = vci_GetExtentT(BufferGetPage(buffer)); + extent->size = numExtentPages * VCI_MAX_PAGE_SPACE; + extent->type = vcis_extent_type_data; + extent->id = extentId; + extent->comp_type = compType; + extent->offset_offset = offsetof(vcis_extent_t, dict_body); + extent->offset_size = vci_GetOffsetArraySize(src->numTotalRows); + extent->data_offset = extentHeaderSize; + extent->data_size = dataSize; + extent->compressed = 0; + extent->dict_offset = VCI_INVALID_DICTIONARY_ID; + extent->dict_size = 0; + extent->dict_type = vcis_dict_type_none; + + /* write offset data */ + /*************** + * ** CAUTION ** + * ************* + * Here, we only record pointers of the head of each + * VCI_COMPACTION_UNIT_ROW entries. + */ + vci_GetBlockNumberAndOffsetInPage(&blockNumber, + &offsetInPage, + extent->offset_offset); + blockNumber += startBlockNumber + numCommonDictPages; + + { /* raw data */ + /* + * Make offset data + */ +#ifdef USE_ASSERT_CHECKING + uint32 numRowSamples = vci_GetOffsetArrayLength(src->numTotalRows); +#endif /* #ifdef USE_ASSERT_CHECKING */ + uint32 offsetSize = vci_GetOffsetArraySize(src->numTotalRows); + vci_offset_in_extent_t *offset = palloc(offsetSize); + int rowId = 0; + int globalOffset = 0; + int offsetPtr = 0; + + for (chunkId = 0; chunkId < src->numFilled; ++chunkId) + { + RosChunkBuffer *chunk = src->chunk[chunkId]; + vci_offset_in_extent_t *dataOffset = chunk->dataOffset[columnId]; + int elemId = rowId % VCI_COMPACTION_UNIT_ROW; + + for (; elemId < chunk->numFilled; + elemId += VCI_COMPACTION_UNIT_ROW) + { + offset[offsetPtr++] = globalOffset + dataOffset[elemId]; + } + rowId += chunk->numFilled; + globalOffset += dataOffset[chunk->numFilled] - dataOffset[0]; + } + Assert(rowId == src->numTotalRows); + Assert(globalOffset == vci_GetDataSizeInChunkStorage(src, columnId, + false)); + Assert(offsetPtr == (numRowSamples - 1)); + offset[offsetPtr] = globalOffset; + + buffer = vci_WriteDataIntoMultiplePages(rel.data, + &blockNumber, + &blockNumberOld, + &offsetInPage, + buffer, + offset, + offsetSize); + pfree(offset); + } + + /* write data */ + vci_GetBlockNumberAndOffsetInPage(&blockNumber, + &offsetInPage, + extent->data_offset); + blockNumber += startBlockNumber + numCommonDictPages; + + { + for (chunkId = 0; chunkId < src->numFilled; ++chunkId) + { + RosChunkBuffer *chunk = src->chunk[chunkId]; + vci_offset_in_extent_t *dataOffset = chunk->dataOffset[columnId]; + int size = dataOffset[chunk->numFilled] - dataOffset[0]; + + Assert(chunk); + + buffer = vci_WriteDataIntoMultiplePages(rel.data, + &blockNumber, &blockNumberOld, &offsetInPage, + buffer, + chunk->data[columnId], size); + } + } + + if (BufferIsValid(buffer)) + { + vci_WriteOneItemPage(rel.data, buffer); + UnlockReleaseBuffer(buffer); + } + + vci_WriteRawDataExtentInfo(rel.meta, + extentId, + startBlockNumber + numCommonDictPages, + numExtentPages, + minData, + maxData, + false, + false); + + vci_CloseColumnRelations(&rel, lockmode); +} + +static void +WriteDeleteVector(vci_MainRelHeaderInfo *info, + RosChunkStorage *src, + int extentId) +{ + vci_ColumnRelations rel; + int chunkId; + LOCKMODE lockmode = RowExclusiveLock; + int numExtentPages = VCI_NUM_PAGES_IN_EXTENT_FOR_DELETE; + BlockNumber startBlockNumber = numExtentPages * extentId; + + Buffer buffer; + + Assert(VCI_FIRST_NORMAL_EXTENT_ID <= extentId); + + vci_OpenColumnRelations(&rel, info, VCI_COLUMN_ID_DELETE, lockmode); + + vci_WriteRawDataExtentInfo(rel.meta, + extentId, + startBlockNumber, + numExtentPages, + NULL, + NULL, + false, + false /* don't check ovwerwrite */ ); + UpdateInfoInMetaForFixedLengthRawData(&rel, + numExtentPages); + { + int rId; + + for (rId = 0; rId < numExtentPages; ++rId) + { + vci_PreparePagesIfNecessaryCore(rel.data, + startBlockNumber + rId, + VCI_ITEMS_IN_PAGE_FOR_DELETE, + true, + true); + + buffer = ReadBuffer(rel.data, startBlockNumber + rId); + vci_InitPageCore(buffer, VCI_ITEMS_IN_PAGE_FOR_DELETE, false); + ReleaseBuffer(buffer); + } + } + + for (chunkId = 0; chunkId < src->numFilled; ++chunkId) + { + RosChunkBuffer *chunk = src->chunk[chunkId]; + + if (chunk->deleteData) + { + abort(); /* FIXME */ + } + } + + vci_WriteRawDataExtentInfo(rel.meta, + extentId, + startBlockNumber, + numExtentPages, + NULL, + NULL, + false, + false); + + vci_CloseColumnRelations(&rel, lockmode); +} + +void +vci_WriteOneExtent(vci_MainRelHeaderInfo *info, + RosChunkStorage *src, + int extentId, + TransactionId xgen, /* xgen in extent info */ + TransactionId xdel, /* xdel in extent info */ + TransactionId xid) /* in tuple header */ +{ + int16 colId; + + Assert(src); + if (src->numTotalRows < 1) + return; + Assert(src->chunk[0]); + Assert(VCI_FIRST_NORMAL_EXTENT_ID <= extentId); + + WriteDeleteVector(info, src, extentId); + WriteFixedLengthRawData(info, src, VCI_COLUMN_ID_TID, extentId); + WriteFixedLengthRawData(info, src, VCI_COLUMN_ID_NULL, extentId); + for (colId = VCI_FIRST_NORMALCOLUMN_ID; colId < src->chunk[0]->numColumns; ++colId) + { + CHECK_FOR_INTERRUPTS(); + + switch (src->chunk[0]->compType[colId]) + { + case vcis_compression_type_fixed_raw: + WriteFixedLengthRawData(info, src, colId, extentId); + break; + case vcis_compression_type_variable_raw: + WriteVariableLengthRawData(info, src, colId, extentId, xid); + break; + default: + elog(ERROR, "unsupported compression type"); /* FIXME */ + } + } + vci_WriteExtentInfo(info, + extentId, + src->numTotalRows, + 0, + 0, + xgen, + xdel); +} + +void +vci_InitializeDictInfo(vci_DictInfo *dictInfo) +{ + Assert(dictInfo); + dictInfo->dictionary_storage = NULL; + dictInfo->storage_size = 0; + dictInfo->extent_id = VCI_INVALID_EXTENT_ID; + dictInfo->common_dict_id = VCI_INVALID_DICTIONARY_ID; + dictInfo->dict_type = vcis_dict_type_none; +} + +vcis_c_extent_t * +vci_GetColumnExtent(Buffer *buffer, + BlockNumber *blockNumber, + Relation rel, + int32 extentId) +{ + Page page; + vcis_column_meta_t *columnMeta = vci_GetColumnMeta(buffer, rel); + + /* vci_MinMaxTypeInfo *mmti = vci_GetMinMaxTypeInfo(attr); */ + int columnExtentSize; + int headerSize = offsetof(vcis_column_meta_t, common_dict_info) + + (sizeof(vcis_c_common_dict_t) * columnMeta->num_common_dicts); + int numExtentsInFirstPage; + int numExtentsInPage; + + Assert(VCI_FIRST_NORMAL_EXTENT_ID <= extentId); + + *blockNumber = VCI_NUM_COLUMN_META_HEADER_PAGE - 1; + columnExtentSize = offsetof(vcis_c_extent_t, min) + (2 * columnMeta->min_max_field_size); + numExtentsInFirstPage = (VCI_MAX_PAGE_SPACE - headerSize) / + columnExtentSize; + if (extentId < numExtentsInFirstPage) + return (vcis_c_extent_t *) &(((char *) columnMeta) + [headerSize + (extentId * columnExtentSize)]); + ReleaseBuffer(*buffer); + + extentId -= numExtentsInFirstPage; + numExtentsInPage = VCI_MAX_PAGE_SPACE / columnExtentSize; + *blockNumber = extentId / numExtentsInPage; + extentId -= *blockNumber * numExtentsInPage; + *blockNumber += VCI_NUM_COLUMN_META_HEADER_PAGE; + vci_PreparePagesWithOneItemIfNecessary(rel, *blockNumber); + *buffer = ReadBuffer(rel, *blockNumber); + page = BufferGetPage(*buffer); + + return (vcis_c_extent_t *) &(((char *) page) + [VCI_MIN_PAGE_HEADER + (extentId * columnExtentSize)]); +} + +vcis_column_meta_t * +vci_GetColumnMeta(Buffer *buffer, Relation rel) +{ + Page page; + + *buffer = vci_ReadBufferWithPageInit(rel, VCI_COLUMN_META_HEADER_PAGE_ID); + page = BufferGetPage(*buffer); + + return vci_GetColumnMetaT(page); +} + +static void +GetColumnOids(Oid *metaOid, + Oid *dataOid, + vci_MainRelHeaderInfo *info, + int16 columnId) +{ + switch (columnId) + { + case VCI_COLUMN_ID_DELETE: + *metaOid = vci_GetMainRelVar(info, vcimrv_delete_meta_oid, 0); + *dataOid = vci_GetMainRelVar(info, vcimrv_delete_data_oid, 0); + break; + case VCI_COLUMN_ID_CRID: + *metaOid = InvalidOid; + *dataOid = InvalidOid; + break; + case VCI_COLUMN_ID_TID: + *metaOid = vci_GetMainRelVar(info, vcimrv_tid_meta_oid, 0); + *dataOid = vci_GetMainRelVar(info, vcimrv_tid_data_oid, 0); + break; + case VCI_COLUMN_ID_NULL: + *metaOid = vci_GetMainRelVar(info, vcimrv_null_meta_oid, 0); + *dataOid = vci_GetMainRelVar(info, vcimrv_null_data_oid, 0); + break; + default: + { + vcis_m_column_t *colInfo = vci_GetMColumn(info, columnId); + + *metaOid = colInfo->meta_oid; + *dataOid = colInfo->data_oid; + break; + } + } +} + +void +vci_OpenColumnRelations(vci_ColumnRelations *rel, + vci_MainRelHeaderInfo *info, + int16 columnId, + LOCKMODE lockmode) +{ + Oid metaOid; + Oid dataOid; + + GetColumnOids(&metaOid, &dataOid, info, columnId); + rel->meta = table_open(metaOid, lockmode); + rel->data = table_open(dataOid, lockmode); + + rel->info = info; +} + +void +vci_CloseColumnRelations(vci_ColumnRelations *rel, LOCKMODE lockmode) +{ + if (rel) + { + if (RelationIsValid(rel->data)) + table_close(rel->data, lockmode); + if (RelationIsValid(rel->meta)) + table_close(rel->meta, lockmode); + } +} + +static void +UpdateInfoInMetaForFixedLengthRawData(vci_ColumnRelations *rel, + int numExtentPages) +{ + vcis_column_meta_t *columnMeta; + + if (0 == numExtentPages) + return; + columnMeta = vci_GetColumnMeta(&rel->bufMeta, rel->meta); + if (0 < numExtentPages) /* an extent added */ + { + ++(columnMeta->num_extents); + if (columnMeta->num_free_pages < numExtentPages) + columnMeta->num_free_pages = 0; + else + columnMeta->num_free_pages = columnMeta->num_free_pages - + numExtentPages; + if (0 < columnMeta->num_free_page_blocks) + --(columnMeta->num_free_page_blocks); + } + else /* an extent deleted */ + { + Assert(0 < columnMeta->num_extents); + --(columnMeta->num_extents); + columnMeta->num_free_pages -= numExtentPages; + ++(columnMeta->num_free_page_blocks); + } + + LockBuffer(rel->bufMeta, BUFFER_LOCK_EXCLUSIVE); + vci_WriteOneItemPage(rel->meta, rel->bufMeta); + UnlockReleaseBuffer(rel->bufMeta); +} + +static uint32 +GetVarlenAHeader(Datum *header, + Buffer *buffer, + BlockNumber *currentBlockNumber, + uint32 offsetInPage, + Relation rel) +{ + Page page = BufferGetPage(*buffer); + char *curPtr = &(page[VCI_MIN_PAGE_HEADER + offsetInPage]); + int len = VCI_MAX_PAGE_SPACE - offsetInPage; + int reqLen = vci_VARHDSZ_ANY(curPtr); + + if (reqLen <= len) + { + MemCpy(header, curPtr, reqLen); + + return offsetInPage + reqLen; + } + + MemCpy(header, curPtr, len); + ReleaseBuffer(*buffer); + ++*currentBlockNumber; + *buffer = vci_ReadBufferWithPageInit(rel, *currentBlockNumber); + page = BufferGetPage(*buffer); + MemCpy(&(((char *) header)[len]), + &(page[VCI_MIN_PAGE_HEADER]), + reqLen - len); + + return reqLen - len; +} + +void +vci_GetElementPosition(uint32 *offset, /* not array */ + BlockNumber *blockNumberBase, + uint32 *dataOffset, + vci_ColumnRelations *rel, + int32 extentId, + uint32 rowIdInExtent, + Form_pg_attribute attr) +{ + uint32 offset_[2]; + Size totalSize; + + Assert(VCI_FIRST_NORMAL_EXTENT_ID <= extentId); + vci_GetChunkPositionAndSize(offset_, + &totalSize, + blockNumberBase, + dataOffset, + rel, + extentId, + rowIdInExtent, + 1, + attr); + + *offset = offset_[0] + *dataOffset; + + { + uint32 rowIdInChunk = rowIdInExtent % VCI_COMPACTION_UNIT_ROW; + BlockNumber curBN = (*offset) / VCI_MAX_PAGE_SPACE; + BlockNumber oldBN = InvalidBlockNumber; + uint32 offsetInPage; + Buffer buffer = InvalidBuffer; + uint32 rowId; + + offsetInPage = (*offset) - (curBN * VCI_MAX_PAGE_SPACE); + curBN += *blockNumberBase; + + for (rowId = 0; rowId < rowIdInChunk; ++rowId) + { + Datum datum; + + if (oldBN != curBN) + { + if (BufferIsValid(buffer)) + ReleaseBuffer(buffer); + buffer = vci_ReadBufferWithPageInit(rel->data, curBN); + oldBN = curBN; + } + + GetVarlenAHeader(&datum, + &buffer, + &curBN, + offsetInPage, + rel->data); + + { + uint32 size = VARSIZE_ANY(&datum); + + (*offset) += size; + offsetInPage += size; + if (VCI_MAX_PAGE_SPACE <= offsetInPage) + { + offsetInPage -= VCI_MAX_PAGE_SPACE; + if (oldBN == curBN) + ++curBN; + else + oldBN = curBN; + Assert(offsetInPage < VCI_MAX_PAGE_SPACE); + } + } + } + if (BufferIsValid(buffer)) + ReleaseBuffer(buffer); + } + *offset -= *dataOffset; + Assert((*offset) < offset_[1]); +} + +void +vci_GetChunkPositionAndSize(uint32 *offset, + Size *totalSize, + BlockNumber *blockNumberBase, + uint32 *dataOffset, + vci_ColumnRelations *rel, + int32 extentId, + uint32 rowIdInExtent, + int32 numUnit, + Form_pg_attribute attr) +{ + uint32 offsetUnit; + + Assert(VCI_FIRST_NORMAL_EXTENT_ID <= extentId); + { + Buffer buffer; + Buffer bufData; + BlockNumber blockNumber; + Page page; + uint32 unitId = rowIdInExtent / VCI_COMPACTION_UNIT_ROW; + vcis_c_extent_t *cExtent = vci_GetColumnExtent(&buffer, + &blockNumber, + rel->meta, + extentId); + vcis_extent_t *extent; + + *blockNumberBase = cExtent->enabled ? cExtent->block_number : InvalidBlockNumber; + bufData = vci_ReadBufferWithPageInit(rel->data, *blockNumberBase); + page = BufferGetPage(bufData); + extent = vci_GetExtentT(page); + *dataOffset = extent->data_offset; + offsetUnit = (sizeof(uint32) * unitId) + extent->offset_offset; + ReleaseBuffer(bufData); + ReleaseBuffer(buffer); + } + + { + BlockNumber blockNumber; + uint32 offsetPtr; + Buffer buffer; + Page page; + int aId; + + vci_GetBlockNumberAndOffsetInPage(&blockNumber, + &offsetPtr, + offsetUnit); + blockNumber += *blockNumberBase; + buffer = vci_ReadBufferWithPageInit(rel->data, blockNumber); + page = BufferGetPage(buffer); + for (aId = 0; aId <= numUnit; ++aId) + { + offset[aId] = *(uint32 *) &(page[offsetPtr + VCI_MIN_PAGE_HEADER]); + offsetPtr += sizeof(uint32); + if (VCI_MAX_PAGE_SPACE <= offsetPtr) + { + ReleaseBuffer(buffer); + ++blockNumber; + buffer = vci_ReadBufferWithPageInit(rel->data, blockNumber); + page = BufferGetPage(buffer); + offsetPtr = 0; + } + } + *totalSize = offset[numUnit] - offset[0]; /* pgr0063 */ + ReleaseBuffer(buffer); + } +} + +/** + * @brief Get byte size of an entry in a column with fixed field length. + * + * @param[in] info pointer to the target vci_MainRelHeaderInfo. + * @param[in] columnId column ID in the VCI index. + * @return byte size of an entry in the column. + */ +uint16 +vci_GetFixedColumnSize(vci_MainRelHeaderInfo *info, int16 columnId) +{ + switch (columnId) + { + case VCI_COLUMN_ID_TID: + + return sizeof(ItemPointerData); + case VCI_COLUMN_ID_NULL: + + return vci_GetMainRelVar(info, vcimrv_null_width_in_byte, 0); + case VCI_COLUMN_ID_DELETE: + + return 1; + default:; + } + + { + vcis_m_column_t *colInfo; + + Assert(VCI_FIRST_NORMALCOLUMN_ID <= columnId); + colInfo = vci_GetMColumn(info, columnId); + + return colInfo->max_columns_size; + } +} + +/** + * @brief Get the position of the target entry in the relation of the column + * with fixed field. + * + * @param[out] blockNumber block number of the target entry. + * @param[out] offset offset in the block where the target is written. + * @param[in] info pointer to the target vci_MainRelHeaderInfo. + * @param[in] columnId column ID in the VCI index. + * @param[in] extentId extent ID of the target entry + * @param[in] rowIdInExtent entry ID in the extent. + */ +void +vci_GetPositionForFixedColumn(BlockNumber *blockNumber, + uint32 *offset, + vci_MainRelHeaderInfo *info, + int16 columnId, + int32 extentId, + uint32 rowIdInExtent, + bool atEnd) +{ + uint32 columnSize = vci_GetFixedColumnSize(info, columnId); + Size dataSize = (Size) columnSize * VCI_NUM_ROWS_IN_EXTENT; + int32 extentHeaderSize = vci_GetExtentFixedLengthRawDataHeaderSize( + VCI_NUM_ROWS_IN_EXTENT); + uint32 numExtentPages = vci_GetNumBlocks(dataSize + extentHeaderSize); + + /* + * The start block number of extents can be directly calculated in the + * case of Fixed field length. + */ + uint32 startBlockNumber = extentId * numExtentPages; + uint32 extraOffset = extentHeaderSize + (rowIdInExtent * columnSize); + + Assert(VCI_FIRST_NORMAL_EXTENT_ID <= extentId); + + if (atEnd) + extraOffset += columnSize - 1; + vci_GetBlockNumberAndOffsetInPage(blockNumber, offset, extraOffset); + *blockNumber += startBlockNumber; +} + +static void +InitColumnMetaRelation(vci_ColumnRelations *relPair, + Form_pg_attribute attr, + vcis_compression_type_t compType, + TupleDesc heapTupleDesc) +{ + vcis_column_meta_t *columnMeta; + BlockNumber firstBlockNumber = VCI_COLUMN_DATA_FIRST_PAGE_ID; + + vci_FormatPageWithOneItem(relPair->meta, VCI_COLUMN_META_HEADER_PAGE_ID); + + columnMeta = vci_GetColumnMeta(&relPair->bufMeta, relPair->meta); + LockBuffer(relPair->bufMeta, BUFFER_LOCK_EXCLUSIVE); + + if (attr) + { /* normal columns */ + columnMeta->pgsql_atttypid = attr->atttypid; + columnMeta->pgsql_attnum = vci_GetAttNum(heapTupleDesc, NameStr(attr->attname)); + columnMeta->pgsql_attlen = attr->attlen; + columnMeta->pgsql_atttypmod = attr->atttypmod; + + if (InvalidAttrNumber == columnMeta->pgsql_attnum) + ereport(ERROR, (errmsg("column missed in VCI index creation"), + errhint("This must never happen. " + "Give up to use VCI index."))); + } + else + { /* delete, null, or tid */ + columnMeta->pgsql_atttypid = InvalidOid; + columnMeta->pgsql_attlen = 0; + columnMeta->pgsql_atttypmod = 0; + } + + columnMeta->num_extents = 0; + columnMeta->num_extents_old = 0; + columnMeta->free_page_begin_id = firstBlockNumber; + columnMeta->free_page_end_id = firstBlockNumber; + columnMeta->free_page_prev_id = InvalidBlockNumber; + columnMeta->free_page_next_id = InvalidBlockNumber; + columnMeta->num_free_pages = 1; + columnMeta->num_free_pages_old = 1; + columnMeta->num_free_page_blocks = 1; + columnMeta->num_free_page_blocks_old = 1; + columnMeta->min_max_field_size = 0; + columnMeta->min_max_content_size = 0; + columnMeta->latest_common_dict_id = VCI_INVALID_DICTIONARY_ID; + + columnMeta->num_common_dicts = 0; + columnMeta->common_dict_info_offset = 0; + columnMeta->block_number_extent_offset = offsetof(vcis_column_meta_t, + common_dict_info); + + vci_WriteColumnMetaDataHeader(relPair->meta, relPair->bufMeta); + UnlockReleaseBuffer(relPair->bufMeta); +} + +static void +InitDeleteVectorRelation(vci_ColumnRelations *relPair) +{ + OffsetNumber oNum; + + vci_FormatPageWithItems(relPair->data, + VCI_COLUMN_DATA_FIRST_PAGE_ID, + VCI_ITEMS_IN_PAGE_FOR_DELETE); + relPair->bufData = ReadBuffer(relPair->data, VCI_COLUMN_DATA_FIRST_PAGE_ID); + LockBuffer(relPair->bufData, BUFFER_LOCK_EXCLUSIVE); + + for (oNum = FirstOffsetNumber; + oNum <= VCI_ITEMS_IN_PAGE_FOR_DELETE; + ++oNum) + vci_WriteItem(relPair->data, relPair->bufData, oNum); + + UnlockReleaseBuffer(relPair->bufData); +} + +static void +InitColumnDataRelation(vci_ColumnRelations *relPair) +{ + vcis_free_space_t *freeSpace; + + vci_FormatPageWithOneItem(relPair->data, VCI_COLUMN_DATA_FIRST_PAGE_ID); + + freeSpace = vci_GetFreeSpace((vci_RelationPair *) relPair, VCI_COLUMN_DATA_FIRST_PAGE_ID); + freeSpace->size = MaxBlockNumber; + freeSpace->type = vcis_free_space; + freeSpace->prev_pos = InvalidBlockNumber; + freeSpace->next_pos = InvalidBlockNumber; + + LockBuffer(relPair->bufData, BUFFER_LOCK_EXCLUSIVE); + vci_WriteOneItemPage(relPair->data, relPair->bufData); + UnlockReleaseBuffer(relPair->bufData); +} + +void +vci_InitializeColumnRelations(vci_MainRelHeaderInfo *info, + TupleDesc tupdesc, + Relation heapRel) +{ + const LOCKMODE lockmode = ShareLock; + int16 colId; + TupleDesc heapTupleDesc = RelationGetDescr(heapRel); + + Assert((INT64CONST(0xFFFFFFFFFFFF0000) & tupdesc->natts) == 0); + + for (colId = VCI_COLUMN_ID_DELETE; colId < (int16) tupdesc->natts; ++colId) + { + vci_ColumnRelations relPairData; + vci_ColumnRelations *relPair = &relPairData; + + Form_pg_attribute attr; + vcis_compression_type_t compType; + + if (colId >= VCI_FIRST_NORMALCOLUMN_ID) + { + attr = TupleDescAttr(tupdesc, colId); + compType = vci_GetMColumn(info, colId)->comp_type; + } + else + { + attr = NULL; + compType = vcis_compression_type_fixed_raw; + } + + vci_OpenColumnRelations(relPair, info, colId, lockmode); + InitColumnMetaRelation(relPair, attr, compType, heapTupleDesc); + + if (colId == VCI_COLUMN_ID_DELETE) + { + InitDeleteVectorRelation(relPair); + } + else + { + InitColumnDataRelation(relPair); + } + vci_CloseColumnRelations(relPair, lockmode); + } +} diff --git a/contrib/vci/storage/vci_columns_data.c b/contrib/vci/storage/vci_columns_data.c new file mode 100644 index 000000000000..1d3a4d5b6242 --- /dev/null +++ b/contrib/vci/storage/vci_columns_data.c @@ -0,0 +1,232 @@ +/*------------------------------------------------------------------------- + * + * vci_columns_data.c + * Definitions of functions to check which columns are indexed. + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/storage/vci_columns_data.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/heapam.h" +#include "access/htup_details.h" +#include "access/reloptions.h" +#include "catalog/indexing.h" +#include "catalog/namespace.h" +#include "catalog/objectaccess.h" +#include "funcapi.h" +#include "nodes/makefuncs.h" +#include "nodes/nodes.h" +#include "storage/lmgr.h" +#include "storage/lock.h" +#include "utils/builtins.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" +#include "utils/varlena.h" + +#include "vci.h" +#include "vci_columns.h" +#include "vci_columns_data.h" +#include "vci_ros.h" + +static Bitmapset *parseVciColumnsIds(const char *vci_column_ids); + +/* Convert comma-separated column ids to Bitmapset */ +static Bitmapset * +parseVciColumnsIds(const char *vci_column_ids) +{ + List *columnlist; + ListCell *l; + + /* SplitIdentifierString can destroy the first argument. */ + char *copied_ids = pstrdup(vci_column_ids); + Bitmapset *indexedAttids = NULL; + int attid = 0; + + if (!SplitIdentifierString(copied_ids, ',', &columnlist)) + ereport(ERROR, (errmsg("internal error. failed to split"))); + + foreach(l, columnlist) + { + char *number_str = (char *) lfirst(l); + + /* The max id is '1600' -> 4 digits. */ + int attid_diff = pg_strtoint32(number_str); + + attid += attid_diff; + + if (attid >= MaxHeapAttributeNumber) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("invalid attribute number %d", attid + 1))); + + if (bms_is_member(attid, indexedAttids)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + (errmsg("duplicated columns in vci index creation")), + errhint("duplicated columns are specified"))); + + indexedAttids = bms_add_member(indexedAttids, attid); + } + + pfree(copied_ids); + + return indexedAttids; +} + +/* + * vci_ConvertAttidBitmap2String -- Convert a Bitmapset that represents which + attids are targets to comma separated string + */ +char * +vci_ConvertAttidBitmap2String(Bitmapset *attid_bitmap) +{ + int attid; + int preAttid = 0; + StringInfo buf = makeStringInfo(); + + attid = -1; + while ((attid = bms_next_member(attid_bitmap, attid)) >= 0) + { + if (attid >= MaxHeapAttributeNumber) + ereport(ERROR, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("invalid attribute number %d", attid + 1))); + + if (buf->len == 0) + appendStringInfo(buf, "%d", attid - preAttid); + else + appendStringInfo(buf, ",%d", attid - preAttid); + + preAttid = attid; + } + return buf->data; +} + +/* + * vci_ExtractColumnDataUsingIds -- returns TupleDesc that contains indexed columns + * information. + * + * The vci_GetTupleDescr() requires a prebuilt vci_MainRelHeaderInfo. So please use + * this when building a VCI because the structure is in the process of building. + */ +TupleDesc +vci_ExtractColumnDataUsingIds(const char *vci_column_ids, Relation heapRel) +{ + int i; + int attid; + TupleDesc heapTupDesc; + TupleDesc result; + Bitmapset *indexedAttids = NULL; /* for duplication check */ + + heapTupDesc = RelationGetDescr(heapRel); + indexedAttids = parseVciColumnsIds(vci_column_ids); + result = CreateTemplateTupleDesc(bms_num_members(indexedAttids)); + + attid = -1; + i = 0; + while ((attid = bms_next_member(indexedAttids, attid)) >= 0) + { + if (attid >= heapTupDesc->natts) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid attribute number %d", attid + 1))); + + TupleDescCopyEntry(result, i + 1, heapTupDesc, attid + 1); + i++; + } + + bms_free(indexedAttids); + + return result; +} + +/* + * vci_GetTupleDescr -- returns TupleDesc that contains indexed columns + * information from vci_MainRelHeaderInfo. + */ +TupleDesc +vci_GetTupleDescr(vci_MainRelHeaderInfo *info) +{ + MemoryContext oldcontext; + + if (info->cached_tupledesc) + return info->cached_tupledesc; + + oldcontext = MemoryContextSwitchTo(info->initctx); + + info->cached_tupledesc = RelationGetDescr(info->rel); + + MemoryContextSwitchTo(oldcontext); + + return info->cached_tupledesc; +} + +Bitmapset * +vci_MakeIndexedColumnBitmap(Oid mainRelationOid, + MemoryContext sharedMemCtx, + LOCKMODE lockmode) +{ + Relation main_rel; + Bitmapset *result = NULL; + vci_MainRelHeaderInfo *info; + + info = MemoryContextAllocZero(sharedMemCtx, + sizeof(vci_MainRelHeaderInfo)); + main_rel = relation_open(mainRelationOid, lockmode); + vci_InitMainRelHeaderInfo(info, main_rel, vci_rc_query); + vci_KeepMainRelHeader(info); + + { + int32 indexNumColumns = vci_GetMainRelVar(info, + vcimrv_num_columns, 0); + int aId; + + for (aId = 0; aId < indexNumColumns; ++aId) + { + vcis_m_column_t *mColumn = vci_GetMColumn(info, aId); + LOCKMODE lockmode_for_meta = AccessShareLock; + Relation column_meta_rel = table_open(mColumn->meta_oid, lockmode_for_meta); + Buffer buffer; + vcis_column_meta_t *metaHeader = vci_GetColumnMeta(&buffer, column_meta_rel); + + Assert(metaHeader->pgsql_attnum > InvalidAttrNumber); + result = bms_add_member(result, metaHeader->pgsql_attnum); + ReleaseBuffer(buffer); + table_close(column_meta_rel, lockmode_for_meta); + } + } + + vci_ReleaseMainRelHeader(info); + relation_close(main_rel, lockmode); + + return result; +} + +/** + * @brief Get attribute number from the name. + * + * @param[in] desc The tuple descriptor of the relation. + * @param[in] name The name of attribute. + * @return The attribute number. + * If the name is not found in the descriptor, InvalidAttrNumber is returned. + */ +AttrNumber +vci_GetAttNum(TupleDesc desc, const char *name) +{ + int aId; + + for (aId = 0; aId < desc->natts; ++aId) + { + if (strcmp(name, NameStr(TupleDescAttr(desc, aId)->attname)) == 0) + return aId + 1; + } + + return InvalidAttrNumber; +} diff --git a/contrib/vci/storage/vci_fetch.c b/contrib/vci/storage/vci_fetch.c new file mode 100644 index 000000000000..588ae59fa433 --- /dev/null +++ b/contrib/vci/storage/vci_fetch.c @@ -0,0 +1,2497 @@ +/*------------------------------------------------------------------------- + * + * vci_fetch.c + * Column fetch store + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/storage/vci_fetch.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/heapam_xlog.h" +#include "access/xact.h" +#include "catalog/index.h" +#include "miscadmin.h" +#include "storage/lmgr.h" +#include "storage/procarray.h" /* for TransactionIdIsInProgress() */ +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +#include "vci.h" +#include "vci_ros.h" +#include "vci_columns.h" +#include "vci_columns_data.h" + +#include "vci_fetch.h" +#include "vci_ros.h" +#include "vci_utils.h" +#include "vci_wos.h" +#include "vci_tidcrid.h" +#include "vci_xact.h" + +static void ChangeLockModeInQueryContext(vci_CSQueryContext queryContext, LOCKMODE new_lockmode); + +static int +CompAttrNumber(const void *a, const void *b) +{ + AttrNumber aA = *(const AttrNumber *) a; + AttrNumber aB = *(const AttrNumber *) b; + + return aA - aB; +} + +static bool +NeedDatumPointer(vci_MainRelHeaderInfo *info, int16 columnId) +{ + TupleDesc tupDesc; + + if (columnId < VCI_FIRST_NORMALCOLUMN_ID) + return false; + tupDesc = vci_GetTupleDescr(info); + + return vci_PassByRefForFixed(TupleDescAttr(tupDesc, columnId)); +} + +/** + * @brief Create query context. + * + * @param[in] mainRelationOid Oid of VCI main relation. + * @param[in] numReadColumns The number of read columns in the part of query. + * @param[in] attrNum The attribute numbers in the original heap relation, + * not those of the VCI main relation. + * @param[in] sharedMemCtx The shared memory context to keep elements of + * query context, fetch context, local ROS. + * @param[in] lockmode lockmode set after local ROS is generated. + * @return The pointer to the allocated vci_CSQueryContext. + */ +vci_CSQueryContext +vci_CSCreateQueryContextWLockMode(Oid mainRelationOid, + int numReadColumns, + AttrNumber *attrNum, + /* attribute number in original relation */ + MemoryContext sharedMemCtx, + LOCKMODE lockmode) +{ + TransactionId curRosVer; + TransactionId lastRosVer; + vci_CSQueryContext result; + Relation rel; + + result = MemoryContextAllocZero(sharedMemCtx, sizeof(vci_CSQueryContextData)); + result->shared_memory_context = sharedMemCtx; + + result->lockmode = lockmode; + + result->main_relation_oid = mainRelationOid; + + result->heap_rel = relation_open(IndexGetRelation(mainRelationOid, false), + AccessShareLock); + + result->num_columns = numReadColumns; + result->attr_num = MemoryContextAllocZero(sharedMemCtx, + sizeof(AttrNumber) * numReadColumns); + MemCpy(result->attr_num, attrNum, sizeof(AttrNumber) * numReadColumns); + qsort(result->attr_num, numReadColumns, sizeof(AttrNumber), CompAttrNumber); + result->column_id = MemoryContextAllocZero(sharedMemCtx, + sizeof(int16) * numReadColumns); + + result->num_local_ros_extents = 0; + result->local_ros = NULL; + + result->num_delete = 0; + result->delete_list = NULL; + + result->info = MemoryContextAllocZero(sharedMemCtx, + sizeof(vci_MainRelHeaderInfo)); + rel = relation_open(mainRelationOid, result->lockmode); + vci_InitMainRelHeaderInfo(result->info, rel, vci_rc_query); + vci_KeepMainRelHeader(result->info); + result->num_nullable_columns = vci_GetMainRelVar(result->info, + vcimrv_num_nullable_columns, + 0); + result->null_width_in_byte = vci_GetMainRelVar(result->info, + vcimrv_null_width_in_byte, + 0); + result->num_ros_extents = vci_GetMainRelVar(result->info, + vcimrv_num_extents, + 0); + + curRosVer = vci_GetMainRelVar(result->info, vcimrv_current_ros_version, 0); + lastRosVer = vci_GetMainRelVar(result->info, vcimrv_last_ros_version, 0); + + switch (vci_transaction_get_type(curRosVer)) + { + case VCI_XACT_DID_COMMIT: + case VCI_XACT_SELF: + result->ros_version = curRosVer; + result->inclusive_xid = curRosVer; + result->exclusive_xid = InvalidTransactionId; + result->tid_crid_diff_sel = vci_GetMainRelVar(result->info, vcimrv_tid_crid_diff_sel, 0); + break; + + case VCI_XACT_IN_PROGRESS: + case VCI_XACT_DID_CRASH: + case VCI_XACT_DID_ABORT: + result->ros_version = lastRosVer; + result->inclusive_xid = InvalidTransactionId; + result->exclusive_xid = curRosVer; + result->tid_crid_diff_sel = vci_GetMainRelVar(result->info, vcimrv_tid_crid_diff_sel_old, 0); + break; + + case VCI_XACT_INVALID: + elog(ERROR, "current ROS version is invalid"); /* @todo */ + break; + } + + Assert(TransactionIdIsValid(result->ros_version)); + + { + int32 indexNumColumns = vci_GetMainRelVar(result->info, + vcimrv_num_columns, 0); + AttrNumber *indexAttNums = palloc(sizeof(AttrNumber) * indexNumColumns); + int aId; + TupleDesc descHeap = RelationGetDescr(result->heap_rel); + + for (aId = 0; aId < indexNumColumns; ++aId) + { + vcis_m_column_t *mColumn = vci_GetMColumn(result->info, aId); + LOCKMODE lockmode_asl = AccessShareLock; + Relation mcol_rel = table_open(mColumn->meta_oid, lockmode_asl); + Buffer buffer; + vcis_column_meta_t *metaHeader = vci_GetColumnMeta(&buffer, mcol_rel); + + indexAttNums[aId] = metaHeader->pgsql_attnum; + ReleaseBuffer(buffer); + table_close(mcol_rel, lockmode_asl); + } + + for (aId = 0; aId < numReadColumns; ++aId) + { + AttrNumber attNum = TupleDescAttr(descHeap, result->attr_num[aId] - 1) + ->attnum; + + /* AttrNumber is 1 origin. We use 0 origin value. */ + result->column_id[aId] = FindInt16(indexAttNums, indexNumColumns, + attNum); + Assert(0 <= result->column_id[aId]); + } + + pfree(indexAttNums); + } + + result->num_data_wos_entries = + vci_EstimateNumEntriesInHeapRelation(vci_GetMainRelVar(result->info, vcimrv_data_wos_oid, 0)); + + result->num_whiteout_wos_entries = + vci_EstimateNumEntriesInHeapRelation(vci_GetMainRelVar(result->info, vcimrv_whiteout_wos_oid, 0)); + + return result; +} + +/** + * @brief Destroy query context. + * + * @param[in] queryContext Pointer to the target context to be destroy. + */ +void +vci_CSDestroyQueryContext(vci_CSQueryContext queryContext) +{ + vci_MainRelHeaderInfo *info; + Relation heapRel; + Relation indexRel; + + Assert(queryContext != NULL); + Assert(queryContext->info != NULL); + + info = queryContext->info; + + heapRel = queryContext->heap_rel; + indexRel = info->rel; + + vci_ReleaseMainRelHeader(info); + + if (RelationIsValid(indexRel)) + table_close(indexRel, queryContext->lockmode); + + if (RelationIsValid(heapRel)) + table_close(heapRel, AccessShareLock); + + if (queryContext->column_id) + pfree(queryContext->column_id); + + if (queryContext->attr_num) + pfree(queryContext->attr_num); + + pfree(info); + pfree(queryContext); +} + +static void +ChangeLockModeInQueryContext(vci_CSQueryContext queryContext, LOCKMODE new_lockmode) +{ + Assert(queryContext); + + if (queryContext->lockmode != new_lockmode) + { + Assert(queryContext->info); + Assert(queryContext->info->rel); + LockRelation(queryContext->info->rel, new_lockmode); + UnlockRelation(queryContext->info->rel, queryContext->lockmode); + queryContext->lockmode = new_lockmode; + } +} + +/** + * @brief Collect information of the specified column and the maximum size of + * tuples. + * + * @param info[in] Pointer to the VCI master information. + * @param fetchContext[in] Pointer to the fetch context. + * @param columnId[in] column ID in VCI main relation. + * @param datumSize[in,out] If not NULL, the datum size is written. + * @param maxElemSize[in,out] If not NULL, the maximum size of an element is written. + * @param maxDictSize[in,out] If not NULL, the maximum size of dictionary is written. + * @param nullBitId[in,out] If not NULL, the null bit Id is written. + * @param compType[in,out] If not NULL, the compression type is written. + * @param atttypid[in,out] If not NULL, the atttypid in PostgreSQL is written. + * @param strictDatumType[in,out] If not NULL, the flag is returned, indicating + * true as Datum has only the pointer to the real value, or false that + * Datum has the value itself. + * @return the maximum size of elements, the same value in maxElemSize. + */ +static int +SizeOfElementAndPointer(vci_MainRelHeaderInfo *info, + vci_CSFetchContext fetchContext, + int16 columnId, /* the column ID in VCI main relation */ + uint32 *datumSize, + uint32 *maxElemSize, + uint32 *maxDictSize, + int32 *nullBitId, + vcis_compression_type_t *compType, + Oid *atttypid, + bool *strictDatumType) +{ + int maxElementSize; + + if (nullBitId) + *nullBitId = -1; /* default is not nullable */ + + if (compType) + *compType = vcis_compression_type_fixed_raw; + + if (atttypid) + *atttypid = InvalidOid; + + if (strictDatumType) + *strictDatumType = false; + + if (maxDictSize) + *maxDictSize = 0; + + if (VCI_FIRST_NORMALCOLUMN_ID <= columnId) + { + TupleDesc desc = vci_GetTupleDescr(info); + vcis_m_column_t *mColumn = vci_GetMColumn(info, columnId); + + Assert((VCI_FIRST_NORMALCOLUMN_ID <= columnId) && (columnId < desc->natts)); + + maxElementSize = mColumn->max_columns_size; + if (maxElemSize) + *maxElemSize = maxElementSize; + + if (compType) + *compType = mColumn->comp_type; + + if (nullBitId) + *nullBitId = vci_GetBitIdInNullBits(desc, columnId); + + if (atttypid) + *atttypid = TupleDescAttr(desc, columnId)->atttypid; + + switch (mColumn->comp_type) + { + case vcis_compression_type_fixed_raw: + { + if (vci_PassByRefForFixed(TupleDescAttr(desc, columnId))) + { + if (strictDatumType) + *strictDatumType = true; + if (datumSize) + *datumSize = sizeof(Datum); + } + else + { + if (strictDatumType) + *strictDatumType = false; + if (datumSize) + *datumSize = maxElementSize; + } + } + break; + case vcis_compression_type_variable_raw: + if (strictDatumType) + *strictDatumType = true; + if (datumSize) + *datumSize = sizeof(Datum); + break; + /* for compressions */ + default: + ereport(ERROR, (errmsg("internal error: unsupported compression type"), errhint("Disable VCI by 'SELECT vci_disable();'"))); + } + + /* + * we put large data in some area, and Datum have the pointer + */ + if (NeedDatumPointer(info, columnId)) + maxElementSize = MAXALIGN(maxElementSize) + sizeof(Datum); + + return maxElementSize; + } + + switch (columnId) + { + case VCI_COLUMN_ID_TID: + maxElementSize = sizeof(int64); + break; + + case VCI_COLUMN_ID_NULL: + maxElementSize = sizeof(bool) * fetchContext->num_columns; + break; + + case VCI_COLUMN_ID_DELETE: + maxElementSize = sizeof(uint16); + break; + + case VCI_COLUMN_ID_CRID: + maxElementSize = sizeof(uint64); + break; + + default: + abort(); + } + + if (datumSize) + *datumSize = maxElementSize; + + if (maxElemSize) + *maxElemSize = maxElementSize; + + return maxElementSize; +} + +static vci_MainRelHeaderInfo * +GetMainRelHeaderInfoFromFetchContext(vci_CSFetchContext fetchContext) +{ + return (fetchContext->info) ? fetchContext->info + : fetchContext->query_context->info; +} + +/** + * @brief Obtain tuple size where each attribute is aligned by MAXALIGN. + */ +static void +GetWorstCaseTupleSize(vci_CSFetchContext fetchContext, + Size *sumWorstCaseDictionarySize_, + Size *sumWorstCaseValueSize_, + Size *sumWorstCaseFlagSize_, + Size *sumWorstCaseAreaSize_, + int16 numReadColumns, + /* attribute number in original relation */ + AttrNumber *attrNum, + bool returnTid, + bool returnCrid) +{ + vci_CSQueryContext queryContext = fetchContext->query_context; + vci_MainRelHeaderInfo *info = GetMainRelHeaderInfoFromFetchContext(fetchContext); + int aId; + uint32 datumSize; + uint32 maxElemSize; + uint32 maxDictSize; + bool strictDatumType; + Size sumWorstCaseDictionarySize = 0; + Size sumWorstCaseValueSize = 0; + Size sumWorstCaseFlagSize = 0; + Size sumWorstCaseAreaSize = 0; + + for (aId = 0; aId < numReadColumns; ++aId) + { + int16 colId = fetchContext->column_link[aId]; + + SizeOfElementAndPointer(info, fetchContext, + queryContext->column_id[colId], + &datumSize, &maxElemSize, &maxDictSize, + NULL, NULL, NULL, &strictDatumType); + sumWorstCaseValueSize += TYPEALIGN(sizeof(Datum), datumSize); + sumWorstCaseDictionarySize += MAXALIGN(maxDictSize); + if (strictDatumType) + sumWorstCaseAreaSize += MAXALIGN(maxElemSize); + } + + SizeOfElementAndPointer(info, fetchContext, VCI_COLUMN_ID_NULL, + &datumSize, &maxElemSize, &maxDictSize, + NULL, NULL, NULL, &strictDatumType); + sumWorstCaseFlagSize += MAXALIGN(maxElemSize); + sumWorstCaseDictionarySize += MAXALIGN(maxDictSize); + + SizeOfElementAndPointer(info, fetchContext, VCI_COLUMN_ID_DELETE, + &datumSize, &maxElemSize, &maxDictSize, + NULL, NULL, NULL, &strictDatumType); + sumWorstCaseFlagSize += MAXALIGN(maxElemSize); + sumWorstCaseDictionarySize += MAXALIGN(maxDictSize); + + if (fetchContext->need_tid) + { + SizeOfElementAndPointer(info, fetchContext, VCI_COLUMN_ID_TID, + &datumSize, &maxElemSize, &maxDictSize, + NULL, NULL, NULL, &strictDatumType); + sumWorstCaseFlagSize += MAXALIGN(maxElemSize); + sumWorstCaseDictionarySize += MAXALIGN(maxDictSize); + } + + if (fetchContext->need_crid) + { + SizeOfElementAndPointer(info, fetchContext, VCI_COLUMN_ID_CRID, + &datumSize, &maxElemSize, &maxDictSize, + NULL, NULL, NULL, &strictDatumType); + sumWorstCaseFlagSize += MAXALIGN(maxElemSize); + sumWorstCaseDictionarySize += MAXALIGN(maxDictSize); + } + + sumWorstCaseFlagSize += sizeof((((vci_virtual_tuples_t *) NULL)->skip)[0]) + + sizeof((((vci_virtual_tuples_t *) NULL)->local_skip)[0]); + + if (sumWorstCaseDictionarySize_) + *sumWorstCaseDictionarySize_ = sumWorstCaseDictionarySize; + if (sumWorstCaseValueSize_) + *sumWorstCaseValueSize_ = sumWorstCaseValueSize; + if (sumWorstCaseFlagSize_) + *sumWorstCaseFlagSize_ = sumWorstCaseFlagSize; + if (sumWorstCaseAreaSize_) + *sumWorstCaseAreaSize_ = sumWorstCaseAreaSize; +} + +/** + * @brief The base function of creating an instance of \c vci_CSFetchContext. + * + * This function is normally called via vci_CSCreateFetchContext(). + * + * @param[in] queryContext The query context. + * @param[in] numRowsReadAtOnce The number of rows which read at once and + * stored in the virtual tuples. + * @param[in] numReadColumns The number of columns to be read. + * @param[in] attrNum The pointer to the array which has the attribute numbers + * of the original heap relation, not VCI main relation. + * @param[in] useColumnStore True for column-wise store. False for row-wise. + * @param[in] returnTid True to get TID in virtual tuples. + * @param[in] returnCrid True to get CRID in virtual tuples. + * @param[in] useCompression True to use compression. + * @return The pointer to the created fetch context. + * NULL if some parameters are invald resulting no fetch context is created. + * @note This function registers CurrentMemoryContext as local_memory_context + * in \c vci_CSFetchContext. + */ +vci_CSFetchContext +vci_CSCreateFetchContextBase(vci_CSQueryContext queryContext, + uint32 numRowsReadAtOnce, + int16 numReadColumns, + /* attribute number in original relation */ + AttrNumber *attrNum, + bool useColumnStore, + bool returnTid, + bool returnCrid, + bool useCompression) +{ + Size size = sizeof(vci_CSFetchContextData) + + ((numReadColumns - 1) * sizeof(int16)); + vci_CSFetchContext result; + + Assert(useCompression == false); /* Compression code has been extracted + * from the contrib/vci module */ + + result = MemoryContextAllocZero(queryContext->shared_memory_context, size); + + result->query_context = queryContext; + result->size = size; + + /* + * The master copy does not have own vci_MainRelHeaderInfo. The localized + * copies have their own vci_MainRelHeaderInfo, which will be created in + * vci_CSLocalizeFetchContext(). + */ + result->info = NULL; + + result->num_columns = numReadColumns; + result->num_rows_read_at_once = TYPEALIGN(VCI_COMPACTION_UNIT_ROW, + numRowsReadAtOnce); + result->use_column_store = useColumnStore; + result->need_crid = returnCrid; + result->need_tid = returnTid; + result->buffer = MemoryContextAllocZero(queryContext->shared_memory_context, + sizeof(vci_seq_scan_buffer_t)); + result->local_memory_context = CurrentMemoryContext; + + result->extent_id = VCI_INVALID_EXTENT_ID; + result->num_rows = 0; + + { + int aId; + Size sumWorstCaseAreaSize; + Size valueSizePerTuple; + Size flagSizePerTuple; + Size flagSizeBaseline; + + for (aId = 0; aId < numReadColumns; ++aId) + { + int16 colId; + + Assert(0 < attrNum[aId]); + colId = FindInt16(queryContext->attr_num, + queryContext->num_columns, + attrNum[aId]); + Assert(VCI_FIRST_NORMALCOLUMN_ID <= colId); + result->column_link[aId] = colId; + } + + /* Should we use faster way? */ + GetWorstCaseTupleSize(result, + &(result->size_dictionary_area), + &valueSizePerTuple, + &flagSizePerTuple, + &sumWorstCaseAreaSize, + numReadColumns, + attrNum, + returnTid, + returnCrid); + + result->size_dictionary_area = useCompression + ? result->size_dictionary_area : 0; + result->size_decompression_area = (result->size_dictionary_area) + ? MAXALIGN(VCI_MAX_PAGE_SPACE * VCI_COMPACTION_UNIT_ROW) : 0; + + if (!(result->use_column_store)) + flagSizePerTuple += result->num_columns * (sizeof(Datum) + sizeof(bool)); + + flagSizeBaseline = sizeof((((vci_virtual_tuples_t *) NULL)->skip)[0]) + + sizeof((((vci_virtual_tuples_t *) NULL)->local_skip)[0]) + + result->size_dictionary_area + + result->size_decompression_area; + +recalculation: + /* The skip information has additional one elements at the tail. */ + result->size_flags = flagSizeBaseline + flagSizePerTuple * result->num_rows_read_at_once; + result->size_values = valueSizePerTuple * result->num_rows_read_at_once; + + /* add padding space for each MemoryContextAlloc() */ + result->size_vector_memory_context = result->size_values + + result->size_flags + sumWorstCaseAreaSize + + ((2 * result->num_columns) * MAXIMUM_ALIGNOF); + + if (MaxAllocSize < result->size_vector_memory_context) + { + uint32 new_num_rows_read_at_once; + + new_num_rows_read_at_once = + (MaxAllocSize - (flagSizeBaseline + sumWorstCaseAreaSize + 2 * result->num_columns * MAXIMUM_ALIGNOF)) + / (flagSizePerTuple + valueSizePerTuple); + + if (new_num_rows_read_at_once > VCI_COMPACTION_UNIT_ROW) + new_num_rows_read_at_once = TYPEALIGN(VCI_COMPACTION_UNIT_ROW, + new_num_rows_read_at_once - VCI_COMPACTION_UNIT_ROW + 1); + + if (new_num_rows_read_at_once == 0) + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + + result->num_rows_read_at_once = new_num_rows_read_at_once; + + goto recalculation; + } + } + + /* + * The following fields are filled by vci_CSLocalizeFetchContext() + */ + result->rel_column = NULL; + vci_Initvci_ColumnRelations(&(result->rel_delete)); + vci_Initvci_ColumnRelations(&(result->rel_null)); + vci_Initvci_ColumnRelations(&(result->rel_tid)); + + return result; +} + +/** + * @brief Destroy the given instance of \c vci_CSFetchContext. + * + * The opened relations registerd in the target are all closed and + * related \c vci_ReleaseMainRelHeader is released. + * + * @param[in] fetchContext target to destroy. + */ +void +vci_CSDestroyFetchContext(vci_CSFetchContext fetchContext) +{ + LOCKMODE lockmode = AccessShareLock; + + vci_CloseColumnRelations(&(fetchContext->rel_delete), lockmode); + vci_CloseColumnRelations(&(fetchContext->rel_null), lockmode); + vci_CloseColumnRelations(&(fetchContext->rel_tid), lockmode); + if (fetchContext->rel_column) + { + int cId; + + for (cId = 0; cId < fetchContext->num_columns; ++cId) + vci_CloseColumnRelations(&(fetchContext->rel_column[cId]), lockmode); + pfree(fetchContext->rel_column); + } + + if (fetchContext->info) + { + Relation rel = fetchContext->info->rel; + + vci_ReleaseMainRelHeader(fetchContext->info); + + table_close(rel, lockmode); + } + + pfree(fetchContext->buffer); + pfree(fetchContext); +} + +/* call this function using local memory context */ +/** + * @brief Make a local copy of \c fetchContext. + * + * Because file handles are unable to be shared among processes, + * and \c fetchContext has many file handles, we need a copy of + * \c fetchContext made by each process. + * + * @param[in] fetchContext source of copy. + * @param[in] memoryContext the memory context where the copy is written. + * @return copy of given \c fetchContext. + */ +vci_CSFetchContext +vci_CSLocalizeFetchContext(vci_CSFetchContext fetchContext, + MemoryContext memoryContext) +{ + LOCKMODE lockmode = AccessShareLock; + int columnId; + MemoryContext oldMemCtx = MemoryContextSwitchTo(memoryContext); + Relation rel = relation_open(fetchContext->query_context->main_relation_oid, + AccessShareLock); + vci_CSFetchContext result = palloc0(fetchContext->size); + + MemCpy(result, fetchContext, fetchContext->size); + result->local_memory_context = memoryContext; + result->buffer = MemoryContextAllocZero(result->local_memory_context, + sizeof(vci_seq_scan_buffer_t)); + + Assert(NULL == result->info); + result->info = MemoryContextAllocZero(result->local_memory_context, + sizeof(vci_MainRelHeaderInfo)); + vci_InitMainRelHeaderInfo(result->info, rel, + fetchContext->query_context->info->command); + vci_KeepMainRelHeader(result->info); + + result->rel_column = MemoryContextAllocZero(result->local_memory_context, + sizeof(vci_ColumnRelations) * result->num_columns); + + vci_OpenColumnRelations(&(result->rel_delete), result->info, VCI_COLUMN_ID_DELETE, lockmode); + vci_OpenColumnRelations(&(result->rel_null), result->info, VCI_COLUMN_ID_NULL, lockmode); + vci_OpenColumnRelations(&(result->rel_tid), result->info, VCI_COLUMN_ID_TID, lockmode); + + for (columnId = VCI_FIRST_NORMALCOLUMN_ID; columnId < result->num_columns; ++columnId) + { + int16 cId = vci_GetColumnIdFromFetchContext(fetchContext, columnId); + + vci_OpenColumnRelations(&(result->rel_column[columnId]), + result->info, cId, lockmode); + } + + MemoryContextSwitchTo(oldMemCtx); + + return result; +} + +/** + * @brief Create an instance of \c vci_extent_status_t where the status + * of extents are written. + * + * @param[in] fetchContext the \c fetchContext of the target VCI relation + * in the process, i.e. localized one. + * @return the pointer of newly created instance of \c vci_extent_status_t. + */ +vci_extent_status_t * +vci_CSCreateCheckExtent(vci_CSFetchContext fetchContext) +{ + uint32 size = sizeof(vci_extent_status_t) + + ((fetchContext->num_columns - 1) * sizeof(vci_minmax_t)); + vci_extent_status_t *result; + + Assert(fetchContext->local_memory_context); + result = MemoryContextAllocZero(fetchContext->local_memory_context, size); + result->size = size; + result->num_rows = 0; + result->existence = false; + result->visible = false; + + return result; +} + +/** + * @brief Destroy given instance of \c vci_extent_status_t. + * + * @param[in] fetchContext target to destory. + */ +void +vci_CSDestroyCheckExtent(vci_extent_status_t *status) +{ + pfree(status); +} + +static void +SetMinMaxInvalid(vci_extent_status_t *status, vci_CSFetchContext fetchContext) +{ + int aId; + + for (aId = 0; aId < fetchContext->num_columns; ++aId) + status->minmax[aId].valid = false; +} + +/** + * @brief Get the status of an extent. + * + * @details Check if the extent is visible with the relation among current ROS + * version and \c Xgen, \c Xdel. + * + * The current ROS version is obtained from + * \c fetchContext->query_context->ros_version. + * + * @param[in,out] status the status of an extent is written in \c * \c status. + * @param[in] fetchContext the \c fetchContext of the target VCI relation + * in the process, i.e. localized one. + * @param[in] extentId the extent ID to probe. + * @param[in] readMinMax obtain min-max information if true. + */ +void +vci_CSCheckExtent(vci_extent_status_t *status, + vci_CSFetchContext fetchContext, + int32 extentId, + bool readMinMax) +{ + vci_MainRelHeaderInfo *info = GetMainRelHeaderInfoFromFetchContext( + fetchContext); + + if (extentId < VCI_FIRST_NORMAL_EXTENT_ID) + { + bool existence; + vci_local_ros_t *localRos; + vci_virtual_tuples_t *extent; + + Assert(fetchContext->query_context); + existence = -(fetchContext->query_context->num_local_ros_extents) <= extentId; + status->existence = existence; + status->visible = existence; + + localRos = fetchContext->query_context->local_ros; + Assert(localRos->num_local_extents == fetchContext->query_context->num_local_ros_extents); + extent = localRos->extent[-1 - extentId]; + status->num_rows = extent->num_rows; + + SetMinMaxInvalid(status, fetchContext); + + return; + } + + /* check if the VCI index has the extent. */ + if (!vci_ExtentInfoExists(info, extentId)) + { + status->num_rows = 0; + status->existence = false; + status->visible = false; + SetMinMaxInvalid(status, fetchContext); + + return; + } + + { + /* check information of extent in main relation */ + Buffer buffer = InvalidBuffer; + vcis_m_extent_t *mExtent; + + mExtent = vci_GetMExtent(&buffer, info, extentId); + + LockBuffer(buffer, BUFFER_LOCK_SHARE); + + status->existence = TransactionIdIsValid(mExtent->xgen) || + TransactionIdIsValid(mExtent->xdel); + + status->visible = vci_ExtentIsVisible(mExtent, + fetchContext->query_context->ros_version); + + status->num_rows = mExtent->num_rows; + + UnlockReleaseBuffer(buffer); + } + + if (!status->visible) + { + SetMinMaxInvalid(status, fetchContext); + + return; + } +} + +/** + * @brief Entry point to generate local ROS. + * + * @param[in] queryContext Query context the local ROS is generated for. + * @return The pointer to the local ROS information. + */ +vci_local_ros_t * +vci_CSGenerateLocalRos(vci_CSQueryContextData *queryContext) +{ + int64 numDataWosRows; + int64 numWhiteoutWosRows; + vci_local_ros_t *result; + + numDataWosRows = queryContext->num_data_wos_entries; + numWhiteoutWosRows = queryContext->num_whiteout_wos_entries; + + numDataWosRows = Max(numDataWosRows, 1); + numWhiteoutWosRows = Max(numWhiteoutWosRows, 1); + + result = vci_GenerateLocalRos(queryContext, + VciGuc.max_local_ros_size * INT64CONST(1024), + numDataWosRows, + numWhiteoutWosRows); + + ChangeLockModeInQueryContext(queryContext, AccessShareLock); + + return result; +} + +/** + * @brief Estimate the size of local ROS. + * + * @details We have some ways to estimate the number of rows in data WOS. + * In the system catalog, we have two values. + * One is pg_class.reltuples, which is updated on VACUUM. + * The other is pg_stat_user_tables.n_live_tup, which is + * updated on COMMIT. + * Both cannot count those rows INSERTed in the same + * transaction. + * To obtain the actual count, it seems necessary to count + * rows in data WOS. + * And we have the count itself in memory object, + * vci_memory_entry_t.tid_tree->number_of_nodes_in_wos. + * + * For local delete list, we estimate the number of entries, i.e. deleted + * rows, from the number of DB pages in the whiteout WOS. + * + * We also use memory for chunk storage in building Local ROS, + * but which can be counted in WOS -> ROS conversion memory. + * + * @param[in] queryContext estimate the size of local ROS for the given + * query context. + * @return estimated size of local ROS. + */ +Size +vci_CSEstimateLocalRosSize(vci_CSQueryContext queryContext) +{ + Size result = 0; + int numRowsInExtent; + int64 numDataWosRows; + int64 numWhiteoutWosRows; + int64 numLocalDeleteListRows; + int numRowsAtOneFetch; + Size oneFetchMemorySize; + + Assert(queryContext); + Assert(queryContext->info); + + numRowsInExtent = vci_GetNumRowsInLocalRosExtent(queryContext->num_columns); + + numDataWosRows = queryContext->num_data_wos_entries = + vci_EstimateNumEntriesInHeapRelation( + vci_GetMainRelVar(queryContext->info, vcimrv_data_wos_oid, 0)); + + numWhiteoutWosRows = queryContext->num_whiteout_wos_entries = + vci_EstimateNumEntriesInHeapRelation( + vci_GetMainRelVar(queryContext->info, vcimrv_whiteout_wos_oid, 0)); + + numLocalDeleteListRows = numDataWosRows + numWhiteoutWosRows; + + if (VCI_NUM_ROWS_IN_EXTENT * VCI_MAX_NUMBER_UNCONVERTED_ROS < numDataWosRows) + return (Size) -1; + + /* + * Calculate the size of memory area to store multiple + * vci_virtual_tuples_t. We assume to use column store for local ROS. + */ + { + vci_CSFetchContext fetchContext; + + /* + * We are estimating the data size of local ROS, for which we do not + * use data compression. + */ + fetchContext = vci_CSCreateFetchContextBase(queryContext, + Min(numRowsInExtent, numDataWosRows), + queryContext->num_columns, + queryContext->attr_num, + true, /* useColumnStore */ + true, + true, + false); /* no compression */ + + numRowsAtOneFetch = fetchContext->num_rows_read_at_once; + + oneFetchMemorySize = fetchContext->size_vector_memory_context; + + vci_CSDestroyFetchContext(fetchContext); + } + + if (numDataWosRows <= numRowsAtOneFetch) + { + result = oneFetchMemorySize; + } + else + { + int numFullFetches; + int numRemainedRows; + + numFullFetches = numDataWosRows / numRowsAtOneFetch; + numRemainedRows = numDataWosRows - (numFullFetches * numRowsAtOneFetch); + + result = numFullFetches * oneFetchMemorySize; + + if (0 < numRemainedRows) + { + vci_CSFetchContext fetchContext; + + fetchContext = vci_CSCreateFetchContextBase(queryContext, + numRemainedRows, + queryContext->num_columns, + queryContext->attr_num, + true, /* useColumnStore */ + true, + true, + false); /* no compression */ + + result += fetchContext->size_vector_memory_context; + + vci_CSDestroyFetchContext(fetchContext); + } + } + + /* + * Calculate the size of local delete list + */ + result += numLocalDeleteListRows * sizeof(queryContext->delete_list[0]); + + return result; +} + +static void +RefillPointersOfVirtualTuples(vci_virtual_tuples_t *vTuples, bool keepStatus) +{ + vci_CSFetchContext fetchContext = vTuples->fetch_context; + uint32 numRows = vTuples->num_rows_read_at_once; + vci_MainRelHeaderInfo *info; + int columnId; + char *ptr; + + info = GetMainRelHeaderInfoFromFetchContext(fetchContext); + + vTuples->buffer_capacity = numRows; + + vTuples->values = (Datum *) MAXALIGN(vTuples->al_values); + vTuples->flags = (char *) MAXALIGN(vTuples->al_flags); + + ptr = vTuples->flags; + + vTuples->crid = NULL; + if (fetchContext->need_crid) + { + vTuples->crid = (int64 *) ptr; + ptr = (char *) &(vTuples->crid[numRows]); + Assert((uintptr_t) ptr - (uintptr_t) (vTuples->flags) <= vTuples->size_flags); + } + + vTuples->tid = NULL; + if (fetchContext->need_tid) + { + vTuples->tid = (int64 *) ptr; + ptr = (char *) &(vTuples->tid[numRows]); + Assert((uintptr_t) ptr - (uintptr_t) (vTuples->flags) <= vTuples->size_flags); + } + + vTuples->skip = (uint16 *) ptr; + ptr = (char *) &(vTuples->skip[TYPEALIGN(8, numRows) + 1]); + Assert((uintptr_t) ptr - (uintptr_t) (vTuples->flags) <= vTuples->size_flags); + + vTuples->local_skip = (uint16 *) ptr; + ptr = (char *) &(vTuples->local_skip[numRows + 1]); + Assert((uintptr_t) ptr - (uintptr_t) (vTuples->flags) <= vTuples->size_flags); + + vTuples->isnull = (bool *) ptr; + ptr = (char *) &(vTuples->isnull[numRows * vTuples->num_columns]); + Assert((uintptr_t) ptr - (uintptr_t) (vTuples->flags) <= vTuples->size_flags); + + vTuples->row_wise_local_ros = NULL; + if (!(vTuples->use_column_store)) + { + vTuples->row_wise_local_ros = (char *) MAXALIGN(ptr); + ptr += numRows * vTuples->num_columns * (sizeof(Datum) + sizeof(bool)); + Assert((uintptr_t) ptr - (uintptr_t) (vTuples->flags) <= vTuples->size_flags); + } + + vTuples->work_decompression = NULL; + if (0 < vTuples->size_decompression_area) + { + vTuples->work_decompression = ptr; + ptr += vTuples->size_decompression_area; + Assert((uintptr_t) ptr - (uintptr_t) (vTuples->flags) <= vTuples->size_flags); + } + + for (columnId = VCI_FIRST_NORMALCOLUMN_ID; columnId < vTuples->num_columns; ++columnId) + { + uint32 datumSize; + uint32 maxDictSize; + int16 cId = vci_GetColumnIdFromFetchContext(fetchContext, columnId); + vci_virtual_tuples_column_info_t *colInfo; + + colInfo = &(vTuples->column_info[columnId]); + SizeOfElementAndPointer(info, + fetchContext, + cId, + &datumSize, + &(colInfo->max_column_size), + &maxDictSize, + &(colInfo->null_bit_id), + &(colInfo->comp_type), + &(colInfo->atttypid), + &(colInfo->strict_datum_type)); + colInfo->dict_info = NULL; + if (0 < maxDictSize) + { + colInfo->dict_info = (vci_DictInfo *) MAXALIGN(ptr); + ptr += sizeof(vci_DictInfo); + Assert((uintptr_t) ptr - (uintptr_t) (vTuples->flags) <= vTuples->size_flags); + if (!keepStatus) + { + vci_InitializeDictInfo(colInfo->dict_info); + colInfo->dict_info->dictionary_storage = (unsigned char *) ptr; + colInfo->dict_info->storage_size = maxDictSize; + } + ptr += maxDictSize; + Assert((uintptr_t) ptr - (uintptr_t) (vTuples->flags) <= vTuples->size_flags); + } + + if (vTuples->use_column_store) + { + colInfo->isnull = &(vTuples->isnull[numRows * columnId]); + colInfo->values = &(vTuples->values[numRows * columnId]); + } + else + { + colInfo->isnull = NULL; + colInfo->values = NULL; + } + + colInfo->area = (char *) MAXALIGN(colInfo->al_area); + } +} + +/** + * @brief Create an instance of \c vci_virtual_tuples_t where the read + * ROS is stored. + * + * @details The function \c vci_CSFetchVirtualTuples() reads and stores + * multiple rows at once from the specified columns and rows. + * Users need to have enough area to store the maximum number of rows. + * To prepare the area, the maximum number of rows should be passed via + * the parameter \c numRows of this function. + * + * @param[in] fetchContext the fetch context. + * @param[in] numRows required number of rows read at once. + * @return the pointer to the created \c vci_virtual_tuples_t. + */ +vci_virtual_tuples_t * +vci_CSCreateVirtualTuplesWithNumRows(vci_CSFetchContext fetchContext, + uint32 numRows) +{ + MemoryContext mctx; + vci_virtual_tuples_t *result; + int32 size; + vci_MainRelHeaderInfo *info; + + Assert(fetchContext); + mctx = fetchContext->local_memory_context; + Assert(mctx); + info = GetMainRelHeaderInfoFromFetchContext(fetchContext); + + size = sizeof(vci_virtual_tuples_t) + ((fetchContext->num_columns - 1) * + sizeof(vci_virtual_tuples_column_info_t)); + + result = MemoryContextAllocZero(mctx, size); + result->size = size; + result->num_columns = fetchContext->num_columns; + result->extent_id = VCI_INVALID_EXTENT_ID; + result->num_rows_in_extent = 0; + result->row_id_in_extent = -1; + result->num_rows = 0; + result->buffer_capacity = 0; + result->offset_of_first_tuple_of_vector = 0; + result->num_rows_read_at_once = numRows; + result->fetch_context = fetchContext; + result->use_column_store = fetchContext->use_column_store; + result->status = vcirvs_out_of_range; + + result->size_vector_memory_context = fetchContext->size_vector_memory_context; + Assert(result->num_rows_read_at_once <= fetchContext->num_rows_read_at_once); + result->size_values = fetchContext->size_values; + result->size_flags = fetchContext->size_flags; + result->size_dictionary_area = fetchContext->size_dictionary_area; + result->size_decompression_area = fetchContext->size_decompression_area; + + result->al_values = MemoryContextAlloc(mctx, + result->size_values + MAXIMUM_ALIGNOF); + + result->al_flags = MemoryContextAlloc(mctx, + result->size_flags + MAXIMUM_ALIGNOF); + + { + int columnId; + + for (columnId = VCI_FIRST_NORMALCOLUMN_ID; columnId < result->num_columns; ++columnId) + { + int16 cId = vci_GetColumnIdFromFetchContext(fetchContext, columnId); + + result->column_info[columnId].al_area = NULL; + if (NeedDatumPointer(info, cId)) + { + uint32 maxElemSize; + + SizeOfElementAndPointer(info, fetchContext, cId, NULL, + &maxElemSize, NULL, NULL, NULL, NULL, NULL); + result->column_info[columnId].al_area = + MemoryContextAlloc(mctx, + MAXIMUM_ALIGNOF + + (MAXALIGN(maxElemSize) * numRows)); + } + } + } + + RefillPointersOfVirtualTuples(result, false); + + return result; +} + +/** + * @brief Destroy given instance of \c vci_virtual_tuples_t. + * + * @param[in] vTuples target to be destory. + */ +void +vci_CSDestroyVirtualTuples(vci_virtual_tuples_t *vTuples) +{ + int columnId; + + for (columnId = VCI_FIRST_NORMALCOLUMN_ID; columnId < vTuples->num_columns; ++columnId) + { + if (vTuples->column_info[columnId].al_area) + pfree(vTuples->column_info[columnId].al_area); + } + + pfree(vTuples->al_values); + pfree(vTuples->al_flags); + pfree(vTuples); +} + +/** + * @brief Fill CRIDs in the given \c vci_virtual_tuples_t. + * + * @param[in] vTuples target virtual tuples. + */ +void +vci_FillCridInVirtualTuples(vci_virtual_tuples_t *vTuples) +{ + int aId; + int aIdRem = (vTuples->num_rows) & 7; + int64 *dst = vTuples->crid; + int64 crid = vci_CalcCrid64(vTuples->extent_id, + vTuples->row_id_in_extent); + + for (aId = 0; aId < aIdRem; ++aId, ++crid) + dst[aId] = crid; + + for (; aId < vTuples->num_rows; aId += 8, crid += 8) + { + dst[aId + 0] = crid + 0; + dst[aId + 1] = crid + 1; + dst[aId + 2] = crid + 2; + dst[aId + 3] = crid + 3; + dst[aId + 4] = crid + 4; + dst[aId + 5] = crid + 5; + dst[aId + 6] = crid + 6; + dst[aId + 7] = crid + 7; + } +} + +static void +FillSkipLoadFillBitImage(uint64 *dst, uint8 data) +{ + /* + * This function expands values like below, for little endian case, + * ((int16*) dst)[0] = (data >> 0) & 1; ((int16*) dst)[1] = (data >> 1) & + * 1; ((int16*) dst)[2] = (data >> 2) & 1; ((int16*) dst)[3] = (data >> 3) + * & 1; ((int16*) dst)[4] = (data >> 4) & 1; ((int16*) dst)[5] = (data >> + * 5) & 1; ((int16*) dst)[6] = (data >> 6) & 1; ((int16*) dst)[7] = (data + * >> 7) & 1; + */ +#ifdef WORDS_BIGENDIAN + uint64 value = UINT64CONST(0x0001000080004000) * data; + + value += (UINT64CONST(0x2000) * data) >> 16; + dst[0] = value & UINT64CONST(0x0001000100010001); + dst[1] = (value >> 4) & UINT64CONST(0x0001000100010001); +#else + uint64 value = UINT64CONST(0x0000200040008001) * data; + + dst[0] = value & UINT64CONST(0x0001000100010001); + dst[1] = (value >> 4) & UINT64CONST(0x0001000100010001); +#endif /* #ifdef WORDS_BIGENDIAN */ +} + +static void +FillSkipLoadBody(uint16 **dst_, + int *startOf, + Page page, + OffsetNumber oNum, + int numRows) +{ + ItemId itemId = PageGetItemId(page, oNum); + HeapTupleHeader hTup = (HeapTupleHeader) PageGetItem(page, itemId); + uint8 *data = &(((uint8 *) hTup)[hTup->t_hoff + *startOf]); + int aId; + int aIdMax = numRows / 8; + uint64 *dst = (uint64 *) *dst_; + + *dst_ += numRows; + *startOf = 0; + for (aId = 0; aId < aIdMax; aId += 4) + { + FillSkipLoadFillBitImage(dst + 0, data[0]); + FillSkipLoadFillBitImage(dst + 2, data[1]); + FillSkipLoadFillBitImage(dst + 4, data[2]); + FillSkipLoadFillBitImage(dst + 6, data[3]); + dst += 8; + data += 4; + } +} + +static void +FillSkipLoad(vci_virtual_tuples_t *vTuples) +{ + int64 cridStart = vci_CalcCrid64(vTuples->extent_id, + vTuples->row_id_in_extent); + int64 cridEnd = cridStart + vTuples->num_rows - 1; + vci_ColumnRelations rel = vTuples->fetch_context->rel_delete; + int initCorr = vTuples->row_id_in_extent % VCI_NUM_ROWS_IN_ONE_ITEM_FOR_DELETE; + + BlockNumber startBN = vci_CalcBlockNumberFromCrid64ForDelete(cridStart); + OffsetNumber startON = vci_CalcOffsetNumberFromCrid64ForDelete(cridStart); + int startOf = vci_CalcByteFromCrid64ForDelete(cridStart); + + BlockNumber endBN = vci_CalcBlockNumberFromCrid64ForDelete(cridEnd); + OffsetNumber endON = vci_CalcOffsetNumberFromCrid64ForDelete(cridEnd); + int endOf = vci_CalcByteFromCrid64ForDelete(cridEnd); + + BlockNumber bNum; + uint16 *dst = vTuples->skip; +#ifdef USE_ASSERT_CHECKING + uint16 *dstSave = dst; +#endif /* #ifdef USE_ASSERT_CHECKING */ + + /* + * We always expand eight bits in a byte. So the first row ID should be a + * multiple of eight. + */ + Assert(0 == (vTuples->row_id_in_extent & 7)); + + for (bNum = startBN; bNum < endBN; ++bNum) + { + Buffer buffer = ReadBuffer(rel.data, bNum); + Page page = BufferGetPage(buffer); + OffsetNumber oNum; + + for (oNum = startON; + oNum < (VCI_ITEMS_IN_PAGE_FOR_DELETE + FirstOffsetNumber); + ++oNum) + { + FillSkipLoadBody(&dst, + &startOf, + page, + oNum, + VCI_NUM_ROWS_IN_ONE_ITEM_FOR_DELETE - initCorr); + initCorr = 0; + Assert((uintptr_t) dst <= (uintptr_t) &(dstSave[TYPEALIGN(8, vTuples->num_rows)])); + } + + ReleaseBuffer(buffer); + startON = FirstOffsetNumber; + } + + { + Buffer buffer = ReadBuffer(rel.data, bNum); + Page page = BufferGetPage(buffer); + OffsetNumber oNum; + + for (oNum = startON; oNum < endON; ++oNum) + { + FillSkipLoadBody(&dst, + &startOf, + page, + oNum, + VCI_NUM_ROWS_IN_ONE_ITEM_FOR_DELETE - initCorr); + initCorr = 0; + Assert((uintptr_t) dst <= (uintptr_t) &(dstSave[TYPEALIGN(8, vTuples->num_rows)])); + } + + FillSkipLoadBody(&dst, + &startOf, + page, + oNum, + (endOf - startOf + 1) * 8); + Assert((uintptr_t) dst <= (uintptr_t) &(dstSave[TYPEALIGN(8, vTuples->num_rows)])); + + ReleaseBuffer(buffer); + } +} + +static int64 +GetPrevIdInDeleteList(vci_CSQueryContext queryContext, uint64 crid) +{ + if (queryContext->num_delete < 16) + { + int64 result; + + for (result = queryContext->num_delete; result--;) + { + if (queryContext->delete_list[result] <= crid) + return result; + } + + return -1; + } + else + { + uint64 tgtBit; + uint64 result = 0; + int shiftBit = vci_GetHighestBit(queryContext->num_delete); + + Assert(0 <= shiftBit); + for (tgtBit = UINT64CONST(1) << shiftBit; tgtBit; tgtBit >>= 1) + { + uint64 cand = result + tgtBit; + + if ((cand < queryContext->num_delete) && + (queryContext->delete_list[cand] <= crid)) + result = cand; + } + if (0 == result) + return (queryContext->delete_list[0] <= crid) ? 0 : -1; + + return result; + } +} + +static void +MergeLocalDeleteListToSkip(vci_virtual_tuples_t *vTuples) +{ + vci_CSFetchContext fetchContext = vTuples->fetch_context; + vci_CSQueryContext queryContext = fetchContext->query_context; + int64 cridStart = vci_CalcCrid64(vTuples->extent_id, + vTuples->row_id_in_extent); + int numRows = vTuples->num_rows; + int startId; + int endId; + int aid; + + if (queryContext->num_delete < 1) + return; + + startId = Max(0, GetPrevIdInDeleteList(queryContext, cridStart)); + if (queryContext->delete_list[startId] < cridStart) + ++startId; + + endId = GetPrevIdInDeleteList(queryContext, cridStart + numRows - 1); + + for (aid = startId; aid <= endId; ++aid) + { + uint64 crid = queryContext->delete_list[aid]; + uint64 offset = crid - cridStart; + + vTuples->skip[offset] = 1; + } +} + +static void +FillSkipCountUp(vci_virtual_tuples_t *vTuples) +{ + int aId; + uint16 *dst = vTuples->skip; + uint16 count = 0; + + for (aId = vTuples->num_rows; aId--;) + dst[aId] = count += dst[aId] + ((dst[aId] - 1) * count); +} + +static void +FillSkip(vci_virtual_tuples_t *vTuples) +{ + FillSkipLoad(vTuples); + MergeLocalDeleteListToSkip(vTuples); + + vTuples->skip[vTuples->num_rows] = 0; + + FillSkipCountUp(vTuples); +} + +static char * +FillFixedWidthCopyBody1(char *dstData, + BlockNumber startBN, + uint32 startOf, + int stepDstData, + int dataWidth, + Relation rel, + int numRows) +{ + int aId; + BlockNumber bNumCur = startBN; + int64 offset = startOf; + Buffer buffer = InvalidBuffer; + Page page = NULL; + const Datum zero = 0; +#ifdef WORDS_BIGENDIAN + const int offsetCont = MAXALIGN(dataWidth) - dataWidth; +#else /* #ifdef WORDS_BIGENDIAN */ + const int offsetCont = MAXALIGN(dataWidth) - sizeof(Datum); +#endif /* #ifdef WORDS_BIGENDIAN */ + + if (0 < numRows) + { + buffer = ReadBuffer(rel, bNumCur); + page = BufferGetPage(buffer); + } + + for (aId = 0; aId < numRows;) + { + int64 rest = VCI_MAX_PAGE_SPACE - offset; + int numElem = rest / dataWidth; + int maxBId = Min(aId + numElem, numRows); + int bId; + + Assert(0 <= rest); + for (bId = aId; bId < maxBId; ++bId) + { +#ifdef WORDS_BIGENDIAN + *(Datum *) dstData = zero; + MemCpy(&(dstData[offsetCont]), &(page[VCI_MIN_PAGE_HEADER + offset]), dataWidth); +#else /* #ifdef WORDS_BIGENDIAN */ + *(Datum *) &(dstData[offsetCont]) = zero; + MemCpy(dstData, &(page[VCI_MIN_PAGE_HEADER + offset]), dataWidth); +#endif /* #ifdef WORDS_BIGENDIAN */ + dstData += stepDstData; + offset += dataWidth; + } + + aId = maxBId; + + if (numRows <= aId) + break; + + if (offset < VCI_MAX_PAGE_SPACE) + { + int size = VCI_MAX_PAGE_SPACE - offset; + int nextOffset = dataWidth - size; + + Assert(VCI_MAX_PAGE_SPACE < (offset + dataWidth - 1)); +#ifdef WORDS_BIGENDIAN + *(Datum *) dstData = zero; + MemCpy(&(dstData[offsetCont]), &(page[VCI_MIN_PAGE_HEADER + offset]), size); +#else /* #ifdef WORDS_BIGENDIAN */ + *(Datum *) &(dstData[offsetCont]) = zero; + MemCpy(dstData, &(page[VCI_MIN_PAGE_HEADER + offset]), size); +#endif /* #ifdef WORDS_BIGENDIAN */ + ReleaseBuffer(buffer); + buffer = ReadBuffer(rel, ++bNumCur); + page = BufferGetPage(buffer); +#ifdef WORDS_BIGENDIAN + MemCpy(&(dstData[offsetCont + size]), &(page[VCI_MIN_PAGE_HEADER]), nextOffset); +#else /* #ifdef WORDS_BIGENDIAN */ + MemCpy(dstData + size, &(page[VCI_MIN_PAGE_HEADER]), nextOffset); +#endif /* #ifdef WORDS_BIGENDIAN */ + dstData += stepDstData; + offset = nextOffset; + ++aId; + } + else + { + Assert(offset == VCI_MAX_PAGE_SPACE); + offset = 0; + ReleaseBuffer(buffer); + buffer = ReadBuffer(rel, ++bNumCur); + page = BufferGetPage(buffer); + } + } + if (BufferIsValid(buffer)) + ReleaseBuffer(buffer); + + return dstData; +} + +static void +FillFixedWidth(vci_virtual_tuples_t *vTuples, + int16 columnId, + vci_ColumnRelations *rel) +{ + vci_MainRelHeaderInfo *info = GetMainRelHeaderInfoFromFetchContext(vTuples->fetch_context); + Datum *dstPtr = NULL; + char *dstData = NULL; + + int dataWidth = 0; + + int16 colId = columnId; + + bool passByRef = false; + + char *checkPtr PG_USED_FOR_ASSERTS_ONLY; + + BlockNumber startBN; + uint32 startOf; + + int facRow = vTuples->num_columns; + int stepDstData = sizeof(Datum) * facRow; + + if (VCI_FIRST_NORMALCOLUMN_ID <= columnId) + { + int facCol = 1; + + Assert(columnId < vTuples->num_columns); + + dataWidth = vTuples->column_info[columnId].max_column_size; + if (vTuples->use_column_store) + { + facRow = 1; + stepDstData = sizeof(Datum) * facRow; + facCol = vTuples->num_rows_read_at_once; + } + + dstData = (char *) &(vTuples->values[facCol * columnId]); + + if ((passByRef = vTuples->column_info[columnId].strict_datum_type)) /* pgr0011 */ + { + dstPtr = (Datum *) dstData; + dstData = vTuples->column_info[columnId].area; + stepDstData = MAXALIGN(dataWidth); + Assert(dstData); + } + else + Assert(NULL == vTuples->column_info[columnId].area); + colId = vci_GetColumnIdFromFetchContext(vTuples->fetch_context, + columnId); + } + else + { + Assert(VCI_COLUMN_ID_TID == columnId); + dstData = (char *) (vTuples->tid); + stepDstData = sizeof(vTuples->tid[0]); + dataWidth = sizeof(ItemPointerData); + } + + vci_GetPositionForFixedColumn(&startBN, + &startOf, + info, + colId, + vTuples->extent_id, + vTuples->row_id_in_extent, + false); + + checkPtr = FillFixedWidthCopyBody1(dstData, + startBN, + startOf, + stepDstData, + dataWidth, + rel->data, + vTuples->num_rows); + if (passByRef) + { + int rId; + + Assert(VCI_FIRST_NORMALCOLUMN_ID <= columnId); + for (rId = 0; rId < vTuples->num_rows; ++rId) + dstPtr[facRow * rId] = PointerGetDatum(&(dstData[stepDstData * rId])); + Assert((uintptr_t) (vTuples->column_info[columnId].area) <= (uintptr_t) checkPtr); + Assert((uintptr_t) checkPtr <= (uintptr_t) &(vTuples->column_info[columnId].area[vTuples->column_info[columnId].max_column_size * vTuples->num_rows_read_at_once])); + if (vTuples->use_column_store) + Assert(vTuples->column_info[columnId].values == dstPtr); + else + Assert(&(vTuples->values[columnId]) == dstPtr); + } + else + { + if (VCI_FIRST_NORMALCOLUMN_ID <= columnId) + { + if (vTuples->use_column_store) + { + Assert((uintptr_t) (vTuples->column_info[columnId].values) <= (uintptr_t) checkPtr); + Assert((uintptr_t) checkPtr <= (uintptr_t) &(vTuples->column_info[columnId].values[vTuples->num_rows_read_at_once])); + } + else + { + Assert((uintptr_t) (&(vTuples->values[columnId])) <= (uintptr_t) checkPtr); + Assert((uintptr_t) checkPtr <= (uintptr_t) &(vTuples->values[columnId + (vTuples->num_rows_read_at_once * vTuples->num_columns)])); + } + } + else + { + Assert((uintptr_t) (vTuples->tid) <= (uintptr_t) checkPtr); + Assert((uintptr_t) checkPtr <= (uintptr_t) &(vTuples->tid[vTuples->num_rows_read_at_once])); + } + } +} + +static void +Copy3(char *dst, char *src, int len) +{ + if (2 & len) + { +#if defined(__i386__) || defined(__x86_64__) || defined(__powerpc64__) /* Little Endian */ + *(uint16 *) dst = *(uint16 *) src; + dst += 2; + src += 2; +#else /* #if defined(__i386__) || + * defined(__x86_64__) */ + *(dst++) = *(src++); + *(dst++) = *(src++); +#endif /* #if defined(__i386__) || + * defined(__x86_64__) */ + } + if (1 & len) + *dst = *src; +} + +static uint32 +GetVarlenAHeader(Datum *header_, + Buffer *buffer, + BlockNumber *currentBlockNumber, + uint32 offsetInPage, + Relation rel) +{ + char *header = (char *) header_; + Page page; + char *curPtr; + int len = VCI_MAX_PAGE_SPACE - offsetInPage; + + if (len <= 0) + { + Assert(BlockNumberIsValid(*currentBlockNumber)); + if (MaxBlockNumber == *currentBlockNumber) + ereport(ERROR, (errmsg("relation full"), errhint("Disable VCI by 'SELECT vci_disable();'"))); + if (BufferIsValid(*buffer)) + ReleaseBuffer(*buffer); + *buffer = ReadBuffer(rel, ++*currentBlockNumber); + offsetInPage -= VCI_MAX_PAGE_SPACE; + len = VCI_MAX_PAGE_SPACE - offsetInPage; + } + + page = BufferGetPage(*buffer); + curPtr = &(page[VCI_MIN_PAGE_HEADER + offsetInPage]); + + Assert(0 < len); + + if (VARATT_IS_1B_E(curPtr)) /* VARHDRSZ_EXTERNAL */ + { + Assert(2 == VARHDRSZ_EXTERNAL); + if (VARHDRSZ_EXTERNAL <= len) + { + *(header++) = *(curPtr++); + *(header++) = *(curPtr++); + + return offsetInPage + VARHDRSZ_EXTERNAL; + } + + Assert(1 == len); + *(header++) = *(curPtr++); + ReleaseBuffer(*buffer); + ++*currentBlockNumber; + *buffer = ReadBuffer(rel, *currentBlockNumber); + page = BufferGetPage(*buffer); + *header = page[VCI_MIN_PAGE_HEADER]; + + return 1; + } + + if (VARATT_IS_1B(curPtr)) /* VARHDRSZ_SHORT */ + { + Assert(1 == VARHDRSZ_SHORT); + Assert(VARHDRSZ_SHORT <= len); + *header = *curPtr; + + return offsetInPage + VARHDRSZ_SHORT; + } + + /* VARHDRSZ */ + Assert(4 == VARHDRSZ); + + if (VARHDRSZ <= len) + { + *(uint32 *) header = *(uint32 *) curPtr; + + return offsetInPage + VARHDRSZ; + } + + Assert((0 <= len) && (len <= 3)); + Copy3(header, curPtr, len); + header += len; + curPtr += len; + + ReleaseBuffer(*buffer); + ++*currentBlockNumber; + *buffer = ReadBuffer(rel, *currentBlockNumber); + page = BufferGetPage(*buffer); + len = VARHDRSZ - len; + curPtr = &(page[VCI_MIN_PAGE_HEADER]); + + Assert((0 <= len) && (len <= 3)); + Copy3(header, curPtr, len); + + return len; +} + +static void +FillVariableWidth(vci_virtual_tuples_t *vTuples, + int16 columnId, + vci_ColumnRelations *rel) +{ + vci_MainRelHeaderInfo *info = GetMainRelHeaderInfoFromFetchContext(vTuples->fetch_context); + char *dstData = vTuples->column_info[columnId].area; + Datum *dstPtr = &(vTuples->values[columnId]); + int ptrStep = vTuples->num_columns; + + BlockNumber startBN; + uint32 startOf; + + if (vTuples->use_column_store) + { + dstPtr = &(vTuples->values[vTuples->num_rows_read_at_once * columnId]); + ptrStep = 1; + } + + /* This function must be called only for ROS, not local ROS. */ + Assert(VCI_FIRST_NORMAL_EXTENT_ID <= vTuples->extent_id); + + { + uint32 offset; + uint32 dataOffset; + BlockNumber blockNumberBase; + TupleDesc desc = vci_GetTupleDescr(info); + int16 cId = vci_GetColumnIdFromFetchContext(vTuples->fetch_context, + columnId); + + vci_GetElementPosition(&offset, + &blockNumberBase, + &dataOffset, + rel, + vTuples->extent_id, + vTuples->row_id_in_extent, + TupleDescAttr(desc, cId)); + vci_GetBlockNumberAndOffsetInPage(&startBN, + &startOf, + offset + dataOffset); + startBN += blockNumberBase; + } + + { + BlockNumber bNum = startBN; + Size offsetInPage = startOf; + Buffer buffer = InvalidBuffer; + Page page; + int aId; + int numWrite = 0; + int numRows = vTuples->num_rows; + + if (0 < numRows) + { + buffer = ReadBuffer(rel->data, bNum); + page = BufferGetPage(buffer); + } + + for (aId = 0; aId < numRows; ++aId) + { + offsetInPage = GetVarlenAHeader((Datum *) dstData, + &buffer, + &bNum, + offsetInPage, + rel->data); + + { + int32 copySize; + uint32 dataSize = VARSIZE_ANY_EXHDR(dstData); + uint32 headerSize = vci_VARHDSZ_ANY(dstData); + + dstPtr[ptrStep * (numWrite++)] = PointerGetDatum(dstData); + + if (VCI_MAX_PAGE_SPACE <= offsetInPage) + { + offsetInPage -= VCI_MAX_PAGE_SPACE; + Assert(offsetInPage < VCI_MAX_PAGE_SPACE); + if (0 == offsetInPage) + { + ReleaseBuffer(buffer); + buffer = ReadBuffer(rel->data, ++bNum); + } + } + page = BufferGetPage(buffer); + + copySize = Min(dataSize, VCI_MAX_PAGE_SPACE - offsetInPage); + MemCpy(&(dstData[headerSize]), + &(page[VCI_MIN_PAGE_HEADER + offsetInPage]), + copySize); + offsetInPage += copySize; + + if (copySize < dataSize) + { + ReleaseBuffer(buffer); + buffer = ReadBuffer(rel->data, ++bNum); + page = BufferGetPage(buffer); + MemCpy(&(dstData[copySize + headerSize]), + &(page[VCI_MIN_PAGE_HEADER]), + dataSize - copySize); + offsetInPage = dataSize - copySize; /* pgr0063 */ + } + dstData += MAXALIGN(dataSize + headerSize); + } + + if (VCI_MAX_PAGE_SPACE <= offsetInPage) + { + ReleaseBuffer(buffer); + buffer = ReadBuffer(rel->data, ++bNum); + page = BufferGetPage(buffer); + offsetInPage -= VCI_MAX_PAGE_SPACE; + Assert(offsetInPage < VCI_MAX_PAGE_SPACE); + } + } + + if (BufferIsValid(buffer)) + ReleaseBuffer(buffer); + + Assert(vTuples->num_rows == numWrite); + } +} + +static void +FillValues(vci_virtual_tuples_t *vTuples) +{ + int16 columnId; + + for (columnId = VCI_FIRST_NORMALCOLUMN_ID; columnId < vTuples->num_columns; ++columnId) + { + switch (vTuples->column_info[columnId].comp_type) + { + case vcis_compression_type_fixed_raw: + FillFixedWidth(vTuples, columnId, + &(vTuples->fetch_context->rel_column[columnId])); + break; + case vcis_compression_type_variable_raw: + FillVariableWidth(vTuples, columnId, + &(vTuples->fetch_context->rel_column[columnId])); + break; + default: + abort(); + } + } +} + +static int +GetNullableColumnInfo(uint16 **columnId, uint16 **nullBitId, + vci_virtual_tuples_t *vTuples) +{ + int cId; + int aId; + vci_CSQueryContext queryContext = vTuples->fetch_context->query_context; + + *columnId = palloc0(sizeof(uint16) * queryContext->num_nullable_columns); + *nullBitId = palloc0(sizeof(uint16) * queryContext->num_nullable_columns); + + cId = 0; + for (aId = 0; aId < vTuples->num_columns; ++aId) + { + int bitId = vTuples->column_info[aId].null_bit_id; + + if (0 <= bitId) + { + (*columnId)[cId] = aId; + (*nullBitId)[cId] = bitId; + ++cId; + } + } + Assert(cId <= queryContext->num_nullable_columns); + + return cId; +} + +static void +FillIsNull(vci_virtual_tuples_t *vTuples) +{ + int colOffset[MaxAttrNumber]; + Buffer buffer = InvalidBuffer; + Page page = NULL; + vci_CSQueryContext queryContext = vTuples->fetch_context->query_context; + vci_MainRelHeaderInfo *info = GetMainRelHeaderInfoFromFetchContext(vTuples->fetch_context); + const int32 strideR = 16; + const int32 oneBuf = sizeof(uint8) * (strideR * queryContext->null_width_in_byte); + uint16 *columnId; + uint16 *nullBitId; + uint8 *nullCopy = palloc0(oneBuf); + int32 rId; + + BlockNumber bNumCur; + uint32 offset; + + Relation rel = vTuples->fetch_context->rel_null.data; + + int numNullableColumns = GetNullableColumnInfo(&columnId, &nullBitId, vTuples); + + int facCol = 1; + int facRow = vTuples->num_columns; + + if (vTuples->use_column_store) + { + facCol = vTuples->num_rows_read_at_once; + facRow = 1; + } + + Assert(VCI_FIRST_NORMAL_EXTENT_ID <= vTuples->extent_id); + MemSet(vTuples->isnull, 0, vTuples->num_columns * vTuples->num_rows); + + { + int aId; + + for (aId = 0; aId < numNullableColumns; ++aId) + colOffset[aId] = facCol * columnId[aId]; + } + vci_GetPositionForFixedColumn(&bNumCur, + &offset, + info, + VCI_COLUMN_ID_NULL, + vTuples->extent_id, + vTuples->row_id_in_extent, + false); + + if (0 < vTuples->num_rows) + { + buffer = ReadBuffer(rel, bNumCur); + page = BufferGetPage(buffer); + } + + /* This tiling is the best? */ + for (rId = 0; rId < vTuples->num_rows; rId += strideR) + { + int32 pIdMax = Min(rId + strideR, vTuples->num_rows); + int nwib = queryContext->null_width_in_byte; + int32 inc = (pIdMax - rId) * nwib; + uint32 nextOffset = offset + inc; + uint8 *ptr = (uint8 *) &(page[VCI_MIN_PAGE_HEADER + offset]); + uint8 *ptrSave = NULL; + int cId; + + Assert((0 <= offset) && (offset < VCI_MAX_PAGE_SPACE)); + if (VCI_MAX_PAGE_SPACE < nextOffset) + { + int size = VCI_MAX_PAGE_SPACE - offset; + + MemCpy(nullCopy, ptr, size); + ReleaseBuffer(buffer); + buffer = ReadBuffer(rel, ++bNumCur); + page = BufferGetPage(buffer); + MemCpy(&(nullCopy[size]), &(page[VCI_MIN_PAGE_HEADER]), + inc - size); + ptr = nullCopy; + } + + ptrSave = ptr; + for (cId = 0; cId < numNullableColumns; ++cId) + { + int32 pId; + int bitId = nullBitId[cId]; + bool *dst = &(vTuples->isnull[colOffset[cId] + (rId * facRow)]); + + ptr = ptrSave; + for (pId = rId; pId <= (pIdMax - 4); pId += 4) + { + *dst = vci_GetBit(ptr, bitId); + ptr += nwib; + dst += facRow; + *dst = vci_GetBit(ptr, bitId); + ptr += nwib; + dst += facRow; + *dst = vci_GetBit(ptr, bitId); + ptr += nwib; + dst += facRow; + *dst = vci_GetBit(ptr, bitId); + ptr += nwib; + dst += facRow; + } + for (; pId < pIdMax; ++pId) + { + *dst = vci_GetBit(ptr, bitId); + ptr += nwib; + dst += facRow; + } + } + + offset = nextOffset; + if (VCI_MAX_PAGE_SPACE <= offset) + { + if (VCI_MAX_PAGE_SPACE == offset) + { + ReleaseBuffer(buffer); + buffer = ReadBuffer(rel, ++bNumCur); + page = BufferGetPage(buffer); + } + offset -= VCI_MAX_PAGE_SPACE; + } + } + if (BufferIsValid(buffer)) + ReleaseBuffer(buffer); + + pfree(nullCopy); + pfree(nullBitId); + pfree(columnId); +} + +/** + * @brief Fetch or read data in columns specified in \c vTuples, + * \c numReadRows rows from \c cridStart. + * + * @details vci_CSFetchVirtualTuples returns number of rows which can be read + * from stored data after cridStart. For example, if cridStart = 50, but + * actualNumberOfRowsReadAtOnce = 128, vci_CSFetchVirtualTuples() returns 78 + * (= 128 - 50). + * + * When all the tuples between cridStart and (cridStart + numReadRows - 1) + * is stored in vTuples, it does not read ROS. + * Otherwise, tuples at TYPEALIGN_DOWN(VCI_COMPACTION_UNIT_ROW, cridStart) + * and following (actualNumRowsReadAtOnce - 1) rows are read from ROS. + * + * @param[in,out] vTuples the read data are stored in the pointed area. + * @param[in] cridStart the data from \c cridStart row are read. + * @param[in] numReadRows required number of rows to be read. + * @return number of rows enable to be read out from \c vTuples. + */ +int +vci_CSFetchVirtualTuples(vci_virtual_tuples_t *vTuples, + int64 cridStart, + uint32 numReadRows) +{ + const int32 extentId = vci_CalcExtentIdFromCrid64(cridStart); + const uint32 rowId = vci_CalcRowIdInExtentFromCrid64(cridStart); + + Assert(vTuples); + + vTuples->status = vcirvs_out_of_range; + if (VCI_INVALID_EXTENT_ID == extentId) + { + return 0; + } + + RefillPointersOfVirtualTuples(vTuples, true); + + /* local ROS */ + if (extentId < VCI_FIRST_NORMAL_EXTENT_ID) + { + vci_CSFetchContext fetchContext = vTuples->fetch_context; + vci_CSQueryContext queryContext = fetchContext->query_context; + vci_local_ros_t *localRos = queryContext->local_ros; + int localRosId = -extentId - 1; + + Assert(queryContext->num_local_ros_extents == localRos->num_local_extents); + if (queryContext->num_local_ros_extents <= localRosId) + { + vTuples->status = vcirvs_not_exist; + + return 0; + } + + if (localRos->extent[localRosId]->num_rows_in_extent <= rowId) + { + vTuples->status = vcirvs_out_of_range; + + return 0; + } + + vTuples->num_rows_in_extent = localRos->extent[localRosId]->num_rows_in_extent; + vTuples->extent_id = extentId; + vTuples->num_rows = Min(numReadRows, + vTuples->num_rows_in_extent - rowId); + vTuples->offset_of_first_tuple_of_vector = 0; + + if (vTuples->tid) + MemCpy(vTuples->tid, &(localRos->extent[localRosId]->tid[rowId]), + sizeof(vTuples->tid[0]) * vTuples->num_rows); + + if (vTuples->crid) + MemCpy(vTuples->crid, &(localRos->extent[localRosId]->crid[rowId]), + sizeof(vTuples->crid[0]) * vTuples->num_rows); + + MemSet(vTuples->skip, 0, + sizeof(vTuples->skip[0]) * (vTuples->num_rows + 1)); + + if (vTuples->use_column_store) + { + int cId; + + for (cId = 0; cId < vTuples->num_columns; ++cId) + { + vci_virtual_tuples_column_info_t *dColI; + vci_virtual_tuples_column_info_t *sColI; + + dColI = &(vTuples->column_info[cId]); + sColI = &(localRos->extent[localRosId]->column_info[ + fetchContext->column_link[cId]]); + MemCpy(dColI->values, &(sColI->values[rowId]), + sizeof(Datum) * vTuples->num_rows); + MemCpy(dColI->isnull, &(sColI->isnull[rowId]), + sizeof(bool) * vTuples->num_rows); + } + } + else + { + int rId; + + vTuples->values = (Datum *) TYPEALIGN(sizeof(Datum), + vTuples->row_wise_local_ros); + vTuples->isnull = (bool *) &(vTuples->values[vTuples->num_rows_read_at_once * + vTuples->num_columns]); + for (rId = 0; rId < vTuples->num_rows; ++rId) + { + int offset = rId * vTuples->num_columns; + Datum *dstValues = &(vTuples->values[offset]); + bool *dstIsNull = &(vTuples->isnull[offset]); + int cId; + + for (cId = 0; cId < vTuples->num_columns; ++cId) + { + vci_virtual_tuples_column_info_t *sColI; + + sColI = &(localRos->extent[localRosId]->column_info[ + fetchContext->column_link[cId]]); + dstValues[cId] = sColI->values[rowId + rId]; + dstIsNull[cId] = sColI->isnull[rowId + rId]; + } + } + } + + vTuples->status = (localRos->extent[localRosId]->num_rows_in_extent <= + (rowId + vTuples->num_rows)) + ? vcirvs_end_of_extent : vcirvs_read_whole; + } + else + { + vTuples->status = vcirvs_read_whole; + /* use stored data */ + if ((extentId == vTuples->extent_id) && + (vTuples->row_id_in_extent <= rowId) && + ((rowId + numReadRows) <= + (vTuples->row_id_in_extent + vTuples->num_rows))) + { + vTuples->offset_of_first_tuple_of_vector = rowId - + vTuples->row_id_in_extent; + } + else + { + uint32 numRowsInExtent = vTuples->num_rows_in_extent; + + { + vci_extent_status_t status; + + vci_CSCheckExtent(&status, vTuples->fetch_context, extentId, false); + /* check if the extent is visible */ + if (!((status.existence) && (status.visible))) + { + vTuples->status = vcirvs_not_visible; + + return 0; /* not visible */ + } + } + + if (extentId != vTuples->extent_id) + { + Buffer buffer = InvalidBuffer; + vcis_m_extent_t *mExtent; + + mExtent = vci_GetMExtent(&buffer, + GetMainRelHeaderInfoFromFetchContext( + vTuples->fetch_context), + extentId); + + LockBuffer(buffer, BUFFER_LOCK_SHARE); + numRowsInExtent = mExtent->num_rows; + UnlockReleaseBuffer(buffer); + + vTuples->num_rows_in_extent = numRowsInExtent; + vTuples->extent_id = extentId; + vTuples->num_rows = 0; + } + + /* no such a row in the extent */ + if (numRowsInExtent <= rowId) + { + vTuples->status = vcirvs_out_of_range; + + return 0; + } + + vTuples->row_id_in_extent = TYPEALIGN_DOWN(VCI_COMPACTION_UNIT_ROW, + rowId); + vTuples->offset_of_first_tuple_of_vector = rowId - + vTuples->row_id_in_extent; + vTuples->num_rows = TYPEALIGN(VCI_COMPACTION_UNIT_ROW, + vTuples->offset_of_first_tuple_of_vector + + numReadRows); + vTuples->num_rows = Min(vTuples->num_rows, + vTuples->num_rows_read_at_once); + vTuples->num_rows = Min(vTuples->num_rows, + numRowsInExtent - vTuples->row_id_in_extent); + + if (vTuples->crid) + vci_FillCridInVirtualTuples(vTuples); + + if (vTuples->tid) + FillFixedWidth(vTuples, VCI_COLUMN_ID_TID, + &(vTuples->fetch_context->rel_tid)); + + FillSkip(vTuples); + + FillIsNull(vTuples); + FillValues(vTuples); + } + + if (vTuples->num_rows_in_extent <= (vTuples->row_id_in_extent + + vTuples->offset_of_first_tuple_of_vector + numReadRows)) + vTuples->status = vcirvs_end_of_extent; + } + + Assert(vTuples->offset_of_first_tuple_of_vector <= vTuples->num_rows); + + return Min(vTuples->num_rows - vTuples->offset_of_first_tuple_of_vector, + numReadRows); +} + +/** + * @brief Fill data of the specified fixed-field-length column in + * \c RosChunkStorage into \c vci_virtual_tuples_t. + * + * @param[in,out] vTuples the pointer of vci_virtual_tuples_t where data are + * stored. + * @param[in] columnId target column ID. + * @param[in] rosChunkStorage data source. + */ +void +vci_FillFixedWidthColumnarFromRosChunkStorage(vci_virtual_tuples_t *vTuples, + int16 columnId, + RosChunkStorage *rosChunkStorage) +{ + int16 colIdInVciMain = VCI_FIRST_NORMALCOLUMN_ID; + Datum *dstPtr = NULL; + char *dstData = NULL; + + int stepDstData = 0; + int stepSrc = 0; + + bool passByRef = false; + int offsetCont = 0; + + if (VCI_FIRST_NORMALCOLUMN_ID <= columnId) + { + vci_CSFetchContext fetchContext = vTuples->fetch_context; + + Assert(columnId < vTuples->num_columns); + colIdInVciMain = fetchContext->query_context->column_id[fetchContext->column_link[columnId]]; + dstData = (char *) &(vTuples->values[vTuples->num_rows_in_extent * columnId]); + if ((passByRef = vTuples->column_info[columnId].strict_datum_type)) /* pgr0011 */ + { + dstPtr = (Datum *) dstData; + dstData = vTuples->column_info[columnId].area; + Assert(dstData); + } + else + Assert(NULL == vTuples->column_info[columnId].area); + stepSrc = vTuples->column_info[columnId].max_column_size; + stepDstData = MAXALIGN(stepSrc); + } + else + { + Assert(VCI_COLUMN_ID_TID == columnId); + dstData = (char *) (vTuples->tid); + stepDstData = sizeof(vTuples->tid[0]); + stepSrc = sizeof(ItemPointerData); + } + + { + const Datum zero = 0; + int sId; +#ifdef WORDS_BIGENDIAN + + if (stepSrc < sizeof(Datum)) + { + /* + * if the value itself is contained in Datum. for example uint32 1 + * is contained in Datum + */ + /* + * the value should be 0x0000000000000001 (not 0x0001000000000000) + * so offsetCont should be 4 + */ + offsetCont = stepDstData - stepSrc; + } + else + { + offsetCont = 0; + } + +#else /* #ifdef WORDS_BIGENDIAN */ + offsetCont = stepDstData - sizeof(Datum); +#endif /* #ifdef WORDS_BIGENDIAN */ + + for (sId = 0; sId < rosChunkStorage->numFilled; ++sId) + { + RosChunkBuffer *chunk = rosChunkStorage->chunk[sId]; + int rId; + char *srcPtr = chunk->tidData; + + if (VCI_FIRST_NORMALCOLUMN_ID <= columnId) + srcPtr = chunk->data[colIdInVciMain]; + for (rId = 0; rId < chunk->numFilled; ++rId) + { +#ifdef WORDS_BIGENDIAN + *(Datum *) dstData = zero; + MemCpy(&(dstData[offsetCont]), &(srcPtr[stepSrc * rId]), stepSrc); +#else /* #ifdef WORDS_BIGENDIAN */ + *(Datum *) &(dstData[offsetCont]) = zero; + MemCpy(dstData, &(srcPtr[stepSrc * rId]), stepSrc); +#endif /* #ifdef WORDS_BIGENDIAN */ + dstData += stepDstData; + } + } + } + + if (passByRef) + { + int rId; + + dstData = vTuples->column_info[columnId].area; + for (rId = 0; rId < vTuples->num_rows; ++rId) + dstPtr[rId] = PointerGetDatum(&(dstData[stepDstData * rId])); + } +} + +/** + * @brief Fill data of the specified variable-field-length column in + * \c RosChunkStorage into \c vci_virtual_tuples_t. + * + * @param[in,out] vTuples the pointer of vci_virtual_tuples_t where data are + * stored. + * @param[in] columnId target column ID. + * @param[in] rosChunkStorage data source. + */ +void +vci_FillVariableWidthColumnarFromRosChunkStorage(vci_virtual_tuples_t *vTuples, + int16 columnId, + RosChunkStorage *rosChunkStorage) +{ + int16 colIdInVciMain = VCI_FIRST_NORMALCOLUMN_ID; + Datum *dstPtr = NULL; + char *dstData = NULL; + + Assert((VCI_FIRST_NORMALCOLUMN_ID <= columnId) && (columnId < vTuples->num_columns)); + dstData = (char *) &(vTuples->values[vTuples->num_rows_in_extent * columnId]); + Assert(vTuples->column_info[columnId].strict_datum_type); + + colIdInVciMain = vTuples->fetch_context->query_context->column_id[vTuples->fetch_context->column_link[columnId]]; + + dstPtr = (Datum *) dstData; + dstData = vTuples->column_info[columnId].area; + Assert(dstData); + + { + const Datum zero = 0; + int sId; + + for (sId = 0; sId < rosChunkStorage->numFilled; ++sId) + { + RosChunkBuffer *chunk = rosChunkStorage->chunk[sId]; + int rId; + + Assert(chunk->data[colIdInVciMain]); + Assert(chunk->dataOffset[colIdInVciMain]); + for (rId = 0; rId < chunk->numFilled; ++rId) + { + int size = chunk->dataOffset[colIdInVciMain][rId + 1] - chunk->dataOffset[colIdInVciMain][rId]; + + *(Datum *) &(dstData[TYPEALIGN_DOWN(sizeof(Datum), size - 1)]) = zero; + MemCpy(dstData, &(chunk->data[colIdInVciMain][chunk->dataOffset[colIdInVciMain][rId]]), size); + *dstPtr++ = PointerGetDatum(dstData); + dstData += TYPEALIGN(sizeof(Datum), size); + } + } + } +} + +/** + * @brief Get column IDs of nullable columns. + * + * The result is stored in a \c palloc()ed area. Thus, caller should \c pfree() + * the result after use. + * + * @param[in] vTuples target vci_virtual_tuples_t. + * @return the pointer of \c (int16 \*) where the result stored. + */ +int16 * +vci_GetNullableColumnIds(vci_virtual_tuples_t *vTuples) +{ + vci_CSQueryContext queryContext = vTuples->fetch_context->query_context; + int16 *result = palloc0(sizeof(uint16) * queryContext->num_nullable_columns); + int16 aId; + int16 cId = 0; + + MemSet(result, -1, sizeof(uint16) * queryContext->num_nullable_columns); + for (aId = 0; aId < vTuples->num_columns; ++aId) + { + int bitId = vTuples->column_info[aId].null_bit_id; + + Assert((-1 <= bitId) && (bitId < (int) (queryContext->num_nullable_columns))); + if (0 <= bitId) + { + Assert(-1 == result[bitId]); + result[bitId] = aId; + ++cId; + } + } + Assert(cId <= queryContext->num_nullable_columns); + + return result; +} diff --git a/contrib/vci/storage/vci_freelist.c b/contrib/vci/storage/vci_freelist.c new file mode 100644 index 000000000000..f813053c8284 --- /dev/null +++ b/contrib/vci/storage/vci_freelist.c @@ -0,0 +1,474 @@ +/*------------------------------------------------------------------------- + * + * vci_freelist.c + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/storage/vci_freelist.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "vci.h" + +#include "vci_freelist.h" +#include "vci_ros.h" +#include "vci_columns.h" + +static vcis_free_space_t *GetFreeSpaceT(Page page); +static void UpdatePrevNextFreeSpace(vci_RelationPair *relPair, + BlockNumber prevFreeBlockNumber, + BlockNumber nextFreeBlockNumber, + BlockNumber prev_next, + BlockNumber next_prev, + vcis_column_meta_t *columnMeta); + +/** + * function to cast from Page to (vcis_freespace_t *) + */ +static vcis_free_space_t * +GetFreeSpaceT(Page page) +{ + HeapTupleHeader htup; + + htup = (HeapTupleHeader) PageGetItem(page, PageGetItemId(page, VCI_FREESPACE_ITEM_ID)); + + return (vcis_free_space_t *) ((char *) htup + htup->t_hoff); +} + +vcis_free_space_t * +vci_GetFreeSpace(vci_RelationPair *relPair, BlockNumber blk) +{ + Page page; + + relPair->bufData = vci_ReadBufferWithPageInit(relPair->data, blk); + page = BufferGetPage(relPair->bufData); + + return GetFreeSpaceT(page); +} + +static void +UpdatePrevNextFreeSpace(vci_RelationPair *relPair, + BlockNumber prevFreeBlockNumber, + BlockNumber nextFreeBlockNumber, + BlockNumber prev_next, + BlockNumber next_prev, + vcis_column_meta_t *columnMeta) +{ + /* update link information in previous free space */ + if (BlockNumberIsValid(prevFreeBlockNumber)) + { + vcis_free_space_t *prevFreePtr = vci_GetFreeSpace(relPair, + prevFreeBlockNumber); + + Assert(vci_hasFreeLinkNode(prevFreePtr)); + LockBuffer(relPair->bufData, BUFFER_LOCK_EXCLUSIVE); + prevFreePtr->next_pos = prev_next; + vci_WriteOneItemPage(relPair->data, relPair->bufData); + UnlockReleaseBuffer(relPair->bufData); + } + else + columnMeta->free_page_begin_id = prev_next; + + /* update link information in next free space */ + if (BlockNumberIsValid(nextFreeBlockNumber)) + { + vcis_free_space_t *nextFreePtr = vci_GetFreeSpace(relPair, + nextFreeBlockNumber); + + Assert(vci_hasFreeLinkNode(nextFreePtr)); + LockBuffer(relPair->bufData, BUFFER_LOCK_EXCLUSIVE); + nextFreePtr->prev_pos = next_prev; + vci_WriteOneItemPage(relPair->data, relPair->bufData); + UnlockReleaseBuffer(relPair->bufData); + } + else + columnMeta->free_page_end_id = next_prev; +} + +int32 +vci_MakeFreeSpace(vci_RelationPair *relPair, + BlockNumber startBlockNumber, + BlockNumber *newFSBlockNumber, + vcis_free_space_t *newFS, + bool coalesce) +{ + int numMerged = 0; + + vcis_free_space_t *origSpace; + vcis_free_space_t *freeSpace; + + BlockNumber freeSpacePtr; + + /* -- Start Block -- */ + origSpace = vci_GetFreeSpace(relPair, startBlockNumber); + newFS->size = origSpace->size; + ReleaseBuffer(relPair->bufData); + + newFS->type = vcis_free_space; + newFS->prev_pos = InvalidBlockNumber; + newFS->next_pos = InvalidBlockNumber; + *newFSBlockNumber = startBlockNumber; + numMerged = 1; + + freeSpacePtr = vci_GetColumnMeta(&relPair->bufMeta, relPair->meta)->free_page_begin_id; + Assert(BlockNumberIsValid(freeSpacePtr)); + ReleaseBuffer(relPair->bufMeta); + + while (BlockNumberIsValid(freeSpacePtr)) + { + freeSpace = vci_GetFreeSpace(relPair, freeSpacePtr); + + Assert(freeSpacePtr != startBlockNumber); + Assert(!BlockNumberIsValid(freeSpace->next_pos) || + freeSpace->next_pos != startBlockNumber); + + if (startBlockNumber < freeSpacePtr) + { + newFS->prev_pos = InvalidBlockNumber; + newFS->next_pos = freeSpacePtr; + ReleaseBuffer(relPair->bufData); + break; + } + else if ((freeSpacePtr < startBlockNumber) && + (startBlockNumber < freeSpace->next_pos)) + { + newFS->prev_pos = freeSpacePtr; + newFS->next_pos = freeSpace->next_pos; + ReleaseBuffer(relPair->bufData); + break; + } + else if (!BlockNumberIsValid(freeSpace->next_pos)) + { + Assert(freeSpacePtr > startBlockNumber); + newFS->prev_pos = freeSpace->prev_pos; + newFS->next_pos = freeSpacePtr; + ReleaseBuffer(relPair->bufData); + break; + } + + freeSpacePtr = freeSpace->next_pos; + ReleaseBuffer(relPair->bufData); + } + + if (coalesce) + { + if (BlockNumberIsValid(newFS->prev_pos)) + { + freeSpace = vci_GetFreeSpace(relPair, newFS->prev_pos); + Assert(vci_hasFreeLinkNode(freeSpace)); + + if (newFS->prev_pos + vci_GetNumBlocks(freeSpace->size) == + *newFSBlockNumber) + { + *newFSBlockNumber = newFS->prev_pos; + + newFS->size += freeSpace->size; + newFS->prev_pos = freeSpace->prev_pos; + + numMerged++; + elog(DEBUG2, "privious FreeSpace marged ,size %d! ", newFS->size); + } + ReleaseBuffer(relPair->bufData); + } + + if (BlockNumberIsValid(newFS->next_pos)) + { + freeSpace = vci_GetFreeSpace(relPair, newFS->next_pos); + Assert(vci_hasFreeLinkNode(freeSpace)); + + if (newFS->next_pos == + *newFSBlockNumber + vci_GetNumBlocks(newFS->size)) + { + newFS->size += freeSpace->size; + if (freeSpace->size == MaxBlockNumber) + newFS->size = MaxBlockNumber; + newFS->next_pos = freeSpace->next_pos; + + numMerged++; + elog(DEBUG2, "next FreeSpace marged ,size %d! ", newFS->size); + + } + ReleaseBuffer(relPair->bufData); + } + } + + return numMerged; +} + +void +vci_AppendFreeSpaceToLinkList(vci_RelationPair *relPair, + BlockNumber startBlockNumber, + BlockNumber prevFreeBlockNumber, + BlockNumber nextFreeBlockNumber, + BlockNumber size) +{ + vcis_column_meta_t *columnMeta; + vcis_free_space_t *freeSpace; + vcis_extent_type_t type; + + Assert(startBlockNumber != prevFreeBlockNumber); + Assert(startBlockNumber != nextFreeBlockNumber); + + columnMeta = vci_GetColumnMeta(&relPair->bufMeta, relPair->meta); + LockBuffer(relPair->bufMeta, BUFFER_LOCK_EXCLUSIVE); + + freeSpace = vci_GetFreeSpace(relPair, columnMeta->free_page_end_id); + Assert(vci_hasFreeLinkNode(freeSpace)); + type = freeSpace->type; + ReleaseBuffer(relPair->bufData); + + /* rebuild freespace */ + UpdatePrevNextFreeSpace(relPair, prevFreeBlockNumber, nextFreeBlockNumber, + startBlockNumber, startBlockNumber, columnMeta); + + freeSpace = vci_GetFreeSpace(relPair, startBlockNumber); + LockBuffer(relPair->bufData, BUFFER_LOCK_EXCLUSIVE); + + freeSpace->prev_pos = prevFreeBlockNumber; + freeSpace->next_pos = nextFreeBlockNumber; + freeSpace->type = type; + freeSpace->size = size; + + vci_WriteOneItemPage(relPair->data, relPair->bufData); + UnlockReleaseBuffer(relPair->bufData); + + columnMeta->num_extents -= 1; + columnMeta->num_free_pages += vci_GetNumBlocks(size); + columnMeta->num_free_page_blocks += 1; + + vci_WriteColumnMetaDataHeader(relPair->meta, relPair->bufMeta); + UnlockReleaseBuffer(relPair->bufMeta); +} + +void +vci_RemoveFreeSpaceFromLinkList(vci_ColumnRelations *relPair, + BlockNumber startBlockNumber, + BlockNumber numExtentPages) +{ + vcis_column_meta_t *columnMeta; + vcis_free_space_t *freeSpace = vci_GetFreeSpace(relPair, startBlockNumber); + BlockNumber prevFreeBlockNumber = freeSpace->prev_pos; + BlockNumber nextFreeBlockNumber = freeSpace->next_pos; + uint32 size = freeSpace->size; + vcis_extent_type_t type = freeSpace->type; + + BlockNumber next_prev = prevFreeBlockNumber; + BlockNumber prev_next = nextFreeBlockNumber; + + BlockNumber numBlocksInCurrentFreeSpace = vci_GetNumBlocks(size); + + Assert(vci_hasFreeLinkNode(freeSpace)); + ReleaseBuffer(relPair->bufData); + + Assert(numExtentPages <= numBlocksInCurrentFreeSpace); + + columnMeta = vci_GetColumnMeta(&relPair->bufMeta, relPair->meta); + LockBuffer(relPair->bufMeta, BUFFER_LOCK_EXCLUSIVE); + + /* + * prepare new free space from tail part + */ + if (numExtentPages < numBlocksInCurrentFreeSpace) + { + vcis_free_space_t *freeSpace_new; + BlockNumber newFreeBlockNumber = startBlockNumber + numExtentPages; + + freeSpace_new = vci_GetFreeSpace(relPair, newFreeBlockNumber); + freeSpace_new->type = type; + freeSpace_new->size = size - (numExtentPages * VCI_MAX_PAGE_SPACE); + + /* it is sentinel */ + if (numBlocksInCurrentFreeSpace == MaxBlockNumber) + { + freeSpace_new->size = MaxBlockNumber; + columnMeta->num_free_pages += numExtentPages; + } + + /* construct new link */ + freeSpace_new->prev_pos = prevFreeBlockNumber; + freeSpace_new->next_pos = nextFreeBlockNumber; + prev_next = next_prev = newFreeBlockNumber; + + ++columnMeta->num_free_page_blocks; + + LockBuffer(relPair->bufData, BUFFER_LOCK_EXCLUSIVE); + vci_WriteOneItemPage(relPair->data, relPair->bufData); + UnlockReleaseBuffer(relPair->bufData); + } + + UpdatePrevNextFreeSpace(relPair, prevFreeBlockNumber, nextFreeBlockNumber, + prev_next, next_prev, columnMeta); + + ++(columnMeta->num_extents); + columnMeta->num_free_pages -= numExtentPages; + --(columnMeta->num_free_page_blocks); + + vci_WriteOneItemPage(relPair->meta, relPair->bufMeta); + UnlockReleaseBuffer(relPair->bufMeta); +} + +BlockNumber +vci_FindFreeSpaceForExtent(vci_RelationPair *relPair, BlockNumber requiredSize) +{ + vcis_column_meta_t *columnMeta; + vcis_free_space_t *freeSpace; + + BlockNumber freeSpacePtr; + BlockNumber found = InvalidBlockNumber; + bool is_sentinel = false; + + columnMeta = vci_GetColumnMeta(&relPair->bufMeta, relPair->meta); + freeSpacePtr = columnMeta->free_page_begin_id; + + while (BlockNumberIsValid(freeSpacePtr)) + { + freeSpace = vci_GetFreeSpace(relPair, freeSpacePtr); + Assert(vci_hasFreeLinkNode(freeSpace)); + + if (vci_GetNumBlocks(freeSpace->size) >= requiredSize) + { + found = freeSpacePtr; + if (!BlockNumberIsValid(freeSpace->next_pos)) + is_sentinel = true; + ReleaseBuffer(relPair->bufData); + break; + } + freeSpacePtr = freeSpace->next_pos; + ReleaseBuffer(relPair->bufData); + } + + if (is_sentinel) + { + /* + * vci_AppendNewPages(relPair->data, requiredSize + + * columnMeta->free_page_end_id - numRelPages + 1); + */ + int16 numItems; + + relPair->bufData = ReadBuffer(relPair->data, 0); + numItems = PageGetMaxOffsetNumber(BufferGetPage(relPair->bufData)); + ReleaseBuffer(relPair->bufData); + + vci_PreparePagesIfNecessary(relPair->data, + requiredSize + columnMeta->free_page_end_id, + numItems); + } + + ReleaseBuffer(relPair->bufMeta); + + return found; +} + +void +vci_WriteRecoveryRecordForFreeSpace(vci_RelationPair *relPair, + int16 colId, + int16 dictId, + BlockNumber StartBlockNumber, + vcis_free_space_t *FS) +{ + vcis_column_meta_t *columnMeta = vci_GetColumnMeta(&relPair->bufMeta, relPair->meta); + + Assert(!BlockNumberIsValid(FS->prev_pos) || FS->prev_pos < StartBlockNumber); + Assert(!BlockNumberIsValid(FS->next_pos) || StartBlockNumber < FS->next_pos); + + LockBuffer(relPair->bufMeta, BUFFER_LOCK_EXCLUSIVE); + + columnMeta->new_data_head = StartBlockNumber; + columnMeta->free_page_prev_id = FS->prev_pos; + columnMeta->free_page_next_id = FS->next_pos; + columnMeta->free_page_old_size = FS->size; + + columnMeta->num_extents_old = columnMeta->num_extents; + columnMeta->num_free_pages_old = columnMeta->num_free_pages; + columnMeta->num_free_page_blocks_old = columnMeta->num_free_page_blocks; + + vci_WriteOneItemPage(relPair->meta, relPair->bufMeta); + UnlockReleaseBuffer(relPair->bufMeta); + + vci_SetMainRelVar(relPair->info, vcimrv_working_column_id, 0, colId); + vci_SetMainRelVar(relPair->info, vcimrv_working_dictionary_id, 0, dictId); + vci_WriteMainRelVar(relPair->info, vci_wmrv_update); +} + +void +vci_InitRecoveryRecordForFreeSpace(vci_MainRelHeaderInfo *info) +{ + vci_SetMainRelVar(info, vcimrv_working_column_id, 0, VCI_INVALID_COLUMN_ID); +} + +void +vci_RecoveryFreeSpace(vci_MainRelHeaderInfo *info, vci_ros_command_t command) +{ + LOCKMODE lockmode = AccessShareLock; /** @todo ? */ + + int16 colId; + vci_ColumnRelations relPairData; + vci_ColumnRelations *relPair = &relPairData; + vcis_column_meta_t *columnMeta; + + BlockNumber startBlockNumber; + BlockNumber prevFreeBlockNumber; + BlockNumber nextFreeBlockNumber; + uint32 oldSize; + + int32 extentId; + + /* get last working column */ + colId = vci_GetMainRelVar(info, vcimrv_working_column_id, 0); + + if (colId != VCI_INVALID_COLUMN_ID) + { + vci_OpenColumnRelations(relPair, info, colId, lockmode); + + /* get column rel set */ + columnMeta = vci_GetColumnMeta(&relPair->bufMeta, relPair->meta); + LockBuffer(relPair->bufMeta, BUFFER_LOCK_EXCLUSIVE); + + /* restore from old fields */ + columnMeta->num_extents = columnMeta->num_extents_old; + columnMeta->num_free_pages = columnMeta->num_free_pages_old; + columnMeta->num_free_page_blocks = columnMeta->num_free_page_blocks_old; + + /* read free link list recovery information */ + startBlockNumber = columnMeta->new_data_head; + prevFreeBlockNumber = columnMeta->free_page_prev_id; + nextFreeBlockNumber = columnMeta->free_page_next_id; + oldSize = columnMeta->free_page_old_size; + + vci_WriteColumnMetaDataHeader(relPair->meta, relPair->bufMeta); + UnlockReleaseBuffer(relPair->bufMeta); + + vci_AppendFreeSpaceToLinkList(relPair, startBlockNumber, prevFreeBlockNumber, + nextFreeBlockNumber, oldSize); + + switch (command) + { + case vci_rc_wos_ros_conv: + case vci_rc_collect_deleted: + extentId = vci_GetMainRelVar(info, vcimrv_new_extent_id, 0); + break; + case vci_rc_collect_extent: + extentId = vci_GetMainRelVar(info, vcimrv_old_extent_id, 0); + break; + default: + extentId = VCI_INVALID_EXTENT_ID; + break; + } + Assert(extentId != VCI_INVALID_EXTENT_ID); + + vci_WriteRawDataExtentInfo(relPair->meta, + extentId, + InvalidBlockNumber, + 0, + NULL, /* min */ + NULL, /* max */ + false, + false); + + vci_CloseColumnRelations(relPair, lockmode); + } +} diff --git a/contrib/vci/storage/vci_index.c b/contrib/vci/storage/vci_index.c new file mode 100644 index 000000000000..85023cbaefe0 --- /dev/null +++ b/contrib/vci/storage/vci_index.c @@ -0,0 +1,2152 @@ +/*------------------------------------------------------------------------- + * + * vci_index.c + * Index Access Method + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/storage/vci_index.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/heapam_xlog.h" +#include "access/htup_details.h" +#include "access/multixact.h" +#include "access/reloptions.h" +#include "access/sysattr.h" +#include "access/toast_compression.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "access/xloginsert.h" +#include "catalog/catalog.h" +#include "catalog/dependency.h" +#include "catalog/heap.h" +#include "catalog/index.h" +#include "catalog/indexing.h" +#include "catalog/namespace.h" +#include "catalog/pg_rewrite.h" +#include "catalog/pg_type.h" +#include "catalog/storage.h" +#include "commands/dbcommands.h" +#include "commands/defrem.h" +#include "commands/tablecmds.h" +#include "executor/executor.h" +#include "executor/nodeModifyTable.h" +#include "executor/spi.h" +#include "fmgr.h" +#include "lib/stringinfo.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "nodes/makefuncs.h" +#include "nodes/pathnodes.h" +#include "access/relation.h" +#include "port.h" +#include "rewrite/rewriteDefine.h" +#include "rewrite/rewriteRemove.h" +#include "rewrite/rewriteSupport.h" +#include "storage/bufmgr.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "storage/smgr.h" +#include "tcop/utility.h" +#include "utils/acl.h" +#include "utils/builtins.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/relcache.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" +#include "utils/varlena.h" + +#include "vci.h" +#include "vci_columns.h" +#include "vci_columns_data.h" + +#include "vci_mem.h" +#include "vci_ros.h" +#include "vci_ros_command.h" +#include "vci_ros_daemon.h" +#include "vci_supported_oid.h" +#include "vci_tidcrid.h" +#include "vci_wos.h" +#include "vci_xact.h" + +#ifdef WIN32 +#define __func__ __FUNCTION__ +#endif + +#ifdef HAVE_DESIGNATED_INITIALIZERS +#define SFINIT(f, ...) f = __VA_ARGS__ +#else +#define SFINIT(f, ...) __VA_ARGS__ +#endif + +/** + * Data Relation + */ +#define VCI_RELTYPE_DATA ('d') + +/** + * Meta Relation + */ +#define VCI_RELTYPE_META ('m') + +/** + * WOS Relation + */ +#define VCI_RELTYPE_WOS ('W') + +/** + * ROS Relation + */ +#define VCI_RELTYPE_ROS ('R') + +/** + * TIDCRID Relation + */ +#define VCI_RELTYPE_TIDCRID ('T') + +/* local functions */ +static TupleDesc get_tuple_desc_for_build(Relation heapRel, Relation indexRel, bool isctid); +static IndexBuildResult *vci_inner_build(Relation, Relation, IndexInfo *); +static void vci_inner_buildempty(Relation indexRelation); +static bool vci_inner_insert(Relation, ItemPointer); +static bool vci_inner_insert_in_copy(Relation, ItemPointer); +static IndexBulkDeleteResult *vci_inner_vacuumcleanup(IndexVacuumInfo *, IndexBulkDeleteResult *); +static void vci_modify_column_information(bool isctid, Relation indexRel, Relation heapRel); + +IndexBuildResult *vci_build(Relation heap, Relation index, IndexInfo *indexInfo); +void vci_buildempty(Relation index); +bool vci_insert(Relation indexRel, Datum *values, bool *isnull, + ItemPointer heap_tid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + struct IndexInfo *indexInfo); +IndexBulkDeleteResult *vci_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state); +IndexBulkDeleteResult *vci_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats); +void vci_costestimate(PlannerInfo *root, IndexPath *path, double loop_count, Cost *indexStartupCost, Cost *indexTotalCost, Selectivity *indexSelectivity, double *indexCorrelation, double *indexPages); +int vci_gettreeheight(Relation rel); +bytea *vci_options(Datum reloptions, bool validate); +IndexScanDesc vci_beginscan(Relation rel, int nkeys, int norderbys); +void vci_rescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys); +bool vci_validate(Oid opclassoid); +void vci_endscan(IndexScanDesc scan); +void vci_markpos(IndexScanDesc scan); +void vci_restrpos(IndexScanDesc scan); + +static char relNameBuf[NAMEDATALEN]; + +static bool copy_with_freeze_option; + +bool +vci_isVciAdditionalRelation(Relation rel) +{ + return vci_isVciAdditionalRelationTuple(rel->rd_id, rel->rd_rel); +} + +bool +vci_isVciAdditionalRelationTuple(Oid reloid, Form_pg_class reltuple) +{ + if (reltuple->relkind == RELKIND_MATVIEW) + { + int ret; + int dummy1; + int dummy2; + char dummy3; + + ret = sscanf(NameStr(reltuple->relname), VCI_INTERNAL_RELATION_TEMPLATE, + &dummy1, &dummy2, &dummy3); + + return (ret == 3); + } + + return false; +} + +/* custom index */ + +IndexBuildResult * +vci_build(Relation heapRel, Relation indexRel, IndexInfo *indexInfo) +{ + IndexBuildResult *result; + vci_id_t vciid; + + if (!fullPageWrites) + { + if (vci_rebuild_command == vcirc_invalid) + /* CREATE INDEX */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("access method \"%s\" does not work under full_page_writes=off", VCI_STRING))); + else + /* TRUNCATE, VACUUM FULL */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("access method \"%s\" does not work under full_page_writes=off", VCI_STRING), + errhint("Use DROP INDEX \"%s\"", RelationGetRelationName(indexRel)))); + } + + result = vci_inner_build(heapRel, indexRel, indexInfo); + + vciid.oid = RelationGetRelid(indexRel); + vciid.dbid = MyDatabaseId; + + vci_TouchMemoryEntry(&vciid, + get_rel_tablespace(indexRel->rd_id)); + + return result; +} + +void +vci_buildempty(Relation indexRel) +{ + vci_inner_buildempty(indexRel); + + return; +} + +/* for COPY command */ +#define EXTENT_LIST_UNIT_EXTENSION (1024) + +typedef struct CopyCommandInfo +{ + TransactionId xid; + CommandId cid; + uint64 numAppendedRows; + uint32 *extentList; + uint32 numFilledExtent; + uint32 numAllocatedExtent; +} CopyCommandInfo; + +static CopyCommandInfo copyInfo = { + SFINIT(xid, InvalidTransactionId), + SFINIT(cid, InvalidCommandId), + SFINIT(numAppendedRows, 0), + SFINIT(extentList, NULL), + SFINIT(numFilledExtent, 0), + SFINIT(numAllocatedExtent, 0) +}; +static vci_RosCommandContext copyConvContext; + +bool +vci_insert(Relation indexRel, Datum *values, bool *isnull, + ItemPointer heap_tid, Relation heapRel, + IndexUniqueCheck checkUnique, + bool indexUnchanged, + struct IndexInfo *indexInfo) +{ + bool result; + TransactionId xid = GetCurrentTransactionId(); + CommandId cid = GetCurrentCommandId(false); + + Assert(TransactionIdIsValid(xid)); + Assert(InvalidCommandId != cid); + + if (!fullPageWrites) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("access method \"%s\" does not work under full_page_writes=off", VCI_STRING), + errhint("Use DROP INDEX \"%s\"", RelationGetRelationName(indexRel)))); + + if (ItemPointerGetOffsetNumber(heap_tid) == FirstOffsetNumber) + { + vci_id_t vciid; + + vciid.oid = RelationGetRelid(indexRel); + vciid.dbid = MyDatabaseId; + + vci_TouchMemoryEntry(&vciid, + get_rel_tablespace(indexRel->rd_id)); + } + + if (TransactionIdEquals(xid, copyInfo.xid) && (cid == copyInfo.cid)) + result = vci_inner_insert_in_copy(indexRel, heap_tid); /* LCOV_EXCL_LINE */ + else + result = vci_inner_insert(indexRel, heap_tid); + + return result; +} + +/** + * vci_bulkdelete + */ +IndexBulkDeleteResult * +vci_bulkdelete(IndexVacuumInfo *info, IndexBulkDeleteResult *stats, + IndexBulkDeleteCallback callback, void *callback_state) +{ + return stats; +} + +/** + * vci_vacuumcleanup + */ +IndexBulkDeleteResult * +vci_vacuumcleanup(IndexVacuumInfo *info, IndexBulkDeleteResult *stats) +{ + if (info->analyze_only) + return stats; + + vci_inner_vacuumcleanup(info, stats); + + return stats; +} + +/** + * vci_costestimate + */ +void +vci_costestimate(PlannerInfo *root, IndexPath *path, double loop_count, + Cost *indexStartupCost, Cost *indexTotalCost, + Selectivity *indexSelectivity, double *indexCorrelation, + double *indexPages) +{ + /* + * PlannerInfo *root = (PlannerInfo *) PG_GETARG_POINTER(0); IndexPath + * *path = (IndexPath *) PG_GETARG_POINTER(1); double loop_count = + * PG_GETARG_FLOAT8(2); + */ + + /* always return worst cost value */ + *indexStartupCost = DBL_MAX; + *indexTotalCost = DBL_MAX; + *indexSelectivity = 1.0; + *indexCorrelation = 0.0; + *indexPages = ((BlockNumber) 0xFFFFFFFE); /* MaxBlockNumber */ + + /** + * Disabled nodes are also a cost metric (see Commit e222534), so set a + * high value to ensure an Index Scan will not be chosen. + */ + path->path.disabled_nodes = INT_MAX; + + return; +} + +int +vci_gettreeheight(Relation rel) +{ + int result; + + result = 0; + return result; +} + +bytea * +vci_options(Datum reloptions, bool validate) +{ + return NULL; +} + +bool +vci_validate(Oid opclassoid) +{ + /* pass */ + return true; +} + +/* LCOV_EXCL_START */ +IndexScanDesc +vci_beginscan(Relation rel, int nkeys, int norderbys) +{ + IndexScanDesc result; + + /* + * Relation indexRel = (Relation) PG_GETARG_POINTER(0); int nkeys = + * PG_GETARG_INT32(1); int norderbys = PG_GETARG_INT32(2); + */ + + result = NULL; + + ereport(PANIC, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("unexpected index access method call : \"%s\" ", __func__))); + + return result; +} + +void +vci_rescan(IndexScanDesc scan, ScanKey scankey, int nscankeys, + ScanKey orderbys, int norderbys) +{ + /* + * IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); ScanKey keys + * = (ScanKey) PG_GETARG_POINTER(1); int nkeys = PG_GETARG_INT32(2); + * ScanKey orderbys = (ScanKey) PG_GETARG_POINTER(3); int norderbys = + * PG_GETARG_INT32(4); + */ + + /* pass */ + return; +} + +void +vci_endscan(IndexScanDesc scan) +{ + /* + * IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + */ + + ereport(PANIC, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("unexpected index access method call : \"%s\" ", __func__))); + + /* pass */ + return; +} + +void +vci_markpos(IndexScanDesc scan) +{ + /* + * IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + */ + ereport(PANIC, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("unexpected index access method call : \"%s\" ", __func__))); + + /* pass */ + return; +} + +void +vci_restrpos(IndexScanDesc scan) +{ + /* + * IndexScanDesc scan = (IndexScanDesc) PG_GETARG_POINTER(0); + */ + ereport(PANIC, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("unexpected index access method call : \"%s\" ", __func__))); + + /* pass */ + return; +} + +/* LCOV_EXCL_STOP */ + +/* --body-- */ + +static Oid +vci_create_relation(const char *rel_identifier, Relation indexRel, IndexInfo *indexInfo, char vci_reltype) +{ + int natts; + + /* system catalog relation id */ + Relation pg_class; + Relation pg_attr; + + /* new rel, oid, tupdesc */ + Relation new_rel; + Oid new_oid; + TupleDesc new_tupdesc; + + /* attributes */ + Oid ownerid = GetUserId(); + + char relkind = RELKIND_MATVIEW; + + Oid new_type_oid = InvalidOid; + Oid reloftypeid = InvalidOid; + CatalogIndexState indstate; + + char relname[NAMEDATALEN]; /* max 64 characters */ + Oid reltablespace; + Oid relnamespace; + char relpersistence; + Oid accessmtd; + + /* variables for pg_class */ + Form_pg_class new_rel_reltup; + + RangeVar *relrv; + + /* Insert pg_depend table */ + ObjectAddress oaIndex; + ObjectAddress oaNewRel; + + relnamespace = indexRel->rd_rel->relnamespace; + reltablespace = indexRel->rd_rel->reltablespace; + relpersistence = indexRel->rd_rel->relpersistence; + accessmtd = HEAP_TABLE_AM_OID; + + /* function start */ + memset(relname, 0, sizeof(relname)); + strncpy(relname, rel_identifier, sizeof(relname)); + + relrv = makeRangeVar(get_namespace_name(relnamespace), relname, -1); + new_oid = RangeVarGetRelid(relrv, AccessShareLock, true); + + if (OidIsValid(new_oid)) + { + new_rel = relation_open(new_oid, AccessExclusiveLock); + RelationSetNewRelfilenumber(new_rel, new_rel->rd_rel->relpersistence); + + /* + * if (new_rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED) + * heap_create_init_fork(new_rel); + */ + + relation_close(new_rel, NoLock); /* do not unlock till end of xact */ + + return new_oid; + } + + /* Generate Data WOS */ + pg_class = table_open(RelationRelationId, RowExclusiveLock); + + /* 4.6.1 get new Oid for new relation */ + + new_oid = GetNewRelFileNumber(reltablespace, pg_class, relpersistence); + + /* TODO */ + + /* + * The following line is meaningful? Or shoud we remove it? + */ + get_user_default_acl(OBJECT_TABLE, ownerid, relnamespace); + + /* 4.6.1.2 create new relation cache entry */ + + /* new tuple descriptor has TID column */ + + switch (vci_reltype) + { + /* WOS */ + case VCI_RELTYPE_WOS: + natts = 2; + new_tupdesc = CreateTemplateTupleDesc(natts); /* no Oid */ + TupleDescInitEntry(new_tupdesc, (AttrNumber) 1, "original_tid", TIDOID, -1, 0); + TupleDescInitEntry(new_tupdesc, (AttrNumber) 2, "xid", INT8OID, -1, 0); + break; + + /* ROS */ + case VCI_RELTYPE_ROS: + natts = 1; + new_tupdesc = CreateTemplateTupleDesc(natts); /* no Oid */ + TupleDescInitEntry(new_tupdesc, (AttrNumber) 1, "bindata", BYTEAOID, -1, 0); /* */ + break; + + /* TID-CRID */ + case VCI_RELTYPE_TIDCRID: + natts = 1; + new_tupdesc = CreateTemplateTupleDesc(natts); /* no Oid */ + TupleDescInitEntry(new_tupdesc, (AttrNumber) 1, "bindata", BYTEAOID, -1, 0); /* */ + break; + + /* LCOV_EXCL_START */ + default: + elog(ERROR, "unexpected vci_reltype"); + break; + /* LCOV_EXCL_STOP */ + } + + /* + * Create the relcache entry (mostly dummy at this point) and the physical + * disk file. (If we fail further down, it's the smgr's responsibility to + * remove the disk file again.) + */ + new_rel = RelationBuildLocalRelation(relname, + relnamespace, + new_tupdesc, + new_oid, + accessmtd, + new_oid, /* relfilenumber */ + reltablespace, + false, /* shared_relation */ + false, /* mapped_relation */ + relpersistence, + relkind); + + /* 4.6.1.3 create new starge for new relation */ + RelationCreateStorage(new_rel->rd_locator, relpersistence, true); + + Assert(new_oid == RelationGetRelid(new_rel)); + + /* 4.6.1.4 add new entry into pg_class */ + new_rel_reltup = new_rel->rd_rel; + new_rel_reltup->relpages = 0; + new_rel_reltup->reltuples = -1; + new_rel_reltup->relallvisible = 0; + new_rel_reltup->relfrozenxid = RecentXmin; + new_rel_reltup->relminmxid = GetOldestMultiXactId(); + new_rel_reltup->relowner = ownerid; + new_rel_reltup->reltype = new_type_oid; + new_rel_reltup->reloftype = reloftypeid; + + /* + * Flag the VCI internal relation MATVIEW as already populated. + * + * Users are not supposed to be querying these internal relations, but + * just in case they do, setting 'relispopulated' prevents an error saying + * the view has not been populated, hinting a "REFRESH MATERIALIZED VIEW" + * is needed. That hint only causes confusion, since the REFRESH is + * disallowed for VCI internal relations. + */ + new_rel_reltup->relispopulated = true; + + /* + * @see + * https://www.postgresql.jp/document/9.4/html/catalog-pg-rewrite.html + */ + new_rel_reltup->relhasrules = true; + + new_rel->rd_att->tdtypeid = new_type_oid; + + InsertPgClassTuple(pg_class, new_rel, new_oid, (Datum) 0, (Datum) 0); + + /* + * 4.6.1.5 -now add tuples to pg_attribute for the attributes in our new + * relation. + */ + + /* + * open pg_attribute and its indexes. + */ + pg_attr = table_open(AttributeRelationId, RowExclusiveLock); + indstate = CatalogOpenIndexes(pg_attr); + + /* + * First we add the user attributes. This is also a convenient place to + * add dependencies on their datatypes and collations. + */ + for (int i = 0; i < natts; i++) + { + Form_pg_attribute attrs; + + /* [TODO] Make sure these are OK? */ + new_tupdesc->compact_attrs[i].attcacheoff = -1; + attrs = TupleDescAttr(new_tupdesc, i); + attrs->attstorage = TYPSTORAGE_PLAIN; + attrs->attcompression = InvalidCompressionMethod; + } + InsertPgAttributeTuples(pg_attr, new_tupdesc, new_oid, NULL, indstate); + + /* + * clean up pg_attribute + */ + CatalogCloseIndexes(indstate); + table_close(pg_attr, RowExclusiveLock); + + /* + * VCI internal relations are dependent on the parent index. + */ + ObjectAddressSet(oaIndex, RelationRelationId, indexRel->rd_id); + ObjectAddressSet(oaNewRel, RelationRelationId, new_oid); + recordDependencyOn(&oaNewRel, &oaIndex, DEPENDENCY_INTERNAL); + + table_close(new_rel, NoLock); /* do not unlock till end of xact */ + table_close(pg_class, RowExclusiveLock); + + return new_oid; +} + +static char * +GenRelName(Relation rel, int16 columnId, char suffix) +{ + snprintf(relNameBuf, NAMEDATALEN, VCI_INTERNAL_RELATION_TEMPLATE, RelationGetRelid(rel), + (0xFFFF & columnId), suffix); + + return relNameBuf; +} + +static void +CheckIndexedRelationKind(Relation rel) +{ + if (rel->rd_rel->relkind == RELKIND_MATVIEW) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("access method \"%s\" does not support index on materialized view", VCI_STRING))); + + if (rel->rd_rel->relpersistence == RELPERSISTENCE_TEMP) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("access method \"%s\" does not support index on temporary table", VCI_STRING))); +} + +static void +CheckIndexInfo(IndexInfo *indexInfo, Relation indexRel) +{ + int i = 0; + + /* check Concurrent option first. */ + if (indexInfo->ii_Concurrent) + /* LCOV_EXCL_START */ + elog(PANIC, "should not reach here"); + /* LCOV_EXCL_STOP */ + + if (indexInfo->ii_Predicate != NIL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("access method \"%s\" does not support partial-index", VCI_STRING))); + + if (indexInfo->ii_Expressions != NIL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("access method \"%s\" does not support to CREATE INDEX on the expression", VCI_STRING))); + + if (indexInfo->ii_ExclusionOps != NULL || + indexInfo->ii_ExclusionProcs != NULL || + indexInfo->ii_ExclusionStrats != NULL) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("access method \"%s\" does not support EXCLUDE clause", VCI_STRING))); + + for (i = 0; i < indexInfo->ii_NumIndexAttrs; i++) + { + AttrNumber an = indexInfo->ii_IndexAttrNumbers[i]; + int j; + + for (j = i + 1; j < indexInfo->ii_NumIndexAttrs; j++) + { + TupleDesc tupdesc = RelationGetDescr(indexRel); + + if (an == indexInfo->ii_IndexAttrNumbers[j]) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("duplicated columns in vci index creation: %s", + NameStr(TupleDescAttr(tupdesc, an - 1)->attname)), + errhint("duplicated columns are specified"))); + } + } +} + +static void +CheckIndexColumnTypes(TupleDesc tupdesc, bool *isctid) +{ + int i; + + *isctid = false; + + for (i = 0; i < tupdesc->natts; i++) + { + Oid typeoid = TupleDescAttr(tupdesc, i)->atttypid; + + /* + * In general, the type 'tid' is not supported. However, 'ctid' column + * (that is exist in all tables) is accepted as a dummy column. In + * this case, the real columns should be registered in the + * 'vci_column_ids' option. + */ + if (!vci_is_supported_type(typeoid)) + { + if (strcmp(NameStr(TupleDescAttr(tupdesc, i)->attname), "ctid") != 0) + { + HeapTuple tuple; + Form_pg_type typetuple; + + tuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(typeoid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for type %u", typeoid); + + typetuple = (Form_pg_type) GETSTRUCT(tuple); + + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("data type %s is not supported for access method \"%s\"", + NameStr(typetuple->typname), VCI_STRING))); + + ReleaseSysCache(tuple); + } + else if (tupdesc->natts != 1) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot use \"ctid\" column with other columns"))); + } + *isctid = true; + } + } +} + +/* TODO - is this function needed? */ +static void +CheckColumnReloptions(Relation indexRel, bool isctid) +{ + char *ids = NULL; + bool hasoption = false; + + if (hasoption) + ereport(DEBUG2, + (errmsg_internal("vci_column_ids: %s", ids))); + + if (isctid == hasoption) + return; + else if (isctid && !hasoption) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("The \"vci_column_ids\" option is required when \"ctid\" column is specified"))); + else if (!isctid && hasoption) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("The \"vci_column_ids\" option cannot be used without \"ctid\" column"))); + else + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg_internal("unrecognized state in vci_inner_build: isctid = %d, ids = %p", isctid, ids))); +} + +/* + * get_tuple_desc_for_build -- create TupleDesc for build. + * + * The VCI can be created by 2 interfaces. The first interface is the same to + * an ordinal index (Ex. CREATE INDEX idx ON table USING vci(c1, c2)). The + * second interface is by the original function 'vci_create' (SELECT vci_create + * ('idx', 'table', ARRAY['c1', 'c2'])). It generates such SQL as 'CREATE + * INDEX idx ON table USING vci(ctid) WITH (vci_column_ids = '1,2')'. The + * following codes distinguish this 2 cases. + * + * XXX - function vci_create is not implemented by this OSS patch, so this + * code may be able to be further simplified. + */ +static TupleDesc +get_tuple_desc_for_build(Relation heapRel, Relation indexRel, bool isctid) +{ + if (isctid) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("\"ctid\" column is specified"))); + + return RelationGetDescr(indexRel); +} + +static void +vci_modify_column_information(bool isctid, Relation indexRel, Relation heapRel) +{ + if (!isctid) + return; + + /* + * XXX. The code which previously existed below here is now removed. It + * relied on vci_MakeDroppedColumnBitmap which asserted + * vci_IsExtendedToMoreThan32Columns, and that is no longer possible since + * "vci_create() function is not supported by this OSS patch. + */ +} + +static IndexBuildResult * +vci_inner_build(Relation heapRel, Relation indexRel, IndexInfo *indexInfo) +{ + IndexBuildResult *result; + Oid oid; + + vci_MainRelHeaderInfo *vmr_info; + + int i; + TupleDesc tupdesc; + bool isctid; + + /* for checking type after getting 'real' TupleDesc. */ + bool dummy_isctid; + + uint32 offsetToExtentInfo; + + double reltuples = -1; + + CheckIndexedRelationKind(heapRel); + CheckIndexInfo(indexInfo, indexRel); + CheckIndexColumnTypes(RelationGetDescr(indexRel), &isctid); + CheckColumnReloptions(indexRel, isctid); + + vci_modify_column_information(isctid, indexRel, heapRel); + + /* create VCI main relation */ + vmr_info = (vci_MainRelHeaderInfo *) palloc0(sizeof(vci_MainRelHeaderInfo)); + vci_InitMainRelHeaderInfo(vmr_info, indexRel, vci_rc_wos_ros_conv_build); + + if (RelationGetNumberOfBlocks(indexRel) != 0) + elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(indexRel)); + + /* create blank page * VCI_NUM_MAIN_REL_HEADER_PAGES */ + vci_PreparePagesWithOneItemIfNecessary(indexRel, + lengthof(vmr_info->buffer) - 1); + + vci_KeepMainRelHeaderWithoutVersionCheck(vmr_info); + + /* write ROS format version */ + vci_SetMainRelVar(vmr_info, vcimrv_ros_version_major, 0, + VCI_ROS_VERSION_MAJOR); + vci_SetMainRelVar(vmr_info, vcimrv_ros_version_minor, 0, + VCI_ROS_VERSION_MINOR); + + /* create WOS relations */ + /* register WOS relation's OID to VCI Main relation */ + + oid = vci_create_relation(GenRelName(indexRel, VCI_COLUMN_ID_DATA_WOS, VCI_RELTYPE_DATA), indexRel, indexInfo, VCI_RELTYPE_WOS); + vci_SetMainRelVar(vmr_info, vcimrv_data_wos_oid, 0, oid); + + oid = vci_create_relation(GenRelName(indexRel, VCI_COLUMN_ID_WHITEOUT_WOS, VCI_RELTYPE_DATA), indexRel, indexInfo, VCI_RELTYPE_WOS); + vci_SetMainRelVar(vmr_info, vcimrv_whiteout_wos_oid, 0, oid); + + /* create ROS relations */ + + /* TID */ + oid = vci_create_relation(GenRelName(indexRel, VCI_COLUMN_ID_TID, VCI_RELTYPE_DATA), indexRel, indexInfo, VCI_RELTYPE_ROS); + vci_SetMainRelVar(vmr_info, vcimrv_tid_data_oid, 0, oid); + oid = vci_create_relation(GenRelName(indexRel, VCI_COLUMN_ID_TID, VCI_RELTYPE_META), indexRel, indexInfo, VCI_RELTYPE_ROS); + vci_SetMainRelVar(vmr_info, vcimrv_tid_meta_oid, 0, oid); + + /* NUll */ + oid = vci_create_relation(GenRelName(indexRel, VCI_COLUMN_ID_NULL, VCI_RELTYPE_DATA), indexRel, indexInfo, VCI_RELTYPE_ROS); + vci_SetMainRelVar(vmr_info, vcimrv_null_data_oid, 0, oid); + oid = vci_create_relation(GenRelName(indexRel, VCI_COLUMN_ID_NULL, VCI_RELTYPE_META), indexRel, indexInfo, VCI_RELTYPE_ROS); + vci_SetMainRelVar(vmr_info, vcimrv_null_meta_oid, 0, oid); + + /* Delete Vector */ + oid = vci_create_relation(GenRelName(indexRel, VCI_COLUMN_ID_DELETE, VCI_RELTYPE_DATA), indexRel, indexInfo, VCI_RELTYPE_ROS); + vci_SetMainRelVar(vmr_info, vcimrv_delete_data_oid, 0, oid); + oid = vci_create_relation(GenRelName(indexRel, VCI_COLUMN_ID_DELETE, VCI_RELTYPE_META), indexRel, indexInfo, VCI_RELTYPE_ROS); + vci_SetMainRelVar(vmr_info, vcimrv_delete_meta_oid, 0, oid); + + /* Column Stores */ + tupdesc = get_tuple_desc_for_build(heapRel, indexRel, isctid); + CheckIndexColumnTypes(tupdesc, &dummy_isctid); + + /* + * When using 'vci_create', PostgreSQL registers only a 'ctid' column as + * as a dependency object. So self registration is required in such case. + * + * Note: A tupdesc->attrs[i]->attnum doesn't point an attribute number of + * the heap but is a sequential number in the index. + */ + if (isctid) + { + ObjectAddress myself, + referenced; + TupleDesc heapTupleDesc; + ObjectAddresses *addrs; + + heapTupleDesc = RelationGetDescr(heapRel); + + addrs = new_object_addresses(); + + ObjectAddressSet(myself, RelationRelationId, RelationGetRelid(indexRel)); + ObjectAddressSet(referenced, RelationRelationId, RelationGetRelid(heapRel)); + + for (i = 0; i < tupdesc->natts; i++) + { + referenced.objectSubId = vci_GetAttNum(heapTupleDesc, + NameStr(TupleDescAttr(tupdesc, i)->attname)); + + add_exact_object_address(&referenced, addrs); + } + + record_object_address_dependencies(&myself, addrs, DEPENDENCY_AUTO); + free_object_addresses(addrs); + } + + vci_SetMainRelVar(vmr_info, vcimrv_num_columns, 0, tupdesc->natts); + for (i = 0; i < tupdesc->natts; i++) + { + Oid column_store_oid; + Oid column_meta_oid; + vcis_m_column_t *columnPointer; + + column_store_oid = vci_create_relation(GenRelName(indexRel, i, VCI_RELTYPE_DATA), indexRel, indexInfo, VCI_RELTYPE_ROS); + column_meta_oid = vci_create_relation(GenRelName(indexRel, i, VCI_RELTYPE_META), indexRel, indexInfo, VCI_RELTYPE_ROS); + + /* set ROS column pointer, */ + columnPointer = vci_GetMColumn(vmr_info, i); + + columnPointer->meta_oid = column_meta_oid; + columnPointer->data_oid = column_store_oid; + columnPointer->max_columns_size = vci_GetColumnWorstSize(TupleDescAttr(tupdesc, i)); + if (TupleDescAttr(tupdesc, i)->attlen == -1) + { + columnPointer->comp_type = vcis_compression_type_variable_raw; + } + else if (TupleDescAttr(tupdesc, i)->attlen > 0) + { + columnPointer->comp_type = vcis_compression_type_fixed_raw; + } + else + { + Assert(false); + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("unexpected attribute length"))); + } + /* put default extent(free_page) to each columns */ + } + vci_SetMainRelVar(vmr_info, vcimrv_num_nullable_columns, 0, + vci_GetNumberOfNullableColumn(tupdesc)); + vci_SetMainRelVar(vmr_info, vcimrv_null_width_in_byte, 0, + (vci_GetNumberOfNullableColumn(tupdesc) + BITS_PER_BYTE - 1) / + BITS_PER_BYTE); + + /* create TID-CRID relations */ + oid = vci_create_relation(GenRelName(indexRel, VCI_COLUMN_ID_TID_CRID, VCI_RELTYPE_META), indexRel, indexInfo, VCI_RELTYPE_TIDCRID); + vci_SetMainRelVar(vmr_info, vcimrv_tid_crid_meta_oid, 0, oid); + + oid = vci_create_relation(GenRelName(indexRel, VCI_COLUMN_ID_TID_CRID, VCI_RELTYPE_DATA), indexRel, indexInfo, VCI_RELTYPE_TIDCRID); + vci_SetMainRelVar(vmr_info, vcimrv_tid_crid_data_oid, 0, oid); + + oid = vci_create_relation(GenRelName(indexRel, VCI_COLUMN_ID_TID_CRID_UPDATE, '0'), indexRel, indexInfo, VCI_RELTYPE_TIDCRID); + vci_SetMainRelVar(vmr_info, vcimrv_tid_crid_update_oid_0, 0, oid); + + oid = vci_create_relation(GenRelName(indexRel, VCI_COLUMN_ID_TID_CRID_UPDATE, '1'), indexRel, indexInfo, VCI_RELTYPE_TIDCRID); + vci_SetMainRelVar(vmr_info, vcimrv_tid_crid_update_oid_1, 0, oid); + + /* other variables */ + vci_SetMainRelVar(vmr_info, vcimrv_column_info_offset, 0, vcimrv_column_info - VCI_MIN_PAGE_HEADER); + + offsetToExtentInfo = (vci_MRVGetBlockNumber(vcimrv_extent_info) * VCI_MAX_PAGE_SPACE) + + vci_MRVGetOffset(vcimrv_extent_info) - VCI_MIN_PAGE_HEADER; + + vci_SetMainRelVar(vmr_info, vcimrv_extent_info_offset, 0, offsetToExtentInfo); + vci_SetMainRelVar(vmr_info, vcimrv_size_mr, 0, offsetToExtentInfo); + vci_SetMainRelVar(vmr_info, vcimrv_size_mr_old, 0, offsetToExtentInfo); + + vci_SetMainRelVar(vmr_info, vcimrv_current_ros_version, 0, FrozenTransactionId); + vci_SetMainRelVar(vmr_info, vcimrv_last_ros_version, 0, FrozenTransactionId); + vci_SetMainRelVar(vmr_info, vcimrv_tid_crid_diff_sel, 0, 0); + vci_SetMainRelVar(vmr_info, vcimrv_tid_crid_diff_sel_old, 0, 0); + + vci_SetMainRelVar(vmr_info, vcimrv_xid_generation, 0, 1); /* xid generation starts + * from 1 */ + vci_SetMainRelVar(vmr_info, vcimrv_xid_gen_update_xid, 0, GetCurrentTransactionId()); + + vci_SetMainRelVar(vmr_info, vcimrv_ros_command, 0, vci_rc_invalid); + vci_SetMainRelVar(vmr_info, vcimrv_num_unterminated_copy_cmd, 0, 0); + + vci_SetMainRelVar(vmr_info, vcimrv_num_extents, 0, 0); + vci_SetMainRelVar(vmr_info, vcimrv_num_extents_old, 0, 0); + + /* flush */ + vci_WriteMainRelVar(vmr_info, vci_wmrv_all); + + /* initialize meta data relations and data relations */ + vci_InitializeColumnRelations(vmr_info, tupdesc, heapRel); + + /* initialize meta data relations and data relations */ + vci_InitializeTidCridUpdateLists(vmr_info); + vci_InitializeTidCridTree(vmr_info); + + /* unlock */ + vci_ReleaseMainRelHeader(vmr_info); + pfree(vmr_info); + + /* convert data in the relations */ + if (vcirc_truncate != vci_rebuild_command && + indexRel->rd_rel->relpersistence != RELPERSISTENCE_UNLOGGED) + reltuples = vci_ConvertWos2RosForBuild(indexRel, + VciGuc.maintenance_work_mem * (Size) 1024, indexInfo); + + /* + * create statistics for return to caller + */ + result = (IndexBuildResult *) palloc0(sizeof(IndexBuildResult)); + result->heap_tuples = reltuples; + result->index_tuples = -1; + + return result; +} + +/* + * Put or Copy page into INIT_FORK. + * If valid page is given, that page will be put into INIT_FORK. + * If invalid page (NULL pointer) is given, MAIN_FORK page will be copied. + */ +static void +vci_putInitPage(Oid oid, Page page, BlockNumber blkno) +{ + Relation rel; + Page pageCopyFrom; + Buffer buffer = InvalidBuffer; + + rel = relation_open(oid, AccessExclusiveLock); + + /* + * If there is no INIT_FORK, create it. VCI Main Relation may have, but + * others may not have. + */ + + if (!smgrexists(RelationGetSmgr(rel), INIT_FORKNUM)) + smgrcreate(RelationGetSmgr(rel), INIT_FORKNUM, false); + + pageCopyFrom = page; + + if (pageCopyFrom == NULL) + { + buffer = ReadBuffer(rel, blkno); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + pageCopyFrom = BufferGetPage(buffer); + } + + PageSetChecksumInplace(pageCopyFrom, blkno); + smgrwrite(RelationGetSmgr(rel), INIT_FORKNUM, blkno, + (char *) pageCopyFrom, true); + + if (XLogIsNeeded()) + log_newpage(&rel->rd_smgr->smgr_rlocator.locator, INIT_FORKNUM, + blkno, pageCopyFrom, false); + + smgrimmedsync(RelationGetSmgr(rel), INIT_FORKNUM); + + if (buffer != InvalidBuffer) + UnlockReleaseBuffer(buffer); + relation_close(rel, AccessExclusiveLock); +} + +static void +vci_inner_buildempty(Relation indexRel) +{ + Oid oid; + Page tmpPage; + BlockNumber blkno; + TupleDesc itupDesc; + int attn; + + IndexInfo *indexInfo; + + vci_MainRelHeaderInfo vmr_infoData; + vci_MainRelHeaderInfo *vmr_info = &vmr_infoData; + + Relation heapRel; + bool isctid; + + /* for checking type after getting 'real' TupleDesc. */ + bool dummy_isctid; + + CheckIndexColumnTypes(RelationGetDescr(indexRel), &isctid); + + /* create VCI main relation */ + vci_InitMainRelHeaderInfo(vmr_info, indexRel, vci_rc_wos_ros_conv_build); + vci_KeepMainRelHeader(vmr_info); + + /* + * WOS relation : a blank page is put again, because the ambuild data + * might been inserted in WOS. (it may be OK, WOS can be assumed heap + * relation.) + */ + + tmpPage = (Page) palloc(BLCKSZ); + PageInit(tmpPage, BLCKSZ, 0); + + oid = vci_GetMainRelVar(vmr_info, vcimrv_data_wos_oid, 0); + vci_putInitPage(oid, tmpPage, 0); + oid = vci_GetMainRelVar(vmr_info, vcimrv_whiteout_wos_oid, 0); + vci_putInitPage(oid, tmpPage, 0); + + pfree(tmpPage); + + oid = vci_GetMainRelVar(vmr_info, vcimrv_tid_meta_oid, 0); + vci_putInitPage(oid, NULL, 0); + oid = vci_GetMainRelVar(vmr_info, vcimrv_null_meta_oid, 0); + vci_putInitPage(oid, NULL, 0); + oid = vci_GetMainRelVar(vmr_info, vcimrv_delete_meta_oid, 0); + vci_putInitPage(oid, NULL, 0); + + oid = vci_GetMainRelVar(vmr_info, vcimrv_tid_data_oid, 0); + vci_putInitPage(oid, NULL, 0); + oid = vci_GetMainRelVar(vmr_info, vcimrv_null_data_oid, 0); + vci_putInitPage(oid, NULL, 0); + oid = vci_GetMainRelVar(vmr_info, vcimrv_delete_data_oid, 0); + vci_putInitPage(oid, NULL, 0); + + /* column store */ + heapRel = table_open(indexRel->rd_index->indrelid, AccessShareLock); + itupDesc = get_tuple_desc_for_build(heapRel, indexRel, isctid); + table_close(heapRel, AccessShareLock); + + CheckIndexColumnTypes(itupDesc, &dummy_isctid); + + for (attn = 0; attn < itupDesc->natts; attn++) + { + /* get ROS column pointer, */ + vcis_m_column_t *columnPointer; + + columnPointer = vci_GetMColumn(vmr_info, attn); + + vci_putInitPage(columnPointer->meta_oid, NULL, 0); + vci_putInitPage(columnPointer->data_oid, NULL, 0); + } + + oid = vci_GetMainRelVar(vmr_info, vcimrv_tid_crid_meta_oid, 0); + vci_putInitPage(oid, NULL, 0); + oid = vci_GetMainRelVar(vmr_info, vcimrv_tid_crid_data_oid, 0); + vci_putInitPage(oid, NULL, 0); + oid = vci_GetMainRelVar(vmr_info, vcimrv_tid_crid_update_oid_0, 0); + vci_putInitPage(oid, NULL, 0); + oid = vci_GetMainRelVar(vmr_info, vcimrv_tid_crid_update_oid_1, 0); + vci_putInitPage(oid, NULL, 0); + /* Copy default content into VCI Main rel INIT_FORK */ + oid = indexRel->rd_id; + for (blkno = 0; blkno < lengthof(vmr_info->buffer); blkno++) + { + vci_putInitPage(oid, NULL, blkno); + } + + vci_ReleaseMainRelHeader(vmr_info); + + if (vcirc_truncate != vci_rebuild_command) + { + /* extract index key information from the index's pg_index info */ + indexInfo = BuildIndexInfo(indexRel); + vci_ConvertWos2RosForBuild(indexRel, + VciGuc.maintenance_work_mem * (Size) 1024, indexInfo); + } +} + +/* LCOV_EXCL_START */ +void +vci_set_copy_transaction_and_command_id(TransactionId xid, CommandId cid) +{ + Assert(NULL == copyInfo.extentList); + Assert(0 == copyInfo.numAllocatedExtent); + copyInfo.xid = xid; + copyInfo.cid = cid; + copyInfo.numAppendedRows = 0; + copyInfo.extentList = NULL; + copyInfo.numFilledExtent = 0; + copyInfo.numAllocatedExtent = 0; +} + +/* LCOV_EXCL_STOP */ + +static bool +vci_inner_insert(Relation indexRel, ItemPointer heap_tid) +{ + TransactionId xid = GetCurrentTransactionId(); + TupleDesc tdesc; + HeapTuple htup; + int options = 0; + + Oid data_wos_oid; + Relation data_wos_rel; + + Datum new_values[2]; + bool new_isnull[2]; + + vci_MainRelHeaderInfo *vmr_info; + + /* get Data WOS relation from vci main rel */ + vmr_info = (vci_MainRelHeaderInfo *) palloc0(sizeof(vci_MainRelHeaderInfo)); + vci_InitMainRelHeaderInfo(vmr_info, indexRel, vci_rc_wos_insert); + vci_KeepMainRelHeader(vmr_info); + data_wos_oid = (Oid) vci_GetMainRelVar(vmr_info, vcimrv_data_wos_oid, 0); + + data_wos_rel = table_open(data_wos_oid, RowExclusiveLock); + + /* get tuple desc */ + tdesc = RelationGetDescr(data_wos_rel); /* */ + + /* create new tuple for insert */ + new_values[0] = ItemPointerGetDatum(heap_tid); + new_values[1] = Int64GetDatum(vci_GenerateXid64(xid, vmr_info)); + new_isnull[0] = false; + new_isnull[1] = false; + htup = heap_form_tuple(tdesc, new_values, new_isnull); + + /* insert (+ WAL) */ + + if (copy_with_freeze_option) + options |= HEAP_INSERT_FROZEN; + + heap_insert(data_wos_rel, htup, GetCurrentCommandId(true), options, NULL); + + heap_freetuple(htup); + table_close(data_wos_rel, RowExclusiveLock); + + /* unlock */ + vci_ReleaseMainRelHeader(vmr_info); + + return false; +} + +/* LCOV_EXCL_START */ +static void +WriteOneExtentForCopy(Relation indexRel) +{ + const LOCKMODE lockmode = ShareUpdateExclusiveLock; + + LockRelation(indexRel, lockmode); + vci_InitMainRelHeaderInfo(&(copyConvContext.info), + indexRel, vci_rc_copy_command); + vci_KeepMainRelHeader(&(copyConvContext.info)); + /* obtain target extent ID */ + copyConvContext.extentId = vci_GetFreeExtentId(&(copyConvContext.info)); + if (copyInfo.numAllocatedExtent <= copyInfo.numFilledExtent) + { + copyInfo.numAllocatedExtent += EXTENT_LIST_UNIT_EXTENSION; + copyInfo.extentList = repalloc(copyInfo.extentList, sizeof(uint32) * copyInfo.numAllocatedExtent); + } + copyInfo.extentList[++(copyInfo.numFilledExtent)] = + copyConvContext.extentId; + + /* write one extent into ROS */ + vci_AddTidCridUpdateList(&(copyConvContext.info), + &(copyConvContext.storage), + copyConvContext.extentId); + vci_WriteOneExtent(&(copyConvContext.info), + &(copyConvContext.storage), + copyConvContext.extentId, + InvalidTransactionId, + copyConvContext.xid, + copyConvContext.xid); + /* write header of the main relation */ + vci_WriteMainRelVar(&(copyConvContext.info), + vci_wmrv_update); + UnlockRelation(indexRel, lockmode); + vci_ReleaseMainRelInCommandContext(©ConvContext); + + vci_ResetRosChunkStorage(&(copyConvContext.storage)); +} + +static bool +vci_inner_insert_in_copy(Relation indexRel, ItemPointer heap_tid) +{ + vci_MainRelHeaderInfo *vmr_info = &(copyConvContext.info); + + if (0 == copyInfo.numAppendedRows) + { + uint32 val; + + vci_InitRosCommandContext0(©ConvContext, indexRel, + vci_rc_copy_command); + vci_RecoverOneVCIIfNecessary(vmr_info); + + vci_InitRosCommandContext1(©ConvContext, + VciGuc.maintenance_work_mem * INT64CONST(1024), + VCI_NUM_ROWS_IN_EXTENT, 0, + false); + vci_ResetRosChunkStorage(&(copyConvContext.storage)); + + vci_WriteExtentInfoInMainRosForCopyInit(vmr_info, + copyConvContext.extentId, + copyConvContext.xid); + + /* increment number of copy commands */ + val = vci_GetMainRelVar(vmr_info, vcimrv_num_unterminated_copy_cmd, 0); + ++val; + vci_SetMainRelVar(vmr_info, vcimrv_num_unterminated_copy_cmd, 0, val); + + vci_SetMainRelVar(vmr_info, vcimrv_ros_command, 0, vci_rc_copy_command); + + /* flush */ + vci_WriteMainRelVar(vmr_info, vci_wmrv_update); + + /* unlock */ + vci_ReleaseMainRelInCommandContext(©ConvContext); + + /* close heap relation */ + vci_CloseHeapRelInCommandContext(©ConvContext); + } + + { + Relation rel = table_open(copyConvContext.heapOid, AccessShareLock); + Buffer buffer = ReadBuffer(rel, ItemPointerGetBlockNumber(heap_tid)); + Page page = BufferGetPage(buffer); + ItemId lp = PageGetItemId(page, ItemPointerGetOffsetNumber(heap_tid)); + HeapTupleData tupleData; + HeapTuple tuple = &tupleData; + + Assert(ItemIdIsNormal(lp)); + + tuple->t_data = (HeapTupleHeader) PageGetItem(page, lp); + tuple->t_len = ItemIdGetLength(lp); + tuple->t_tableOid = RelationGetRelid(rel); + tuple->t_self = *heap_tid; + + vci_FillOneRowInRosChunkBuffer(&(copyConvContext.buffer), + &(copyConvContext.info), + &tuple->t_self, + tuple, + copyConvContext.indxColumnIdList, + copyConvContext.heapAttrNumList, + vci_GetTupleDescr(vmr_info)); + + if (copyConvContext.buffer.numRowsAtOnce <= copyConvContext.buffer.numFilled) + vci_RegisterChunkBuffer(&(copyConvContext.storage), + &(copyConvContext.buffer)); + + if (copyConvContext.numRowsToConvert <= copyConvContext.storage.numTotalRows) + { + Assert(copyConvContext.numRowsToConvert == copyConvContext.storage.numTotalRows); + WriteOneExtentForCopy(indexRel); + } + + table_close(rel, AccessShareLock); + } + + return false; +} + +void +vci_FinalizeCopyCommand(void) +{ + if (0 < copyConvContext.storage.numTotalRows) + { + Relation rel = table_open(copyConvContext.indexOid, RowExclusiveLock); + + WriteOneExtentForCopy(rel); + table_close(rel, RowExclusiveLock); + } + + vci_FinRosCommandContext(©ConvContext, false); +} + +static IndexBulkDeleteResult * +vci_inner_vacuumcleanup(IndexVacuumInfo *info, + IndexBulkDeleteResult *stats) +{ + elog(DEBUG2, "%s is called.", __func__); + + LockRelation(info->index, ShareUpdateExclusiveLock); + + vci_VacuumRos(info->index, info); + + UnlockRelation(info->index, ShareUpdateExclusiveLock); + + return NULL; +} + +/* LCOV_EXCL_STOP */ + +/** + * vci_add_index_delete + */ +void +vci_add_index_delete(Relation heapRel, ItemPointer heap_tid, TransactionId xmin) +{ + List *indexoidlist; + ListCell *l; + + /* Fast path if definitely no indexes */ + if (!RelationGetForm(heapRel)->relhasindex) + return; + + /* + * Get cached list of index OIDs + */ + indexoidlist = RelationGetIndexList(heapRel); + + /* Iterate for indexes */ + foreach(l, indexoidlist) + { + Oid indexOid = lfirst_oid(l); + Relation indexRel; + + Oid whiteoutWosOid; + Relation whiteoutWOSRel; + Datum new_values[2]; + bool new_isnull[2]; + HeapTuple htup; + TupleDesc tdesc; + + vci_MainRelHeaderInfo vmr_info_data; + vci_MainRelHeaderInfo *vmr_info = &vmr_info_data; + + TransactionId xid; + + /* Skip if Index is NOT VCI index */ + indexRel = index_open(indexOid, RowExclusiveLock); + if (!isVciIndexRelation(indexRel)) + { + index_close(indexRel, RowExclusiveLock); + continue; + } + + if (!fullPageWrites) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("access method \"%s\" does not work under full_page_writes=off", VCI_STRING), + errhint("Use DROP INDEX \"%s\"", RelationGetRelationName(indexRel)))); + + vci_InitMainRelHeaderInfo(vmr_info, indexRel, vci_rc_wos_delete); + vci_KeepMainRelHeader(vmr_info); + + /* Open Whiteout WOS */ + whiteoutWosOid = (Oid) vci_GetMainRelVar(vmr_info, vcimrv_whiteout_wos_oid, 0); + whiteoutWOSRel = table_open(whiteoutWosOid, RowExclusiveLock); + + tdesc = RelationGetDescr(whiteoutWOSRel); + + /* @see generateXidDiff() in vci_ros_command.c */ + if (!TransactionIdEquals(xmin, FrozenTransactionId)) + xid = xmin; + else + xid = GetCurrentTransactionId(); + + /* create new tuple for insert */ + new_values[0] = ItemPointerGetDatum(heap_tid); + new_values[1] = Int64GetDatum(vci_GenerateXid64(xid, vmr_info)); + new_isnull[0] = false; + new_isnull[1] = false; + + htup = heap_form_tuple(tdesc, new_values, new_isnull); + + /* insert TID into Whiteout WOS */ + simple_heap_insert(whiteoutWOSRel, htup); + heap_freetuple(htup); + table_close(whiteoutWOSRel, RowExclusiveLock); + + /* flush & unlock */ + vci_ReleaseMainRelHeader(vmr_info); + + index_close(indexRel, RowExclusiveLock); + } + + list_free(indexoidlist); +} + +List * +vci_add_should_index_insert(ResultRelInfo *resultRelInfo, + TupleTableSlot *slot, + ItemPointer tupleid, + EState *estate) +{ + int i; + int numIndices; + RelationPtr relationDescs; + Relation heapRelation; + IndexInfo **indexInfoArray; + ExprContext *econtext; + Datum values[INDEX_MAX_KEYS]; + bool isnull[INDEX_MAX_KEYS]; + + /* + * Get information from the result relation info structure. + */ + numIndices = resultRelInfo->ri_NumIndices; + relationDescs = resultRelInfo->ri_IndexRelationDescs; + indexInfoArray = resultRelInfo->ri_IndexRelationInfo; + heapRelation = resultRelInfo->ri_RelationDesc; + + /* + * We will use the EState's per-tuple context for evaluating predicates + * and index expressions (creating it if it's not already there). + */ + econtext = GetPerTupleExprContext(estate); + + /* Arrange for econtext's scan tuple to be the tuple under test */ + econtext->ecxt_scantuple = slot; + + /* + * for each index, form and insert the index tuple + */ + for (i = 0; i < numIndices; i++) + { + Relation indexRelation = relationDescs[i]; + IndexInfo *indexInfo; + + if (indexRelation == NULL) + continue; + + /* Skip if Index is NOT VCI index */ + if (!isVciIndexRelation(indexRelation)) + continue; + + indexInfo = indexInfoArray[i]; + + /* If the index is marked as read-only, ignore it */ + if (!indexInfo->ii_ReadyForInserts) + continue; + + /* Check for partial index */ + if (indexInfo->ii_Predicate != NIL) + { + ExprState *predicate; + + /* + * If predicate state not set up yet, create it (in the estate's + * per-query context) + */ + predicate = indexInfo->ii_PredicateState; + if (predicate == NULL) + { + predicate = ExecPrepareQual(indexInfo->ii_Predicate, estate); + indexInfo->ii_PredicateState = predicate; + } + + /* Skip this index-update if the predicate isn't satisfied */ + if (!ExecQual(predicate, econtext)) + continue; + } + + /* + * FormIndexDatum fills in its values and isnull parameters with the + * appropriate values for the column(s) of the index. + */ + FormIndexDatum(indexInfo, + slot, + estate, + values, + isnull); + + index_insert(indexRelation, /* index relation */ + values, /* array of index Datums */ + isnull, /* null flags */ + tupleid, /* tid of heap tuple */ + heapRelation, /* heap relation */ + UNIQUE_CHECK_NO, /* it is ignored in VCI */ + false, /* 'logically unchanged index' hint */ + indexInfo); /* index AM may need this */ + } + + return NIL; +} + +static bool +vci_add_drop_column(const ObjectAddress *object, int flags) +{ + Relation tableRel; + + if (vci_rebuild_command != vcirc_alter_table) + return false; + + Assert(object->objectSubId != 0); + + /* + * If object->objectSubId < 0, it means that the column is a system + * column. Such case occurs only when OID column is modified, but this is + * checked in other places. So simply skip in this place. + */ + if (object->objectSubId < 0) + return false; + + tableRel = relation_open(object->objectId, AccessExclusiveLock); + + if (tableRel->rd_rel->relkind != RELKIND_RELATION) + { + relation_close(tableRel, AccessExclusiveLock); + return false; + } + + relation_close(tableRel, AccessExclusiveLock); + + return false; +} + +bool +vci_add_drop_relation(const ObjectAddress *object, int flags) +{ + Relation rel; + Oid ruleId; + Oid oid = object->objectId; + char relKind = get_rel_relkind(oid); + bool concurrent = ((flags & PERFORM_DELETION_CONCURRENTLY) + == PERFORM_DELETION_CONCURRENTLY); + bool concurrent_lock_mode = ((flags & PERFORM_DELETION_CONCURRENT_LOCK) != 0); + vci_id_t vciid; + + if (object->objectSubId != 0) + return vci_add_drop_column(object, flags); + + if (relKind == RELKIND_INDEX) + { + rel = relation_open(oid, AccessExclusiveLock); + + if (!isVciIndexRelation(rel)) + { + relation_close(rel, NoLock); + return false; + } + relation_close(rel, NoLock); + + /* + * Deletion of VCI index by ALTER TABLE command is not supported + * + * Ereport only if the relation is vci main relation so that it does + * not give unnecessary messages. + * + * Return true when so that the post-processing does not continue. + */ + if (vci_rebuild_command == vcirc_alter_table) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot alter table because the table is indexed by VCI"), + errhint("You must drop index \"%s\" before using this command.", RelationGetRelationName(rel)))); + } + + if (concurrent) + elog(PANIC, "should not reach here"); + + index_drop(oid, concurrent, concurrent_lock_mode); + + vciid.oid = oid; + vciid.dbid = MyDatabaseId; + vci_freeMemoryEntry(&vciid); + } + else + { + rel = relation_open(oid, AccessExclusiveLock); + + if (!vci_isVciAdditionalRelation(rel)) + { + relation_close(rel, NoLock); + return false; + } + + /* + * Deletion of VCI index by ALTER TABLE command is not supported + * + * Ereport only if the relation is vci main relation so that it does + * not give unneccesary messages. + * + * Return true when so that the post-processing does not continue. + */ + if (vci_rebuild_command == vcirc_alter_table) + { + relation_close(rel, NoLock); + return true; + } + + if (concurrent) + elog(PANIC, "should not reach here"); + + /* 2.1 Is relation used? */ + CheckTableNotInUse(rel, "DROP TABLE"); + CheckTableForSerializableConflictIn(rel); + + ruleId = get_rewrite_oid(oid, rel->rd_rel->relname.data, true); + + /* 2.2 Drop relation storage */ + RelationDropStorage(rel); + + relation_close(rel, NoLock); + remove_on_commit_action(oid); + + /* 2.3 release relation cache */ + RelationForgetRelation(oid); + + /* 2.4 remove statistic info */ + RemoveStatistics(oid, 0); + + /* 2.5 remove pg_rewrite entry */ + if (OidIsValid(ruleId)) + RemoveRewriteRuleById(ruleId); + + /* 2.6 remove pg_attributes entry */ + DeleteAttributeTuples(oid); + + /* 2.7 remove pg_system entry */ + DeleteRelationTuple(oid); + + } + + return true; +} + +bool +vci_add_reindex_index(Relation indexRel) +{ + bool continue_after_return; + + /* if it is not VCI relation */ + if (!isVciIndexRelation(indexRel)) + continue_after_return = true; + + /* it is the VCI indexed relation */ + else + { + switch (vci_rebuild_command) + { + case vcirc_reindex: + /* called by the command REINDEX except REINDEX INDEX */ + continue_after_return = false; + break; + + case vcirc_alter_table: + + /* + * alter table for columns indexed by vci index, it is not + * work + */ + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot alter table because the table is indexed by VCI"), + errhint("You must drop index \"%s\" before using this command.", RelationGetRelationName(indexRel)))); + /* remaining work of reindex_index() must be cancelled */ + continue_after_return = false; + break; + + case vcirc_truncate: + + /* + * This is reindex_index called in truncation Command. In this + * case, before RelationSetNewRelfilenumber(indexRel,...) we + * must drop other relations for VCI. + */ + /* vci_add_drop_index(indexRel->rd_id); */ + continue_after_return = true; + break; + + case vcirc_cluster: + case vcirc_vacuum_full: + /* called by the command CLUSTER or VACUUM FULL */ + continue_after_return = true; + break; + + default: + elog(ERROR, "unexpected vci_RebuildCommand"); + break; + } + } + + return continue_after_return; +} + +bool +vci_add_skip_vci_index(Relation indexRel) +{ + return isVciIndexRelation(indexRel); +} + +bool +vci_add_alter_tablespace(Relation indexRel) +{ + if (isVciIndexRelation(indexRel)) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("ALTER INDEX SET TABLESPACE is not supported for VCI"), + errhint("DROP INDEX and CREATE INDEX instead"))); + return true; + } + else + return false; +} + +static uint32 +GetNumberOfBlocksFromOid(Oid oid) +{ + uint32 result; + Relation rel = relation_open(oid, AccessShareLock); + + result = RelationGetNumberOfBlocks(rel); + relation_close(rel, AccessShareLock); + + return result; +} + +static int64 +GetNumDBPagesOfVCIElement(vcis_attribute_type_t attrType, + int index, + vci_MainRelHeaderInfo *info) +{ +#ifdef USE_ASSERT_CHECKING + int numColumns = vci_GetMainRelVar(info, vcimrv_num_columns, 0); +#endif /* #ifdef USE_ASSERT_CHECKING */ + Oid dataOid = InvalidOid; + Oid metaOid = InvalidOid; + int64 result = 0; + + Assert((0 <= attrType) && (attrType < num_vcis_attribute_type)); + Assert((0 <= index) && (index < vci_GetNumIndexForAttributeType(attrType, numColumns))); + switch (attrType) + { + case vcis_attribute_type_main: + return RelationGetNumberOfBlocks(info->rel); + case vcis_attribute_type_data_wos: + dataOid = vci_GetMainRelVar(info, vcimrv_data_wos_oid, 0); + break; + case vcis_attribute_type_whiteout_wos: + dataOid = vci_GetMainRelVar(info, vcimrv_whiteout_wos_oid, 0); + break; + case vcis_attribute_type_tid_crid: + dataOid = vci_GetMainRelVar(info, vcimrv_tid_crid_data_oid, 0); + metaOid = vci_GetMainRelVar(info, vcimrv_tid_crid_meta_oid, 0); + break; + case vcis_attribute_type_tid_crid_update: + dataOid = vci_GetMainRelVar(info, vcimrv_tid_crid_update_oid_0, index); + break; + case vcis_attribute_type_delete_vec: + dataOid = vci_GetMainRelVar(info, vcimrv_delete_data_oid, 0); + metaOid = vci_GetMainRelVar(info, vcimrv_delete_meta_oid, 0); + break; + case vcis_attribute_type_null_vec: + dataOid = vci_GetMainRelVar(info, vcimrv_null_data_oid, 0); + metaOid = vci_GetMainRelVar(info, vcimrv_null_meta_oid, 0); + break; + case vcis_attribute_type_tid: + dataOid = vci_GetMainRelVar(info, vcimrv_tid_data_oid, 0); + metaOid = vci_GetMainRelVar(info, vcimrv_tid_meta_oid, 0); + break; + case vcis_attribute_type_pgsql: + { + vcis_m_column_t *mColumn; + + mColumn = vci_GetMColumn(info, index); + dataOid = mColumn->data_oid; + metaOid = mColumn->meta_oid; + break; + } + default: + elog(ERROR, "internal error. invalid attribute type"); + } + + if (OidIsValid(dataOid)) + result += GetNumberOfBlocksFromOid(dataOid); + if (OidIsValid(metaOid)) + result += GetNumberOfBlocksFromOid(metaOid); + + return result; +} + +PG_FUNCTION_INFO_V1(vci_index_size); +Datum +vci_index_size(PG_FUNCTION_ARGS) +{ + Relation rel; + uint32 numColumns; + uint32 numEntries; + uint32 aId; + int64 result = 0; + vci_MainRelHeaderInfo infoData; + vci_MainRelHeaderInfo *info = &infoData; + LOCKMODE lockmode = AccessShareLock; + + text *relname = PG_GETARG_TEXT_P(0); + + if (PG_NARGS() != 1) + ereport(ERROR, + (errmsg("vci_index_size requires 1 argument"))); + + { + RangeVar *relrv; + + relrv = makeRangeVarFromNameList(textToQualifiedNameList(relname)); + rel = relation_openrv(relrv, lockmode); + if (!isVciIndexRelation(rel)) + ereport(ERROR, + (errcode(ERRCODE_WRONG_OBJECT_TYPE), + errmsg("only VCI index is supported"))); + } + + vci_InitMainRelHeaderInfo(info, rel, vci_rc_probe); + vci_KeepMainRelHeader(info); + numColumns = vci_GetMainRelVar(info, vcimrv_num_columns, 0); + numEntries = vci_GetSumOfAttributeIndices(numColumns); + + for (aId = 0; aId < numEntries; ++aId) + { + vcis_attribute_type_t attrType; + int index; + + vci_GetAttrTypeAndIndexFromSumOfIndices(&attrType, + &index, + numColumns, + aId); + result += GetNumDBPagesOfVCIElement(attrType, index, info); + } + + vci_ReleaseMainRelHeader(info); + relation_close(rel, lockmode); + + PG_RETURN_INT64(result * BLCKSZ); +} + +/* + * Process Utility Hook + */ + +void +vci_process_utility(PlannedStmt *pstmt, + const char *queryString, + bool readOnlyTree, + ProcessUtilityContext context, + ParamListInfo params, + QueryEnvironment *queryEnv, + DestReceiver *dest, + QueryCompletion *qc) +{ + bool creating_vci_extension = false; + volatile bool saved_vci_is_in_vci_create_extension; + + Node *parseTree = pstmt->utilityStmt; + + vci_check_prohibited_operation(parseTree, &creating_vci_extension); + + saved_vci_is_in_vci_create_extension = vci_is_in_vci_create_extension; + + if (creating_vci_extension) + vci_is_in_vci_create_extension = true; + + vci_rebuild_command = vcirc_invalid; + copy_with_freeze_option = false; + +#define UNUSE_COPY_INSERT + + switch (nodeTag(parseTree)) + { + /* check if the statement is a "COPY table FROM ..." statement */ + case T_CopyStmt: + { + CopyStmt *stmt; + ListCell *lc; + +#ifndef UNUSE_COPY_INSERT + TransactionId xid = GetCurrentTransactionId(); + CommandId cid = GetCurrentCommandId(false); + + Assert(TransactionIdIsValid(xid)); + Assert(InvalidCommandId != cid); + vci_set_copy_transaction_and_command_id(xid, cid); +#endif /* #ifndef UNUSE_COPY_INSERT */ + + stmt = (CopyStmt *) parseTree; + + foreach(lc, stmt->options) + { + DefElem *defel = (DefElem *) lfirst(lc); + + if (strcmp(defel->defname, "freeze") == 0) + { + if (defGetBoolean(defel)) + { + copy_with_freeze_option = true; + break; + } + } + } + } + break; + + /* check if the statement is a TRUNCATE for VCI Indexed table */ + case T_TruncateStmt: + vci_rebuild_command = vcirc_truncate; + break; + + /* check if the statement is a REINDEX for VCI Indexed table */ + case T_ReindexStmt: + vci_rebuild_command = vcirc_reindex; + break; + + /* check if the statement is a REINDEX for VCI Indexed table */ + case T_AlterTableStmt: + vci_rebuild_command = vcirc_alter_table; + break; + + /* check if the statement is a VACUUM for VCI Indexed table */ + case T_VacuumStmt: + vci_rebuild_command = vcirc_vacuum_full; + break; + + /* check if the statement is a CLUSTER for VCI Indexed table */ + case T_ClusterStmt: + vci_rebuild_command = vcirc_cluster; + break; + + default: + break; + } + + if (creating_vci_extension) + { + PG_TRY(); + { + if (process_utility_prev != NULL) + process_utility_prev(pstmt, queryString, readOnlyTree, + context, params, queryEnv, + dest, qc); + else + standard_ProcessUtility(pstmt, queryString, readOnlyTree, + context, params, queryEnv, + dest, qc); + } + PG_CATCH(); + { + vci_is_in_vci_create_extension = saved_vci_is_in_vci_create_extension; + + PG_RE_THROW(); + } + PG_END_TRY(); + } + else + { + if (process_utility_prev != NULL) + process_utility_prev(pstmt, queryString, readOnlyTree, + context, params, queryEnv, + dest, qc); + else + standard_ProcessUtility(pstmt, queryString, readOnlyTree, + context, params, queryEnv, + dest, qc); + } + + vci_rebuild_command = vcirc_invalid; + + vci_is_in_vci_create_extension = saved_vci_is_in_vci_create_extension; + +#ifndef UNUSE_COPY_INSERT + /* check if the statement is a "COPY table FROM ..." statement */ + if (nodeTag(parseTree) == T_CopyStmt) + vci_FinalizeCopyCommand(); +#endif /* #ifndef UNUSE_COPY_INSERT */ +} + +/* + * VCI handler function: return IndexAmRoutine with access method parameters + * and callbacks. + */ +PG_FUNCTION_INFO_V1(vci_handler); + +Datum +vci_handler(PG_FUNCTION_ARGS) +{ + IndexAmRoutine *amroutine = makeNode(IndexAmRoutine); + + amroutine->amstrategies = 1; + amroutine->amsupport = 0; + amroutine->amoptsprocnum = 0; + amroutine->amcanorder = false; + amroutine->amcanorderbyop = false; + amroutine->amcanhash = false; + amroutine->amconsistentequality = false; + amroutine->amconsistentordering = false; + amroutine->amcanbackward = false; + amroutine->amcanunique = false; + amroutine->amcanmulticol = true; + amroutine->amoptionalkey = false; + amroutine->amsearcharray = false; + amroutine->amsearchnulls = false; + amroutine->amstorage = false; + amroutine->amclusterable = false; + amroutine->ampredlocks = false; + amroutine->amcanparallel = false; + amroutine->amcanbuildparallel = false; + amroutine->amcaninclude = false; + amroutine->amusemaintenanceworkmem = false; + amroutine->amsummarizing = false; + amroutine->amparallelvacuumoptions = VACUUM_OPTION_NO_PARALLEL; + amroutine->amkeytype = InvalidOid; + + amroutine->ambuild = vci_build; + amroutine->ambuildempty = vci_buildempty; + amroutine->aminsert = vci_insert; + amroutine->aminsertcleanup = NULL; + amroutine->ambulkdelete = vci_bulkdelete; + amroutine->amvacuumcleanup = vci_vacuumcleanup; + amroutine->amcanreturn = NULL; + amroutine->amcostestimate = vci_costestimate; + amroutine->amgettreeheight = vci_gettreeheight; + amroutine->amoptions = vci_options; + amroutine->amproperty = NULL; + amroutine->ambuildphasename = NULL; + amroutine->amvalidate = vci_validate; + amroutine->amadjustmembers = NULL; + amroutine->ambeginscan = vci_beginscan; + amroutine->amrescan = vci_rescan; + amroutine->amgettuple = NULL; + amroutine->amgetbitmap = NULL; + amroutine->amendscan = vci_endscan; + amroutine->ammarkpos = vci_markpos; + amroutine->amrestrpos = vci_restrpos; + + amroutine->amestimateparallelscan = NULL; + amroutine->aminitparallelscan = NULL; + amroutine->amparallelrescan = NULL; + + amroutine->amtranslatestrategy = NULL; + amroutine->amtranslatecmptype = NULL; + + PG_RETURN_POINTER(amroutine); +} diff --git a/contrib/vci/storage/vci_internal_view.c b/contrib/vci/storage/vci_internal_view.c new file mode 100644 index 000000000000..d5422d7ce00c --- /dev/null +++ b/contrib/vci/storage/vci_internal_view.c @@ -0,0 +1,663 @@ +/*------------------------------------------------------------------------- + * + * vci_internal_view.c + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/storage/vci_internal_view.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/genam.h" +#include "access/htup.h" +#include "access/htup_details.h" +#include "access/skey.h" +#include "catalog/indexing.h" +#include "catalog/namespace.h" /* for RangeVarGetRelid */ +#include "catalog/pg_am.h" +#include "catalog/pg_class.h" +#include "catalog/pg_depend.h" +#include "catalog/pg_index.h" +#include "catalog/pg_opclass.h" +#include "catalog/pg_opfamily.h" +#include "catalog/pg_namespace.h" +#include "commands/tablecmds.h" +#include "commands/defrem.h" +#include "nodes/nodes.h" +#include "nodes/parsenodes.h" +#include "nodes/primnodes.h" +#include "storage/lock.h" +#include "utils/acl.h" +#include "utils/fmgroids.h" /* for F_OIDEQ */ +#include "utils/rel.h" +#include "utils/relcache.h" +#include "utils/syscache.h" + +#include "vci.h" + +#include "vci_ros.h" + +bool vci_is_in_vci_create_extension; + +static List *make_dependent_view_list(Oid relOid); +static void change_owner_or_schema_of_internal_view_list(List *internal_view_oid_list, Oid newOid, bool is_owner); +static void check_prohibited_operation_for_extension(const char *extname); +static void check_prohibited_operation_for_access_method(const char *amname); +static void check_prohibited_operation_for_range_var(RangeVar *rel); +static void check_prohibited_operation_for_object(ObjectType objtype, Node *object); +static void check_prohibited_operation_for_relation(Relation rel); +static bool is_vci_access_method(Oid accessMethodObjectId); + +void +vci_alter_table_change_owner(Oid relOid, char relKind, Oid newOwnerId) +{ + List *view_oid_list = NIL; + + if (relKind != RELKIND_INDEX) + return; + + view_oid_list = make_dependent_view_list(relOid); + + if (view_oid_list == NIL) + return; + + change_owner_or_schema_of_internal_view_list(view_oid_list, newOwnerId, true); + + list_free(view_oid_list); +} + +void +vci_alter_table_change_schema(Oid relOid, char relKind, Oid newNspOid) +{ + List *view_oid_list = NIL; + + if (relKind != RELKIND_INDEX) + return; + + view_oid_list = make_dependent_view_list(relOid); + + if (view_oid_list == NIL) + return; + + change_owner_or_schema_of_internal_view_list(view_oid_list, newNspOid, false); + + list_free(view_oid_list); +} + +static List * +make_dependent_view_list(Oid relOid) +{ + Relation depRel; + ScanKeyData key[2]; + SysScanDesc depScan; + HeapTuple depTup; + List *view_oid_list = NIL; + + depRel = table_open(DependRelationId, AccessShareLock); + + ScanKeyInit(&key[0], + Anum_pg_depend_refclassid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(RelationRelationId)); + ScanKeyInit(&key[1], + Anum_pg_depend_refobjid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(relOid)); + + depScan = systable_beginscan(depRel, DependReferenceIndexId, true, + NULL, 2, key); + + while (HeapTupleIsValid(depTup = systable_getnext(depScan))) + { + Form_pg_depend pg_depend = (Form_pg_depend) GETSTRUCT(depTup); + + Assert(pg_depend->refclassid == RelationRelationId); + Assert(pg_depend->refobjid == relOid); + + /* Ignore dependees that aren't user columns of relations */ + /* (we assume system columns are never of rowtypes) */ + if (pg_depend->classid != RelationRelationId || + pg_depend->refobjsubid != 0) + continue; + + view_oid_list = lappend_oid(view_oid_list, pg_depend->objid); + } + + systable_endscan(depScan); + + relation_close(depRel, AccessShareLock); + + return view_oid_list; +} + +static void +change_owner_or_schema_of_internal_view_list(List *view_oid_list, Oid newOid, bool is_owner) +{ + ListCell *lc; + + foreach(lc, view_oid_list) + { + Oid childRelOid = lfirst_oid(lc); + Relation class_rel; + HeapTuple tuple; + Form_pg_class tuple_class; + + class_rel = table_open(RelationRelationId, RowExclusiveLock); + + tuple = SearchSysCache1(RELOID, ObjectIdGetDatum(childRelOid)); + if (!HeapTupleIsValid(tuple)) + elog(ERROR, "cache lookup failed for relation %u", childRelOid); + + tuple_class = (Form_pg_class) GETSTRUCT(tuple); + + if (vci_isVciAdditionalRelationTuple(childRelOid, tuple_class)) + { + Datum repl_val[Natts_pg_class]; + bool repl_null[Natts_pg_class]; + bool repl_repl[Natts_pg_class]; + Acl *newAcl; + Datum aclDatum; + bool isNull; + HeapTuple newtuple; + + memset(repl_null, false, sizeof(repl_null)); + memset(repl_repl, false, sizeof(repl_repl)); + + if (is_owner) + { + repl_repl[Anum_pg_class_relowner - 1] = true; + repl_val[Anum_pg_class_relowner - 1] = ObjectIdGetDatum(newOid); + + aclDatum = SysCacheGetAttr(RELOID, tuple, + Anum_pg_class_relacl, + &isNull); + if (!isNull) + { + newAcl = aclnewowner(DatumGetAclP(aclDatum), + tuple_class->relowner, newOid); + repl_repl[Anum_pg_class_relacl - 1] = true; + repl_val[Anum_pg_class_relacl - 1] = PointerGetDatum(newAcl); + } + } + else + { + repl_repl[Anum_pg_class_relnamespace - 1] = true; + repl_val[Anum_pg_class_relnamespace - 1] = ObjectIdGetDatum(newOid); + } + + newtuple = heap_modify_tuple(tuple, RelationGetDescr(class_rel), repl_val, repl_null, repl_repl); + + CatalogTupleUpdate(class_rel, &newtuple->t_self, newtuple); + + heap_freetuple(newtuple); + } + + ReleaseSysCache(tuple); + table_close(class_rel, RowExclusiveLock); + } +} + +void +vci_check_prohibited_operation(Node *parseTree, bool *creating_vci_extension) +{ + switch (nodeTag(parseTree)) + { + case T_CreateExtensionStmt: + { + CreateExtensionStmt *stmt = (CreateExtensionStmt *) parseTree; + + if (strcmp(stmt->extname, VCI_STRING) == 0) + { + ListCell *lc; + + foreach(lc, stmt->options) + { + DefElem *defel = (DefElem *) lfirst(lc); + + if (strcmp(defel->defname, "schema") == 0 + && get_namespace_oid(defGetString(defel), false) != PG_PUBLIC_NAMESPACE) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("extension \"%s\" cannot specify a schema name", VCI_STRING))); + } + } + + *creating_vci_extension = true; + } + } + break; + + case T_AlterExtensionStmt: + check_prohibited_operation_for_extension(((AlterExtensionStmt *) parseTree)->extname); + break; + + case T_AlterExtensionContentsStmt: + check_prohibited_operation_for_extension(((AlterExtensionContentsStmt *) parseTree)->extname); + break; + + case T_ViewStmt: /* CREATE (OR REPLACE) VIEW */ + check_prohibited_operation_for_range_var(((ViewStmt *) parseTree)->view); + break; + + case T_AlterTableStmt: /* ALTER VIEW */ + check_prohibited_operation_for_range_var(((AlterTableStmt *) parseTree)->relation); + break; + + case T_RuleStmt: /* CREATE RULE */ + check_prohibited_operation_for_range_var(((RuleStmt *) parseTree)->relation); + break; + + case T_CreateTrigStmt: /* CREATE TRIGGER */ + check_prohibited_operation_for_range_var(((CreateTrigStmt *) parseTree)->relation); + break; + + case T_GrantStmt: + { + GrantStmt *stmt = (GrantStmt *) parseTree; + + if ((stmt->targtype == ACL_TARGET_OBJECT) && (stmt->objtype == OBJECT_TABLE)) + { + ListCell *lc; + + foreach(lc, stmt->objects) + check_prohibited_operation_for_range_var((RangeVar *) lfirst(lc)); + } + } + break; + + case T_GrantRoleStmt: + break; + + case T_CreateOpClassStmt: + if (!vci_is_in_vci_create_extension) + check_prohibited_operation_for_access_method(((CreateOpClassStmt *) parseTree)->amname); + break; + + case T_CreateOpFamilyStmt: + if (!vci_is_in_vci_create_extension) + check_prohibited_operation_for_access_method(((CreateOpFamilyStmt *) parseTree)->amname); + break; + + case T_AlterOpFamilyStmt: + if (!vci_is_in_vci_create_extension) + check_prohibited_operation_for_access_method(((AlterOpFamilyStmt *) parseTree)->amname); + break; + + case T_ReindexStmt: + { + ReindexStmt *stmt = (ReindexStmt *) parseTree; + Relation rel; + + if (stmt->kind != REINDEX_OBJECT_INDEX) + break; + + rel = relation_openrv_extended(stmt->relation, AccessShareLock, true); + + if (rel == NULL) + break; + + if (isVciIndexRelation(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("REINDEX is not supported for VCI"), + errhint("DROP INDEX and CREATE INDEX instead"))); + + relation_close(rel, AccessShareLock); + } + break; + + case T_ClusterStmt: + { + ClusterStmt *stmt = (ClusterStmt *) parseTree; + Relation rel; + + /* + * Do nothing, if CLUSTER command issued without relation + * name. As this command will only cluster previously + * clustered tables, VCI indexed tables will not be clustered + * anyways + */ + if (stmt->relation == NULL) + break; + + rel = relation_openrv_extended(stmt->relation, AccessShareLock, true); + + if (rel == NULL) + break; + + if (RelationGetForm(rel)->relhasindex) + { + List *indexoidlist; + ListCell *lc; + + indexoidlist = RelationGetIndexList(rel); + + foreach(lc, indexoidlist) + { + Oid indexOid = lfirst_oid(lc); + Relation indexRel; + + indexRel = index_open(indexOid, AccessShareLock); + + if (isVciIndexRelation(indexRel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("cannot cluster tables including %s index(es)", VCI_STRING), + errhint("Use DROP INDEX %s first", RelationGetRelationName(indexRel)))); + + index_close(indexRel, AccessShareLock); + } + } + + relation_close(rel, AccessShareLock); + } + break; + + case T_CommentStmt: /* COMMENT */ + { + CommentStmt *stmt = (CommentStmt *) parseTree; + + if (stmt->objtype == OBJECT_MATVIEW) + check_prohibited_operation_for_object(stmt->objtype, stmt->object); + } + break; + + case T_SecLabelStmt: /* SECURITY LABEL */ + { + SecLabelStmt *stmt = (SecLabelStmt *) parseTree; + + if (stmt->objtype == OBJECT_MATVIEW) + check_prohibited_operation_for_object(stmt->objtype, stmt->object); + } + break; + + case T_RenameStmt: + { + RenameStmt *stmt = (RenameStmt *) parseTree; + + switch (stmt->renameType) + { + case OBJECT_MATVIEW: + check_prohibited_operation_for_range_var(stmt->relation); + break; + + case OBJECT_OPCLASS: + case OBJECT_OPFAMILY: + check_prohibited_operation_for_object(stmt->renameType, stmt->object); + break; + default: + break; + } + } + break; + + case T_AlterObjectSchemaStmt: + { + AlterObjectSchemaStmt *stmt = (AlterObjectSchemaStmt *) parseTree; + + switch (stmt->objectType) + { + case OBJECT_MATVIEW: + check_prohibited_operation_for_range_var(stmt->relation); + break; + + case OBJECT_EXTENSION: + case OBJECT_OPCLASS: + case OBJECT_OPFAMILY: + check_prohibited_operation_for_object(stmt->objectType, stmt->object); + break; + + default: + break; + } + } + break; + + case T_AlterOwnerStmt: + { + AlterOwnerStmt *stmt = (AlterOwnerStmt *) parseTree; + + switch (stmt->objectType) + { + case OBJECT_OPCLASS: + case OBJECT_OPFAMILY: + check_prohibited_operation_for_object(stmt->objectType, stmt->object); + break; + + default: + break; + } + } + break; + + case T_IndexStmt: + { + IndexStmt *stmt = (IndexStmt *) parseTree; + + if (strcmp(stmt->accessMethod, VCI_STRING) == 0) + { + if (stmt->concurrent) + { + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("access method \"%s\" does not support concurrent index build", VCI_STRING), + errhint("Use DROP INDEX to remove an vci index and try again without CONCURRENTLY option"))); + } + } + } + break; + + case T_DropStmt: + { + DropStmt *stmt = (DropStmt *) parseTree; + + if (stmt->removeType == OBJECT_INDEX) + { + ListCell *lc; + + if (stmt->concurrent) + { + foreach(lc, stmt->objects) + { + RangeVar *range_var = makeRangeVarFromNameList((List *) lfirst(lc)); + Relation relation; + + relation = relation_openrv_extended(range_var, AccessShareLock, true); + + if (relation == NULL) + break; + + if (isVciIndexRelation(relation)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("access method \"%s\" does not support concurrent index drop", VCI_STRING), + errhint("Try again without CONCURRENTLY option"))); + + relation_close(relation, AccessShareLock); + } + } + } + } + break; + + /* + * REFRESH MATERIALIZED VIEW on a VCI internal materialized view + * is prohibited. + */ + case T_RefreshMatViewStmt: + check_prohibited_operation_for_range_var(((RefreshMatViewStmt *) parseTree)->relation); + break; + + default: + break; + } +} + +static void +check_prohibited_operation_for_extension(const char *extname) +{ + if (strcmp(extname, VCI_STRING) == 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("extension \"%s\" prohibits this operation", VCI_STRING))); +} + +static void +check_prohibited_operation_for_access_method(const char *amname) +{ + if (strcmp(amname, VCI_STRING) == 0) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("extension \"%s\" prohibits this operation on access method \"%s\"", + VCI_STRING, VCI_STRING))); +} + +static void +check_prohibited_operation_for_range_var(RangeVar *range_var) +{ + Relation rel; + + rel = relation_openrv_extended(range_var, AccessShareLock, true); + + if (rel == NULL) + return; + + check_prohibited_operation_for_relation(rel); + + relation_close(rel, AccessShareLock); +} + +static void +check_prohibited_operation_for_object(ObjectType objtype, Node *object) +{ + switch (objtype) + { + case OBJECT_EXTENSION: + check_prohibited_operation_for_extension(strVal(object)); + break; + + case OBJECT_MATVIEW: + case OBJECT_OPCLASS: + case OBJECT_OPFAMILY: + { + ObjectAddress address; + Relation relation = NULL; + + address = get_object_address(objtype, object, &relation, AccessShareLock, true); + + if (!OidIsValid(address.objectId)) + goto done; + + switch (objtype) + { + case OBJECT_MATVIEW: + check_prohibited_operation_for_relation(relation); + break; + + case OBJECT_OPCLASS: + { + Relation opclass_rel; + HeapTuple opclass_tuple; + Form_pg_opclass opclass_form; + + opclass_rel = table_open(OperatorClassRelationId, AccessShareLock); + + opclass_tuple = SearchSysCache1(CLAOID, ObjectIdGetDatum(address.objectId)); + if (!HeapTupleIsValid(opclass_tuple)) /* should not happen */ + elog(ERROR, "cache lookup failed for opclass %u", address.objectId); + + opclass_form = (Form_pg_opclass) GETSTRUCT(opclass_tuple); + + if (is_vci_access_method(opclass_form->opcmethod)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("extension \"%s\" prohibits this operation on operation class \"%s\"", + VCI_STRING, NameStr(opclass_form->opcname)))); + + ReleaseSysCache(opclass_tuple); + table_close(opclass_rel, AccessShareLock); + } + break; + + case OBJECT_OPFAMILY: + { + Relation opfamily_rel; + HeapTuple opfamily_tuple; + Form_pg_opfamily opfamily_form; + + opfamily_rel = table_open(OperatorFamilyRelationId, AccessShareLock); + + opfamily_tuple = SearchSysCache1(OPFAMILYOID, ObjectIdGetDatum(address.objectId)); + if (!HeapTupleIsValid(opfamily_tuple)) /* should not happen */ + elog(ERROR, "cache lookup failed for opfamily %u", address.objectId); + + opfamily_form = (Form_pg_opfamily) GETSTRUCT(opfamily_tuple); + + if (is_vci_access_method(opfamily_form->opfmethod)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("extension \"%s\" prohibits this operation on operation family \"%s\"", + VCI_STRING, NameStr(opfamily_form->opfname)))); + + ReleaseSysCache(opfamily_tuple); + table_close(opfamily_rel, AccessShareLock); + } + break; + + default: + break; + } + + done: + if (relation != NULL) + relation_close(relation, AccessShareLock); + } + break; + + default: + break; + } +} + +static void +check_prohibited_operation_for_relation(Relation rel) +{ + if (vci_isVciAdditionalRelation(rel)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("extension \"%s\" prohibits this operation on view \"%s\"", + VCI_STRING, NameStr(rel->rd_rel->relname)))); +} + +static bool +is_vci_access_method(Oid accessMethodObjectId) +{ + HeapTuple amtuple; + bool result = false; + Form_pg_am amform; + + amtuple = SearchSysCache1(AMOID, + ObjectIdGetDatum(accessMethodObjectId)); + + if (!HeapTupleIsValid(amtuple)) + { + elog(WARNING, + "cache lookup failed for access method %u", accessMethodObjectId); + + return false; + } + + amform = (Form_pg_am) GETSTRUCT(amtuple); + + if (strcmp(NameStr(amform->amname), VCI_STRING) == 0) + result = true; + + ReleaseSysCache(amtuple); + + return result; +} diff --git a/contrib/vci/storage/vci_low_utils.c b/contrib/vci/storage/vci_low_utils.c new file mode 100644 index 000000000000..fcf3d1217f36 --- /dev/null +++ b/contrib/vci/storage/vci_low_utils.c @@ -0,0 +1,90 @@ +/*------------------------------------------------------------------------- + * + * vci_low_utils.c + * Low-level utility function + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/storage/vci_low_utils.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "utils/snapmgr.h" + +#include "vci.h" +#include "vci_ros.h" + +/** + * @brief This function writes data over pages. + * + * The last page is not flushed. + * So, after calling for the last data, the last page in the return value + * must be written by functions like vci_WriteOnePageIfNecessaryAndGetBuffer(). + * + * @param[in] rel relation to store the data. + * @param[in, out] blockNumber the first block number to write as input. + * the blockNumber for the next data. + * @param[in, out] blockNumberOld the block number of buffer in the argument. + * the blockNumber of the buffer returned as output. + * @param[in, out] offsetInPage the offset in the page to write as input. + * the offset in the page of the next data as output. + * @param[in] buffer shared buffer of *blockNumber. + * @param[in] data_ the pointer of the data to write. + * @param[in] size the size of the data to write. + * @return the shared buffer read last, which is not written. + */ +Buffer +vci_WriteDataIntoMultiplePages(Relation rel, + BlockNumber *blockNumber, + BlockNumber *blockNumberOld, + uint32 *offsetInPage, + Buffer buffer, + const void *data_, + Size size) +{ + const char *data = (const char *) data_; + Size ptr; + + Assert(*offsetInPage < VCI_MAX_PAGE_SPACE); + for (ptr = 0; ptr < size;) + { + Page page; + uint32 writeSize; + + writeSize = Min(VCI_MAX_PAGE_SPACE - *offsetInPage, size - ptr); + buffer = vci_WriteOnePageIfNecessaryAndGetBuffer(rel, + *blockNumber, + *blockNumberOld, + buffer); + *blockNumberOld = *blockNumber; + page = BufferGetPage(buffer); + MemCpy(&(page[VCI_MIN_PAGE_HEADER + *offsetInPage]), &(data[ptr]), + writeSize); + ptr += writeSize; + *offsetInPage += writeSize; + if (VCI_MAX_PAGE_SPACE <= *offsetInPage) + { + ++(*blockNumber); + *offsetInPage = 0; + } + } + + return buffer; +} + +/** + * @brief Get active snapshot and push it, update command ID. + * + * @return active snapshot. + */ +Snapshot +vci_GetCurrentSnapshot(void) +{ + PushCopiedSnapshot(GetActiveSnapshot()); + UpdateActiveSnapshotCommandId(); + + return GetActiveSnapshot(); +} diff --git a/contrib/vci/storage/vci_memory_entry.c b/contrib/vci/storage/vci_memory_entry.c new file mode 100644 index 000000000000..a69644630c73 --- /dev/null +++ b/contrib/vci/storage/vci_memory_entry.c @@ -0,0 +1,915 @@ +/*------------------------------------------------------------------------- + * + * vci_memory_entry.c + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/storage/vci_memory_entry.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "c.h" +#include "access/sysattr.h" +#include "access/xact.h" +#include "access/reloptions.h" +#include "catalog/indexing.h" +#include "catalog/pg_database.h" +#include "catalog/pg_tablespace.h" +#include "miscadmin.h" +#include "storage/lock.h" +#include "storage/lwlock.h" +#include "utils/builtins.h" +#include "utils/fmgroids.h" +#include "utils/snapmgr.h" + +#include "vci.h" + +#include "vci_mem.h" +#include "vci_memory_entry.h" + +#define VCI_LOCKTAG_MEMORY_ENTRY LOCKTAG_USERLOCK + +static void debug_show_vciid_ts(const char *head, Oid oid, Oid tsid); +static char *getTablespacePath(Oid tsid); +static HeapTuple GetDatabaseTupleByOid(Oid dboid); +static void initializeMemoryEntryCommon(vci_memory_entry_t *entry, vci_id_t *vciid, Oid tsid); +static void initializeMemoryEntry(vci_memory_entry_t *entry, vci_id_t *vciid, Oid tsid, int32 timeStamp); +static void resetMemoryEntry(vci_memory_entry_t *entry, vci_id_t *vciid, Oid tsid, int32 timeStamp); +static int findMemoryEntryLocation(vci_id_t *vciid); +static void stampOnMemoryEntry(vci_memory_entry_t *entry, vci_memory_entries_t *entries); +static int determineRoomPosition(Oid oid); +static void setLockTagPointer(LOCKTAG *locktag, void *ptr); +static LockAcquireResult lockAcquirePointer(vci_memory_entry_t *entry, LOCKMODE lockmode, bool dontWait); +static bool lockReleasePointer(vci_memory_entry_t *entry, LOCKMODE lockmode); +static int makeRoomAndGetLocation(vci_id_t *vciid, Oid tsid); +static void registerMemoryEntry2Device(vci_memory_entry_t *entry); + +/** + * output debug log + * @param[in] head the string in the head of a log + * @param[in] oid vci oid + * @param[in] tsid tablespace oid + * @param[in] path path of tablespace + */ +static void +debug_show_vciid_ts(const char *head, Oid oid, Oid tsid) +{ + bool free_flag = true; + char *path = getTablespacePath(tsid); + + if (path == NULL) + { + path = DataDir; + free_flag = false; + } + elog(DEBUG2, "%s entry oid=%u tsid=%u (%s)", head, oid, tsid, path); + + if (free_flag) + pfree(path); +} + +#define VCI_HASH_WIDTH 65536 + +Size +vci_GetSizeOfMemoryEntries(void) +{ + Size result; + uint32 capacity; + + /* LCOV_EXCL_START */ + capacity = Max(VCI_HASH_WIDTH / NUM_BUFFER_PARTITIONS, 1); + /* LCOV_EXCL_STOP */ + + result = offsetof(vci_memory_entries_t, data) + + sizeof(vci_memory_entry_t) * capacity; + + return result; +} + +/** + * @brief Initialize the area for VCI memory objects residing with PostgreSQL + * instance, and the hash table area. + */ +void +vci_InitMemoryEntries(void) +{ + int i; + uint32 capacity; + vci_id_t vciid; + + elog(DEBUG2, "vci_InitMemoryEntries"); + dlist_init(&VciShmemAddr->memory_entry_device_unknown_list); + + /* LCOV_EXCL_START */ + capacity = Max(VCI_HASH_WIDTH / NUM_BUFFER_PARTITIONS, 1); + elog(DEBUG2, ">>> capacity = %d", capacity); + /* LCOV_EXCL_STOP */ + + VciShmemAddr->memory_entries->capacity_hash_entries = capacity; + + VciShmemAddr->memory_entries->lock = VciShmemAddr->vci_memory_entries_lock; + + vciid.oid = InvalidOid; + vciid.dbid = InvalidOid; + + for (i = 0; i < capacity; i++) + initializeMemoryEntryCommon(&VciShmemAddr->memory_entries->data[i], &vciid, InvalidOid); +} + +static char * +getTablespacePath(Oid tsid) +{ + char *tablespace_path; + + Assert(OidIsValid(tsid)); + + if (tsid == DEFAULTTABLESPACE_OID || + tsid == GLOBALTABLESPACE_OID) + { + tablespace_path = NULL; + } + else + { + LOCAL_FCINFO(ci, 1); + + ci->nargs = 1; + ci->args[0].value = ObjectIdGetDatum(tsid); + ci->args[0].isnull = false; + + tablespace_path = text_to_cstring(DatumGetTextP(pg_tablespace_location(ci))); + } + + return tablespace_path; +} + +/* + * This function is a static function in src/backend/utils/init/postinit.c in + * PostgreSQL + * + * @param[in] dboid oid of a database + */ +static HeapTuple +GetDatabaseTupleByOid(Oid dboid) +{ + HeapTuple tuple; + Relation relation; + SysScanDesc scan; + ScanKeyData key[1]; + + /* + * form a scan key + */ + ScanKeyInit(&key[0], + Anum_pg_class_oid, + BTEqualStrategyNumber, F_OIDEQ, + ObjectIdGetDatum(dboid)); + + /* + * Open pg_database and fetch a tuple. Force heap scan if we haven't yet + * built the critical shared relcache entries (i.e., we're starting up + * without a shared relcache cache file). + */ + relation = table_open(DatabaseRelationId, AccessShareLock); + scan = systable_beginscan(relation, DatabaseOidIndexId, + criticalSharedRelcachesBuilt, + NULL, + 1, key); + + tuple = systable_getnext(scan); + + /* Must copy tuple before releasing buffer */ + if (HeapTupleIsValid(tuple)) + tuple = heap_copytuple(tuple); + + /* all done */ + systable_endscan(scan); + table_close(relation, AccessShareLock); + + return tuple; +} + +/** + * DefaultTablespaceOid for dbid. + * This function must be called in a transaction + * + * @param[in] dboid oid of a database + * + * @return Oid of default tablespace + */ +static Oid +getDefaultTablespaceOid(Oid dboid) +{ + HeapTuple tuple; + Form_pg_database dbform; + + tuple = GetDatabaseTupleByOid(dboid); + if (!HeapTupleIsValid(tuple)) + { + elog(DEBUG2, + "database %u does not exist", dboid); + return InvalidOid; + } + dbform = (Form_pg_database) GETSTRUCT(tuple); + + return dbform->dattablespace; +} + +/** + * Initialize memory entry. + * If a valid vci oid is given, this memory entry is registered to the list in vci_devload_t + * for WOS->ROS transformation. + * + * @param[in] entry memory entry to be initialized + * @param[in] vciid id of vci + * @param[in] tsid tablespace oid + */ +static void +initializeMemoryEntryCommon(vci_memory_entry_t *entry, vci_id_t *vciid, Oid tsid) +{ + elog(DEBUG2, "initializeMemoryEntryCommon: vciid->oid: %d, vciid->dbid: %d, tsid: %d)", vciid->oid, vciid->dbid, tsid); + + entry->id.oid = vciid->oid; + entry->id.dbid = vciid->dbid; + entry->tsid = tsid; + if (!OidIsValid(tsid) && OidIsValid(vciid->dbid)) + { + entry->real_tsid = getDefaultTablespaceOid(vciid->dbid); + if (!OidIsValid(entry->real_tsid)) + ereport(FATAL, + (errcode(ERRCODE_UNDEFINED_DATABASE), + errmsg("database %u does not exist", vciid->dbid))); + } + else + entry->real_tsid = tsid; + + if (OidIsValid(vciid->oid)) + { + dlist_push_head(&VciShmemAddr->memory_entry_device_unknown_list, &(entry->link)); + + entry->force_next_wosros_conv = false; + } +} + +/** + * Initialize memory entry + * + * @param[in] entry memory entry to be initialized + * @param[in] vciid id of vci (a pair of oid of vci and oid of database) + * @param[in] tsid oid of tablespace + * @param[in] timeStamp timestamp + */ +static void +initializeMemoryEntry(vci_memory_entry_t *entry, vci_id_t *vciid, Oid tsid, int32 timeStamp) +{ + elog(DEBUG2, "initializeMemoryEntry: vciid->oid: %d, vciid->dbid: %d, tsid: %d)", vciid->oid, vciid->dbid, tsid); + + if (OidIsValid(entry->id.oid)) + { + /* clean up previous registration. */ + dlist_delete(&entry->link); + } + + initializeMemoryEntryCommon(entry, vciid, tsid); + + entry->time_stamp = timeStamp; +} + +/** + * reset memory entry + * + * @param[in] entry memory entry to be initialized + * @param[in] vciid id of vci (a pair of oid of vci and oid of database) + * @param[in] tsid oid of tablespace + * @param[in] timeStamp timestamp + */ +static void +resetMemoryEntry(vci_memory_entry_t *entry, vci_id_t *vciid, Oid tsid, int32 timeStamp) +{ + elog(DEBUG2, "reset memory object of OID %d", vciid->oid); + + initializeMemoryEntry(entry, vciid, tsid, timeStamp); +} + +#define MAKE_ROOM_MAX_SCAN_SPAN (128) +#define MAKE_ROOM_SCAN_SPAN (8) +#define MAKE_ROOM_THRESHOLD (0x10000000) + +/** + * free memory entry + * + * @param[in] vciid id of vci whose memory entry is freed + */ +void +vci_freeMemoryEntry(vci_id_t *vciid) +{ + int index; + vci_memory_entries_t *entries = VciShmemAddr->memory_entries; + vci_id_t invalid_vciid; + + LWLockAcquire(entries->lock, LW_EXCLUSIVE); + + index = findMemoryEntryLocation(vciid); + + invalid_vciid.oid = InvalidOid; + invalid_vciid.dbid = InvalidOid; + if (index != -1) + initializeMemoryEntry(&entries->data[index], &invalid_vciid, InvalidOid, entries->time_stamp); + LWLockRelease(entries->lock); + + return; +} + +/** + * @brief This function returns the position in vci_memory_entries_t.data. + * + * If some data[ptr].oid has the given oid or InvalidOid, return ptr. + * In other case, return -1, meaning no room for the OID. + * + * @param[in] vciid id of vci, information of which are stored in the found + * memory entry. + * @return The index to the found object. + * + * @note This function must be called under the lock of the + * vci_memory_entries_t is acquired exclusively. Since this function does not + * acquire any lock on the entry, user must lock the entry and check if it has + * the correct OID. + */ +static int +findMemoryEntryLocation(vci_id_t *vciid) +{ + vci_memory_entries_t *entries = VciShmemAddr->memory_entries; + int capacity = entries->capacity_hash_entries; + int ptr = vciid->oid % capacity; + int ptr_candidate = -1; + int aId; + int aIdMax = Min(capacity, MAKE_ROOM_MAX_SCAN_SPAN); + + for (aId = 0; aId < aIdMax; ++aId) + { + if (entries->data[ptr].id.oid == vciid->oid && entries->data[ptr].id.dbid == vciid->dbid) + { + return ptr; + } + else if (ptr_candidate == -1 && !OidIsValid(entries->data[ptr].id.oid)) + { + ptr_candidate = ptr; + } + ptr = (ptr == (capacity - 1)) ? 0 : (1 + ptr); + } + + return ptr_candidate; +} + +/** + * set timestamp on memory entry + * + * @param[in] entry memory entry whose timestamp are set + * @param[in] entries memory entries in which set timestamp exists + */ +static void +stampOnMemoryEntry(vci_memory_entry_t *entry, vci_memory_entries_t *entries) +{ + entry->time_stamp = entries->time_stamp; +} + +/** + * This function is called when no room is available in the hash table. + * It finds a position to store information for oid. First, it + * calculates the hash value (HV) by oid % capacity. Then, look into the + * range [HV, HV + MAKE_ROOM_MAX_SCAN_SPAN - 1]. If there are entries, whose + * timestamps are older by MAKE_ROOM_THRESHOLD from the currrent, the oldest + * one is selected as the result of this function. Otherwise, it repeats the + * sequence in the next range, + * [HV + MAKE_ROOM_MAX_SCAN_SPAN, HV + 2 * MAKE_ROOM_MAX_SCAN_SPAN - 1], and + * so on. If there is no such entry, then it returns HV itself. + * + * @param[in] oid Oid of VCI main relation, for which the hash table entry is + * allocated. + * + * @return position ID of hash table entry. + */ +static int +determineRoomPosition(Oid oid) +{ + vci_memory_entries_t *entries = VciShmemAddr->memory_entries; + int capacity = entries->capacity_hash_entries; + int32 currentStamp = entries->time_stamp; + vci_memory_entry_t *data = entries->data; + int ptr = oid % capacity; + int maxPtr = ptr; + uint32 maxDiff = 0; + int aId; + int aIdMax = Min(capacity, MAKE_ROOM_MAX_SCAN_SPAN); + + for (aId = 0; aId < aIdMax; aId += MAKE_ROOM_SCAN_SPAN) + { + int bId; + int bIdMax = Min(capacity, aId + MAKE_ROOM_SCAN_SPAN); + + for (bId = aId; bId < bIdMax; ++bId) + { + uint32 newDiff = currentStamp - data[ptr].time_stamp; + + if (maxDiff < newDiff) + { + maxDiff = newDiff; + maxPtr = ptr; + } + ptr = (ptr == (capacity - 1)) ? 0 : (1 + ptr); + } + + if (MAKE_ROOM_THRESHOLD <= maxDiff) + break; + } + + elog(DEBUG2, + "discard OID %d at position %d with time stamp difference %d" + " for OID %d under capacity %d", + data[ptr].id.oid, ptr, maxDiff, oid, capacity); + + return maxPtr; +} + +static void +setLockTagPointer(LOCKTAG *locktag, void *ptr) +{ + locktag->locktag_field1 = ((uintptr_t) ptr) & 0xFFFFFFFF; + locktag->locktag_field2 = 0; + locktag->locktag_field3 = (((uintptr_t) ptr) >> 32) & 0xFFFFFFFF; + locktag->locktag_field4 = 0; + locktag->locktag_type = VCI_LOCKTAG_MEMORY_ENTRY; + locktag->locktag_lockmethodid = DEFAULT_LOCKMETHOD; +} + +static LockAcquireResult +lockAcquirePointer(vci_memory_entry_t *entry, LOCKMODE lockmode, bool dontWait) +{ + LOCKTAG locktag; + + Assert(entry); + + setLockTagPointer(&locktag, entry); + + return LockAcquire(&locktag, lockmode, false, dontWait); +} + +static bool +lockReleasePointer(vci_memory_entry_t *entry, LOCKMODE lockmode) +{ + LOCKTAG locktag; + + setLockTagPointer(&locktag, entry); + + return LockRelease(&locktag, lockmode, false); +} + +/** + * @brief This function removes idle entry of vci_memory_entry_t + * in vci_memory_entries_t, resets the area, and return the index. + * + * @param[in] vciid id of vci whose information is stored in the determined + * memory entry. + * @param[in] tsid tablespace oid + * @return The index to the object acquired and resetted. + * + * @note This function must be called under the lock of the + * vci_memory_entries_t is acquired exclusively. + */ +static int +makeRoomAndGetLocation(vci_id_t *vciid, Oid tsid) +{ + vci_memory_entries_t *entries = VciShmemAddr->memory_entries; + vci_memory_entry_t *entry = NULL; + int result = -1; + + do + { + result = determineRoomPosition(vciid->oid); + Assert(0 <= result); + entry = &(entries->data[result]); + + /* Test if the entry is really free. */ + switch (lockAcquirePointer(entry, AccessExclusiveLock, true /* don't wait */ )) + { + case LOCKACQUIRE_OK: + /* It is free. */ + break; + + case LOCKACQUIRE_NOT_AVAIL: + + /* + * The lock should be taken in shared mode. We have to search + * another entry. To skip current entry, we stamp it. + */ + stampOnMemoryEntry(entry, entries); + result = -1; + break; + + /* LCOV_EXCL_START */ + case LOCKACQUIRE_ALREADY_HELD: + /* NEVER COME HERE */ + ereport(ERROR, + (errmsg("duplicate lock detected"), + errhint("Disable VCI by 'SELECT vci_disable();'"))); + break; + + default: + /* NEVER COME HERE */ + ereport(ERROR, + (errmsg("undefined lock state"), + errhint("Disable VCI by 'SELECT vci_disable();'"))); + /* LCOV_EXCL_STOP */ + } + } while (result < 0); + + Assert(NULL != entry); + + resetMemoryEntry(entry, vciid, tsid, entries->time_stamp); + + /* LCOV_EXCL_START */ + if (!lockReleasePointer(entry, AccessExclusiveLock)) + { + /* NEVER COME HERE */ + ereport(ERROR, + (errmsg("undefined lock state"), + errhint("Disable VCI by 'SELECT vci_disable();'"))); + } + /* LCOV_EXCL_STOP */ + + return result; +} + +/** + * get memory entry which corresponds to vciid + * + * @param[in] vciid vci_id_t identifying vci + * @param[in] tsid oid of tablespace + * + * @return memory entry which corresponds to vciid + */ +static vci_memory_entry_t * +vci_GetMemoryEntry(vci_id_t *vciid, Oid tsid) +{ + vci_memory_entries_t *entries = VciShmemAddr->memory_entries; + vci_memory_entry_t *entry = NULL; + LockAcquireResult lockResult; + int ptr; + + LWLockAcquire(entries->lock, LW_SHARED); + + entries->time_stamp++; + +retry: + ptr = findMemoryEntryLocation(vciid); + + if (ptr == -1) + { + LWLockRelease(entries->lock); + ptr = makeRoomAndGetLocation(vciid, tsid); + LWLockAcquire(entries->lock, LW_EXCLUSIVE); + } + + entry = &entries->data[ptr]; + + if (!OidIsValid(entry->id.oid)) + initializeMemoryEntry(entry, vciid, tsid, entries->time_stamp); + + LWLockRelease(entries->lock); + + lockResult = lockAcquirePointer(entry, AccessShareLock, false /* wait */ ); + + if (lockResult == LOCKACQUIRE_OK) + { + if (entry->id.oid == vciid->oid) + return entry; + else + lockReleasePointer(entry, AccessShareLock); + } + + LWLockAcquire(entries->lock, LW_SHARED); + + goto retry; +} + +/** + * @brief release memory entry + * + * @param[out] entry The memory entry to be released + */ +static void +vci_ReleaseMemoryEntry(vci_memory_entry_t *entry) +{ + lockReleasePointer(entry, AccessShareLock); +} + +static void +registerMemoryEntry2Device(vci_memory_entry_t *entry) +{ + vci_devload_t *dload; + const char *devname; + char *tablespace_path; + bool free_flag = true; + + elog(DEBUG2, "registerMemoryEntry2Device"); + + tablespace_path = getTablespacePath(entry->real_tsid); + if (tablespace_path == NULL) + { + tablespace_path = DataDir; + free_flag = false; + } + + /* OSS always uses "unmonitored" device */ + devname = VCI_PSEUDO_UNMONITORED_DEVICE; + elog(DEBUG2, "vci oid %u tablespace(%s) is on a device (%s)", entry->id.oid, tablespace_path, devname); + + if (free_flag) + pfree(tablespace_path); + + Assert(VciShmemAddr->num_devload_info == 1); + dload = &(VciShmemAddr->devload_array[0]); + Assert(dload != NULL); + Assert(strcmp(dload->devname, VCI_PSEUDO_UNMONITORED_DEVICE) == 0); + + dlist_push_head(&(dload->memory_entry_queue->head), &(entry->link)); +} + +/** + * @return the index to be checked + * + * XXX: Consider just removing this function, because for OSS it only returns 0. + */ +static int +get_new_checked_device_index(int index) +{ + /* + * For OSS the num_devload_info is hardwired as 1, so this function can + * only return an index of 0. ([0] is the "unmonitored" device) + */ + Assert(VciShmemAddr->num_devload_info == 1); + Assert(index == 0); + + return index; +} + +/** + * @param[out] vciid id on which WOS->ROS conversion should be done + * + * @return true if VCI for transformation is found, false otherwise. + */ +bool +vci_GetWosRosConvertingVCI(vci_wosros_conv_worker_arg_t *vciinfo) +{ + int index; + int head_index; + vci_memory_entry_t *entry; + bool found = false; + bool check_started = false; + vci_devload_t *dl; + + index = VciShmemAddr->translated_dev_pos; + + head_index = index; + + elog(DEBUG2, ">>> index=%d, head_index=%d, check_started=%d, found=%d", index, head_index, check_started, found); + dl = &(VciShmemAddr->devload_array[index]); + while ((index != head_index || !check_started) + && !found) + { + check_started = true; + + if (dl->memory_entry_pos == NULL) + { + elog(LOG, "wos->ros translation: skip translation on device [%s] because no memory entry", dl->devname); + index = get_new_checked_device_index(index); + + elog(LOG, ">>> vci_GetWosRosConvertingVCI: index=%d, num_devload_info=%d", index, VciShmemAddr->num_devload_info); + dl = &(VciShmemAddr->devload_array[index]); + } + else + found = true; + + elog(DEBUG2, ">>> index=%d, head_index=%d, check_started=%d, found=%d", index, head_index, check_started, found); + } + + if (!found) + { + elog(LOG, "wos->ros translation: no vci is found for translation"); + + } + else + { + dlist_node *ret; + dlist_head *memory_entry_queue; + + elog(LOG, "dev info: [%s] ", VciShmemAddr->devload_array[index].devname); + + memory_entry_queue = &(dl->memory_entry_queue->head); + + Assert(dl->memory_entry_pos != NULL); + + ret = dl->memory_entry_pos; + if (dlist_has_next(memory_entry_queue, ret)) + dl->memory_entry_pos = dlist_next_node(memory_entry_queue, ret); + else + dl->memory_entry_pos = NULL; + + VciShmemAddr->translated_dev_pos = get_new_checked_device_index(index); + + entry = dlist_container(vci_memory_entry_t, link, ret); + + vciinfo->dbid = entry->id.dbid; + vciinfo->oid = entry->id.oid; + vciinfo->force_next_wosros_conv = entry->force_next_wosros_conv; + + elog(LOG, "wos->ros conversion on device (%s): vci oid=%u dbid=%u", dl->devname, vciinfo->oid, vciinfo->dbid); + } + + return found; +} + +/** + * update a timestamp or newly create a memoryentry for a vci. + * + * @param[in] vciid id of a vci index + * @param[in] tsid Oid of tablespace of vci index identified by the first argument'oid' + */ +void +vci_TouchMemoryEntry(vci_id_t *vciid, Oid tsid) +{ + vci_memory_entry_t *entry; + + entry = vci_GetMemoryEntry(vciid, tsid); + entry->time_stamp = VciShmemAddr->memory_entries->time_stamp; + vci_ReleaseMemoryEntry(entry); +} + +void +vci_update_memoryentry_in_devloadinfo(void) +{ + elog(DEBUG2, "vci_update_memoryentry_in_devloadinfo: start"); + + /* dlist_mutable_iter miter; */ + + LWLockAcquire(VciShmemAddr->memory_entries->lock, LW_EXCLUSIVE); + while (!dlist_is_empty(&VciShmemAddr->memory_entry_device_unknown_list)) + { + dlist_node *tmp; /* vci_memory_entry_t */ + vci_memory_entry_t *entry; + + elog(DEBUG2, ">>> vci_update_memoryentry_in_devloadinfo: in loop"); + tmp = dlist_pop_head_node(&VciShmemAddr->memory_entry_device_unknown_list); + entry = dlist_container(vci_memory_entry_t, link, tmp); + Assert(OidIsValid(entry->id.dbid)); + +#if 1 + debug_show_vciid_ts("ros extract one: ", entry->id.oid, entry->real_tsid); +#endif + + registerMemoryEntry2Device(entry); + } + LWLockRelease(VciShmemAddr->memory_entries->lock); +} + +void +vci_ResetDevloadCurrentPos(void) +{ + vci_devload_t *item; + int i; + + elog(DEBUG2, "vci_ResetDevloadCurrentPos: start; VciShmemAddr->num_devload_info is %d", VciShmemAddr->num_devload_info); + for (i = 0; i < VciShmemAddr->num_devload_info; i++) + { + dlist_head *memory_entry_queue = &(VciShmemAddr->devload_array[i].memory_entry_queue->head); + + item = &(VciShmemAddr->devload_array[i]); + + if (dlist_is_empty(memory_entry_queue)) + item->memory_entry_pos = NULL; + else + item->memory_entry_pos = dlist_head_node(memory_entry_queue); + + } +} + +void +vci_MoveTranslatedVCI2Tail(void) +{ + int i; + + for (i = 0; i < VciShmemAddr->num_devload_info; i++) + { + vci_devload_t *dl; + + dl = &(VciShmemAddr->devload_array[i]); + + { + dlist_head *memory_entry_queue = &(dl->memory_entry_queue->head); + + if (dl->memory_entry_pos != NULL + && dl->memory_entry_pos != dlist_head_node(memory_entry_queue)) + { + while (dlist_head_node(memory_entry_queue) != dl->memory_entry_pos) + { + dlist_node *n; + + n = dlist_pop_head_node(memory_entry_queue); + dlist_push_tail(memory_entry_queue, n); + } + } + } + } +} + +/** + * Check VCI' database is exists, and + * remove memory entory on dropped database. + */ +void +vci_RemoveMemoryEntryOnDroppedDatabase(void) +{ + vci_id_t invalid_vciid; + int32 time_stamp; + int i; + + invalid_vciid.oid = InvalidOid; + invalid_vciid.dbid = InvalidOid; + time_stamp = VciShmemAddr->memory_entries->time_stamp; + + /* start transaction */ + SetCurrentStatementStartTimestamp(); + StartTransactionCommand(); + PushActiveSnapshot(GetTransactionSnapshot()); + + for (i = 0; i < VciShmemAddr->num_devload_info; i++) + { + vci_devload_t *devload; + dlist_head *memory_entry_queue; + dlist_node *node; + dlist_node *next_node; + + devload = &(VciShmemAddr->devload_array[i]); + memory_entry_queue = &(devload->memory_entry_queue->head); + + if (dlist_is_empty(memory_entry_queue)) + node = NULL; + else + node = dlist_head_node(memory_entry_queue); + + while (node != NULL) + { + vci_memory_entry_t *entry; + HeapTuple tuple; + + if (dlist_has_next(memory_entry_queue, node)) + next_node = dlist_next_node(memory_entry_queue, node); + else + next_node = NULL; + + entry = dlist_container(vci_memory_entry_t, link, node); + tuple = GetDatabaseTupleByOid(entry->id.dbid); + if (!HeapTupleIsValid(tuple)) + { + elog(DEBUG2, + "vci %d was dropped by DROP DATABASE", entry->id.oid); + + initializeMemoryEntry(entry, &invalid_vciid, InvalidOid, time_stamp); + } + + node = next_node; + } + + if (dlist_is_empty(memory_entry_queue)) + devload->memory_entry_pos = NULL; + else + devload->memory_entry_pos = dlist_head_node(memory_entry_queue); + } + + /* close transaction */ + PopActiveSnapshot(); + CommitTransactionCommand(); +} + +/** + * Set the flag to force WOS->ROS conversion next time. + * @param[in] vciid id of vci + * @param[in] value flag + */ +void +vci_SetForceNextWosRosConvFlag(vci_id_t *vciid, bool value) +{ + int index; + vci_memory_entries_t *entries = VciShmemAddr->memory_entries; + + LWLockAcquire(entries->lock, LW_EXCLUSIVE); + + index = findMemoryEntryLocation(vciid); + if (index != -1) + entries->data[index].force_next_wosros_conv = value; + + LWLockRelease(entries->lock); +} diff --git a/contrib/vci/storage/vci_ros.c b/contrib/vci/storage/vci_ros.c new file mode 100644 index 000000000000..869491c62ba1 --- /dev/null +++ b/contrib/vci/storage/vci_ros.c @@ -0,0 +1,1674 @@ +/*------------------------------------------------------------------------- + * + * vci_ros.c + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/storage/vci_ros.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "access/heapam_xlog.h" +#include "access/xact.h" +#include "access/xloginsert.h" +#include "catalog/pg_type.h" +#include "mb/pg_wchar.h" /* for MAX_MULTIBYTE_CHAR_LEN */ +#include "miscadmin.h" +#include "storage/ipc.h" +#include "storage/procarray.h" +#include "utils/lsyscache.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" +#include "utils/varbit.h" + +#include "vci.h" +#include "vci_columns.h" +#include "vci_freelist.h" +#include "vci_ros.h" +#include "vci_mem.h" +#include "vci_wos.h" + +/* + * This file has four parts. + * 1. Accessing VCI main relation header + * 2. Relation and buffer control + * 3. Attributes (columns) + * 4. VCI "columns" + */ + +/* + * ********************************************************* + * Accessing VCI main relation header + * ********************************************************* + */ +/* Accessing VCI main relation header + * Because the header of VCI main relation has three pages, we can not map + * one structure of C on the header pages simply. + * Instead, we use access functions. + * + * In order to, first use one of these two * functions, + * + * vci_KeepReadingMainRelHeader() + * Read header pages for reading, and pin them. + * vci_KeepWritingMainRelHeader() + * Read header pages for writing, and pin them. + * + * Then, use the following two functions, + * + * vci_SetMainRelVar() + * To set the value to the field. + * vci_GetMainRelVar() + * To get the value of the field. + * + * The field is defined in enum enum vci_MainRelVar. + * The format is, page ID is in upper 16 bits, and offset from + * the page top is in lower 16 bits. + * + * To write the header pages out to storage, use the next function. + * + * vci_WriteMainRelVar() + * + * After accessing the header, release the DB pages with the following + * function. + * + * vci_ReleaseMainRelHeader() + * Release header pages. + * + * Other helper functions. + * + * vci_GetMColumnPosition() + * Gives the position of vcis_m_column_t. + * + * vci_GetMColumn() + * Gives vcis_m_column_t. + * + * vci_GetExtentInfoPosition() + * Get the position of vcis_m_extent_t structure for the target + * extentId. + * + * FIXME Lock check function necessary? + * Memo: I think the functions to check the lock status of the VCI main + * relation may be convenient, in order to determine if it is possible to + * start a ROS command. It will be used to avoid conflict between building + * local ROS, the vacuum operation, and other ROS commands. For other ROS + * commands, we do not need to use such functions, just try to lock and + * wait. Vacuum, too. For local ROS conversion, we have to determine if + * other ROS command is running when we evaluate the cost of plans. + */ + +/** + * @brief Initialize the structure info to access the header of VCI main + * relation. + * + * This function "just" initializes the given object. + * To access the information in the header, keep the DB pages in buffer + * using vci_KeepMainRelHeader(). + * The accessors are vci_GetMainRelVar() and vci_SetMainRelVar(). + * After modifying the information, call vci_WriteMainRelVar() to write + * the page back to the storage. + * Finally to release the buffer, call vci_ReleaseMainRelHeader(). + * + * @param[out] info Pointer to the target vci_MainRelHeaderInfo, + * which will be initialized + * @param[in] rel VCI main relation. + * @param[in] command ROS command which uses this structure. + */ +void +vci_InitMainRelHeaderInfo(vci_MainRelHeaderInfo *info, + Relation rel, + vci_ros_command_t command) +{ + int aId; + + Assert(NULL != info); + info->rel = rel; + for (aId = 0; aId < lengthof(info->buffer); ++aId) + info->buffer[aId] = InvalidBuffer; + info->command = command; + info->num_extents_allocated = -1; + info->initctx = CurrentMemoryContext; + info->cached_tupledesc = NULL; +} + +static void +KeepMainRelHeader(vci_MainRelHeaderInfo *info) +{ + int blockNum; + + Assert(NULL != info); + Assert(NULL != info->rel); + for (blockNum = 0; blockNum < lengthof(info->buffer); ++blockNum) + info->buffer[blockNum] = vci_ReadBufferWithPageInit(info->rel, blockNum); +} + +static void +CheckRosVersion(vci_MainRelHeaderInfo *info) +{ + uint32 major = vci_GetMainRelVar(info, vcimrv_ros_version_major, 0); + uint32 minor = vci_GetMainRelVar(info, vcimrv_ros_version_minor, 0); + + if ((major == 0) && (minor == 0)) + ereport(ERROR, (errmsg("ROS has not been formatted yet."), + errhint("This might happen when CREATE INDEX fails. " + "\"DROP INDEX %s;\" and CREATE INDEX again may help.", + RelationGetRelationName(info->rel)))); + + if ((VCI_ROS_VERSION_MAJOR != major) || (VCI_ROS_VERSION_MINOR != minor)) + ereport(ERROR, (errmsg("incompatible VCI version: expected (%d, %d), stored (%d, %d).", VCI_ROS_VERSION_MAJOR, VCI_ROS_VERSION_MINOR, major, minor), + errhint("This can happen when accessing old database with newer VCI modules. DROP and CREATE INDEX may help."))); +} + +static int32 +GetNumberOfExtentsFromSizeOfMainRelation(Relation rel) +{ + const int headerBlockNumber = vcimrv_extent_info >> VCI_MRV_PAGE_SHIFT; + const int maxExtentInfoInFirstPage = (BLCKSZ - + (vcimrv_extent_info & VCI_MRV_MASK_OFFSET)) / + sizeof(vcis_m_extent_t); + const int maxExtentInfoInPage = VCI_MAX_PAGE_SPACE / + sizeof(vcis_m_extent_t); + int numBlocks = RelationGetNumberOfBlocks(rel); + + if (numBlocks <= headerBlockNumber) + return -1; + + return ((numBlocks - (headerBlockNumber + 1)) * maxExtentInfoInPage) + + maxExtentInfoInFirstPage; +} + +static void +UpdateNumberOfExtentsInMainRelHeader(vci_MainRelHeaderInfo *info) +{ + if (vci_rc_query == info->command) + info->num_extents_allocated = GetNumberOfExtentsFromSizeOfMainRelation( + info->rel); + else + info->num_extents_allocated = -1; +} + +/** + * @brief Keep DB pages of VCI header in buffer. + * + * This function acquire one read lock with AccessShareLock. + * This is called only by vci_inner_build(). + * + * @param[in] info Pointer to the target vci_MainRelHeaderInfo. + */ +void +vci_KeepMainRelHeaderWithoutVersionCheck(vci_MainRelHeaderInfo *info) +{ + Assert(info); + Assert(RelationIsValid(info->rel)); + elog(DEBUG3, "open VCI \"%s\" ignoring ROS version", + RelationGetRelationName(info->rel)); + KeepMainRelHeader(info); +} + +/** + * @brief Change command ID stored in vci_MainRelHeaderInfo. + * + * @param[in] info pointer to the target vci_MainRelHeaderInfo. + * @param[in] command new command ID. + */ +void +vci_ChangeCommand(vci_MainRelHeaderInfo *info, vci_ros_command_t command) +{ + Assert(info); + info->command = command; + UpdateNumberOfExtentsInMainRelHeader(info); +} + +/** + * @brief Keep DB pages of VCI header in buffer after checking the ROS version. + * + * This function acquire one read lock with AccessShareLock. + * + * @param[in] info Pointer to the target vci_MainRelHeaderInfo. + */ +void +vci_KeepMainRelHeader(vci_MainRelHeaderInfo *info) +{ + Assert(info); + Assert(RelationIsValid(info->rel)); + elog(DEBUG3, "open VCI \"%s\"", + RelationGetRelationName(info->rel)); + KeepMainRelHeader(info); + CheckRosVersion(info); + UpdateNumberOfExtentsInMainRelHeader(info); +} + +/** + * @brief Write header pages of VCI main relation. + * + * @param[in] info Pointer to the target vci_MainRelHeaderInfo. + * @param[in] writeArea Give vci_wmrv_update for updating the pages for + * recovery, or vci_wmrv_all for all pages. The latter should only be used in + * building the index. + */ +void +vci_WriteMainRelVar(vci_MainRelHeaderInfo *info, + vci_wmrv_t writeArea) +{ + int blockNum; + int start = 0; + + Assert(NULL != info); + Assert(NULL != info->rel); + + elog(DEBUG3, "flush header pages of VCI \"%s\" main relation", + RelationGetRelationName(info->rel)); + + switch (writeArea) + { + case vci_wmrv_update: + start = lengthof(info->buffer) - 1; + break; + case vci_wmrv_all: + start = 0; + break; + default: + ereport(ERROR, (errmsg("internal error. unsupported parameter."), errhint("Disable VCI by 'SELECT vci_disable();'"))); + } + + for (blockNum = start; blockNum < lengthof(info->buffer); ++blockNum) + { + LockBuffer(info->buffer[blockNum], BUFFER_LOCK_EXCLUSIVE); + MarkBufferDirty(info->buffer[blockNum]); + vci_WriteOneItemPage(info->rel, info->buffer[blockNum]); + LockBuffer(info->buffer[blockNum], BUFFER_LOCK_UNLOCK); + } +} + +/** + * @brief Release buffer for the VCI header. + * + * This function release one read lock with AccessShareLock. + * + * @param[in] info Pointer to the target vci_MainRelHeaderInfo. + */ +void +vci_ReleaseMainRelHeader(vci_MainRelHeaderInfo *info) +{ + int blockNum; + + Assert(NULL != info); + Assert(NULL != info->rel); + + elog(DEBUG3, "release VCI \"%s\"", + RelationGetRelationName(info->rel)); + for (blockNum = 0; blockNum < lengthof(info->buffer); ++blockNum) + { + ReleaseBuffer(info->buffer[blockNum]); + info->buffer[blockNum] = InvalidBuffer; + } + info->rel = NULL; + info->cached_tupledesc = NULL; +} + +/** + * @brief Set values in the header part of VCI main relation. + * + * @param[in] info Pointer to the target vci_MainRelHeaderInfo. + * @param[in] var "virtual address" of the variable, defined in + * enum vci_MainRelVar. + * @param[in] elemId Give 0 normally. + * When the target variable has multiple of elements, say an array, + * the element ID should be placed. + * @param[in] value The value to write. + */ +void +vci_SetMainRelVar(vci_MainRelHeaderInfo *info, + vci_MainRelVar var, + int elemId, + uint32 value) +{ + Page page; + unsigned int blockNumber = vci_MRVGetBlockNumber(var); + unsigned int offset = vci_MRVGetOffset(var); + + Assert(blockNumber < lengthof(info->buffer)); + Assert(offset < BLCKSZ); + + page = BufferGetPage(info->buffer[blockNumber]); + ((uint32 *) &(((char *) page)[offset]))[elemId] = value; +} + +/** + * @brief Get values in the header part of VCI main relation. + * + * @param[in] info Pointer to the target vci_MainRelHeaderInfo. + * @param[in] var "virtual address" of the variable, defined in + * enum vci_MainRelVar. + * @param[in] elemId Give 0 normally. + * When the target variable has multiple of elements, say an array, + * the element ID should be placed. + * @return The gotten value. + */ +uint32 +vci_GetMainRelVar(vci_MainRelHeaderInfo *info, + vci_MainRelVar var, + int elemId) +{ + Page page; + unsigned int blockNumber = vci_MRVGetBlockNumber(var); + unsigned int offset = vci_MRVGetOffset(var); + + Assert(blockNumber < lengthof(info->buffer)); + Assert(offset < BLCKSZ); + page = BufferGetPage(info->buffer[blockNumber]); + + return ((uint32 *) &(((char *) page)[offset]))[elemId]; +} + +/** + * @brief Get the position of column information in the VCI main relation. + * + * @param[in] columnId The column ID in the VCI index. + * @return The offset in the page, which including DB page header part. + */ +vci_MainRelVar +vci_GetMColumnPosition(int16 columnId) +{ + const int firstBlockNumber = vci_MRVGetBlockNumber(vcimrv_column_info); + const int numInFirstPage = (BLCKSZ - vci_MRVGetOffset(vcimrv_column_info)) / + sizeof(vcis_m_column_t); + const int numInPage = VCI_MAX_PAGE_SPACE / sizeof(vcis_m_column_t); + int blockNumber; + + Assert(VCI_FIRST_NORMALCOLUMN_ID <= columnId); + if (columnId < numInFirstPage) + { + return (firstBlockNumber << VCI_MRV_PAGE_SHIFT) + + vci_MRVGetOffset(vcimrv_column_info) + + (columnId * sizeof(vcis_m_column_t)); + } + + columnId -= numInFirstPage; + blockNumber = columnId / numInPage; + columnId -= blockNumber * numInPage; + blockNumber += 1 + firstBlockNumber; + Assert(blockNumber < (VCI_NUM_MAIN_REL_HEADER_PAGES - 1)); + + return (blockNumber << VCI_MRV_PAGE_SHIFT) + + VCI_MIN_PAGE_HEADER + + (columnId * sizeof(vcis_m_column_t)); +} + +/** + * @brief Get the column information in the VCI main relation. + * + * @param[in] info Pointer to the target vci_MainRelHeaderInfo. + * @param[in] columnId The column ID in the VCI index. + * @return The pointer to the column information in the header page of + * VCI main relation. + * + * @note + * AFTER ACCESSING vcis_m_column_t, RELEASE BUFFER WITH ReleaseBuffer(buffer); + */ +vcis_m_column_t * +vci_GetMColumn(vci_MainRelHeaderInfo *info, int16 columnId) +{ + Page page; + vci_MainRelVar mrv = vci_GetMColumnPosition(columnId); + + page = BufferGetPage(info->buffer[vci_MRVGetBlockNumber(mrv)]); + + return (vcis_m_column_t *) &(((char *) page)[vci_MRVGetOffset(mrv)]); +} + +/** + * @brief Obtain the position of vcis_m_extent_t structure for + * the target extentId. + * + * vcis_m_extent_t is the information of extents in VCI main relation. + * + * @param[out] blockNumber The block number contains the information is written + * in * blockNumber. + * @param[out] offset The offset number contains the information is written + * in * offset. + * @param[in] extentId The target extent ID. + */ +void +vci_GetExtentInfoPosition(BlockNumber *blockNumber, + OffsetNumber *offset, + int32 extentId) +{ + const int maxExtentInfoInFirstPage = (BLCKSZ - + (vcimrv_extent_info & VCI_MRV_MASK_OFFSET)) / + sizeof(vcis_m_extent_t); + const int maxExtentInfoInPage = VCI_MAX_PAGE_SPACE / + sizeof(vcis_m_extent_t); + + Assert(blockNumber); + Assert(offset); + + if (extentId < maxExtentInfoInFirstPage) + { + *blockNumber = vcimrv_extent_info >> VCI_MRV_PAGE_SHIFT; + *offset = (vcimrv_extent_info & VCI_MRV_MASK_OFFSET) + + (extentId * sizeof(vcis_m_extent_t)); + } + else + { + int32 extentIdRem = extentId - maxExtentInfoInFirstPage; + + *blockNumber = extentIdRem / maxExtentInfoInPage; + extentIdRem -= *blockNumber * maxExtentInfoInPage; + *blockNumber += 1 + (vcimrv_extent_info >> VCI_MRV_PAGE_SHIFT); + *offset = VCI_MIN_PAGE_HEADER + + (extentIdRem * sizeof(vcis_m_extent_t)); + } +} + +static void +WriteAllItemsInPage(Relation rel, + Buffer buffer, + uint16 numItems) +{ + uint16 iId; + + for (iId = 0; iId < numItems; ++iId) + vci_WriteItem(rel, buffer, iId + FirstOffsetNumber); +} + +/* + * ********************************************************* + * Relation and buffer control + * ********************************************************* + */ +/* + * vci_PreparePagesWithOneItemIfNecessary() + * This function checks if the relation has the DB page pointed + * by an argument. If it does not exists, the function extends + * the relation and initialize extended pages with one item per + * page. Mind that this function does not touch existing pages. + * If you need to format existing pages, use vci_InitPage(). + * + * vci_InitPage() + * Low level function. + * + * This function formats the existing DB page, pointed by + * relation and page ID (block number), with empty items. + * The number of items are also passed by an argument. + * + * vci_PreparePagesWithOneItemIfNecessary() is more convenient. + * For pages with one item, the macro vci_InitOneItemPage() is + * defined. + * + * vci_WriteItem() + * Mark the buffer dirty, and write out WAL from the pointed + * item in the buffer. + * + * vci_WriteOnePageIfNecessaryAndNext() + * A utility function. + * This function takes new page ID and old page ID in the + * arguments. If they are different, write out the old page, + * assumed which is loaded in the given buffer, and read + * the new page. + * If the page IDs are same, do nothing. + * + */ + +/** + * @brief This function checks if the relation has the DB page with the page ID + * blockNumber. + * + * When it does not exists, the function extends the relation and initialize + * extended pages with one item per page. + * + * @param[in] rel The relation. + * @param[in] blockNumber The block number to be examined. + * @param[in] numItems The number of items the page is initialized with. + * @param[in] forceInit If true, the block is initialized anyway. + * @param[in] logItems If true, write all items in the pages into WAL. + */ +void +vci_PreparePagesIfNecessaryCore(Relation rel, + BlockNumber blockNumber, + uint16 numItems, + bool forceInit, + bool logItems) +{ + BlockNumber existingPages = RelationGetNumberOfBlocks(rel); + + Assert(0 < numItems); + + if (!BlockNumberIsValid(blockNumber)) + ereport(ERROR, (errmsg("data relation full"), errhint("Normally relations of VCI index are smaller than the table relation, therefore this error must not happen. Disable VCI by 'SELECT vci_disable();'"))); + + if (existingPages <= blockNumber) + { + BlockNumber pId; + + for (pId = existingPages; pId <= blockNumber; ++pId) + { + Buffer buffer = ReadBufferExtended(rel, MAIN_FORKNUM, + P_NEW, RBM_ZERO_AND_LOCK, NULL); + + vci_InitPageCore(buffer, numItems, true); + if (logItems) + WriteAllItemsInPage(rel, buffer, numItems); + UnlockReleaseBuffer(buffer); + } + } + else + { + Buffer buffer = ReadBuffer(rel, blockNumber); + Page page = BufferGetPage(buffer); + bool needUnlock = false; + + if (PageIsNew(page) || forceInit) + { + vci_InitPageCore(buffer, numItems, false); + + if (logItems) + { + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + WriteAllItemsInPage(rel, buffer, numItems); + needUnlock = true; + } + } + if (needUnlock) + UnlockReleaseBuffer(buffer); + else + ReleaseBuffer(buffer); + } +} + +/** + * @brief This function writes a given number of items in the buffer. + * + * @param[in] buffer Postgres DB buffer to be initialized. + * @param[in] numItems The number of items the page is initialized with. + * @param[in] locked true if the buffer is locked, false otherwise. + */ +void +vci_InitPageCore(Buffer buffer, int16 numItems, bool locked) +{ + if (!locked) + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + { + uint32 size; + uint32 itemSize; + int32 aId; + Page page = BufferGetPage(buffer); + PageHeader pageHeader = (PageHeader) page; + + PageInit(page, BLCKSZ, 0); + pageHeader->pd_lower += sizeof(ItemIdData) * numItems; + size = pageHeader->pd_upper - pageHeader->pd_lower; + itemSize = vci_RoundDownValue(size / numItems, + VCI_DATA_ALIGNMENT_IN_STORAGE); + for (aId = numItems; aId--;) + { + HeapTupleHeader hTup; + + pageHeader->pd_upper -= itemSize; + pageHeader->pd_linp[aId].lp_off = pageHeader->pd_upper; + pageHeader->pd_linp[aId].lp_len = itemSize; + pageHeader->pd_linp[aId].lp_flags = LP_NORMAL; + hTup = (HeapTupleHeader) PageGetItem(page, &(pageHeader->pd_linp[aId])); + hTup->t_infomask2 = 0; + hTup->t_infomask = HEAP_XMIN_FROZEN | HEAP_XMAX_INVALID; + hTup->t_hoff = vci_RoundUpValue(offsetof(HeapTupleHeaderData, t_bits), + VCI_DATA_ALIGNMENT_IN_STORAGE); + } + MarkBufferDirty(buffer); + Assert(pageHeader->pd_lower <= pageHeader->pd_upper); + } + + if (!locked) + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); +} + +/** + * @brief This function get or newly create a DB buffer page, and put the + * header information that only one item is in the page, and the size of + * item is 8140 bytes, and the data type is bytea. + * + * @param[in] rel The relation. + * @param[in] blockNumber The block number to be initialized. + * @param[in] numItems The number of items the page is initialized with. + */ +/* + * dead code + * LCOV_EXCL_START + */ +void +vci_InitPage(Relation rel, BlockNumber blockNumber, int16 numItems) +{ + Buffer buffer; + + Assert(BlockNumberIsValid(blockNumber)); + buffer = ReadBuffer(rel, blockNumber); + vci_InitPageCore(buffer, numItems, false); + ReleaseBuffer(buffer); +} + +/* LCOV_EXCL_STOP */ + +/** + * @brief This function mark the buffer dirty, and make WAL from the item + * in the buffer. + * + * We assume that the relation is only modified by ROS command exclusively. + * So, we do not put strict lock here. + * + * @param[in] rel The relation. + * @param[in] buffer PostgreSQL DB buffer having the page data. + * @param[in] numItems The number of items the page is initialized with. + */ +void +vci_WriteItem(Relation rel, + Buffer buffer, + OffsetNumber offsetNumber) +{ + Page page = BufferGetPage(buffer); + ItemId tup = PageGetItemId(page, offsetNumber); + HeapTupleHeader htup = (HeapTupleHeader) PageGetItem(page, tup); + + Assert(BufferIsValid(buffer)); + Assert(OffsetNumberIsValid(offsetNumber)); + + MarkBufferDirty(buffer); + + if (RelationNeedsWAL(rel)) + { + xl_heap_inplace xlrec; + XLogRecPtr recptr; + uint8 info = 0; + uint32 newlen; + + xlrec.offnum = offsetNumber; + xlrec.dbId = MyDatabaseId; + xlrec.tsId = MyDatabaseTableSpace; + xlrec.relcacheInitFileInval = false; + xlrec.nmsgs = 0; + + /* + * originally taken from heap_inplace_update() in + * src/backend/access/heap/heapam.c + */ + XLogBeginInsert(); + XLogRegisterData(&xlrec, MinSizeOfHeapInplace); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + newlen = VCI_ITEM_SPACE(PageGetMaxOffsetNumber(page)); + XLogRegisterBufData(0, (char *) htup + htup->t_hoff, newlen); + + START_CRIT_SECTION(); + recptr = XLogInsert(RM_HEAP_ID, XLOG_HEAP_INPLACE | info); + + PageSetLSN(page, recptr); + + END_CRIT_SECTION(); + } +} + +/** + * @brief This function first compares blockNumber and blockNumberOld. + * + * If they differ each other, write out the buffer in the DB page of + * blockNumberOld, and read the DB page of blockNumber. + * If the are same, do nothing. + * + * @param[in] relation The relation. + * @param[in] blockNumber New page ID. + * @param[in] blockNumberOld Old page ID. The data is in buffer. + * @param[in] buffer The buffer contains the old page. + * @return buffer contains new page, exclusively locked. + */ +Buffer +vci_WriteOnePageIfNecessaryAndGetBuffer(Relation relation, + BlockNumber blockNumber, + BlockNumber blockNumberOld, + Buffer buffer) +{ + if (blockNumber == blockNumberOld) + return buffer; + if (BlockNumberIsValid(blockNumberOld)) + { + vci_WriteOneItemPage(relation, buffer); + UnlockReleaseBuffer(buffer); + } + buffer = vci_ReadBufferWithPageInit(relation, blockNumber); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + return buffer; +} + +/* + * ********************************************************* + * PostgreSQL Attributes (columns) + * ********************************************************* + */ + +/* + * ********************************************************* + * VCI "columns" + * Here, a "column" may have only one data relation, + * or a pair of meta data relation and data relation. + * It includes delete vector, null vector, TID relation, + * + * ********************************************************* + */ +/* + * vci_GetSumOfAttributeIndices() + * This function counts up all the VCI "columns" defined + * in num_vcis_attribute_type. + * + * vci_GetAttrTypeAndIndexFromSumOfIndices() + * Get vcis_attribute_type_t and index from given + * sequential index. + */ + +/** + * @brief This function counts up all the VCI "columns" defined + * in num_vcis_attribute_type. + * + * @param[in] numColumns Number of normal columns in VCI index. + * @return number of total columns, not only of indexed columns, but also + * auxiliary columns. + */ +int +vci_GetSumOfAttributeIndices(int16 numColumns) +{ + int result = 0; + int aId; + + for (aId = 0; aId < num_vcis_attribute_type; ++aId) + result += vci_GetNumIndexForAttributeType(aId, numColumns); + + return result; +} + +/** + * @brief Get Attribute type defined in vcis_attribute_type_t and + * index of the target category. + * + * @param[out] attrType The attribute type is wirtten in *attrType. + * @param[out] index The index is wirtten in *index. + * If no corresponding attribute exists, *index set to -1. + * @param[in] numColumns The number of normal columns in VCI index. + * @param[in] sumOfIndex The sequential index of target column. + */ +void +vci_GetAttrTypeAndIndexFromSumOfIndices(vcis_attribute_type_t *attrType, + int *index, + int16 numColumns, + int sumOfIndex) +{ + int sum = 0; + + *index = 0; + for (*attrType = 0; *attrType < num_vcis_attribute_type; ++*attrType) + { + int inc = vci_GetNumIndexForAttributeType(*attrType, numColumns); + + if ((sum <= sumOfIndex) && (sumOfIndex < (sum + inc))) + { + *index = sumOfIndex - sum; + + return; + } + sum += inc; + } + *index = -1; +} + +/** + * @brief Calculate the bid ID of null bit vector for given column ID. + * + * @param[in] tupleDesc The tuple descriptor of VCI main relation. + * @param[in] columnId Target column ID. + * @return The bit ID in null bit vector. For not nullable columns, return -1. + */ +int16 +vci_GetBitIdInNullBits(TupleDesc tupleDesc, int16 columnId) +{ + return columnId; +} + +/** + * @brief Get the column widths in the worst case. + * + * @param attr Attribute information of the columns. + * @return The width in the worst case. + */ +int16 +vci_GetColumnWorstSize(Form_pg_attribute attr) +{ + if (0 <= attr->attlen) /* fixed length data */ + return attr->attlen; + + /* variable or long length data */ + if (0 <= attr->atttypmod) + { + int32 columnSize; + + switch (attr->atttypid) + { + /* for bit(n), varbit(n). */ + case BITOID: + case VARBITOID: + columnSize = VARBITTOTALLEN(attr->atttypmod); + break; + + /* for numeric(p,q), retrun 'p'+LL . */ + case NUMERICOID: + columnSize = (attr->atttypmod >> 16) + VARHDRSZ; + break; + + case BPCHAROID: + case VARCHAROID: + if (attr->atttypmod < VARHDRSZ) + columnSize = (attr->atttypmod - VARHDRSZ) * MAX_MULTIBYTE_CHAR_LEN + VARHDRSZ; + else + columnSize = attr->atttypmod * MAX_MULTIBYTE_CHAR_LEN; + break; + + default: + { +#ifdef VCI_USE_COMPACT_VARLENA + if (attr->atttypmod < VARATT_SHORT_MAX) + columnSize = attr->atttypmod - VARHDRSZ + VARHDRSZ_SHORT; + else + columnSize = attr->atttypmod; +#else + columnSize = attr->atttypmod; +#endif + } + break; + } + + if (columnSize < MaxHeapTupleSize) + return (int16) columnSize; + } + + /* worst size -> MaxHeapTupleSize(8k) */ + /* unlimited data size */ + return MaxHeapTupleSize; + + /* + * Large data are externally toasted and the size of tuple including the + * large attribute is limited to TOAST_TUPLE_TARGET, which is BLCKSZ / 4 + * normally. But, UN-TOASTED -> MaxHeapTupleSize. + */ +} + +/** + * @brief from vci_MainRelHeaderInfo, column IDs in original heap relation + * and VCI index relation are collected. + * + * This function also collect the worst-case sizes of columns. + * attributes, just packed. + * + * @param[out] heapAttrNumList Pointer to an array of AttrNumber. + * The attribute numbers (column ID) in the heap relation are stored here. + * The AttrNumber is one-origin. + * The length of array must be larger than numColumns. + * + * @param[out] indxColumnIdList Pointer to an array of int16. + * The column IDs in the VCI main relation are stored here. + * This is zero-origin. + * The length of array must be larger than numColumns. + * + * @param[out] columnSizeList Pointer to an array of int16. + * The worst-case widths are stored here. + * The length of array must be larger than numColumns. + * + * @param[in] numColumn Number of columns defined in VCI index. + * @param[in] info VCI main relation header information. + * @param[in] heapOid OID of original PostgreSQL tables. + * @return sum of columnSizeList. + */ +Size +vci_GetColumnIdsAndSizes(AttrNumber *heapAttrNumList, + int16 *indxColumnIdList, + int16 *columnSizeList, + int numColumn, + vci_MainRelHeaderInfo *info, + Oid heapOid) +{ + LOCKMODE lockmode = AccessShareLock; + Oid tableOid = info->rel->rd_index->indrelid; + Relation tableRel; + TupleDesc tupleDesc; + Size result = 0; + int colId; + + tableRel = table_open(tableOid, lockmode); + tupleDesc = RelationGetDescr(tableRel); + + for (colId = VCI_FIRST_NORMALCOLUMN_ID; colId < numColumn; ++colId) + { + Form_pg_attribute attr; + vcis_m_column_t *mColumn = vci_GetMColumn(info, colId); + Buffer buffer; + Relation rel = table_open(mColumn->meta_oid, lockmode); + vcis_column_meta_t *metaHeader = vci_GetColumnMeta(&buffer, rel); + int16 attnum = metaHeader->pgsql_attnum; + + heapAttrNumList[colId] = attnum; + attr = TupleDescAttr(tupleDesc, attnum - 1); + + ReleaseBuffer(buffer); + table_close(rel, lockmode); + + /* + * Previously, "attr->attnum - 1" was used for the right value instead + * of the simple sequencial number, colId (The attr is extracted from + * indexRel). This was for future expanding to enable to add columns + * to or delete ones from VCI after creating. But this is not + * implemented. And then, the attr is no longer reliable because real + * columns information is stored in the vci_column_ids option not in + * indexRel when using vci_create(). + */ + indxColumnIdList[colId] = colId; + + if (!AttributeNumberIsValid(heapAttrNumList[colId])) + elog(ERROR, "column not found."); /* FIXME */ + + result += columnSizeList[colId] = vci_GetColumnWorstSize(attr); + } + + table_close(tableRel, lockmode); + + return result; +} + +/** + * @brief Count number of nullable columns in a tuple descriptor. + * + * @param[in] tupleDesc tuple descriptor + * @return Number of nullable columns in the relation. + */ +int +vci_GetNumberOfNullableColumn(TupleDesc tupleDesc) +{ + int result = 0; + int aId; + + for (aId = 0; aId < tupleDesc->natts; ++aId) + { + Assert(!((TupleDescAttr(tupleDesc, aId)->attnotnull))); + ++result; + } + + return result; +} + +/** + * @brief Sarch for free extent and return the extent ID. + * + * This function reads extent information in the ROS main relation and checks + * if the extent has its xgen and xdel are both InvalidTransactionId. + * The check is done in vci_isFreeExtent(). + */ +static uint32 +SearchFreeExtent(vci_MainRelHeaderInfo *info) +{ + int32 numExtents = vci_GetMainRelVar(info, vcimrv_num_extents, 0); + int32 extentId = numExtents; + BlockNumber blockNumber; + OffsetNumber offset; + Buffer buffer = InvalidBuffer; + Page pageHeader = NULL; + + /* search deleted extent first */ + + vcis_m_extent_t *extentInfo; + vci_meta_item_scanner_t *scan = + vci_BeginMetaItemScan(info->rel, BUFFER_LOCK_SHARE); + + while ((extentInfo = vci_GetMExtentNext(info, scan)) != NULL) + { + if (vci_ExtentIsFree(extentInfo)) + { + extentId = scan->index; + break; + } + } + vci_EndMetaItemScan(scan); + + /* if no deleted extent, create a new extent */ + if (extentId == numExtents) + { + while (true) + { + vcis_m_extent_t *extentInfo_new; + bool extentIsFree; + + vci_GetExtentInfoPosition(&blockNumber, &offset, extentId); + vci_PreparePagesWithOneItemIfNecessary(info->rel, blockNumber); + buffer = ReadBuffer(info->rel, blockNumber); + + LockBuffer(buffer, BUFFER_LOCK_SHARE); + + pageHeader = BufferGetPage(buffer); + extentInfo_new = (vcis_m_extent_t *) &(((char *) pageHeader)[offset]); + Assert(extentInfo_new->xgen == InvalidTransactionId); + Assert((extentInfo_new->xdel == InvalidTransactionId) || (extentInfo_new->xdel == FrozenTransactionId)); + extentIsFree = vci_ExtentIsFree(extentInfo_new); + + UnlockReleaseBuffer(buffer); + + if (extentIsFree) + break; + else + ++extentId; + } + } + + return extentId; +} + +/** + * @brief Get free extent Id. + * + * This function first check the pointer in main relation to one free extent. + * It it is not free extent, then scan the main relation to find free one. + * @param[in] info Pointer to the target vci_MainRelHeaderInfo. + * @return ID of a free extent. + */ +uint32 +vci_GetFreeExtentId(vci_MainRelHeaderInfo *info) +{ + Buffer buffer; + int32 extentId; + vcis_m_extent_t *extentInfo; + bool isFreeExtent; + + /* first, check the pointed extent */ + extentId = 0; + { + extentInfo = vci_GetMExtent(&buffer, info, extentId); + + LockBuffer(buffer, BUFFER_LOCK_SHARE); + isFreeExtent = vci_ExtentIsFree(extentInfo); + UnlockReleaseBuffer(buffer); + + if (isFreeExtent) + return extentId; + } + + /* scan the VCI main relation to find free extent */ + extentId = SearchFreeExtent(info); + extentInfo = vci_GetMExtent(&buffer, info, extentId); + + LockBuffer(buffer, BUFFER_LOCK_SHARE); + Assert(vci_ExtentIsFree(extentInfo)); + UnlockReleaseBuffer(buffer); + + return extentId; +} + +/* + * ************* + * ** CAUTION ** + * ************* + * USE vci_WriteExtentInfoInMainRosForWosRosConvInit() IN SOME TRANSACTION. + * GetCurrentTransactionId() IS USED. + */ + +/** + * @brief The function to call before starting WOS -> ROS conversion to write + * recovery information. + * + * This function write new current ROS ID to the header area of ROS main + * relation, ROS command, and target extent ID. It also write + * InvalidTransactionId at the target extent info. + * + * @param[in] info pointer to the target vci_MainRelHeaderInfo. + * @param[in] extentId target extent ID. + * @param[in] extentId target common dictionary ID. + * @param[in] xid transaction ID of this write operation. + * @param[in] command command of this operation. + */ +void +vci_WriteExtentInfoInMainRosForWriteExtentOrCommonDict( + vci_MainRelHeaderInfo *info, + int32 extentId, + int32 dictionaryId, + TransactionId xid, + vci_ros_command_t command) +{ + int32 numExtents = vci_GetMainRelVar(info, vcimrv_num_extents, 0); + + Assert(0 <= numExtents); + Assert(VCI_FIRST_NORMAL_EXTENT_ID <= extentId); + if (numExtents <= extentId) + { + BlockNumber blockNumber; + OffsetNumber offset; + + numExtents = extentId + 1; + vci_GetExtentInfoPosition(&blockNumber, &offset, extentId); + vci_PreparePagesWithOneItemIfNecessary(info->rel, blockNumber); + } + vci_SetMainRelVar(info, vcimrv_num_extents, 0, numExtents); + vci_SetMainRelVar(info, vcimrv_current_ros_version, 0, xid); + vci_SetMainRelVar(info, vcimrv_ros_command, 0, command); + vci_WriteMainRelVar(info, vci_wmrv_update); +} + +vcis_m_extent_t * +vci_GetMExtent(Buffer *buffer, vci_MainRelHeaderInfo *info, int32 extentId) +{ + BlockNumber blockNumber; + OffsetNumber offset; + Page page; + + Assert(VCI_FIRST_NORMAL_EXTENT_ID <= extentId); + vci_GetExtentInfoPosition(&blockNumber, &offset, extentId); + + /* + * info->num_extents_allocated is normally -1. When vci_rc_query == + * info->command, it has the expected number of extents calcuated from + * number of blocks in VCI main relation. + */ + if (info->num_extents_allocated <= extentId) + vci_PreparePagesWithOneItemIfNecessary(info->rel, blockNumber); + + *buffer = vci_ReadBufferWithPageInit(info->rel, blockNumber); + page = BufferGetPage(*buffer); + + return (vcis_m_extent_t *) &(((char *) page)[offset]); +} + +vcis_m_extent_t * +vci_GetMExtentNext(vci_MainRelHeaderInfo *info, vci_meta_item_scanner_t *scan) +{ + OffsetNumber offset; + BlockNumber block; + + if (!scan->inited) + { + Page page; + + scan->max_item = vci_GetMainRelVar(info, vcimrv_num_extents, 0); + vci_GetExtentInfoPosition(&scan->start_block, &offset, 0); + vci_GetExtentInfoPosition(&scan->end_block, &offset, scan->max_item); + scan->item_size = sizeof(vcis_m_extent_t); + scan->current_block = scan->start_block; + + scan->buffer = ReadBuffer(scan->rel, scan->current_block); + LockBuffer(scan->buffer, scan->buf_lockmode); + + page = BufferGetPage(scan->buffer); + if (PageIsNew(page)) + { + UnlockReleaseBuffer(scan->buffer); + return NULL; + } + + Assert(scan->index == -1); + Assert(scan->max_item >= 0); + + scan->inited = true; + } + + scan->index++; + + if (scan->index >= scan->max_item) + return NULL; + + vci_GetExtentInfoPosition(&block, &offset, scan->index); + + if (scan->current_block != block) + { + Page page; + + Assert(BufferIsValid(scan->buffer)); + + if (scan->buf_lockmode == BUFFER_LOCK_EXCLUSIVE) + vci_WriteOneItemPage(scan->rel, scan->buffer); + + UnlockReleaseBuffer(scan->buffer); + + scan->buffer = ReadBuffer(scan->rel, block); + scan->current_block = block; + + LockBuffer(scan->buffer, scan->buf_lockmode); + + page = BufferGetPage(scan->buffer); + if (PageIsNew(page)) + { + UnlockReleaseBuffer(scan->buffer); + return NULL; + } + } + + return (vcis_m_extent_t *) &(((char *) BufferGetPage(scan->buffer))[offset]); +} + +vci_meta_item_scanner_t * +vci_BeginMetaItemScan(Relation rel, int buf_lock) +{ + vci_meta_item_scanner_t *scan = palloc0(sizeof(vci_meta_item_scanner_t)); + + Assert((buf_lock == BUFFER_LOCK_SHARE) || (buf_lock == BUFFER_LOCK_EXCLUSIVE)); + + scan->inited = false; + + scan->rel = rel; + scan->index = -1; + + scan->end_block = InvalidBlockNumber; + scan->start_block = InvalidBlockNumber; + scan->buffer = InvalidBuffer; + scan->current_block = InvalidBlockNumber; + scan->max_item = 0; + scan->max_item_in_page = 0; + scan->item_size = 0; + scan->buf_lockmode = buf_lock; + + return scan; +} + +void +vci_EndMetaItemScan(vci_meta_item_scanner_t *scan) +{ + Assert(scan); + + if (BufferIsValid(scan->buffer)) + { + if (scan->buf_lockmode == BUFFER_LOCK_EXCLUSIVE) + vci_WriteOneItemPage(scan->rel, scan->buffer); + + UnlockReleaseBuffer(scan->buffer); + } + + pfree(scan); +} + +void +vci_WriteExtentInfo(vci_MainRelHeaderInfo *info, + int32 extentId, + uint32 numRows, + uint32 numDeletedRows, + uint32 numDeletedRowsOld, + TransactionId xgen, + TransactionId xdel) +{ + Buffer buffer; + vcis_m_extent_t *extentInfo = vci_GetMExtent(&buffer, info, extentId); + + Assert(VCI_FIRST_NORMAL_EXTENT_ID <= extentId); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + extentInfo->num_rows = numRows; + extentInfo->num_deleted_rows = numDeletedRows; + extentInfo->num_deleted_rows_old = numDeletedRowsOld; + extentInfo->xgen = xgen; + extentInfo->xdel = xdel; + extentInfo->flags = 0; + vci_WriteOneItemPage(info->rel, buffer); + UnlockReleaseBuffer(buffer); +} + +/** + * @brief This function checks if the extentID is 0 <= extentID and + * extentID < numExtents written in header part of main relation. + * + * If it passes, check the existence of the DB page where the extent ID + * information is written. + * It might happen that the page has vanished in some trouble...? + * In recovery process, the record of the number of extents should be + * corrected. If so, elog is better... + * + * @param[in] info Pointer to the target vci_MainRelHeaderInfo. + * @param[in] extentId The target extent ID. + * @retval true The DB page is allocated for the information with given + * extent ID. + * @retval false Need to allocate new DB page for the information. + */ +bool +vci_ExtentInfoExists(vci_MainRelHeaderInfo *info, int32 extentId) +{ + BlockNumber blockNumber; + OffsetNumber offset; + int32 numExtents = vci_GetMainRelVar(info, vcimrv_num_extents, 0); + + Assert(0 <= numExtents); + if (numExtents <= extentId) + return false; + + if (0 <= info->num_extents_allocated) + return extentId < info->num_extents_allocated; + + vci_GetExtentInfoPosition(&blockNumber, &offset, extentId); + + return blockNumber < RelationGetNumberOfBlocks(info->rel); +} + +static bool +VisibilityCheck(TransactionId objectXidMin, + TransactionId objectXidMax, + TransactionId readerXid) +{ + /* visibility from generation */ + bool result = TransactionIdIsValid(objectXidMin) && + (TransactionIdEquals(objectXidMin, FrozenTransactionId) || + /* objectXidMin <= readerXid */ + TransactionIdPrecedesOrEquals(objectXidMin, readerXid)); + + if (!result) + return false; + + /* visibility from deletion */ + return (!TransactionIdIsValid(objectXidMax)) || + (TransactionIdIsNormal(objectXidMax) && + NormalTransactionIdPrecedes(readerXid, objectXidMax)); +} + +/** + * @brief Test if the extent is visible. + * + * @param[in] mExtent Pointer to the extent information. + * @param[in] xid The transaction ID to access the information. + * @retval true Visible. + * @retval false Invisible. + */ +bool +vci_ExtentIsVisible(vcis_m_extent_t *mExtent, TransactionId xid) +{ + return VisibilityCheck(mExtent->xgen, mExtent->xdel, xid); +} + +bool +vci_ExtentIsCollectable(vcis_m_extent_t *mExtent, TransactionId wos2rosXid) +{ + bool result = false; + + if (TransactionIdIsValid(mExtent->xdel)) + { + result = TransactionIdEquals(mExtent->xdel, FrozenTransactionId) || + /* mExtent->xdel < wos2rosXid */ + TransactionIdPrecedes(mExtent->xdel, wos2rosXid); + } + + return result; +} + +bool +vci_ExtentIsFree(vcis_m_extent_t *extentInfo) +{ + return !TransactionIdIsValid(extentInfo->xdel) && !TransactionIdIsValid(extentInfo->xgen); +} + +/* -------------------------------------------------- */ +/* Recovery function around VCI Main Relation */ +/* -------------------------------------------------- */ + +void +vci_UpdateLastRosVersionAndOthers(vci_MainRelHeaderInfo *info) +{ + uint32 val; + + val = vci_GetMainRelVar(info, vcimrv_current_ros_version, 0); + vci_SetMainRelVar(info, vcimrv_last_ros_version, 0, val); + val = vci_GetMainRelVar(info, vcimrv_size_mr, 0); + vci_SetMainRelVar(info, vcimrv_size_mr_old, 0, val); + val = vci_GetMainRelVar(info, vcimrv_tid_crid_diff_sel, 0); + vci_SetMainRelVar(info, vcimrv_tid_crid_diff_sel_old, 0, val); + + vci_WriteMainRelVar(info, vci_wmrv_update); +} + +void +vci_RecoveryDone(vci_MainRelHeaderInfo *info) +{ + uint32 val; + + val = vci_GetMainRelVar(info, vcimrv_last_ros_version, 0); + vci_SetMainRelVar(info, vcimrv_current_ros_version, 0, val); + + val = vci_GetMainRelVar(info, vcimrv_size_mr_old, 0); + vci_SetMainRelVar(info, vcimrv_size_mr, 0, val); + + val = vci_GetMainRelVar(info, vcimrv_tid_crid_diff_sel_old, 0); + vci_SetMainRelVar(info, vcimrv_tid_crid_diff_sel, 0, val); + + vci_WriteMainRelVar(info, vci_wmrv_update); +} + +void +vci_WriteRecoveryRecordDone(vci_MainRelHeaderInfo *info, vci_ros_command_t command, + TransactionId xid) +{ + vci_SetMainRelVar(info, vcimrv_current_ros_version, 0, xid); + vci_SetMainRelVar(info, vcimrv_ros_command, 0, command); + vci_WriteMainRelVar(info, vci_wmrv_update); +} + +void +vci_WriteRecoveryRecordForExtentInfo(vci_MainRelHeaderInfo *info, int32 newExtentId, int32 oldExtentId) +{ + /* + * ConvertWos2Ros oldExtentId = VCI_INVALID_EXTENT_ID newExtentId = New + * Extent + * + * CollectDeletedRows oldExtentId = Src Extent( -> Unused Extent) + * newExtentId = New Extent + * + * CollectUnusedExtent oldExtentId = Unused Extent newExtentId = + * VCI_INVALID_EXTENT_ID + */ + vci_SetMainRelVar(info, vcimrv_old_extent_id, 0, oldExtentId); + vci_SetMainRelVar(info, vcimrv_new_extent_id, 0, newExtentId); +} + +void +vci_RecoveryExtentInfo(vci_MainRelHeaderInfo *info, vci_ros_command_t command) +{ + int32 numExtents; + int32 oldExtentId; + int32 newExtentId; + Buffer s_buffer = InvalidBuffer; + Buffer d_buffer = InvalidBuffer; + vcis_m_extent_t *extentInfo; + int16 colId; + + numExtents = vci_GetMainRelVar(info, vcimrv_num_extents, 0); + oldExtentId = vci_GetMainRelVar(info, vcimrv_old_extent_id, 0); + newExtentId = vci_GetMainRelVar(info, vcimrv_new_extent_id, 0); + colId = vci_GetMainRelVar(info, vcimrv_working_column_id, 0); + + if (oldExtentId != VCI_INVALID_EXTENT_ID) + { + TransactionId recovery_xdel; + + switch (command) + { + case vci_rc_collect_deleted: + Assert(oldExtentId < numExtents); + recovery_xdel = InvalidTransactionId; + break; + case vci_rc_collect_extent: + /* unuse extent Xdel -> Frozen(2) */ + recovery_xdel = FrozenTransactionId; + break; + default: + Assert(0); + recovery_xdel = InvalidTransactionId; + break; + } + + extentInfo = vci_GetMExtent(&s_buffer, info, oldExtentId); /* from */ + + LockBuffer(s_buffer, BUFFER_LOCK_EXCLUSIVE); + extentInfo->xdel = recovery_xdel; + vci_WriteOneItemPage(info->rel, s_buffer); + UnlockReleaseBuffer(s_buffer); + } + + if ((newExtentId != VCI_INVALID_EXTENT_ID) && (newExtentId < numExtents)) + { + Assert((command == vci_rc_wos_ros_conv) || (command == vci_rc_collect_deleted)); + extentInfo = vci_GetMExtent(&d_buffer, info, newExtentId); /* to */ + + LockBuffer(d_buffer, BUFFER_LOCK_EXCLUSIVE); + extentInfo->xgen = InvalidTransactionId; + Assert((extentInfo->xdel == InvalidTransactionId) || (extentInfo->xdel == FrozenTransactionId)); + extentInfo->xdel = FrozenTransactionId; + extentInfo->flags |= VCIS_M_EXTENT_FLAG_ENABLE_RECOVERED_COLID; + extentInfo->recovered_colid = colId; + vci_WriteOneItemPage(info->rel, d_buffer); + UnlockReleaseBuffer(d_buffer); + } +} + +void +vci_WriteRecoveryRecordForUpdateDelVec(vci_MainRelHeaderInfo *info) +{ + vcis_m_extent_t *extentInfo; + vci_meta_item_scanner_t *scan; + + scan = vci_BeginMetaItemScan(info->rel, BUFFER_LOCK_EXCLUSIVE); + while ((extentInfo = vci_GetMExtentNext(info, scan)) != NULL) + { + extentInfo->num_deleted_rows_old = extentInfo->num_deleted_rows; + } + vci_EndMetaItemScan(scan); +} + +void +vci_RecoveryUpdateDelVec(vci_MainRelHeaderInfo *info) +{ + vcis_m_extent_t *extentInfo; + vci_meta_item_scanner_t *scan; + + scan = vci_BeginMetaItemScan(info->rel, BUFFER_LOCK_EXCLUSIVE); + while ((extentInfo = vci_GetMExtentNext(info, scan)) != NULL) + { + extentInfo->num_deleted_rows = extentInfo->num_deleted_rows_old; + } + vci_EndMetaItemScan(scan); +} + +const char * +vci_GetRosCommandName(vci_ros_command_t command) +{ + switch (command) + { + case vci_rc_invalid: + return "invalid"; + + case vci_rc_vacuum: + return "vacuum"; + + case vci_rc_query: + return "query"; + + case vci_rc_drop_index: + return "drop index"; + + case vci_rc_wos_delete: + return "wos delete"; + + case vci_rc_wos_insert: + return "wos insert"; + + case vci_rc_recovery: + return "recovery"; + + case vci_rc_probe: + return "probe"; + + case vci_rc_wos_ros_conv_build: + return "wos ros conv build"; + + case vci_rc_generate_local_ros: + return "generate local ros"; + + case vci_rc_copy_command: + return "copy command"; + + case vci_rc_wos_ros_conv: + return "wos2ros conversion"; + + case vci_rc_update_del_vec: + return "update delete vector"; + + case vci_rc_collect_deleted: + return "collect deleted rows"; + + case vci_rc_collect_extent: + return "collect extent"; + + case vci_rc_update_tid_crid: + return "update tid-crid tree"; + + default: + return "unknown"; + } +} + +static Buffer +ReadBufferWithPageInitCore(Relation reln, BlockNumber blockNumber, int16 numItem) +{ + Buffer buffer; + Page page; + + Assert((reln->rd_rel->relkind == 'i') || (reln->rd_rel->relkind == 'm')); + buffer = ReadBuffer(reln, blockNumber); + + page = BufferGetPage(buffer); + if (PageIsNew(page)) + { + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + if (PageIsNew(page)) + vci_InitPageCore(buffer, numItem, true); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + } + + return buffer; +} + +/** + * @brief Read a buffer containing the requested block of the requested VCI + * relation. + * + * Same as ReadBuffer(), but initialize new page. + * + * We must generally use this function instead of ReadBuffer(), to access a kind + * of VCI relations except Data WOS, Whiteout WOS, and delete vector. But we + * don't need to replace ReadBuffer() immediately after vci_PreparePagesIfNecessaryCore(). + * + * @param[in] reln The relation. + * @param[in] blockNumber The block number to be read. + */ +Buffer +vci_ReadBufferWithPageInit(Relation reln, BlockNumber blockNumber) +{ + return ReadBufferWithPageInitCore(reln, blockNumber, 1); +} + +/** + * @brief Read a buffer containing the requested block of the requested delete + * vector. + * + * Same as ReadBuffer(), but initialize new page. + * + * We must generally use this function instead of ReadBuffer(), to access a + * delete vector. But we don't need to replace ReadBuffer() immediately after + * vci_PreparePagesIfNecessaryCore(). + * + * @param[in] reln The relation. + * @param[in] blockNumber The block number to be read. + */ +Buffer +vci_ReadBufferWithPageInitDelVec(Relation reln, BlockNumber blockNumber) +{ + return ReadBufferWithPageInitCore(reln, blockNumber, VCI_ITEMS_IN_PAGE_FOR_DELETE); +} diff --git a/contrib/vci/storage/vci_ros_command.c b/contrib/vci/storage/vci_ros_command.c new file mode 100644 index 000000000000..fc706b8b1e57 --- /dev/null +++ b/contrib/vci/storage/vci_ros_command.c @@ -0,0 +1,4165 @@ +/*------------------------------------------------------------------------- + * + * vci_ros_command.c + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/storage/vci_ros_command.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include +#include + +#ifndef WIN32 +#include +#endif + +#include "access/heapam.h" +#include "access/heapam_xlog.h" +#include "access/relscan.h" +#include "access/tupdesc.h" +#include "access/genam.h" +#include "access/visibilitymap.h" /* for visibilitymap_set() */ +#include "access/xact.h" +#include "access/xloginsert.h" +#include "access/tableam.h" +#include "catalog/index.h" +#include "catalog/pg_operator.h" /* for TIDLessOperator */ +#include "catalog/storage.h" +#include "commands/vacuum.h" +#include "storage/freespace.h" +#include "storage/itemptr.h" +#include "storage/lmgr.h" +#include "storage/procarray.h" +#include "storage/smgr.h" /* for RelationSetTargetBlock() */ +#include "utils/rel.h" +#include "utils/syscache.h" +#include "utils/tuplesort.h" + +#include "postgresql_copy.h" + +#include "vci.h" +#include "vci_chunk.h" + +#include "vci_columns.h" +#include "vci_columns_data.h" + +#include "vci_fetch.h" +#include "vci_freelist.h" +#include "vci_mem.h" +#include "vci_ros.h" +#include "vci_ros_command.h" +#include "vci_tidcrid.h" +#include "vci_wos.h" +#include "vci_xact.h" + +extern bool HeapTupleSatisfiesWos2Ros(HeapTuple htup, Snapshot snapshot, Buffer buffer); +extern bool HeapTupleSatisfiesLocalRos(HeapTuple htup, Snapshot snapshot, Buffer buffer); +bool VCITupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer); + +typedef enum +{ + CEK_CountDeletedRows, + CEK_CountUnusedExtents, +} CEKind; + +typedef enum +{ + WOS_Data, + WOS_Whiteout, +} WosKind; + +typedef struct +{ + ItemPointerData orig_tid; + + ItemPointerData wos_tid; + + bool movable; + + int64 xid64; + +} vci_tid_tid_xid64_t; + +static bool WaitTransactionEndOfLastRosCommand(vci_MainRelHeaderInfo *info); +static void fillTidListFromTidSortState(vci_RosCommandContext *comContext, int numRows); +static int ConvertWos2Ros(vci_RosCommandContext *comContext); +static void FillValuesColumnwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, RosChunkStorage *rosChunkStorage); +static void FillIsNullColumnwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, RosChunkStorage *rosChunkStorage); +static void FillIsNullRowwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, RosChunkStorage *rosChunkStorage); +static void FillValuesRowwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, RosChunkStorage *rosChunkStorage); +static void AppendDataToLocalRos(vci_local_ros_t *localRos, RosChunkStorage *storage, vci_MainRelHeaderInfo *info); +static Size ConvertWos2LocalRos(vci_RosCommandContext *comContext); +static void FillOneRosChunkBuffer(vci_RosCommandContext *comContext, int rowId, int numRowsToConvert); +static void ReadOneExtentAndStoreInChunkStorage(vci_RosCommandContext *comContext); +static Size ConvertWhiteOut2LocalDeleteList(vci_RosCommandContext *comContext, int sel); +static bool NeedMainRelHeaderUpdate(vci_ros_command_t command); +static int CmpUint64(const void *pa, const void *pb); +static void FlushTidCridPairListToTreeForBuild(vci_TidCridRelations *relPair, vcis_tidcrid_pair_list_t *appList, BlockNumber blockNumber); +static void UpdateTidCridForBuild(vci_RosCommandContext *comContext); +static void vci_build_callback(Relation rel, ItemPointer tid, Datum *values, bool *isnull, bool tupleIsAlive, void *state); +static void FinalizeBuild(vci_RosCommandContext *comContext); +static double GetEstimatedNumRows(Oid relid); +static void RemoveWosEntries(vci_RosCommandContext *comContext, WosKind wos_kind); +static uint64 cleanUpWos(vci_RosCommandContext *comContext, vci_MainRelVar wosType); +static uint64 UpdateDelVec(vci_RosCommandContext *comContext, Size workareaSize, uint64 numRowsAtOnce); +static void writeNumDeleteRowsIntoExntetInfo(vci_MainRelHeaderInfo *info, int32 topExtentId, uint32 numExtents, uint32 *numDeletedRows); +static vci_target_extent_info_t CountExtents(Relation mainRel, uint32 threshold, CEKind kind); +static HeapTuple getTupleFromVector(int offset, TupleDesc tupleDesc, vci_virtual_tuples_t *vecSet); +static void FillOneRosChunkBufferFromExtent(vci_RosCommandContext *comContext, int32 extentId, uint32 *rowIdInExtent); +static bool isCdrTargetExtentValid(vci_RosCommandContext *comContext); +static int32 CollectDeletedRows(vci_RosCommandContext *comContext, Snapshot snapshot); +static uint32 SearchUnusedExtent(vci_MainRelHeaderInfo *info); +static void CollectUnusedExtent(vci_RosCommandContext *comContext); +static void UpdateTidCrid(vci_RosCommandContext *comContext, Size workareaSize); +static void collectBlockNumberToMove(vci_RosCommandContext *comContext, int numPages); +static void freezeMainAndRos(vci_RosCommandContext *comContext); +static void freezeWos(vci_RosCommandContext *comContext, vci_MainRelVar wosType, Snapshot snapshot); +static void truncateRos(vci_RosCommandContext *comContext); +static void truncateWos(vci_RosCommandContext *comContext); +static void constructTidArray(vci_RosCommandContext *comContext, int max_data_wos_entries, int max_whiteout_wos_entries); +static int comparator_orig_tid_xid64(const void *pa, const void *pb); +static bool can_select_candidate_for_wos2ros_conv(vci_tid_tid_xid64_t *data_wos_item, vci_RosCommandContext *comContext, ItemPointer last_whiteout_orig_tid); +static bool can_select_candidate_for_update_delvec(vci_tid_tid_xid64_t *whiteout_wos_item, vci_RosCommandContext *comContext); +static void put_entry_into_tid_list(vci_RosCommandContext *comContext, WosKind wos_kind, ItemPointer orig_tid, ItemPointer wos_tid); +static bool get_entry_into_tid_list(vci_RosCommandContext *comContext, WosKind wos_kind, ItemPointer orig_tid, ItemPointer wos_tid); +static int readTidListFromWosIntoTidArray(Oid wos_od, WosKind wos_kind, vci_tid_tid_xid64_t *wos_entris, int max_wos_entries, Snapshot snapshot); +static void constructTidSortState(vci_RosCommandContext *comContext); +static void readTidListFromWosIntoTidSortState(Oid wos_oid, WosKind wos_kind, TupleTableSlot *slot, Tuplesortstate *sortstate, Snapshot snapshot, TransactionId wosros_xid); +static bool getValidTidSortState(Tuplesortstate *sortstate, TupleTableSlot *slot, vci_tid_tid_xid64_t *item); +static int32 compareXid64(int64 data_wos_xid64, int64 whiteout_wos_xid64); + +/* + * WOS -> ROS conversion + * We have two situations of WOS -> ROS conversion. + * 1. conversion process to reduce WOS and move data into ROS. + * In this case, all columns registered to the VCI are converted into + * ROS style and stored each relation. The column meta data relations + * are also updated. We normally convert one full extent at a time. + * The precise description is, + * A. take an exclusive lock to the main relation header. + * B. recover ROS if broken. + * C. scan WOS with care of freeze condition and deleted condition + * and collect live TID, up to 256 K rows. + * D. sort TID. + * E. write conversion information into VCI main relation header and + * extent info. + * F. collect target tuples and build ROS data. Here we have chunk + * the data, since the work area might be limited. + * G. Find extent and free spaces to write the data. + * H. Write meta data. + * I. Write extent. + * J. Finalize meta data and VCI main relation. + * K. release the main relation header. + * For this purpose, we need VCI main relation, size of workarea. + * + * 2. local ROS conversion. + * In this case, given columns are converted into ROS style and stored + * in memory. All the visible data are converted. + * The precise description is, + * A. scan WOS with care of visibility and deleted condition and collect + * visible TID. + * B. sort TID. + * C. take an exclusive lock to the main relation header. + * D. recover ROS if broken. + * E. collect target tuples and build local ROS data. + * F. release the main relation header. + * For this purpose, we need VCI main relation, size of area to store, + * necessary column ID list. + * + */ + +/* -------------------------------------------------------------- */ + +#define PERIOD_TO_CHECK_TRANSACTION_END (INT64CONST(1000)) /* 1 ms */ +#define DURATION_TO_CHECK_TRANSACTION_END (100000) /* 100 s */ + +/* + * Copy from vacuumlazy.c + */ +#define VACUUM_TRUNCATE_LOCK_CHECK_INTERVAL 20 /* ms */ +#define VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL 50 /* ms */ +#define VACUUM_TRUNCATE_LOCK_TIMEOUT 5000 /* ms */ + +/** + * @brief This function is designed to detect transaction end after VCI + * exclusive write lock is released. + * + * If the transaction of previous ROS command is not commited nor aborted, + * wait for the end for time specified by the macro + * We expect that normally ROS command is soon commited + * DURATION_TO_CHECK_TRANSACTION_END (originally 100 seconds) + * after the lock is released. + * When the end is not detected, the function returns false, + * otherwise true. + * + * @param[in] info Pointer to vci_MainRelHeaderInfo whose VCI index is + * determined. + * @retval true The transaction of the previous ROS command is detected + * in a wait-time. + * @retval false The transaction end is not detected. + */ +static bool +WaitTransactionEndOfLastRosCommand(vci_MainRelHeaderInfo *info) +{ + /* + * current ROS version is the transaction ID of last ROS command + */ + TransactionId curRosVer = vci_GetMainRelVar(info, vcimrv_current_ros_version, 0); + int checkCount; + + if (!TransactionIdIsValid(curRosVer)) + return true; + + if (TransactionIdIsCurrentTransactionId(curRosVer)) + return true; + + for (checkCount = 0; + (checkCount < DURATION_TO_CHECK_TRANSACTION_END) && + (!ConditionalXactLockTableWait(curRosVer, false)); + ++checkCount) + { + pg_usleep(PERIOD_TO_CHECK_TRANSACTION_END); /* wait 1 ms */ + } + + return checkCount < DURATION_TO_CHECK_TRANSACTION_END; +} + +/** + * @brief This function determine the result of the transaction status + * of the previous ROS command. + * + * First, it waits the end of the transaction of the previous if necessary. + * When it is committed successfully, just update current ROS version. + * Otherwise, tries to recover VCI relations. + * + * @param[in] info Pointer to vci_MainRelHeaderInfo whose VCI index is + * determined. + * + * @note Assuming that this function is called under main relation is locked + * exclusively. + */ +void +vci_RecoverOneVCIIfNecessary(vci_MainRelHeaderInfo *info) +{ + TransactionId curRosVer; + TransactionId lastRosVer; + vci_ros_command_t commandSave = info->command; + + Assert(info); + + vci_ChangeCommand(info, vci_rc_recovery); + + /* + * Since the transaction is commited or abort after the lock is released, + * we have to wait for it. + */ + if (!WaitTransactionEndOfLastRosCommand(info)) + elog(ERROR, "unterminated ROS command"); + + curRosVer = vci_GetMainRelVar(info, vcimrv_current_ros_version, 0); + lastRosVer = vci_GetMainRelVar(info, vcimrv_last_ros_version, 0); + + if (!TransactionIdEquals(curRosVer, lastRosVer)) + { + switch (vci_transaction_get_type(curRosVer)) + { + case VCI_XACT_SELF: + /* The last ROS version has been already updated */ + break; + + case VCI_XACT_IN_PROGRESS: + elog(PANIC, "internal error. multiple ROS command running"); + break; + + case VCI_XACT_DID_COMMIT: + /* update last ROS version and others */ + vci_UpdateLastRosVersionAndOthers(info); + break; + + case VCI_XACT_DID_ABORT: + case VCI_XACT_DID_CRASH: + { + vci_ros_command_t command; + + command = vci_GetMainRelVar(info, vcimrv_ros_command, 0); + + elog(DEBUG1, "crash recovery: previous command=\"%s\"(%d)", + vci_GetRosCommandName(command), command); + + switch (command) + { + case vci_rc_update_del_vec: + vci_RecoveryUpdateDelVec(info); + break; + + case vci_rc_wos_ros_conv: + case vci_rc_collect_deleted: + case vci_rc_collect_extent: + vci_RecoveryExtentInfo(info, command); + vci_RecoveryFreeSpace(info, command); + break; + + case vci_rc_update_tid_crid: + vci_RecoveryTidCrid(info); + vci_RecoveryFreeSpaceForTidCrid(info); + break; + + default: + elog(PANIC, "last recorded ros command is fatally broken."); + break; + } + + vci_RecoveryDone(info); + } + break; + + case VCI_XACT_INVALID: + elog(PANIC, "should not reach here"); + break; + } + } + + vci_ChangeCommand(info, commandSave); +} + +static void +fillTidListFromTidSortState(vci_RosCommandContext *comContext, int numRows) +{ + int i; + int count = 0; + + Assert(numRows <= VCI_NUM_ROWS_IN_EXTENT); + + for (i = 0; i < numRows; i++) + { + Assert(count < comContext->wos2ros_array.max); + + if (!get_entry_into_tid_list(comContext, WOS_Data, + &comContext->wos2ros_array.orig_tids[i], + &comContext->wos2ros_array.wos_tids[i])) + break; + + count++; + } + + comContext->wos2ros_array.num = count; + comContext->numRowsToConvert = count; +} + +static int +ConvertWos2Ros(vci_RosCommandContext *comContext) +{ + int result = 0; + + if (comContext->numRowsToConvert < 1) + { + elog(DEBUG2, "stop WOS to ROS conversion numRowsToConvert = %d", comContext->numRowsToConvert); + return 0; + } + + elog(DEBUG2, "start to convert WOS to ROS"); + + /* obtain target extent ID */ + /* comContext->extentId = vci_GetFreeExtentId(&(comContext->info)); */ + elog(DEBUG2, + "WOS -> ROS conversion: index: %s extent ID: " INT64_FORMAT, + RelationGetRelationName(comContext->info.rel), + (int64) comContext->extentId); + + /* + * Set WOS->ROS conversion data and write main relation for recovery. + * Header and extent info. Here, we also put current ROS version to the + * actual current transaction ID. + */ + vci_WriteExtentInfoInMainRosForWosRosConvInit(&(comContext->info), + comContext->extentId, + comContext->xid); + + vci_ResetRosChunkStorage(&(comContext->storage)); + vci_ResetRosChunkBufferCounter(&(comContext->buffer)); + + /* read data for one extent */ + ReadOneExtentAndStoreInChunkStorage(comContext); + + /* write one extent into ROS */ + vci_AddTidCridUpdateList(&(comContext->info), + &(comContext->storage), + comContext->extentId); + vci_WriteOneExtent(&(comContext->info), + &(comContext->storage), + comContext->extentId, + comContext->xid, + InvalidTransactionId, + comContext->xid); + + result = comContext->storage.numTotalRows; + + elog(DEBUG2, "converted %d rows into ROS", result); + + return result; +} + +static void +FillValuesColumnwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, + RosChunkStorage *rosChunkStorage) +{ + int16 columnId; + + for (columnId = 0; columnId < vTuples->num_columns; ++columnId) + { + switch (vTuples->column_info[columnId].comp_type) + { + case vcis_compression_type_fixed_raw: + vci_FillFixedWidthColumnarFromRosChunkStorage(vTuples, + columnId, rosChunkStorage); + break; + case vcis_compression_type_variable_raw: + vci_FillVariableWidthColumnarFromRosChunkStorage(vTuples, + columnId, rosChunkStorage); + break; + default: + Assert(false); + elog(ERROR, "internal error: unsupported compression type"); + } + } +} + +static void +FillIsNullColumnwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, + RosChunkStorage *rosChunkStorage) +{ + const int16 strideR = 64; + int sId; + int baseOffset = 0; + int16 *nullableColumnId = vci_GetNullableColumnIds(vTuples); + + if (vTuples->num_columns < 1) + return; + + Assert(0 < rosChunkStorage->numFilled); + Assert(vTuples->num_columns <= rosChunkStorage->chunk[0]->numColumns); + Assert(vTuples->fetch_context->query_context->num_nullable_columns <= rosChunkStorage->chunk[0]->numNullableColumns); + Assert(rosChunkStorage->numTotalRows <= vTuples->num_rows_in_extent); + + MemSet(vTuples->isnull, 0, vTuples->num_columns * vTuples->num_rows_in_extent); + + for (sId = 0; sId < rosChunkStorage->numFilled; ++sId) + { + RosChunkBuffer *chunk = rosChunkStorage->chunk[sId]; + int rId; + + for (rId = 0; rId < chunk->numFilled; rId += strideR) + { + int pIdMax = Min(rId + strideR, chunk->numFilled); + int bitId; + + for (bitId = 0; bitId < chunk->numNullableColumns; ++bitId) + { + int colId = nullableColumnId[bitId]; + + if (VCI_FIRST_NORMALCOLUMN_ID <= colId) + { + uint8 *dst = (uint8 *) &(vTuples->isnull[(vTuples->num_rows_in_extent * colId) + baseOffset]); + int pId; + + for (pId = rId; pId < pIdMax; ++pId) + dst[pId] = vci_GetBit((uint8 *) &(chunk->nullData[chunk->nullWidthInByte * pId]), bitId); + } + } + } + baseOffset += chunk->numFilled; + } + Assert(rosChunkStorage->numTotalRows == baseOffset); +} + +static void +FillIsNullRowwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, + RosChunkStorage *rosChunkStorage) +{ + abort(); +} + +static void +FillValuesRowwiseFromRosChunkStorage(vci_virtual_tuples_t *vTuples, + RosChunkStorage *rosChunkStorage) +{ + abort(); +} + +static void +AppendDataToLocalRos(vci_local_ros_t *localRos, + RosChunkStorage *storage, + vci_MainRelHeaderInfo *info) +{ + MemoryContext oldMemCtx; + struct vci_virtual_tuples *vTuples; + int32 extentId; + + oldMemCtx = MemoryContextSwitchTo(localRos->memory_context); + + ++(localRos->num_local_extents); + extentId = -(localRos->num_local_extents); + + localRos->extent = vci_repalloc(localRos->extent, + sizeof(vci_virtual_tuples_t *) * + localRos->num_local_extents); + vTuples = vci_CSCreateVirtualTuplesWithNumRows(localRos->fetch_context, + storage->numTotalRows); + localRos->extent[localRos->num_local_extents - 1] = vTuples; + + /* + * Originally, localRos->size_vector_memory_context has the total size of + * vector sets. The third parameter of vci_CSInitializeVectorSet() is the + * size for one vector set. Normally, we give up when many data are stored + * in ROS. So, we can fix the maximum number of extents. + */ + + vTuples->num_rows = storage->numTotalRows; + vTuples->extent_id = extentId; + vTuples->num_rows_in_extent = storage->numTotalRows; + vTuples->row_id_in_extent = 0; + vTuples->status = vcirvs_read_whole; + + if (vTuples->crid) + vci_FillCridInVirtualTuples(vTuples); + + MemSet(vTuples->skip, 0, sizeof(uint16) * vTuples->num_rows_in_extent); + + if (vTuples->tid) + vci_FillFixedWidthColumnarFromRosChunkStorage(vTuples, VCI_COLUMN_ID_TID, storage); + + if (vTuples->use_column_store) + { + FillIsNullColumnwiseFromRosChunkStorage(vTuples, storage); + FillValuesColumnwiseFromRosChunkStorage(vTuples, storage); + } + else + { + FillIsNullRowwiseFromRosChunkStorage(vTuples, storage); + FillValuesRowwiseFromRosChunkStorage(vTuples, storage); + } + + MemoryContextSwitchTo(oldMemCtx); +} + +static Size +ConvertWos2LocalRos(vci_RosCommandContext *comContext) +{ + Size result = 0; + + if (comContext->numRowsToConvert < 1) + return 0; + + elog(DEBUG2, "start to generate local ROS"); + + for (comContext->extentId = -1; (!comContext->done); + comContext->extentId -= 1) + { + elog(DEBUG3, + "WOS -> local ROS conversion: index: %s extent ID:%d\n", + RelationGetRelationName(comContext->info.rel), + comContext->extentId); + + vci_ResetRosChunkStorage(&(comContext->storage)); + vci_ResetRosChunkBufferCounter(&(comContext->buffer)); + + /* read data for one extent */ + ReadOneExtentAndStoreInChunkStorage(comContext); + + /* write one extent into ROS */ + if (0 < comContext->storage.numTotalRows) + AppendDataToLocalRos(comContext->local_ros, + &(comContext->storage), + &(comContext->info)); + + result += comContext->storage.numTotalRows; + elog(DEBUG2, "converted %llu rows into local ROS", + (unsigned long long) result); + } + + return result; +} + +/* ************************************** + * ** CAUTION: AttrNumber is 1 origin. ** + * ************************************** + */ +/** + * assuming when tIdList != NULL, TID list in tIdList to be read. + * not sequential scan, so scan is NULL. + * when tIdList == NULL, scan != NULL, sequential scan. + * + * @retval true some data remain + * @retval false no data remain + */ +static void +FillOneRosChunkBuffer(vci_RosCommandContext *comContext, + int rowId, + int numRowsToConvert) +{ + int offset; + TupleDesc tupleDesc = RelationGetDescr(comContext->heapRel); + Snapshot snapshot = GetActiveSnapshot(); + + if (comContext->wos2ros_array.max > 0) + { + uint32 sel PG_USED_FOR_ASSERTS_ONLY; + vci_ros_command_t command = comContext->command; + +#ifdef USE_ASSERT_CHECKING + vci_TidCridUpdateListContext *oldListContext = NULL; +#endif + + if ((command == vci_rc_wos_ros_conv) || + (command == vci_rc_collect_deleted)) + { + sel = vci_GetMainRelVar(&comContext->info, vcimrv_tid_crid_diff_sel, 0); + +#ifdef USE_ASSERT_CHECKING + oldListContext = vci_OpenTidCridUpdateList(&comContext->info, sel); +#endif + } + else if (command == vci_rc_generate_local_ros) + { + sel = comContext->local_ros->fetch_context->query_context->tid_crid_diff_sel; + } + + for (offset = 0; offset < numRowsToConvert; ++offset) + { + HeapTupleData tuple; + Buffer buffer; + int actualOffset = rowId + comContext->wos2ros_array.offset + offset; + + if (comContext->wos2ros_array.num <= actualOffset) + { + comContext->done = true; + break; + } + + CHECK_FOR_INTERRUPTS(); + + tuple.t_self = comContext->wos2ros_array.orig_tids[actualOffset]; + + if (!heap_fetch(comContext->heapRel, snapshot, &tuple, &buffer, true)) + { + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("vci index \"%s\" corrupted", RelationGetRelationName(comContext->info.rel)), + errdetail("TID (%d,%d) has been deleted from table \"%s\"", + ItemPointerGetBlockNumber(&tuple.t_self), + ItemPointerGetOffsetNumber(&tuple.t_self), + RelationGetRelationName(comContext->heapRel)), + errhint("Use DROP INDEX \"%s\"", RelationGetRelationName(comContext->info.rel)))); + } + +#ifdef USE_ASSERT_CHECKING + if (oldListContext) + { + uint64 cridUint = vci_GetCridFromTid(oldListContext, &tuple.t_self, NULL); + + if (cridUint != VCI_INVALID_CRID) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("vci index \"%s\" corrupted", RelationGetRelationName(comContext->info.rel)), + errdetail("try to insert TID (%d,%d) into ROS twice: extentId=%d, index=%d", + ItemPointerGetBlockNumber(&tuple.t_self), + ItemPointerGetOffsetNumber(&tuple.t_self), + vci_CalcExtentIdFromCrid64(cridUint), + vci_CalcRowIdInExtentFromCrid64(cridUint)), + errhint("Use DROP INDEX \"%s\"", RelationGetRelationName(comContext->info.rel)))); + } +#endif + + vci_FillOneRowInRosChunkBuffer(&(comContext->buffer), + &(comContext->info), + &tuple.t_self, + &tuple, + comContext->indxColumnIdList, + comContext->heapAttrNumList, + tupleDesc); + + if (comContext->data_wos_del_list) + { + tuplesort_putdatum(comContext->data_wos_del_list, + ItemPointerGetDatum(&comContext->wos2ros_array.wos_tids[actualOffset]), false); + } + + ReleaseBuffer(buffer); + } + +#ifdef USE_ASSERT_CHECKING + if (oldListContext) + vci_CloseTidCridUpdateList(oldListContext); +#endif + } +} + +static void +ReadOneExtentAndStoreInChunkStorage(vci_RosCommandContext *comContext) +{ + Size rowId; + + /* collect data for one extent */ + for (rowId = 0; + rowId < comContext->numRowsToConvert; + rowId += comContext->numRowsAtOnce) + { + /* the number of rows in one chunk */ + int numRowsToConvert = comContext->numRowsToConvert - rowId; + + if (comContext->numRowsAtOnce - comContext->buffer.numFilled < numRowsToConvert) + numRowsToConvert = comContext->numRowsAtOnce - comContext->buffer.numFilled; + + CHECK_FOR_INTERRUPTS(); + + /* fetch the data from original relation */ + FillOneRosChunkBuffer(comContext, rowId, numRowsToConvert); + if (0 < comContext->buffer.numFilled) + { + /* copy chunk buffer in a compact manner */ + vci_RegisterChunkBuffer(&(comContext->storage), &(comContext->buffer)); + vci_ResetRosChunkBufferCounter(&(comContext->buffer)); + } + } + + comContext->wos2ros_array.offset += comContext->numRowsToConvert; +} + +static Size +ConvertWhiteOut2LocalDeleteList(vci_RosCommandContext *comContext, + int sel) +{ + vci_local_delete_list *list = &(comContext->local_ros->local_delete_list); + int cId; + vci_TidCridUpdateListContext *tidCridListContext; + + Assert(list); + Assert(list->num_entry < list->length); + + tidCridListContext = vci_OpenTidCridUpdateList(&comContext->info, sel); + + for (cId = 0; cId < comContext->delvec_array.num; cId++) + { + ItemPointerData orig_tid; + uint64 crid; + + orig_tid = comContext->delvec_array.orig_tids[cId]; + + crid = vci_GetCridFromTid(tidCridListContext, &orig_tid, NULL); + + if (crid == VCI_INVALID_CRID) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("vci index \"%s\" corrupted", RelationGetRelationName(comContext->info.rel)), + errdetail("try to delete TID (%d,%d) into local delete list", + ItemPointerGetBlockNumber(&orig_tid), + ItemPointerGetOffsetNumber(&orig_tid)), + errhint("Use DROP INDEX \"%s\"", RelationGetRelationName(comContext->info.rel)))); + + list->crid_list[list->num_entry] = crid; + list->num_entry++; + } + + vci_CloseTidCridUpdateList(tidCridListContext); + + return list->num_entry; +} + +static bool +NeedMainRelHeaderUpdate(vci_ros_command_t command) +{ + switch (command) + { + case vci_rc_recovery: + case vci_rc_wos_ros_conv: + case vci_rc_update_del_vec: + case vci_rc_collect_deleted: + /* case vci_rc_compaction: */ + case vci_rc_update_tid_crid: + case vci_rc_collect_extent: + case vci_rc_copy_command: + case vci_rc_wos_ros_conv_build: + + return true; + + case vci_rc_wos_delete: + case vci_rc_wos_insert: + case vci_rc_probe: + case vci_rc_query: + case vci_rc_generate_local_ros: + case vci_rc_drop_index: + case vci_rc_vacuum: + + return false; + + default: + Assert(false); + elog(ERROR, "internal error: unexpected ROS command"); + } + + return false; +} + +void +vci_ReleaseMainRelInCommandContext(vci_RosCommandContext *comContext) +{ + /* release the main relation */ + vci_ReleaseMainRelHeader(&(comContext->info)); +} + +void +vci_CloseHeapRelInCommandContext(vci_RosCommandContext *comContext) +{ + if (RelationIsValid(comContext->heapRel)) + table_close(comContext->heapRel, AccessShareLock); + comContext->heapRel = NULL; +} + +static int +CmpUint64(const void *pa, const void *pb) +{ + uint64 a = *(uint64 *) pa; + uint64 b = *(uint64 *) pb; + + return (a < b) ? -1 : ((b < a) ? 1 : 0); +} + +void +vci_InitRosCommandContext0(vci_RosCommandContext *context, + Relation rel, vci_ros_command_t command) +{ + Assert(context); + + MemSet(context, 0, sizeof(*context)); + + context->command = command; + context->indexOid = RelationGetRelid(rel); + + vci_InitMainRelHeaderInfo(&(context->info), rel, command); + vci_KeepMainRelHeader(&(context->info)); +} + +void +vci_InitRosCommandContext1(vci_RosCommandContext *comContext, + Size workareaSize, + int numInsertRows, + int numDeleteRows, + bool readOriginalData) +{ + Size worstCaseTupleSize; + int numColumns; + + Assert(comContext); + + comContext->xid = ((vci_rc_query == comContext->command) || + (vci_rc_generate_local_ros == comContext->command)) ? + InvalidTransactionId : GetCurrentTransactionId(); + + comContext->heapOid = IndexGetRelation(comContext->info.rel->rd_id, false); + + comContext->local_ros = NULL; + comContext->done = false; + + switch (comContext->command) + { + case vci_rc_generate_local_ros: + comContext->wos2ros_array.orig_tids = (ItemPointerData *) palloc(sizeof(ItemPointerData) * numInsertRows); + comContext->wos2ros_array.max = numInsertRows; + comContext->delvec_array.orig_tids = (ItemPointerData *) palloc(sizeof(ItemPointerData) * numDeleteRows); + comContext->delvec_array.max = numDeleteRows; + break; + + case vci_rc_wos_ros_conv: + case vci_rc_collect_deleted: + comContext->wos2ros_array.orig_tids = (ItemPointerData *) palloc(sizeof(ItemPointerData) * numInsertRows); + comContext->wos2ros_array.wos_tids = (ItemPointerData *) palloc(sizeof(ItemPointerData) * numInsertRows); + comContext->wos2ros_array.max = numInsertRows; + break; + + default: + break; + } + + comContext->numRowsToConvert = Min(Max(numInsertRows, numDeleteRows), VCI_NUM_ROWS_IN_EXTENT); + + /* + * Column sizes + */ + numColumns = vci_GetMainRelVar(&(comContext->info), vcimrv_num_columns, 0); + + /* + * get column size in worst case and column ID lists for both original + * relation and VCI relation + */ + comContext->numColumns = numColumns; + + if (readOriginalData) + { + Size allocatableSize = Min(workareaSize, MaxAllocSize); + int numRowsAtOnce; + int largestTupleSize; + + comContext->heapAttrNumList = (AttrNumber *) palloc(sizeof(AttrNumber) * numColumns); + comContext->indxColumnIdList = (int16 *) palloc(sizeof(int16) * numColumns); + comContext->columnSizeList = (int16 *) palloc(sizeof(int16) * numColumns); + worstCaseTupleSize = vci_GetColumnIdsAndSizes( + comContext->heapAttrNumList, + comContext->indxColumnIdList, + comContext->columnSizeList, + numColumns, + &(comContext->info), + comContext->heapOid); + + comContext->heapRel = table_open(comContext->heapOid, AccessShareLock); + + /* + * PostgreSQL limits the tuple size by TOAST_TUPLE_TARGET, normally. + * The upper limit of the tuple size is smaller than BLCKSZ. We use + * other area to keep the offset or data size in the chunk buffers or + * ROS. Here, we assume the type of offset is uint32. + */ + largestTupleSize = worstCaseTupleSize + + (comContext->numColumns * sizeof(uint32)); + + /* The number of rows in one chunk */ + numRowsAtOnce = (int) (allocatableSize * VCI_WOS_ROS_WORKAREA_SAFE_RATIO / + largestTupleSize); + numRowsAtOnce = (numRowsAtOnce / VCI_COMPACTION_UNIT_ROW) * VCI_COMPACTION_UNIT_ROW; + numRowsAtOnce = Max(numRowsAtOnce, VCI_COMPACTION_UNIT_ROW); + numRowsAtOnce = Min(numRowsAtOnce, VCI_NUM_ROWS_IN_EXTENT); + + comContext->numRowsAtOnce = numRowsAtOnce; + } + else + { + comContext->heapAttrNumList = NULL; + comContext->indxColumnIdList = NULL; + comContext->columnSizeList = NULL; + comContext->heapRel = NULL; + comContext->numRowsAtOnce = VCI_COMPACTION_UNIT_ROW; + } + + comContext->scan = NULL; + + switch (comContext->command) + { + case vci_rc_wos_ros_conv: + case vci_rc_collect_deleted: + case vci_rc_update_del_vec: + case vci_rc_vacuum: + comContext->oldestXmin = GetOldestNonRemovableTransactionId(comContext->info.rel); + comContext->wos2rosXid = comContext->oldestXmin; + break; + + case vci_rc_generate_local_ros: + default: + comContext->oldestXmin = InvalidTransactionId; + comContext->wos2rosXid = InvalidTransactionId; + break; + } + +} + +void +vci_InitRosCommandContext2(vci_RosCommandContext *comContext, Size workareaSize) +{ + bool make_wos2ros_tid_list = false; + bool make_delvec_tid_list = false; + + comContext->data_wos_del_list = + tuplesort_begin_datum(TIDOID, TIDLessOperator, InvalidOid, false, + Min(workareaSize / 1024 / 3, INT_MAX), NULL, TUPLESORT_NONE); + comContext->whiteout_wos_del_list = + tuplesort_begin_datum(TIDOID, TIDLessOperator, InvalidOid, false, + Min(workareaSize / 1024 / 3, INT_MAX), NULL, TUPLESORT_NONE); + + switch (comContext->command) + { + case vci_rc_wos_ros_conv: + make_wos2ros_tid_list = true; + break; + + case vci_rc_collect_deleted: + make_wos2ros_tid_list = true; + break; + + case vci_rc_update_del_vec: + make_delvec_tid_list = true; + break; + + default: + break; + } + + if (make_wos2ros_tid_list || make_delvec_tid_list) + { + TupleDesc tupDesc; + AttrNumber sortKeys[] = {1}; + Oid sortOperators[] = {TIDLessOperator}; + Oid sortCollations[] = {InvalidOid}; + bool nullsFirstFlags[] = {false}; + + tupDesc = CreateTemplateTupleDesc(2); + + TupleDescInitEntry(tupDesc, (AttrNumber) 1, "orig_tid", TIDOID, -1, 0); + TupleDescInitEntry(tupDesc, (AttrNumber) 2, "wos_tid", TIDOID, -1, 0); + + comContext->tid_tid_tupdesc = tupDesc; + comContext->tid_tid_slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple); + + if (make_wos2ros_tid_list) + { + comContext->wos2ros_tid_list = + tuplesort_begin_heap(tupDesc, 1, + sortKeys, sortOperators, sortCollations, nullsFirstFlags, + Min(workareaSize / 1024 / 3, INT_MAX), NULL, TUPLESORT_NONE); + } + + if (make_delvec_tid_list) + { + comContext->delvec_tid_list = + tuplesort_begin_heap(tupDesc, 1, + sortKeys, sortOperators, sortCollations, nullsFirstFlags, + Min(workareaSize / 1024 / 3, INT_MAX), NULL, TUPLESORT_NONE); + } + } +} + +void +vci_InitRosChunkStroageAndBuffer(vci_RosCommandContext *comContext, bool forAppending) +{ + int numRowsAtOnce; + + Assert(RelationIsValid(comContext->heapRel)); + + numRowsAtOnce = comContext->numRowsAtOnce; + + /* Initialize the buffers for building chunks of ROS data */ + vci_InitOneRosChunkBuffer(&(comContext->buffer), + numRowsAtOnce, + comContext->columnSizeList, + comContext->numColumns, + false, + &(comContext->info)); + + vci_InitRosChunkStorage(&(comContext->storage), numRowsAtOnce, forAppending); +} + +void +vci_CleanRosCommandContext(vci_RosCommandContext *comContext, bool neverWrite) +{ + if (comContext->tid_tid_slot) + { + ExecClearTuple(comContext->tid_tid_slot); + pfree(comContext->tid_tid_slot); + comContext->tid_tid_slot = NULL; + } + + if (comContext->data_wos_del_list) + { + tuplesort_end(comContext->data_wos_del_list); + comContext->data_wos_del_list = NULL; + } + + if (comContext->whiteout_wos_del_list) + { + tuplesort_end(comContext->whiteout_wos_del_list); + comContext->whiteout_wos_del_list = NULL; + } + + if (comContext->wos2ros_tid_list) + { + tuplesort_end(comContext->wos2ros_tid_list); + comContext->wos2ros_tid_list = NULL; + } + + if (comContext->delvec_tid_list) + { + tuplesort_end(comContext->delvec_tid_list); + comContext->delvec_tid_list = NULL; + } + + if (comContext->tid_tid_tupdesc) + { + FreeTupleDesc(comContext->tid_tid_tupdesc); + comContext->tid_tid_tupdesc = NULL; + } + + /* Close original heap relation if it is opened. */ + vci_CloseHeapRelInCommandContext(comContext); + + /* + * Release chunk buffers - WOS ROS Conv. + */ + if (comContext->command == vci_rc_wos_ros_conv) + { + vci_DestroyOneRosChunkBuffer(&(comContext->buffer)); + vci_DestroyRosChunkStorage(&(comContext->storage)); + } + + if (NULL != comContext->heapAttrNumList) + { + /* release local work area */ + pfree(comContext->heapAttrNumList); + pfree(comContext->indxColumnIdList); + pfree(comContext->columnSizeList); + comContext->heapAttrNumList = NULL; + comContext->indxColumnIdList = NULL; + comContext->columnSizeList = NULL; + } + + /* release local work area */ + if (comContext->wos2ros_array.orig_tids) + { + pfree(comContext->wos2ros_array.orig_tids); + comContext->wos2ros_array.orig_tids = NULL; + } + + if (comContext->wos2ros_array.wos_tids) + { + pfree(comContext->wos2ros_array.wos_tids); + comContext->wos2ros_array.wos_tids = NULL; + } + + if (comContext->delvec_array.orig_tids) + { + pfree(comContext->delvec_array.orig_tids); + comContext->delvec_array.orig_tids = NULL; + } + + if (comContext->utility_array.orig_blknos) + { + pfree(comContext->utility_array.orig_blknos); + comContext->utility_array.orig_blknos = NULL; + } + + if (neverWrite) + return; + + /* write header of the main relation */ + if (NeedMainRelHeaderUpdate(comContext->command)) + vci_WriteMainRelVar(&(comContext->info), + vci_wmrv_update); +} + +void +vci_FinRosCommandContext(vci_RosCommandContext *comContext, bool neverWrite) +{ + vci_CleanRosCommandContext(comContext, neverWrite); + + /* release the main relation */ + vci_ReleaseMainRelInCommandContext(comContext); + + comContext->indexOid = InvalidOid; + comContext->command = vci_rc_invalid; +} + +/** + * numRows is from 1 to VCI_NUM_ROWS_IN_EXTENT + * workareaSize should be taken from the configuration parameter + * in postgresql.conf. + * It just convert one extent. + */ +int +vci_ConvertWos2Ros(Relation mainRel, Size workareaSize, int numRows) +{ + vci_RosCommandContext comContext; + MemoryContext memCtxWos2Ros; + MemoryContext oldMemCtx; + int result = -1; + + Assert((0 < numRows) && (numRows <= VCI_NUM_ROWS_IN_EXTENT)); + + vci_InitRosCommandContext0(&comContext, mainRel, vci_rc_wos_ros_conv); + + /* recover ROS if necessary */ + vci_RecoverOneVCIIfNecessary(&(comContext.info)); + + /* prepare local work area */ + memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext, + "WOS->ROS conversion", + ALLOCSET_DEFAULT_SIZES); + oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros); + + vci_InitRosCommandContext1(&comContext, + workareaSize / 3 * 2, + numRows, 0, + true); + + vci_InitRosCommandContext2(&comContext, workareaSize / 3); + + if (TransactionIdPrecedes(GetCurrentTransactionId(), + (TransactionId) vci_GetMainRelVar(&comContext.info, vcimrv_current_ros_version, 0))) + goto done; + + GetActiveSnapshot(); + + /* obtain new extent ID */ + comContext.extentIdSrc = VCI_INVALID_EXTENT_ID; + comContext.extentId = vci_GetFreeExtentId(&(comContext.info)); + + /* Write Recovery Information of this command. */ + vci_WriteRecoveryRecordForExtentInfo(&comContext.info, comContext.extentId, comContext.extentIdSrc); + vci_InitRecoveryRecordForFreeSpace(&comContext.info); + + vci_WriteRecoveryRecordDone(&comContext.info, comContext.command, comContext.xid); + + constructTidSortState(&comContext); + + /* call Main routine */ + fillTidListFromTidSortState(&comContext, numRows); + + vci_InitRosChunkStroageAndBuffer(&comContext, false /* no append */ ); + + result = ConvertWos2Ros(&comContext); + + /* remove WOS entries */ + cleanUpWos(&comContext, vcimrv_data_wos_oid); + cleanUpWos(&comContext, vcimrv_whiteout_wos_oid); + + /* Xmax WOS entry */ + RemoveWosEntries(&comContext, WOS_Data); + RemoveWosEntries(&comContext, WOS_Whiteout); + +done: + /* Finalize ROS */ + vci_FinRosCommandContext(&comContext, false); + + MemoryContextSwitchTo(oldMemCtx); + MemoryContextDelete(memCtxWos2Ros); + + return result; +} + +static void +FlushTidCridPairListToTreeForBuild(vci_TidCridRelations *relPair, + vcis_tidcrid_pair_list_t *appList, + BlockNumber blockNumber) +{ + if (0 < appList->num) + { + ItemPointerData treeNode; + + vci_GetTidCridSubTree(relPair, blockNumber, &treeNode); + if (!ItemPointerIsValid(&treeNode)) + vci_CreateTidCridSubTree(relPair, blockNumber, &treeNode); + vci_UpdateTidCridSubTree(relPair, &treeNode, appList); + } + appList->num = 0; +} + +static void +UpdateTidCridForBuild(vci_RosCommandContext *comContext) +{ + RosChunkStorage *src = &(comContext->storage); + vci_TidCridRelations relPair; + const LOCKMODE lockmode = ExclusiveLock; + BlockNumber blockNumber = InvalidBlockNumber; + int32 offset = offsetof(vcis_tidcrid_pair_list_t, body); + int chunkId; + int rowIdInExt = 0; + vcis_tidcrid_pair_list_t *appList = palloc(offset + + (sizeof(vcis_tidcrid_pair_item_t) * src->numTotalRows)); + + vci_OpenTidCridRelations(&relPair, &comContext->info, lockmode); + appList->num = 0; + + for (chunkId = 0; chunkId < src->numFilled; ++chunkId) + { + int rowId; + + for (rowId = 0; rowId < src->chunk[chunkId]->numFilled; ++rowId) + { + ItemPointer itemPtr = (ItemPointer) &(src->chunk[chunkId]-> + tidData[sizeof(ItemPointerData) * rowId]); + + if (blockNumber != ItemPointerGetBlockNumber(itemPtr)) + { + if (BlockNumberIsValid(blockNumber)) + FlushTidCridPairListToTreeForBuild(&relPair, appList, + blockNumber); + blockNumber = ItemPointerGetBlockNumber(itemPtr); + } + + Assert(appList->num < src->numTotalRows); + appList->body[appList->num].crid = vci_GetCridFromUint64( + vci_CalcCrid64(comContext->extentId, rowIdInExt)); + ItemPointerCopy(itemPtr, &appList->body[appList->num].page_item_id); + (appList->num)++; + + Assert(rowIdInExt < src->numTotalRows); + rowIdInExt++; + } + } + if (BlockNumberIsValid(blockNumber)) + FlushTidCridPairListToTreeForBuild(&relPair, appList, blockNumber); + pfree(appList); + vci_CloseTidCridRelations(&relPair, lockmode); +} + +/* Implementation of callback interface:IndexBuildCallback */ +static void +vci_build_callback(Relation rel, + ItemPointer tid, + Datum *values, + bool *isnull, + bool tupleIsAlive, + void *state) +{ + vci_RosCommandContext *comContext = (vci_RosCommandContext *) state; + + Assert(comContext); + + if (tupleIsAlive) + { + Assert((0 <= comContext->buffer.numFilled) && + (comContext->buffer.numFilled < comContext->numRowsAtOnce)); + + vci_FillOneRowInRosChunkBuffer(&(comContext->buffer), + &(comContext->info), + &IndexHeapTuple->t_self, /* use the original heap + * tuple saved in + * heapam_index_build_range_scan() */ + IndexHeapTuple, /* use the original heap + * tuple saved in + * heapam_index_build_range_scan() */ + comContext->indxColumnIdList, + comContext->heapAttrNumList, + RelationGetDescr(comContext->heapRel)); + + if (comContext->numRowsAtOnce <= comContext->buffer.numFilled) + { + vci_RegisterChunkBuffer(&(comContext->storage), + &(comContext->buffer)); + vci_ResetRosChunkBufferCounter(&(comContext->buffer)); + } + + if (VCI_NUM_ROWS_IN_EXTENT <= + (comContext->storage.numTotalRows + comContext->buffer.numFilled)) + { + Assert(TransactionIdIsValid(comContext->xid)); + if (0 < comContext->buffer.numFilled) + { + vci_RegisterChunkBuffer(&(comContext->storage), + &(comContext->buffer)); + vci_ResetRosChunkBufferCounter(&(comContext->buffer)); + } + vci_WriteExtentInfoInMainRosForWosRosConvInit(&(comContext->info), + comContext->extentId, + comContext->xid); + UpdateTidCridForBuild(comContext); + vci_WriteOneExtent(&(comContext->info), + &(comContext->storage), + comContext->extentId, + comContext->xid, + InvalidTransactionId, + comContext->xid); + vci_ResetRosChunkStorage(&(comContext->storage)); + comContext->extentId++; + } + } +} + +static void +FinalizeBuild(vci_RosCommandContext *comContext) +{ + if (0 < comContext->buffer.numFilled) + vci_RegisterChunkBuffer(&(comContext->storage), + &(comContext->buffer)); + + if (0 < comContext->storage.numTotalRows) + { + Assert(TransactionIdIsValid(comContext->xid)); + vci_WriteExtentInfoInMainRosForWosRosConvInit(&(comContext->info), + comContext->extentId, + comContext->xid); + UpdateTidCridForBuild(comContext); + vci_WriteOneExtent(&(comContext->info), + &(comContext->storage), + comContext->extentId, + comContext->xid, + InvalidTransactionId, + comContext->xid); + comContext->extentId++; + } +} + +/** + * @brief Obtain number of rows in the relation estimated by ANALYZE or + * VACUUM commands. + * + * @param[in] relid The Oid of the relation. + * @return The estimated number of rows. + */ +static double +GetEstimatedNumRows(Oid relid) +{ + HeapTuple tp = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); + + if (HeapTupleIsValid(tp)) + { + Form_pg_class reltup = (Form_pg_class) GETSTRUCT(tp); + double result = Max(reltup->reltuples, 0); + + ReleaseSysCache(tp); + + return result; + } + else + return 0.0; +} + +/** + * This function is assumed when the VCI index is newly built, and + * it converts all the data in the relation of PostgreSQL into ROS. + */ +double +vci_ConvertWos2RosForBuild(Relation mainRel, + Size workareaSize, + IndexInfo *indexInfo) +{ + vci_RosCommandContext comContext; + MemoryContext memCtxWos2Ros; + MemoryContext oldMemCtx; + double result = 0; + + vci_InitRosCommandContext0(&comContext, mainRel, + vci_rc_wos_ros_conv_build); + + /* prepare local work area */ + memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext, + "WOS->ROS conversion", + ALLOCSET_DEFAULT_SIZES); + oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros); + + vci_InitRosCommandContext1(&comContext, + workareaSize, + VCI_NUM_ROWS_IN_EXTENT, 0, + true); + + vci_InitRosChunkStroageAndBuffer(&comContext, false /* no append */ ); + + comContext.extentId = VCI_FIRST_NORMAL_EXTENT_ID; + + /* + * Initialize information for printing progress + */ + comContext.estimatedNumRows = GetEstimatedNumRows( + RelationGetRelid(comContext.heapRel)); + if (comContext.estimatedNumRows < 1) + comContext.estimatedNumRows = 1; + comContext.numConvertedRows = 0; + strcpy(comContext.relName, RelationGetRelationName(mainRel)); + + result = table_index_build_scan(comContext.heapRel, + mainRel, + indexInfo, + true, /* allow syncscan */ + true, + vci_build_callback, + (void *) &comContext, NULL); + indexInfo->ii_BrokenHotChain = true; + FinalizeBuild(&comContext); + + vci_FinRosCommandContext(&comContext, false); + + MemoryContextSwitchTo(oldMemCtx); + MemoryContextDelete(memCtxWos2Ros); + + return result; +} + +static void +RemoveWosEntries(vci_RosCommandContext *comContext, WosKind wos_kind) +{ + Datum value; + bool isnull; + Relation rel; + Oid wos_oid; + Tuplesortstate *sortstate = NULL; + + switch (wos_kind) + { + case WOS_Data: + wos_oid = vci_GetMainRelVar(&comContext->info, vcimrv_data_wos_oid, 0); + sortstate = comContext->data_wos_del_list; + break; + + case WOS_Whiteout: + wos_oid = vci_GetMainRelVar(&comContext->info, vcimrv_whiteout_wos_oid, 0); + sortstate = comContext->whiteout_wos_del_list; + break; + default: + wos_oid = InvalidOid; + break; + } + + tuplesort_performsort(sortstate); + + rel = relation_open(wos_oid, RowExclusiveLock); + + while (tuplesort_getdatum(sortstate, true, true, &value, &isnull, NULL)) + { + ItemPointer tid; + + tid = DatumGetItemPointer(value); + + simple_heap_delete(rel, tid); + } + + RelationSetTargetBlock(rel, InvalidBlockNumber); + + relation_close(rel, RowExclusiveLock); +} + +static uint64 +cleanUpWos(vci_RosCommandContext *comContext, vci_MainRelVar wosType) +{ + const LOCKMODE lockmode = ShareUpdateExclusiveLock; + vci_MainRelHeaderInfo *info; + BlockNumber nblocks; + BlockNumber blkno; + OffsetNumber offnum; + ItemPointer dead_tuples; + int max_dead_tuples; + int tupindex; + uint64 total_live = 0; + + HeapTupleData tuple; + + Oid oidWosType; + TransactionId oldestXmin; + Relation rel; + + info = &comContext->info; + + oldestXmin = comContext->oldestXmin; + + oidWosType = vci_GetMainRelVar(info, wosType, 0); + + rel = table_open(oidWosType, lockmode); + + max_dead_tuples = MaxHeapTuplesPerPage; + dead_tuples = palloc0(sizeof(ItemPointerData) * max_dead_tuples); + + nblocks = RelationGetNumberOfBlocks(rel); + for (blkno = 0; blkno < nblocks; blkno++) + { + Size freespace; + int num_dead_tuples = 0; + TransactionId snapshotConflictHorizon = InvalidTransactionId; + + Buffer buffer; + Buffer vmbuffer = InvalidBuffer; + Page page; + OffsetNumber maxoff; + + OffsetNumber unused[MaxOffsetNumber]; + int uncnt = 0; + bool is_visible_page = true; + + /* Get a buffer containing the target block. */ + buffer = ReadBuffer(rel, blkno); + page = BufferGetPage(buffer); + + if (!ConditionalLockBufferForCleanup(buffer)) + { + ReleaseBuffer(buffer); + continue; + } + + /* Collect removable dead tuples in the target block. */ + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + + /* Unused items require no processing, but we count 'em */ + if (!ItemIdIsUsed(itemid)) + continue; + + /* Redirect items mustn't be touched */ + if (ItemIdIsRedirected(itemid)) + continue; + + ItemPointerSet(&(tuple.t_self), blkno, offnum); + + /* + * DEAD item pointers are to be vacuumed normally; but we don't + * count them in tups_vacuumed, else we'd be double-counting (at + * least in the common case where heap_page_prune() just freed up + * a non-HOT tuple). + */ + if (ItemIdIsDead(itemid)) + { + dead_tuples[num_dead_tuples++] = tuple.t_self; + continue; + } + + Assert(ItemIdIsNormal(itemid)); + + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple.t_len = ItemIdGetLength(itemid); + tuple.t_tableOid = RelationGetRelid(rel); + + switch (HeapTupleSatisfiesVacuum(&tuple, oldestXmin, buffer)) + { + case HEAPTUPLE_DEAD: + dead_tuples[num_dead_tuples++] = tuple.t_self; + HeapTupleHeaderAdvanceConflictHorizon(tuple.t_data, + &snapshotConflictHorizon); + break; + case HEAPTUPLE_LIVE: + ++total_live; + break; + case HEAPTUPLE_RECENTLY_DEAD: + case HEAPTUPLE_INSERT_IN_PROGRESS: + case HEAPTUPLE_DELETE_IN_PROGRESS: + break; + default: + elog(ERROR, "unexpected HeapTupleSatisfiesVacuum result"); + break; + } + } + + if (num_dead_tuples == 0) + { + /* + * Skip repair of a fragmentation, because dead tuple is not + * exist. + */ + UnlockReleaseBuffer(buffer); + continue; + } + + visibilitymap_pin(rel, blkno, &vmbuffer); + + /* + * this routine is copied from lazy_vacuum_heap() & + * lazy_vacuum_page(), + */ + /* and modified */ + + START_CRIT_SECTION(); + + for (tupindex = 0; tupindex < num_dead_tuples; tupindex++) + { + BlockNumber tblk; + OffsetNumber toff; + ItemId itemid; + + HeapTupleHeader htup; + + tblk = ItemPointerGetBlockNumber(&dead_tuples[tupindex]); + if (tblk != blkno) + break; /* past end of tuples for this block */ + toff = ItemPointerGetOffsetNumber(&dead_tuples[tupindex]); + + itemid = PageGetItemId(page, toff); + if (!ItemIdHasStorage(itemid)) + continue; + + htup = (HeapTupleHeader) PageGetItem(page, itemid); + dead_tuples[tupindex] = *(ItemPointer) ((char *) htup + htup->t_hoff); + + ItemIdSetUnused(itemid); + + unused[uncnt++] = toff; + } + + PageRepairFragmentation(page); + + /* Mark buffer dirty before we write WAL. */ + MarkBufferDirty(buffer); + + for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) + { + ItemId itemid = PageGetItemId(page, offnum); + + if (ItemIdIsUsed(itemid)) + { + is_visible_page = false; + break; + } + } + + if (BufferIsValid(vmbuffer)) + { + if (is_visible_page) + { + PageSetAllVisible(page); + MarkBufferDirty(buffer); + visibilitymap_set(rel, blkno, buffer, InvalidXLogRecPtr, + vmbuffer, InvalidTransactionId, VISIBILITYMAP_ALL_VISIBLE); + } + + ReleaseBuffer(vmbuffer); + } + + /* XLOG stuff */ + if (RelationNeedsWAL(rel)) + { + xl_heap_prune xlrec; + XLogRecPtr recptr; + + xlrec.flags = 0; + + XLogBeginInsert(); + + XLogRegisterBuffer(0, buffer, REGBUF_STANDARD); + + if (uncnt > 0) + XLogRegisterBufData(0, unused, + uncnt * sizeof(OffsetNumber)); + if (RelationIsAccessibleInLogicalDecoding(rel)) + xlrec.flags |= XLHP_IS_CATALOG_REL; + if (TransactionIdIsValid(snapshotConflictHorizon)) + xlrec.flags |= XLHP_HAS_CONFLICT_HORIZON; + + XLogRegisterData(&xlrec, SizeOfHeapPrune); + if (TransactionIdIsValid(snapshotConflictHorizon)) + XLogRegisterData(&snapshotConflictHorizon, sizeof(TransactionId)); + + recptr = XLogInsert(RM_HEAP2_ID, XLOG_HEAP2_PRUNE_ON_ACCESS); + + PageSetLSN(page, recptr); + } + + END_CRIT_SECTION(); + + freespace = PageGetHeapFreeSpace(page); + + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + + RecordPageWithFreeSpace(rel, blkno, freespace); + + /* + * in vci_UnregisterTIDFromTIDTree(), TidTree in memory will be + * rebuild when the size was too large to store in memory, and the + * size is reduced to fit to the memory size. At that time, data WOS + * will be scan to obtain TID list. So, vci_UnregisterTIDFromTIDTree() + * can not be called in the critical section above. + */ + } + + pfree(dead_tuples); + table_close(rel, lockmode); + + return total_live; +} + +/** + * generate local ROS. + * This function is assumed to be called in backend process, not parallel + * background worker. Here, vci_CSFetchContext is used unlocalized. + */ +vci_local_ros_t * +vci_GenerateLocalRos(vci_CSQueryContext queryContext, + Size workareaSize, + int64 numDataWosRows, + int64 numWhiteoutWosRows) +{ + vci_RosCommandContext comContext; + int numRowsInExtent; + MemoryContext localMemCtx; + MemoryContext sharedMemCtx; + MemoryContext oldMemCtx; + vci_local_ros_t *result; + Size partedWorkareaSize = workareaSize / 4; + int64 numLocalDeleteListRows; + + numRowsInExtent = vci_GetNumRowsInLocalRosExtent(queryContext->num_columns); + + sharedMemCtx = AllocSetContextCreate(queryContext->shared_memory_context, + "Work for Local ROS generation", + ALLOCSET_DEFAULT_SIZES); + + result = MemoryContextAllocZero(sharedMemCtx, sizeof(vci_local_ros_t)); + result->num_local_extents = 0; + result->extent = NULL; + result->memory_context = sharedMemCtx; + result->fetch_context = vci_CSCreateFetchContextBase(queryContext, + Min(numRowsInExtent, numDataWosRows), + queryContext->num_columns, + queryContext->attr_num, + true, + true, + true, + false); /* no compression */ + + numRowsInExtent = result->fetch_context->num_rows_read_at_once; + + Assert(queryContext == result->fetch_context->query_context); + + /* + * Local Delete List + */ + numLocalDeleteListRows = numDataWosRows + numWhiteoutWosRows; + + result->local_delete_list.crid_list = + MemoryContextAllocZero(result->memory_context, + sizeof(*(result->local_delete_list.crid_list)) * numLocalDeleteListRows); + result->local_delete_list.num_entry = 0; + result->local_delete_list.length = numLocalDeleteListRows; + + Assert(0 == ((uintptr_t) (result->local_delete_list.crid_list) & (MAXIMUM_ALIGNOF - 1))); + + localMemCtx = AllocSetContextCreate(TopTransactionContext, + "Work for Local ROS generation", + ALLOCSET_DEFAULT_SIZES); + oldMemCtx = MemoryContextSwitchTo(localMemCtx); + + vci_InitRosCommandContext0(&comContext, queryContext->info->rel, + vci_rc_generate_local_ros); + vci_InitRosCommandContext1(&comContext, + partedWorkareaSize, + numDataWosRows, numWhiteoutWosRows, + true); + + vci_InitRosChunkStroageAndBuffer(&comContext, false /* no append */ ); + + comContext.inclusiveXid = queryContext->inclusive_xid; + comContext.exclusiveXid = queryContext->exclusive_xid; + + Assert(queryContext->num_data_wos_entries <= VCI_NUM_ROWS_IN_EXTENT * VCI_MAX_NUMBER_UNCONVERTED_ROS); + Assert(queryContext->num_whiteout_wos_entries <= VCI_NUM_ROWS_IN_EXTENT * VCI_MAX_NUMBER_UNCONVERTED_ROS); + + constructTidArray(&comContext, + (int) queryContext->num_data_wos_entries, + (int) queryContext->num_whiteout_wos_entries); + + comContext.numRowsToConvert = Min(comContext.numRowsToConvert, + numRowsInExtent); + comContext.local_ros = result; + queryContext->local_ros = result; + + MemoryContextSwitchTo(sharedMemCtx); + + PG_TRY(); + { + ConvertWos2LocalRos(&comContext); + + comContext.local_ros = result; + + ConvertWhiteOut2LocalDeleteList(&comContext, + result->fetch_context->query_context->tid_crid_diff_sel); + + qsort(result->local_delete_list.crid_list, + result->local_delete_list.num_entry, + sizeof(uint64), + CmpUint64); + + queryContext->local_ros = result; + queryContext->num_local_ros_extents = result->num_local_extents; + queryContext->delete_list = comContext.local_ros->local_delete_list.crid_list; + queryContext->num_delete = comContext.local_ros->local_delete_list.num_entry; + } + PG_CATCH(); + { + if (geterrcode() == ERRCODE_OUT_OF_MEMORY) + { + vci_FinRosCommandContext(&comContext, true /* never write */ ); + + MemoryContextSwitchTo(oldMemCtx); + MemoryContextDelete(localMemCtx); + } + + PG_RE_THROW(); + } + PG_END_TRY(); + + vci_FinRosCommandContext(&comContext, false); + + MemoryContextSwitchTo(oldMemCtx); + MemoryContextDelete(localMemCtx); + + return result; +} + +/** + * in vci_DestroyLocalRos(), release the memory context allocated to the + * local ros pointed by localRos. + * We have not need to pfree() each element. + */ +void +vci_DestroyLocalRos(vci_local_ros_t *localRos) +{ + MemoryContext memCtx; + + Assert(localRos); + memCtx = localRos->memory_context; + MemoryContextDelete(memCtx); +} + +uint32 +vci_CountFreezedInDataWos(Relation mainRel, Size workareaSize) +{ + uint32 count = 0; + vci_MainRelHeaderInfo infoData = {0}; + vci_MainRelHeaderInfo *info = &infoData; + + Oid dataWosOid; + Relation dataWosRel; + + TableScanDesc scan; + HeapTuple tuple; + Snapshot snapshot; + + vci_InitMainRelHeaderInfo(info, mainRel, vci_rc_probe); + vci_KeepMainRelHeader(info); + + dataWosOid = (Oid) vci_GetMainRelVar(info, vcimrv_data_wos_oid, 0); + dataWosRel = table_open(dataWosOid, AccessShareLock); + + snapshot = vci_GetSnapshotForWos2Ros(); + + scan = table_beginscan(dataWosRel, snapshot, 0, NULL); + scan->rs_flags &= ~SO_ALLOW_PAGEMODE; + + while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + count++; + } + table_endscan(scan); + + PopActiveSnapshot(); + + /* release the data WOS relation */ + table_close(dataWosRel, AccessShareLock); + + /* release the main relation */ + vci_ReleaseMainRelHeader(info); + + return count; +} + +/* --------------------------------------------------------------*/ +/* Update Delete Lists */ +/* --------------------------------------------------------------*/ + +uint32 +vci_CountFreezedInWhiteoutWos(Relation mainRel, Size workareaSize) +{ + uint32 count = 0; + vci_MainRelHeaderInfo infoData = {0}; + vci_MainRelHeaderInfo *info = &infoData; + + Oid whiteoutWosOid; + Relation whiteoutWosRel; + + TableScanDesc scan; + HeapTuple tuple; + Snapshot snapshot; + + vci_InitMainRelHeaderInfo(info, mainRel, vci_rc_probe); + vci_KeepMainRelHeader(info); + + whiteoutWosOid = (Oid) vci_GetMainRelVar(info, vcimrv_whiteout_wos_oid, 0); + whiteoutWosRel = table_open(whiteoutWosOid, AccessShareLock); + + snapshot = vci_GetSnapshotForWos2Ros(); + + scan = table_beginscan(whiteoutWosRel, snapshot, 0, NULL); + scan->rs_flags &= ~SO_ALLOW_PAGEMODE; + while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + count++; + } + table_endscan(scan); + + PopActiveSnapshot(); + + /* release the data WOS relation */ + table_close(whiteoutWosRel, AccessShareLock); + + /* release the main relation */ + vci_ReleaseMainRelHeader(info); + + return count; +} + +static uint64 +UpdateDelVec(vci_RosCommandContext *comContext, Size workareaSize, uint64 numRowsAtOnce) +{ + uint32 numExtents; + Tuplesortstate *cridList; + uint64 result = 0; + + if (comContext->num_delvec_tids == 0) + return 0; + + numExtents = vci_GetMainRelVar(&comContext->info, vcimrv_num_extents, 0); + + cridList = + tuplesort_begin_datum(INT8OID, Int8LessOperator, InvalidOid, false, + Min(workareaSize / 1024 / 2, INT_MAX), NULL, TUPLESORT_NONE); + + /* + * Phase 1. Convert TID List -> CRID List + */ + do + { + vci_TidCridUpdateListContext *oldListContext; + Tuplesortstate *addList; + + uint32 oldSel; + uint32 newSel; + + oldSel = vci_GetMainRelVar(&comContext->info, vcimrv_tid_crid_diff_sel, 0); + newSel = 1 ^ oldSel; + + oldListContext = vci_OpenTidCridUpdateList(&comContext->info, oldSel); + + addList = + tuplesort_begin_datum(TIDOID, TIDLessOperator, InvalidOid, false, + Min(workareaSize / 1024 / 2, INT_MAX), NULL, TUPLESORT_NONE); + while (result < numRowsAtOnce) + { + ItemPointerData orig_tid; + ItemPointerData wos_tid; + uint64 cridUint; + + if (!get_entry_into_tid_list(comContext, WOS_Whiteout, &orig_tid, &wos_tid)) + break; + + if (comContext->whiteout_wos_del_list) + tuplesort_putdatum(comContext->whiteout_wos_del_list, ItemPointerGetDatum(&wos_tid), false); + + cridUint = vci_GetCridFromTid(oldListContext, &orig_tid, NULL); + + if (cridUint == VCI_INVALID_CRID) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("vci index \"%s\" corrupted", RelationGetRelationName(comContext->info.rel)), + errdetail("try to delete TID (%d,%d) into delete vector twice", + ItemPointerGetBlockNumber(&orig_tid), + ItemPointerGetOffsetNumber(&orig_tid)), + errhint("Use DROP INDEX \"%s\"", RelationGetRelationName(comContext->info.rel)))); + + /* list for storage */ + tuplesort_putdatum(addList, ItemPointerGetDatum(&orig_tid), false); + + /* list for operation */ + tuplesort_putdatum(cridList, Int64GetDatum((int64) cridUint), false); + + result++; + } + + vci_CloseTidCridUpdateList(oldListContext); + + tuplesort_performsort(addList); + + /* Insert TID->CRID(Invalid) List */ + vci_MergeAndWriteTidCridUpdateList(&comContext->info, newSel, oldSel, addList, vci_GetCridFromUint64(VCI_INVALID_CRID)); + + tuplesort_end(addList); + + } while (false); /* phase1 */ + + elog(DEBUG2, "CRID List OK"); + + /* + * Phase 2. loop for crid + */ + do + { + LOCKMODE lockmode = RowExclusiveLock; + vci_ColumnRelations delvecCol; + + BlockNumber prevBlkno = InvalidBlockNumber; + OffsetNumber prevOffset = InvalidOffsetNumber; + + Buffer buffer = InvalidBuffer; + Page page = NULL; + + bool readFirstBlock = false; + Datum value; + bool isnull; + + uint32 numDeletedRows[VCI_MAX_PAGE_SPACE / sizeof(vcis_m_extent_t)]; + int32 topExtentId = -1; + BlockNumber topBlockNumber = InvalidBlockNumber; + + memset(numDeletedRows, 0, sizeof(numDeletedRows)); + + tuplesort_performsort(cridList); + + vci_OpenColumnRelations(&delvecCol, &comContext->info, + VCI_COLUMN_ID_DELETE, lockmode); + + while (tuplesort_getdatum(cridList, true, true, &value, &isnull, NULL)) + { + HeapTupleHeader htup; + int32 extentId; + BlockNumber blkno; + OffsetNumber offset; + uint32 byte_num; + uint32 setBitPos; + uint64 crid; + BlockNumber extentInfoBlkno; + OffsetNumber extentInfoOffset; + + crid = (uint64) DatumGetInt64(value); + + extentId = vci_CalcExtentIdFromCrid64(crid); + blkno = vci_CalcBlockNumberFromCrid64ForDelete(crid); + offset = vci_CalcOffsetNumberFromCrid64ForDelete(crid); + byte_num = vci_CalcByteFromCrid64ForDelete(crid); + setBitPos = vci_CalcBitFromCrid64ForDelete(crid); + + if ((blkno != prevBlkno) || (offset != prevOffset)) + { + if (readFirstBlock) + { + /* write Tuple & WAL */ + vci_WriteItem(delvecCol.data, buffer, prevOffset); + } + } + + if (blkno != prevBlkno) + { + if (readFirstBlock) + UnlockReleaseBuffer(buffer); + + buffer = vci_ReadBufferWithPageInitDelVec(delvecCol.data, blkno); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buffer); + + readFirstBlock = true; + } + + /* Calc bits & overwrite */ + htup = (HeapTupleHeader) PageGetItem(page, PageGetItemId(page, offset)); + *((char *) htup + htup->t_hoff + byte_num) |= 1 << setBitPos; + + vci_GetExtentInfoPosition(&extentInfoBlkno, &extentInfoOffset, extentId); + + if (topBlockNumber != extentInfoBlkno) + { + writeNumDeleteRowsIntoExntetInfo(&comContext->info, topExtentId, numExtents, numDeletedRows); + + memset(numDeletedRows, 0, sizeof(numDeletedRows)); + + topExtentId = extentId; + topBlockNumber = extentInfoBlkno; + } + + numDeletedRows[extentId - topExtentId]++; + + prevBlkno = blkno; + prevOffset = offset; + } + + /* write remaining Tuple & WAL, and release buffer */ + if (readFirstBlock) + { + Assert(BufferIsValid(buffer)); + vci_WriteItem(delvecCol.data, buffer, prevOffset); + UnlockReleaseBuffer(buffer); + } + + /* Close Column */ + vci_CloseColumnRelations(&delvecCol, lockmode); + + if (BlockNumberIsValid(topBlockNumber)) + writeNumDeleteRowsIntoExntetInfo(&comContext->info, topExtentId, numExtents, numDeletedRows); + + } while (false); /* phase 2 */ + + tuplesort_end(cridList); + + elog(DEBUG2, "update delvec OK"); + + return result; +} + +static void +writeNumDeleteRowsIntoExntetInfo(vci_MainRelHeaderInfo *info, int32 topExtentId, uint32 numExtents, uint32 *numDeletedRows) +{ + BlockNumber topBlockNumber; + OffsetNumber topOffsetNumber; + Buffer buffer; + Page page; + int32 extentId; + + if (topExtentId < 0) + return; + + vci_GetExtentInfoPosition(&topBlockNumber, &topOffsetNumber, topExtentId); + + buffer = vci_ReadBufferWithPageInit(info->rel, topBlockNumber); + + /* LockBuffer(buffer, BUFFER_LOCK_SHARE); */ + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + page = BufferGetPage(buffer); + + for (extentId = topExtentId; (uint32) extentId < numExtents; extentId++) + { + BlockNumber curBlockNumber; + OffsetNumber curOffsetNumber; + vcis_m_extent_t *extentInfo; + + vci_GetExtentInfoPosition(&curBlockNumber, &curOffsetNumber, extentId); + + if (curBlockNumber != topBlockNumber) + break; + + extentInfo = (vcis_m_extent_t *) &(((char *) page)[curOffsetNumber]); + + extentInfo->num_deleted_rows += numDeletedRows[extentId - topExtentId]; + } + + vci_WriteOneItemPage(info->rel, buffer); + + UnlockReleaseBuffer(buffer); +} + +int +vci_UpdateDelVec(Relation mainRel, Size workareaSize, int numRows) +{ + int result = -1; + MemoryContext memCtxWos2Ros; + MemoryContext oldMemCtx; + vci_RosCommandContext comContext; + + vci_InitRosCommandContext0(&comContext, mainRel, vci_rc_update_del_vec); + + /* recover ROS if necessary */ + vci_RecoverOneVCIIfNecessary(&(comContext.info)); + + /* Change Mem Context */ + memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext, + "Delete Vector Update.", + ALLOCSET_DEFAULT_SIZES); + oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros); + + /* Create TID List from Whiteout WOS */ + vci_InitRosCommandContext1(&comContext, + workareaSize / 2, + 0, numRows, + false); + + vci_InitRosCommandContext2(&comContext, workareaSize / 2); + + if (TransactionIdPrecedes(GetCurrentTransactionId(), + (TransactionId) vci_GetMainRelVar(&comContext.info, vcimrv_current_ros_version, 0))) + goto done; + + GetActiveSnapshot(); + + /* Write Recovery Information */ + vci_WriteRecoveryRecordForUpdateDelVec(&comContext.info); + + vci_WriteRecoveryRecordDone(&comContext.info, comContext.command, comContext.xid); + + constructTidSortState(&comContext); + + /* call Main routine */ + result = UpdateDelVec(&comContext, workareaSize / 2, Min(numRows, VCI_NUM_ROWS_IN_EXTENT)); + + /* Clean up WOS entry */ + cleanUpWos(&comContext, vcimrv_data_wos_oid); + cleanUpWos(&comContext, vcimrv_whiteout_wos_oid); + + /* Xmax WOS entry */ + RemoveWosEntries(&comContext, WOS_Data); + RemoveWosEntries(&comContext, WOS_Whiteout); + +done: + /* Finalize ROS */ + vci_FinRosCommandContext(&comContext, false); + + MemoryContextSwitchTo(oldMemCtx); + MemoryContextDelete(memCtxWos2Ros); + + return result; +} + +/* -------------------------------------------------------------- */ +/* Correction Deleted Rows */ +/* -------------------------------------------------------------- */ + +static vci_target_extent_info_t +CountExtents(Relation mainRel, uint32 threshold, CEKind kind) +{ + TransactionId wos2rosXid; + + vci_MainRelHeaderInfo infoData = {0}; + vci_MainRelHeaderInfo *info = &infoData; + + vcis_m_extent_t *extentInfo; + vci_meta_item_scanner_t *scan; + + vci_target_extent_info_t result = {0, -1 /* not-found-value */ }; + uint32 max_deleted_rows = 0; + + wos2rosXid = GetOldestNonRemovableTransactionId(mainRel); + + vci_InitMainRelHeaderInfo(info, mainRel, vci_rc_probe); + vci_KeepMainRelHeader(info); + vci_GetMainRelVar(info, vcimrv_num_extents, 0); + + scan = vci_BeginMetaItemScan(info->rel, BUFFER_LOCK_SHARE); + while ((extentInfo = vci_GetMExtentNext(info, scan)) != NULL) + { + if (kind == CEK_CountDeletedRows) + { + if (vci_ExtentIsFree(extentInfo)) + continue; + + if (!vci_ExtentIsVisible(extentInfo, wos2rosXid)) + continue; + + if (TransactionIdIsValid(extentInfo->xdel)) + continue; + + if (extentInfo->num_deleted_rows >= threshold) + { + if (max_deleted_rows <= extentInfo->num_deleted_rows) + { + result.best_extent_id = scan->index; + max_deleted_rows = extentInfo->num_deleted_rows; + } + result.num_fit_extents++; + } + } + else + { + if (vci_ExtentIsFree(extentInfo)) + continue; + + if (vci_ExtentIsCollectable(extentInfo, wos2rosXid)) + { + result.best_extent_id = scan->index; + result.num_fit_extents++; + } + } + } + vci_EndMetaItemScan(scan); + + /* release the main relation */ + vci_ReleaseMainRelHeader(info); + + return result; +} + +vci_target_extent_info_t +vci_CountDeletedRowsInROS(Relation mainRel, uint32 threshold) +{ + return CountExtents(mainRel, threshold, CEK_CountDeletedRows); +} + +static HeapTuple +getTupleFromVector(int offset, + TupleDesc tupleDesc, + vci_virtual_tuples_t *vecSet) +{ + HeapTuple result; + vci_CSFetchContext fetchContext = vecSet->fetch_context; + vci_CSQueryContext queryContext = fetchContext->query_context; + Datum values[MaxAttrNumber]; + bool isnull[MaxAttrNumber]; + int cId; + + Assert((0 <= offset) && (offset < vecSet->num_rows)); + Assert(tupleDesc->natts == vecSet->num_columns); + for (cId = 0; cId < vecSet->num_columns; ++cId) + { + int tgtId = queryContext->column_id[fetchContext->column_link[cId]]; + + Assert((0 <= tgtId) && (tgtId < queryContext->num_columns)); + values[tgtId] = vci_CSGetValuesOfVirtualTupleColumnar(vecSet, cId)[offset]; + isnull[tgtId] = vci_CSGetIsNullOfVirtualTupleColumnar(vecSet, cId)[offset]; + } + result = heap_form_tuple(tupleDesc, values, isnull); +#ifdef __s390x__ + result->t_self = vci_CSGetTidInItemPointerFromVirtualTuples(vecSet, offset); +#else + result->t_self = *(vci_CSGetTidInItemPointerFromVirtualTuples(vecSet, offset)); +#endif + + return result; +} + +static void +FillOneRosChunkBufferFromExtent(vci_RosCommandContext *comContext, + int32 extentId, + uint32 *rowIdInExtent) +{ + vci_CSQueryContext queryContext; + vci_CSFetchContext fetchContext; + vci_CSFetchContext localContext; + vci_virtual_tuples_t *vectorSet = NULL; + + TupleDesc tupleDesc; + AttrNumber *tableAttrNumList; + AttrNumber *fetchAttrNumList; + int colId; + int numFetchRowsAtOnce = Min(comContext->numRowsAtOnce, VCI_MAX_NUM_ROW_TO_FETCH); + vci_ros_command_t saveCommand1; + + saveCommand1 = comContext->info.command; + + /* Get a descriptor of the index relation(VCI main relation). */ + /* This is not a descriptor of the table relation. */ + /* This including only target columns for VCI. */ + tupleDesc = vci_GetTupleDescr(&comContext->info); + Assert(comContext->numColumns == tupleDesc->natts); + + /* Create pg_attribute::attnum list of the table relation for initialize, */ + /* and create serial number of ROS columners for fetch. */ + tableAttrNumList = palloc(sizeof(AttrNumber) * comContext->numColumns); + fetchAttrNumList = palloc(sizeof(AttrNumber) * comContext->numColumns); + for (colId = VCI_FIRST_NORMALCOLUMN_ID; colId < comContext->numColumns; ++colId) + { + tableAttrNumList[colId] = comContext->heapAttrNumList[colId]; + fetchAttrNumList[colId] = (AttrNumber) (comContext->indxColumnIdList[colId] + 1); + } + + /* queryContext */ + queryContext = vci_CSCreateQueryContext(RelationGetRelid(comContext->info.rel), + comContext->numColumns, + tableAttrNumList, + TopTransactionContext, + false, + false); + + /* fetchContext */ + fetchContext = vci_CSCreateFetchContext(queryContext, + numFetchRowsAtOnce, + comContext->numColumns, + tableAttrNumList, + true, /* use ColumnStore */ + true, /* return Tid */ + false); /* NOT return CRID */ + + localContext = vci_CSLocalizeFetchContext(fetchContext, + CurrentMemoryContext); + + { + vci_extent_status_t *status = vci_CSCreateCheckExtent(localContext); + bool extent_ok; + + vci_CSCheckExtent(status, localContext, extentId, true); + + elog(DEBUG2, "status: %d, %d, %d, %d", status->size, status->num_rows, + status->existence, status->visible); + + extent_ok = status->existence && status->visible; + + vci_CSDestroyCheckExtent(status); + + if (!extent_ok) + { + comContext->done = true; + goto done; + } + } + + /* VectorSet */ + vectorSet = vci_CSCreateVirtualTuples(localContext); + + { + while (comContext->buffer.numFilled < comContext->numRowsAtOnce) + { + /* int numFetchRows; */ + int numRead; + int offset; + + if ((*rowIdInExtent) >= VCI_NUM_ROWS_IN_EXTENT) + { + comContext->done = true; + goto done; + } + + /* + * if (((*rowIdInExtent) + numFetchRowsAtOnce) <= + * VCI_NUM_ROWS_IN_EXTENT) numFetchRows = numFetchRowsAtOnce; else + * numFetchRows = VCI_NUM_ROWS_IN_EXTENT - (*rowIdInExtent); + */ + + /* FIXME: Does it need to use numFetchRows?? */ + /* let the vci_CSFetchVirtualTuples optimize the number of rows */ + numRead = vci_CSFetchVirtualTuples(vectorSet, + vci_CalcCrid64(extentId, *rowIdInExtent), + numFetchRowsAtOnce); + + if (numRead < 1) + { + comContext->done = true; + goto done; + } + + /* Read fetched data as HeapTuple */ + for (offset = 0; offset < numRead; ++offset) + { + HeapTuple tuple = NULL; + uint16 skip = vci_CSGetSkipFromVirtualTuples(vectorSet)[offset]; + + if (0 < skip) + { + (*rowIdInExtent) += skip; + offset += skip - 1; + continue; + } + + tuple = getTupleFromVector(offset, tupleDesc, vectorSet); + (*rowIdInExtent) += 1; + + if (tuple != NULL) + { + /* ... and register to ROS Chunk. */ + vci_FillOneRowInRosChunkBuffer(&(comContext->buffer), + &(comContext->info), + &tuple->t_self, + tuple, + comContext->indxColumnIdList, + fetchAttrNumList, + tupleDesc); + if (comContext->buffer.numFilled == comContext->numRowsAtOnce) + break; + } + else + { + Assert(false); + elog(LOG, "internal error: CDR command failed"); + } + } + } + } + +done: + if (vectorSet) + vci_CSDestroyVirtualTuples(vectorSet); + vci_CSDestroyFetchContext(localContext); + vci_CSDestroyFetchContext(fetchContext); + vci_CSDestroyQueryContext(queryContext); + + pfree(fetchAttrNumList); + pfree(tableAttrNumList); + + comContext->info.command = saveCommand1; +} + +static bool +isCdrTargetExtentValid(vci_RosCommandContext *comContext) +{ + bool result; + uint32 numExtents; + vcis_m_extent_t *extentInfo; + Buffer buffer = InvalidBuffer; + + if (comContext->extentId == comContext->extentIdSrc) + return false; + + numExtents = vci_GetMainRelVar(&comContext->info, vcimrv_num_extents, 0); + if (numExtents <= comContext->extentIdSrc) + return false; + + extentInfo = vci_GetMExtent(&buffer, &comContext->info, comContext->extentIdSrc); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + result = vci_ExtentIsVisible(extentInfo, comContext->wos2rosXid) && !TransactionIdIsValid(extentInfo->xdel); + UnlockReleaseBuffer(buffer); + + return result; +} + +static int32 +CollectDeletedRows(vci_RosCommandContext *comContext, Snapshot snapshot) +{ + uint32 rowIdInExtent; + + vcis_m_extent_t *extentInfo; + Buffer buffer = InvalidBuffer; + + int numRows; + + Assert(0 == (comContext->numRowsAtOnce % VCI_COMPACTION_UNIT_ROW)); + + /* + * Set CDR data and write main relation for recovery. Header and extent + * info. Here, we also put current ROS version to the actual current + * transaction ID. + */ + vci_WriteExtentInfoInMainRosForWriteExtent(&comContext->info, + comContext->extentId, + comContext->xid, + vci_rc_collect_deleted); + + /* Create ROS Chunk from target Extent */ + vci_ResetRosChunkStorage(&(comContext->storage)); + vci_ResetRosChunkBufferCounter(&(comContext->buffer)); + + /* collect data from old extent for new extent */ + rowIdInExtent = 0; + while (!comContext->done) + { + + CHECK_FOR_INTERRUPTS(); + + /* fetch the data from old extents for one chunk */ + FillOneRosChunkBufferFromExtent(comContext, + comContext->extentIdSrc, &rowIdInExtent); + + if (comContext->buffer.numFilled == comContext->numRowsAtOnce) + { + /* copy chunk buffer in a compact manner */ + vci_RegisterChunkBuffer(&(comContext->storage), &(comContext->buffer)); + vci_ResetRosChunkBufferCounter(&(comContext->buffer)); + + Assert(comContext->storage.numTotalRows <= VCI_NUM_ROWS_IN_EXTENT); + } + else + { + Assert(comContext->done); + + /* + * We read and fill data in unit of VCI_COMPACTION_UNIT_ROW. The + * remaining data is read outside this loop to merge data read + * newly from WOS. + */ + } + } + comContext->done = false; + + elog(DEBUG2, "... collected deleted extent %d -> %d", comContext->extentIdSrc, + comContext->extentId); + + /* + * Now, reading from old extent was completed. Write Current ROS Version + * to VCI main relation as the XDel of old extent. + */ + extentInfo = vci_GetMExtent(&buffer, &(comContext->info), + comContext->extentIdSrc); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + extentInfo->xdel = comContext->xid; + vci_WriteOneItemPage(comContext->info.rel, buffer); + UnlockReleaseBuffer(buffer); + + /* Append data from WOS */ + + numRows = Min((VCI_NUM_ROWS_IN_EXTENT - comContext->storage.numTotalRows + - comContext->buffer.numFilled), + comContext->numRowsToConvert); + + if (numRows > 0) + { + fillTidListFromTidSortState(comContext, numRows); + + ReadOneExtentAndStoreInChunkStorage(comContext); + } + + /* Copy the remaining data to chunk buffer in a compact manner */ + if (0 < comContext->buffer.numFilled) + { + vci_RegisterChunkBuffer(&(comContext->storage), &(comContext->buffer)); + vci_ResetRosChunkBufferCounter(&(comContext->buffer)); + + Assert(comContext->storage.numTotalRows <= VCI_NUM_ROWS_IN_EXTENT); + } + + /* + * Update TID-CRID List, and Write Ros Chunk into new extent. + */ + comContext->numRowsToConvert = comContext->storage.numTotalRows; + + if (comContext->numRowsToConvert == 0) + { + + vci_SetMainRelVar(&comContext->info, vcimrv_new_extent_id, 0, VCI_INVALID_EXTENT_ID); + + return 0; + } + + vci_AddTidCridUpdateList(&(comContext->info), + &(comContext->storage), + comContext->extentId); + vci_WriteOneExtent(&(comContext->info), + &(comContext->storage), + comContext->extentId, /* to */ + comContext->xid, + InvalidTransactionId, + comContext->xid); + + return comContext->storage.numTotalRows; +} + +int +vci_CollectDeletedRows(Relation mainRel, Size workareaSize, int32 extentId) +{ + int result = -1; + + MemoryContext memCtxWos2Ros; + MemoryContext oldMemCtx; + vci_RosCommandContext comContext; + Snapshot snapshot; + + vci_InitRosCommandContext0(&comContext, mainRel, vci_rc_collect_deleted); + + /* excute recovery previous ROS command if necessary */ + vci_RecoverOneVCIIfNecessary(&(comContext.info)); + + /* Change Mem Context */ + memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext, + "Collect Deleted Rows", + ALLOCSET_DEFAULT_SIZES); + oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros); + + /* CommandContext */ + vci_InitRosCommandContext1(&comContext, + workareaSize / 3 * 2, + VCI_NUM_ROWS_IN_EXTENT, 0, + true); + + vci_InitRosCommandContext2(&comContext, workareaSize / 3); + + if (TransactionIdPrecedes(GetCurrentTransactionId(), + (TransactionId) vci_GetMainRelVar(&comContext.info, vcimrv_current_ros_version, 0))) + goto done; + + snapshot = GetActiveSnapshot(); + + /* obtain new extent ID */ + comContext.extentIdSrc = extentId; + comContext.extentId = vci_GetFreeExtentId(&(comContext.info)); + + if (!isCdrTargetExtentValid(&comContext)) + goto done; + + /* Write Recovery Information of this command. */ + vci_WriteRecoveryRecordForExtentInfo(&comContext.info, comContext.extentId, comContext.extentIdSrc); + vci_InitRecoveryRecordForFreeSpace(&comContext.info); + + vci_WriteRecoveryRecordDone(&comContext.info, comContext.command, comContext.xid); + + constructTidSortState(&comContext); + + vci_InitRosChunkStroageAndBuffer(&comContext, true /* append */ ); + + /* call Main routine */ + result = CollectDeletedRows(&comContext, snapshot); + + cleanUpWos(&comContext, vcimrv_data_wos_oid); + cleanUpWos(&comContext, vcimrv_whiteout_wos_oid); + + /* Xmax WOS entry */ + RemoveWosEntries(&comContext, WOS_Data); + RemoveWosEntries(&comContext, WOS_Whiteout); + +done: + /* Finalize ROS */ + vci_FinRosCommandContext(&comContext, false); + + MemoryContextSwitchTo(oldMemCtx); + MemoryContextDelete(memCtxWos2Ros); + + return result; +} + +/* -------------------------------------------------------------- */ +/* Collect Unused Extent */ +/* -------------------------------------------------------------- */ + +vci_target_extent_info_t +vci_CountUnusedExtents(Relation mainRel) +{ + return CountExtents(mainRel, 0, CEK_CountUnusedExtents); +} + +static uint32 +SearchUnusedExtent(vci_MainRelHeaderInfo *info) +{ + int32 extentIdFirstFound = VCI_INVALID_EXTENT_ID; + TransactionId OldestXmin; + vcis_m_extent_t *extentInfo; + vci_meta_item_scanner_t *scan; + + OldestXmin = GetOldestNonRemovableTransactionId(info->rel); + + /* search deleted extent */ + scan = vci_BeginMetaItemScan(info->rel, BUFFER_LOCK_SHARE); + while ((extentInfo = vci_GetMExtentNext(info, scan)) != NULL) + { + if (vci_ExtentIsCollectable(extentInfo, OldestXmin)) + { + extentIdFirstFound = scan->index; + break; + } + } + vci_EndMetaItemScan(scan); + + return extentIdFirstFound; +} + +static void +CollectUnusedExtent(vci_RosCommandContext *comContext) +{ + int16 colId; + int16 numColumns = vci_GetMainRelVar(&comContext->info, vcimrv_num_columns, 0); + int16 recoveredColId = VCI_INVALID_COLUMN_ID; + vcis_m_extent_t *extentInfo; + Buffer buffer = InvalidBuffer; + + extentInfo = vci_GetMExtent(&buffer, &comContext->info, comContext->extentId); + + LockBuffer(buffer, BUFFER_LOCK_SHARE); + if (extentInfo->flags & VCIS_M_EXTENT_FLAG_ENABLE_RECOVERED_COLID) + recoveredColId = extentInfo->recovered_colid; + UnlockReleaseBuffer(buffer); + + for (colId = VCI_COLUMN_ID_NULL; colId < numColumns; ++colId) + { + vci_ColumnRelations relPairData; + vci_ColumnRelations *relPair = &relPairData; + vcis_c_extent_t *extentPointer; + + LOCKMODE lockmode = RowExclusiveLock; + + Buffer bufData; + Buffer bufMeta; + BlockNumber blockNumber; + BlockNumber startBlockNumber; + + Page page; + + vcis_extent_t *extentHead; + + vci_OpenColumnRelations(relPair, &comContext->info, colId, lockmode); + + /* target column-extent pointer */ + extentPointer = vci_GetColumnExtent(&bufMeta, &blockNumber, + relPair->meta, + comContext->extentId); + startBlockNumber = extentPointer->enabled ? extentPointer->block_number : InvalidBlockNumber; + ReleaseBuffer(bufMeta); + + if (!BlockNumberIsValid(startBlockNumber)) + { + /* Close Column */ + elog(DEBUG2, "this is invalid extent pointer!!"); + vci_CloseColumnRelations(relPair, lockmode); + continue; + } + + /* get extent Header */ + bufData = vci_ReadBufferWithPageInit(relPair->data, startBlockNumber); + page = BufferGetPage(bufData); + extentHead = vci_GetExtentT(page); + + if (colId == recoveredColId) + goto skip_collect_freelist; + + /* Freelist link node */ + { + bool isFixedLength; + + isFixedLength = true; + if (VCI_FIRST_NORMALCOLUMN_ID <= colId) + { + vcis_m_column_t *colInfo; + + colInfo = vci_GetMColumn(&comContext->info, colId); + if (colInfo->comp_type != vcis_compression_type_fixed_raw) + isFixedLength = false; + } + + if (!isFixedLength) + { + vcis_free_space_t newFS; + BlockNumber newFSBlockNumber; + + vci_MakeFreeSpace(relPair, startBlockNumber, &newFSBlockNumber, &newFS, true); + + /* FIXME */ /* The common dictionary should be collected? */ + vci_WriteRecoveryRecordForFreeSpace(relPair, + colId, VCI_INVALID_DICTIONARY_ID, + newFSBlockNumber, + &newFS); + + ReleaseBuffer(bufData); + vci_AppendFreeSpaceToLinkList(relPair, + newFSBlockNumber, + newFS.prev_pos, + newFS.next_pos, + newFS.size); + } + else + { + LockBuffer(bufData, BUFFER_LOCK_EXCLUSIVE); + extentHead->type = vcis_free_space; + vci_WriteOneItemPage(relPair->data, bufData); + UnlockReleaseBuffer(bufData); + } + } + +skip_collect_freelist: + vci_WriteRawDataExtentInfo(relPair->meta, + comContext->extentId, + InvalidBlockNumber, + 0, + NULL, /* min */ + NULL, /* max */ + false, + false); + + /* Close Column */ + vci_CloseColumnRelations(relPair, lockmode); + } + /* loop for each column */ + + vci_WriteExtentInfo(&comContext->info, + comContext->extentId, + 0, + 0, + 0, + InvalidTransactionId, + InvalidTransactionId); +} + +int +vci_CollectUnusedExtent(Relation mainRel, Size workareaSize) +{ + int result = -1; + + MemoryContext memCtxWos2Ros; + MemoryContext oldMemCtx; + vci_RosCommandContext comContext; + + vci_InitRosCommandContext0(&comContext, mainRel, vci_rc_collect_extent); + + /* excute recovery previous ROS command if necessary */ + vci_RecoverOneVCIIfNecessary(&(comContext.info)); + + /* Change Mem Context */ + memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext, + "Collect Deleted Extent", + ALLOCSET_DEFAULT_SIZES); + oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros); + + /* CommandContext */ + vci_InitRosCommandContext1(&comContext, + workareaSize, + 0, 0, + false); + + if (TransactionIdPrecedes(GetCurrentTransactionId(), + (TransactionId) vci_GetMainRelVar(&comContext.info, vcimrv_current_ros_version, 0))) + goto done; + + comContext.extentIdSrc = VCI_INVALID_EXTENT_ID; + comContext.extentId = SearchUnusedExtent(&comContext.info); + + if (comContext.extentId == VCI_INVALID_EXTENT_ID) + goto done; + + /* Write Recovery Infomation of this command. */ + vci_WriteRecoveryRecordForExtentInfo(&comContext.info, VCI_INVALID_EXTENT_ID, comContext.extentId); + vci_InitRecoveryRecordForFreeSpace(&comContext.info); + + vci_WriteRecoveryRecordDone(&comContext.info, comContext.command, comContext.xid); + + /* call Main routine */ + CollectUnusedExtent(&comContext); + + result = comContext.extentId; + +done: + /* Finalize ROS */ + vci_FinRosCommandContext(&comContext, false); + + MemoryContextSwitchTo(oldMemCtx); + MemoryContextDelete(memCtxWos2Ros); + + return result; +} + +/* -------------------------------------------------------------- */ +/* Update TID-CRID Tree */ +/* -------------------------------------------------------------- */ + +int32 +vci_CountTidCridUpdateListLength(Relation mainRel, Size workarea) +{ + int32 result; + vci_MainRelHeaderInfo infoData = {0}; + vci_MainRelHeaderInfo *info = &infoData; + int32 oldSel; + + vci_InitMainRelHeaderInfo(info, mainRel, vci_rc_probe); + vci_KeepMainRelHeader(info); + + oldSel = vci_GetMainRelVar(info, vcimrv_tid_crid_diff_sel, 0); + result = vci_GetTidCridUpdateListLength(info, oldSel); + + /* release the main relation */ + vci_ReleaseMainRelHeader(info); + + return result; +} + +/** + * @param[in] comContext Conv Context + * @param[in] workareaSize + */ +static void +UpdateTidCrid(vci_RosCommandContext *comContext, Size workareaSize) +{ + const LOCKMODE lockmode = RowExclusiveLock; + uint32 toMove; + int i; + + vci_TidCridRelations relPairData; + vci_TidCridRelations *relPair = &relPairData; + + vci_TidCridUpdateListContext *oldListContext = NULL; + BlockNumber prevOldListBlkno = InvalidBlockNumber; + vcis_tidcrid_pair_item_t *array; + + vcis_tidcrid_pair_list_t *moveList; + Tuplesortstate *deleteList; + + uint32 oldSel = vci_GetMainRelVar(&comContext->info, vcimrv_tid_crid_diff_sel, 0); + uint32 newSel = 1 ^ oldSel; + + oldListContext = vci_OpenTidCridUpdateList(&comContext->info, oldSel); + + moveList = palloc(offsetof(vcis_tidcrid_pair_list_t, body) + (sizeof(vcis_tidcrid_pair_item_t) * MaxHeapTuplesPerPage)); + moveList->num = 0; + + deleteList = + tuplesort_begin_datum(TIDOID, TIDLessOperator, InvalidOid, false, + Min(workareaSize / 1024, INT_MAX), NULL, TUPLESORT_NONE); + array = palloc(VCI_TID_CRID_UPDATE_PAGE_SPACE); + + vci_OpenTidCridRelations(relPair, &comContext->info, lockmode); + + i = 0; + + for (toMove = 0; toMove < comContext->utility_array.num; toMove++) + { + ItemPointerData treeNodeData; + ItemPointer treeNode = &treeNodeData; + + BlockNumber blkToMove; + + blkToMove = comContext->utility_array.orig_blknos[toMove]; + + moveList->num = 0; + + for (; i < oldListContext->count; i++) + { + BlockNumber blkno = VCI_TID_CRID_UPDATE_BODY_PAGE_ID + (i / VCI_TID_CRID_UPDATE_PAGE_ITEMS); + vcis_tidcrid_pair_item_t item; + + if (prevOldListBlkno != blkno) + { + vci_ReadOneBlockFromTidCridUpdateList(oldListContext, blkno, array); + prevOldListBlkno = blkno; + } + + item = array[i % VCI_TID_CRID_UPDATE_PAGE_ITEMS]; + + if (ItemPointerGetBlockNumber(&item.page_item_id) != blkToMove) + break; + + Assert(moveList->num < MaxHeapTuplesPerPage); + + moveList->body[moveList->num] = item; + moveList->num++; + + tuplesort_putdatum(deleteList, ItemPointerGetDatum(&item.page_item_id), false); + } + + if (moveList->num == 0) + continue; + + vci_GetTidCridSubTree(relPair, blkToMove, treeNode); + + if (!ItemPointerIsValid(treeNode)) + vci_CreateTidCridSubTree(relPair, blkToMove, treeNode); + + vci_UpdateTidCridSubTree(relPair, treeNode, moveList); + } + + pfree(array); + pfree(moveList); + + vci_CloseTidCridRelations(relPair, lockmode); + + vci_CloseTidCridUpdateList(oldListContext); + + tuplesort_performsort(deleteList); + + vci_MergeAndWriteTidCridUpdateList(&comContext->info, newSel, oldSel, deleteList, vci_GetCridFromUint64(VCI_MOVED_CRID)); + + tuplesort_end(deleteList); +} + +/** + * @param[in,out] comContext Conv Context + * @param[in] numPages + */ +static void +collectBlockNumberToMove(vci_RosCommandContext *comContext, int numPages) +{ + uint32 oldSel; + vci_TidCridUpdateListContext *oldListContext; + BlockNumber prevblk = InvalidBlockNumber; + vcis_tidcrid_pair_item_t *array; + BlockNumber blockNumber = VCI_TID_CRID_UPDATE_BODY_PAGE_ID; + uint64 count = 0; + + oldSel = vci_GetMainRelVar(&comContext->info, vcimrv_tid_crid_diff_sel, 0); + oldListContext = vci_OpenTidCridUpdateList(&comContext->info, oldSel); + + comContext->utility_array.num = 0; + + array = palloc(VCI_TID_CRID_UPDATE_PAGE_SPACE); + + while (blockNumber < oldListContext->nblocks) + { + int i; + + vci_ReadOneBlockFromTidCridUpdateList(oldListContext, blockNumber, array); + + for (i = 0; (i < VCI_TID_CRID_UPDATE_PAGE_ITEMS) && (count < oldListContext->count); i++) + { + BlockNumber blkno = ItemPointerGetBlockNumber(&array[i].page_item_id); + + if (prevblk != blkno) + { + comContext->utility_array.orig_blknos[comContext->utility_array.num++] = blkno; + prevblk = blkno; + + if (numPages == comContext->utility_array.num) + goto done; + } + + count++; + } + + blockNumber++; + } + +done: + pfree(array); + + vci_CloseTidCridUpdateList(oldListContext); +} + +int +vci_UpdateTidCrid(Relation mainRel, Size workareaSize, int numPages) +{ + int result = 0; + + MemoryContext memCtxWos2Ros; + MemoryContext oldMemCtx; + vci_RosCommandContext comContext; + + vci_InitRosCommandContext0(&comContext, mainRel, vci_rc_update_tid_crid); + + /* excute recovery previous ROS command if necessary */ + vci_RecoverOneVCIIfNecessary(&(comContext.info)); + + /* Change Mem Context */ + memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext, + "TIDCRID Tree Update", + ALLOCSET_DEFAULT_SIZES); + oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros); + + /* CommandContext */ + vci_InitRosCommandContext1(&comContext, + workareaSize, + 0, 0, + false); + + if (TransactionIdPrecedes(GetCurrentTransactionId(), + (TransactionId) vci_GetMainRelVar(&comContext.info, vcimrv_current_ros_version, 0))) + goto done; + + comContext.utility_array.orig_blknos = (BlockNumber *) palloc(sizeof(BlockNumber) * numPages); + comContext.utility_array.max = numPages; + + collectBlockNumberToMove(&comContext, numPages); + + result = comContext.utility_array.num; + + /* Write Recovery Information of this command. */ + vci_InitRecoveryRecordForTidCrid(&comContext.info); + vci_InitRecoveryRecordForFreeSpace(&comContext.info); + + vci_WriteRecoveryRecordDone(&comContext.info, comContext.command, comContext.xid); + + /* call Main routine */ + UpdateTidCrid(&comContext, workareaSize); + +done: + /* Finalize ROS */ + vci_FinRosCommandContext(&comContext, false); + + MemoryContextSwitchTo(oldMemCtx); + MemoryContextDelete(memCtxWos2Ros); + + return result; +} + +/* -------------------------------------------------------------- */ +/* Vacuum and Freeze */ +/* -------------------------------------------------------------- */ + +static void +freezeMainAndRos(vci_RosCommandContext *comContext) +{ + vcis_m_extent_t *mExtent; + TransactionId wos2rosXid = comContext->wos2rosXid; + vci_meta_item_scanner_t *scan; + TransactionId lastRosVer; + + lastRosVer = vci_GetMainRelVar(&comContext->info, vcimrv_last_ros_version, 0); + if (TransactionIdIsNormal(lastRosVer) && TransactionIdPrecedes(lastRosVer, wos2rosXid)) + vci_SetMainRelVar(&comContext->info, vcimrv_last_ros_version, 0, FrozenTransactionId); + + scan = vci_BeginMetaItemScan(comContext->info.rel, BUFFER_LOCK_EXCLUSIVE); + while ((mExtent = vci_GetMExtentNext(&comContext->info, scan)) != NULL) + { + if (TransactionIdIsNormal(mExtent->xgen) && + TransactionIdPrecedes(mExtent->xgen, wos2rosXid)) /* mExtent->xgen < + * wos2rosXid */ + mExtent->xgen = FrozenTransactionId; + + if (TransactionIdIsNormal(mExtent->xdel) && + TransactionIdPrecedes(mExtent->xdel, wos2rosXid)) /* mExtent->xdel < + * wos2rosXid */ + mExtent->xdel = FrozenTransactionId; + } + vci_EndMetaItemScan(scan); +} + +/* + * VCITupleSatisfiesVisibility + * True iff heap tuple satisfies a time qual. + * + * Notes: + * Assumes heap tuple is valid, and buffer at least share locked. + * + * Copy of OSS HeapTupleSatisfiesVisibulity() for VCI snapshot types + * + */ +bool +VCITupleSatisfiesVisibility(HeapTuple htup, Snapshot snapshot, Buffer buffer) +{ + switch (snapshot->snapshot_type) + { + case SNAPSHOT_VCI_WOS2ROS: + return HeapTupleSatisfiesWos2Ros(htup, snapshot, buffer); + case SNAPSHOT_VCI_LOCALROS: + return HeapTupleSatisfiesLocalRos(htup, snapshot, buffer); + default: + return HeapTupleSatisfiesVisibility(htup, snapshot, buffer); + } + return false; +} + +static void +freezeWos(vci_RosCommandContext *comContext, vci_MainRelVar wosType, Snapshot snapshot) +{ + LOCKMODE lockmode = ShareUpdateExclusiveLock; + Oid oid; + HeapTupleFreeze *frozen; + Relation rel; + BlockNumber nblocks, + blkno; + + frozen = palloc0(sizeof(HeapTupleFreeze) * MaxHeapTuplesPerPage); + + oid = vci_GetMainRelVar(&comContext->info, wosType, 0); + + rel = table_open(oid, lockmode); + + nblocks = RelationGetNumberOfBlocks(rel); + + for (blkno = 0; blkno < nblocks; blkno++) + { + Buffer buffer; + Page page; + OffsetNumber offnum, + maxoff; + int nfrozen = 0; + + buffer = ReadBuffer(rel, blkno); + + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + + page = BufferGetPage(buffer); + + maxoff = PageGetMaxOffsetNumber(page); + + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + HeapTupleData loctup; + + itemid = PageGetItemId(page, offnum); + + if (ItemIdIsNormal(itemid)) + { + bool valid; + TransactionId xmin; + + loctup.t_tableOid = RelationGetRelid(rel); + loctup.t_data = (HeapTupleHeader) PageGetItem(page, itemid); + loctup.t_len = ItemIdGetLength(itemid); + ItemPointerSet(&loctup.t_self, blkno, offnum); + + valid = VCITupleSatisfiesVisibility(&loctup, snapshot, buffer); + + HeapCheckForSerializableConflictOut(valid, rel, &loctup, buffer, snapshot); + + xmin = HeapTupleHeaderGetXmin(loctup.t_data); + + if (valid && + !TransactionIdEquals(xmin, FrozenTransactionId) && + TransactionIdPrecedes(xmin, comContext->oldestXmin)) + { + HeapTupleFreeze *frz = &frozen[nfrozen]; + HeapTupleHeader tuple = loctup.t_data; + + frz->frzflags = 0; + frz->t_infomask2 = tuple->t_infomask2; + frz->t_infomask = tuple->t_infomask | HEAP_XMIN_FROZEN; + frz->xmax = HeapTupleHeaderGetRawXmax(tuple); + frz->offset = offnum; + + nfrozen++; + } + } + } + + if (nfrozen > 0) + { + heap_pre_freeze_checks(buffer, frozen, nfrozen); + START_CRIT_SECTION(); + heap_freeze_prepared_tuples(buffer, frozen, nfrozen); + MarkBufferDirty(buffer); + + /* Now WAL-log freezing if necessary */ + if (RelationNeedsWAL(rel)) + { + /* + * Commit add323d added the vmbuffer/vmflags parameters. + * A quick fix was needed to allow build to proceed. + * + * TODO Confirm if passing InvalidBuffer, 0 is OK here. + */ + log_heap_prune_and_freeze(rel, buffer, + InvalidBuffer, /* vmbuffer */ + 0, /* vmflags */ + comContext->oldestXmin, + false, /* no cleanup lock + * required */ + PRUNE_VACUUM_SCAN, + frozen, nfrozen, + NULL, 0, /* redirected */ + NULL, 0, /* dead */ + NULL, 0); /* unused */ + } + END_CRIT_SECTION(); + } + UnlockReleaseBuffer(buffer); + } + + table_close(rel, lockmode); + + pfree(frozen); +} + +/** + * @param[in] comContext Conv Context + * + * @note + * This is not transaction-safe, because the truncation is done immediately + * and cannot be rolled back later. Caller is responsible for having + * checked permissions etc, and must have obtained AccessExclusiveLock. + */ +static void +truncateRos(vci_RosCommandContext *comContext) +{ + const LOCKMODE lockmode = ShareUpdateExclusiveLock; + int colId; + + vci_meta_item_scanner_t *scan; + vcis_m_extent_t *extentInfo; + int32 lastAvailableExtentId = -1; + + scan = vci_BeginMetaItemScan(comContext->info.rel, BUFFER_LOCK_SHARE); + while ((extentInfo = vci_GetMExtentNext(&comContext->info, scan)) != NULL) + { + if (TransactionIdIsValid(extentInfo->xgen) || + TransactionIdIsValid(extentInfo->xdel)) + lastAvailableExtentId = scan->index; + } + vci_EndMetaItemScan(scan); + + vci_SetMainRelVar(&comContext->info, vcimrv_num_extents, 0, lastAvailableExtentId + 1); + + for (colId = VCI_FIRST_NORMALCOLUMN_ID; colId < comContext->numColumns; ++colId) + { + vcis_m_column_t *colInfo; + + vci_ColumnRelations relPairData; + vci_ColumnRelations *relPair = &relPairData; + + BlockNumber nblocks; + + colInfo = vci_GetMColumn(&comContext->info, colId); + + vci_OpenColumnRelations(relPair, &comContext->info, colId, lockmode); + + nblocks = RelationGetNumberOfBlocks(relPair->data); + + if (colInfo->comp_type != vcis_compression_type_fixed_raw) + { + BlockNumber sentinelBlockNumber; + vcis_column_meta_t *columnMeta; + + elog(DEBUG2, " -- colId %d ,variable column ", colId); + + columnMeta = vci_GetColumnMeta(&relPair->bufMeta, relPair->meta); + sentinelBlockNumber = columnMeta->free_page_end_id; + ReleaseBuffer(relPair->bufMeta); + + Assert(sentinelBlockNumber + 1 <= nblocks); + + RelationTruncate(relPair->data, sentinelBlockNumber + 1); + elog(DEBUG2, " end"); + } + else + { + int16 columnSize; + int extentHeaderSize; + Size dataSize; + int numExtentPages; + BlockNumber startBlockNumber; + + elog(DEBUG2, " -- colId %d ,variable column ", colId); + + columnSize = vci_GetFixedColumnSize(&comContext->info, colId); + extentHeaderSize = vci_GetExtentFixedLengthRawDataHeaderSize(VCI_NUM_ROWS_IN_EXTENT); + dataSize = (Size) columnSize * VCI_NUM_ROWS_IN_EXTENT; + numExtentPages = vci_GetNumBlocks(dataSize + extentHeaderSize); + startBlockNumber = (lastAvailableExtentId + 1) * numExtentPages; + + if (startBlockNumber < nblocks) + RelationTruncate(relPair->data, startBlockNumber); + + elog(DEBUG2, " end"); + + } + + vci_CloseColumnRelations(relPair, lockmode); + } +} + +/** + * @param[in] comContext Conv Context + */ +static void +truncateWos(vci_RosCommandContext *comContext) +{ + LOCKMODE lockmode = ShareUpdateExclusiveLock; + + Oid oid[2] = { + vci_GetMainRelVar(&comContext->info, vcimrv_data_wos_oid, 0), + vci_GetMainRelVar(&comContext->info, vcimrv_whiteout_wos_oid, 0) + }; + + int i; + + for (i = 0; i < 2; i++) + { + Relation rel = table_open(oid[i], lockmode); + int lock_retry = 0; + BlockNumber old_rel_pages; + BlockNumber new_rel_pages; + BlockNumber blkno; + + while (true) + { + if (ConditionalLockRelation(rel, AccessExclusiveLock)) + break; + + /* + * * Check for interrupts while trying to (re-)acquire the + * exclusive * lock. + */ + CHECK_FOR_INTERRUPTS(); + + if (++lock_retry > (VACUUM_TRUNCATE_LOCK_TIMEOUT / + VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL)) + { + table_close(rel, lockmode); + return; + } + + pg_usleep(VACUUM_TRUNCATE_LOCK_WAIT_INTERVAL); + } + + blkno = old_rel_pages = new_rel_pages = RelationGetNumberOfBlocks(rel); + + while (blkno > 0) + { + Buffer buffer; + Page page; + OffsetNumber offnum, + maxoff; + + blkno--; + + buffer = ReadBuffer(rel, blkno); + + LockBuffer(buffer, BUFFER_LOCK_SHARE); + page = BufferGetPage(buffer); + + if (PageIsNew(page) || PageIsEmpty(page)) + { + UnlockReleaseBuffer(buffer); + + new_rel_pages = blkno; + continue; + } + + maxoff = PageGetMaxOffsetNumber(page); + + for (offnum = FirstOffsetNumber; offnum <= maxoff; offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + + itemid = PageGetItemId(page, offnum); + + if (ItemIdIsUsed(itemid)) + { + UnlockReleaseBuffer(buffer); + goto found_use_item; + } + } + + UnlockReleaseBuffer(buffer); + + new_rel_pages = blkno; + } + +found_use_item: + if (new_rel_pages < old_rel_pages) + RelationTruncate(rel, new_rel_pages); + + UnlockRelation(rel, AccessExclusiveLock); + + table_close(rel, lockmode); + } +} + +void +vci_VacuumRos(Relation mainRel, IndexVacuumInfo *vacuumInfo) +{ + MemoryContext memCtxWos2Ros; + MemoryContext oldMemCtx; + vci_RosCommandContext comContext; + Snapshot snapshot; + + vci_InitRosCommandContext0(&comContext, mainRel, vci_rc_vacuum); + + /* recover ROS if necessary */ + vci_RecoverOneVCIIfNecessary(&(comContext.info)); + + /* Change Mem Context */ + memCtxWos2Ros = AllocSetContextCreate(TopTransactionContext, + "Vacuum", + ALLOCSET_DEFAULT_SIZES); + oldMemCtx = MemoryContextSwitchTo(memCtxWos2Ros); + + vci_InitRosCommandContext1(&comContext, 0, 0, 0, false); + + snapshot = GetActiveSnapshot(); + + /* remove WOS entries */ + elog(DEBUG2, " -- wos"); + cleanUpWos(&comContext, vcimrv_data_wos_oid); + cleanUpWos(&comContext, vcimrv_whiteout_wos_oid); + freezeWos(&comContext, vcimrv_data_wos_oid, snapshot); + freezeWos(&comContext, vcimrv_whiteout_wos_oid, snapshot); + truncateWos(&comContext); + + elog(DEBUG2, " -- ros"); + freezeMainAndRos(&comContext); + truncateRos(&comContext); + + elog(DEBUG2, " -- end"); + + vci_UpdateXidGeneration(&comContext.info); + + /* Finalize ROS */ + vci_FinRosCommandContext(&comContext, true /* never write */ ); + + MemoryContextSwitchTo(oldMemCtx); + MemoryContextDelete(memCtxWos2Ros); +} + +static void +constructTidArray(vci_RosCommandContext *comContext, int max_data_wos_entries, int max_whiteout_wos_entries) +{ + vci_MainRelHeaderInfo *info; + Snapshot snapshot; + Oid data_wos_oid; + Oid whiteout_wos_oid; + vci_tid_tid_xid64_t *data_wos_entries; + vci_tid_tid_xid64_t *whiteout_wos_entries; + int num_data_wos_entries = 0; + int num_whiteout_wos_entries = 0; + int data_wos_entries_pos = 0; + int whiteout_wos_entries_pos = 0; + + info = &comContext->info; + + data_wos_oid = vci_GetMainRelVar(info, vcimrv_data_wos_oid, 0); + whiteout_wos_oid = vci_GetMainRelVar(info, vcimrv_whiteout_wos_oid, 0); + + data_wos_entries = palloc(max_data_wos_entries * sizeof(vci_tid_tid_xid64_t)); + whiteout_wos_entries = palloc(max_whiteout_wos_entries * sizeof(vci_tid_tid_xid64_t)); + + snapshot = vci_GetSnapshotForLocalRos(comContext->inclusiveXid, comContext->exclusiveXid); + + num_data_wos_entries = + readTidListFromWosIntoTidArray(data_wos_oid, WOS_Data, + data_wos_entries, max_data_wos_entries, + snapshot); + + num_whiteout_wos_entries = + readTidListFromWosIntoTidArray(whiteout_wos_oid, WOS_Whiteout, + whiteout_wos_entries, max_whiteout_wos_entries, + snapshot); + + Assert(num_data_wos_entries <= max_data_wos_entries); + Assert(num_whiteout_wos_entries <= max_whiteout_wos_entries); + + qsort(data_wos_entries, num_data_wos_entries, sizeof(vci_tid_tid_xid64_t), comparator_orig_tid_xid64); + qsort(whiteout_wos_entries, num_whiteout_wos_entries, sizeof(vci_tid_tid_xid64_t), comparator_orig_tid_xid64); + + while ((data_wos_entries_pos < num_data_wos_entries) && + (whiteout_wos_entries_pos < num_whiteout_wos_entries)) + { + int32 res; + vci_tid_tid_xid64_t data_wos_item; + vci_tid_tid_xid64_t whiteout_wos_item; + + data_wos_item = data_wos_entries[data_wos_entries_pos]; + whiteout_wos_item = whiteout_wos_entries[whiteout_wos_entries_pos]; + + res = ItemPointerCompare(&data_wos_item.orig_tid, &whiteout_wos_item.orig_tid); + + if (res == 0) + res = compareXid64(data_wos_item.xid64, whiteout_wos_item.xid64); + + if (res < 0) + { + comContext->wos2ros_array.orig_tids[comContext->wos2ros_array.num] = + data_wos_item.orig_tid; + + comContext->wos2ros_array.num++; + data_wos_entries_pos++; + } + else if (res > 0) + { + comContext->delvec_array.orig_tids[comContext->delvec_array.num] = + whiteout_wos_item.orig_tid; + + comContext->delvec_array.num++; + whiteout_wos_entries_pos++; + } + else + { + data_wos_entries_pos++; + whiteout_wos_entries_pos++; + } + } + + while (data_wos_entries_pos < num_data_wos_entries) + { + comContext->wos2ros_array.orig_tids[comContext->wos2ros_array.num] = + data_wos_entries[data_wos_entries_pos].orig_tid; + + comContext->wos2ros_array.num++; + data_wos_entries_pos++; + } + + while (whiteout_wos_entries_pos < num_whiteout_wos_entries) + { + comContext->delvec_array.orig_tids[comContext->delvec_array.num] = + whiteout_wos_entries[whiteout_wos_entries_pos].orig_tid; + + comContext->delvec_array.num++; + whiteout_wos_entries_pos++; + } + + PopActiveSnapshot(); + + pfree(data_wos_entries); + pfree(whiteout_wos_entries); +} + +static int +comparator_orig_tid_xid64(const void *pa, const void *pb) +{ + vci_tid_tid_xid64_t *a = (vci_tid_tid_xid64_t *) pa; + vci_tid_tid_xid64_t *b = (vci_tid_tid_xid64_t *) pb; + int res; + + res = ItemPointerCompare(&a->orig_tid, &b->orig_tid); + + if (res == 0) + { + if (a->xid64 == b->xid64) + res = 0; + else if (a->xid64 > b->xid64) + res = 1; + else + res = -1; + } + + return res; +} + +/** + * @param[in,out] comContext Conv Context + * @param[in] snapshot Snapshot + */ +static void +constructTidSortState(vci_RosCommandContext *comContext) +{ + vci_MainRelHeaderInfo *info; + Snapshot snapshot; + Oid data_wos_oid; + Oid whiteout_wos_oid; + MemoryContext workcontext; + MemoryContext oldcontext; + TupleDesc tupDesc; + Tuplesortstate *data_wos_valid_tid_sortstate; + Tuplesortstate *whiteout_wos_valid_tid_sortstate; + AttrNumber sortKeys[2] = {1, 3}; + Oid sortOperators[2] = {TIDLessOperator, Int8LessOperator}; + Oid sortCollations[2] = {InvalidOid, InvalidOid,}; + bool nullsFirstFlags[2] = {false, false}; + TupleTableSlot *data_wos_valid_slot; + TupleTableSlot *whiteout_wos_valid_slot; + vci_tid_tid_xid64_t data_wos_item; + vci_tid_tid_xid64_t whiteout_wos_item; + bool is_terminated_data_wos = false; + bool is_terminated_whiteout_wos = false; + int64 numInsertRows = 0; + int64 numDeleteRows = 0; + ItemPointerData last_whiteout_orig_tid; + + info = &comContext->info; + + data_wos_oid = vci_GetMainRelVar(info, vcimrv_data_wos_oid, 0); + whiteout_wos_oid = vci_GetMainRelVar(info, vcimrv_whiteout_wos_oid, 0); + + workcontext = AllocSetContextCreate(CurrentMemoryContext, + "Construct Tid Sort State", + ALLOCSET_DEFAULT_SIZES); + + oldcontext = MemoryContextSwitchTo(workcontext); + + tupDesc = CreateTemplateTupleDesc(4); + + TupleDescInitEntry(tupDesc, (AttrNumber) 1, "orig_tid", TIDOID, -1, 0); + TupleDescInitEntry(tupDesc, (AttrNumber) 2, "wos_tid", TIDOID, -1, 0); + TupleDescInitEntry(tupDesc, (AttrNumber) 3, "xid64", INT8OID, -1, 0); + TupleDescInitEntry(tupDesc, (AttrNumber) 4, "movable", BOOLOID, -1, 0); + + data_wos_valid_tid_sortstate = + tuplesort_begin_heap(tupDesc, 2, + sortKeys, sortOperators, sortCollations, nullsFirstFlags, + VciGuc.maintenance_work_mem / 8 * 3, NULL, + TUPLESORT_NONE); + + whiteout_wos_valid_tid_sortstate = + tuplesort_begin_heap(tupDesc, 2, + sortKeys, sortOperators, sortCollations, nullsFirstFlags, + VciGuc.maintenance_work_mem / 8 * 3, NULL, + TUPLESORT_NONE); + + data_wos_valid_slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple); + whiteout_wos_valid_slot = MakeSingleTupleTableSlot(tupDesc, &TTSOpsHeapTuple); + + snapshot = vci_GetSnapshotForWos2Ros(); + + readTidListFromWosIntoTidSortState(data_wos_oid, WOS_Data, + data_wos_valid_slot, + data_wos_valid_tid_sortstate, + snapshot, + comContext->wos2rosXid); + + readTidListFromWosIntoTidSortState(whiteout_wos_oid, WOS_Whiteout, + whiteout_wos_valid_slot, + whiteout_wos_valid_tid_sortstate, + snapshot, + comContext->wos2rosXid); + + tuplesort_performsort(data_wos_valid_tid_sortstate); + tuplesort_performsort(whiteout_wos_valid_tid_sortstate); + + if (!getValidTidSortState(data_wos_valid_tid_sortstate, data_wos_valid_slot, &data_wos_item)) + is_terminated_data_wos = true; + + if (!getValidTidSortState(whiteout_wos_valid_tid_sortstate, whiteout_wos_valid_slot, &whiteout_wos_item)) + is_terminated_whiteout_wos = true; + + ItemPointerSetInvalid(&last_whiteout_orig_tid); + + while (!is_terminated_data_wos && !is_terminated_whiteout_wos) + { + int32 res; + + res = ItemPointerCompare(&data_wos_item.orig_tid, &whiteout_wos_item.orig_tid); + + if (res == 0) + res = compareXid64(data_wos_item.xid64, whiteout_wos_item.xid64); + + if (res < 0) + { + if (can_select_candidate_for_wos2ros_conv(&data_wos_item, comContext, &last_whiteout_orig_tid)) + { + put_entry_into_tid_list(comContext, WOS_Data, &data_wos_item.orig_tid, &data_wos_item.wos_tid); + + numInsertRows++; + } + + if (!getValidTidSortState(data_wos_valid_tid_sortstate, data_wos_valid_slot, &data_wos_item)) + is_terminated_data_wos = true; + } + else if (res > 0) + { + last_whiteout_orig_tid = whiteout_wos_item.orig_tid; + + if (can_select_candidate_for_update_delvec(&whiteout_wos_item, comContext)) + { + put_entry_into_tid_list(comContext, WOS_Whiteout, &whiteout_wos_item.orig_tid, &whiteout_wos_item.wos_tid); + + numDeleteRows++; + } + + if (!getValidTidSortState(whiteout_wos_valid_tid_sortstate, whiteout_wos_valid_slot, &whiteout_wos_item)) + is_terminated_whiteout_wos = true; + } + else + { + if (data_wos_item.movable && whiteout_wos_item.movable) + { + if (comContext->data_wos_del_list) + tuplesort_putdatum(comContext->data_wos_del_list, + ItemPointerGetDatum(&data_wos_item.wos_tid), false); + + if (comContext->whiteout_wos_del_list) + tuplesort_putdatum(comContext->whiteout_wos_del_list, + ItemPointerGetDatum(&whiteout_wos_item.wos_tid), false); + } + + if (!getValidTidSortState(data_wos_valid_tid_sortstate, data_wos_valid_slot, &data_wos_item)) + is_terminated_data_wos = true; + + if (!getValidTidSortState(whiteout_wos_valid_tid_sortstate, whiteout_wos_valid_slot, &whiteout_wos_item)) + is_terminated_whiteout_wos = true; + } + } + + if (!is_terminated_data_wos && comContext->wos2ros_tid_list) + { + do + { + if (can_select_candidate_for_wos2ros_conv(&data_wos_item, comContext, &last_whiteout_orig_tid)) + { + put_entry_into_tid_list(comContext, WOS_Data, &data_wos_item.orig_tid, &data_wos_item.wos_tid); + numInsertRows++; + } + } while (getValidTidSortState(data_wos_valid_tid_sortstate, + data_wos_valid_slot, &data_wos_item)); + } + + if (!is_terminated_whiteout_wos && comContext->delvec_tid_list) + { + do + { + if (can_select_candidate_for_update_delvec(&whiteout_wos_item, comContext)) + { + put_entry_into_tid_list(comContext, WOS_Whiteout, &whiteout_wos_item.orig_tid, &whiteout_wos_item.wos_tid); + + numDeleteRows++; + } + } while (getValidTidSortState(whiteout_wos_valid_tid_sortstate, + whiteout_wos_valid_slot, &whiteout_wos_item)); + } + + tuplesort_end(whiteout_wos_valid_tid_sortstate); + tuplesort_end(data_wos_valid_tid_sortstate); + + FreeTupleDesc(tupDesc); + + PopActiveSnapshot(); + + MemoryContextSwitchTo(oldcontext); + MemoryContextDelete(workcontext); + + if (comContext->wos2ros_tid_list) + { + tuplesort_performsort(comContext->wos2ros_tid_list); + comContext->num_wos2ros_tids = numInsertRows; + } + + if (comContext->delvec_tid_list) + { + tuplesort_performsort(comContext->delvec_tid_list); + comContext->num_delvec_tids = numDeleteRows; + } +} + +static bool +can_select_candidate_for_wos2ros_conv(vci_tid_tid_xid64_t *data_wos_item, vci_RosCommandContext *comContext, ItemPointer last_whiteout_orig_tid) +{ + if (!data_wos_item->movable) + return false; + + if (!comContext->wos2ros_tid_list) + return false; + + if (!comContext->delvec_tid_list) + if (ItemPointerIsValid(last_whiteout_orig_tid) && + ItemPointerEquals(last_whiteout_orig_tid, &data_wos_item->orig_tid)) + return false; + + return true; +} + +static bool +can_select_candidate_for_update_delvec(vci_tid_tid_xid64_t *whiteout_wos_item, vci_RosCommandContext *comContext) +{ + if (!whiteout_wos_item->movable) + return false; + + if (!comContext->delvec_tid_list) + return false; + + return true; +} + +static void +put_entry_into_tid_list(vci_RosCommandContext *comContext, WosKind wos_kind, ItemPointer orig_tid, ItemPointer wos_tid) +{ + TupleTableSlot *slot; + Tuplesortstate *sortstate; + + slot = comContext->tid_tid_slot; + + ExecClearTuple(slot); + + if (wos_kind == WOS_Data) + sortstate = comContext->wos2ros_tid_list; + else + sortstate = comContext->delvec_tid_list; + + Assert(sortstate != NULL); + + slot->tts_values[0] = ItemPointerGetDatum(orig_tid); + slot->tts_values[1] = ItemPointerGetDatum(wos_tid); + slot->tts_isnull[0] = false; + slot->tts_isnull[1] = false; + + slot->tts_flags |= TTS_FLAG_EMPTY; + + ExecStoreVirtualTuple(slot); + + tuplesort_puttupleslot(sortstate, slot); +} + +static bool +get_entry_into_tid_list(vci_RosCommandContext *comContext, WosKind wos_kind, ItemPointer orig_tid, ItemPointer wos_tid) +{ + bool isnull; + TupleTableSlot *slot; + Tuplesortstate *sortstate; + + slot = MakeSingleTupleTableSlot(comContext->tid_tid_slot->tts_tupleDescriptor, &TTSOpsMinimalTuple); + + if (wos_kind == WOS_Data) + sortstate = comContext->wos2ros_tid_list; + else + sortstate = comContext->delvec_tid_list; + + Assert(sortstate != NULL); + + if (!tuplesort_gettupleslot(sortstate, true, false, slot, NULL)) + { + ExecDropSingleTupleTableSlot(slot); + return false; + } + + slot_getsomeattrs(slot, 2); + + *orig_tid = *DatumGetItemPointer(slot_getattr(slot, 1, &isnull)); + *wos_tid = *DatumGetItemPointer(slot_getattr(slot, 2, &isnull)); + + ExecDropSingleTupleTableSlot(slot); + return true; +} + +static int +readTidListFromWosIntoTidArray(Oid wos_oid, WosKind wos_kind, vci_tid_tid_xid64_t *wos_entris, int max_wos_entries, Snapshot snapshot) +{ + LOCKMODE lockmode = AccessShareLock; + TableScanDesc scan; + HeapTuple tuple; + Relation rel; + TupleDesc tupleDesc; + int num_rows = 0; + + rel = relation_open(wos_oid, lockmode); + + tupleDesc = RelationGetDescr(rel); + + CHECK_FOR_INTERRUPTS(); + + scan = table_beginscan(rel, snapshot, 0, NULL); + + scan->rs_flags &= ~SO_ALLOW_PAGEMODE; + while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + bool isnull; + + if (max_wos_entries <= num_rows) + ereport(ERROR, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("too many WOS rows over estimation"))); + + wos_entris[num_rows].orig_tid = *DatumGetItemPointer(heap_getattr(tuple, 1, tupleDesc, &isnull)); /* original_tid in WOS */ + wos_entris[num_rows].wos_tid = tuple->t_self; + wos_entris[num_rows].xid64 = DatumGetInt64(heap_getattr(tuple, 2, tupleDesc, &isnull)); /* xid64 in WOS */ + + wos_entris[num_rows].movable = true; + + Assert(ItemPointerIsValid(&wos_entris[num_rows].orig_tid)); + + CHECK_FOR_INTERRUPTS(); + + num_rows++; + } + table_endscan(scan); + + table_close(rel, lockmode); + + return num_rows; +} + +static void +readTidListFromWosIntoTidSortState(Oid wos_oid, WosKind wos_kind, + TupleTableSlot *slot, Tuplesortstate *sortstate, + Snapshot snapshot, + TransactionId wos2ros_xid) +{ + LOCKMODE lockmode = AccessShareLock; + TableScanDesc scan; + HeapTuple tuple; + Relation rel; + TupleDesc tupleDesc; + + rel = relation_open(wos_oid, lockmode); + tupleDesc = RelationGetDescr(rel); + + CHECK_FOR_INTERRUPTS(); + + scan = table_beginscan(rel, snapshot, 0, NULL); + scan->rs_flags &= ~SO_ALLOW_PAGEMODE; + while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + TransactionId xmin; + bool isnull; + bool movable; + + xmin = HeapTupleHeaderGetXmin(tuple->t_data); + movable = TransactionIdPrecedes(xmin, wos2ros_xid); + ExecClearTuple(slot); + + slot->tts_values[0] = heap_getattr(tuple, 1, tupleDesc, &isnull); /* original_tid in WOS */ + slot->tts_values[1] = ItemPointerGetDatum(&tuple->t_self); + slot->tts_values[2] = heap_getattr(tuple, 2, tupleDesc, &isnull); /* xid64 in WOS */ + slot->tts_values[3] = BoolGetDatum(movable); + + slot->tts_isnull[0] = false; + slot->tts_isnull[1] = false; + slot->tts_isnull[2] = false; + slot->tts_isnull[3] = false; + + slot->tts_flags |= TTS_FLAG_EMPTY; + + ExecStoreVirtualTuple(slot); + + tuplesort_puttupleslot(sortstate, slot); + + CHECK_FOR_INTERRUPTS(); + } + table_endscan(scan); + + relation_close(rel, lockmode); +} + +static bool +getValidTidSortState(Tuplesortstate *sortstate, TupleTableSlot *slot, vci_tid_tid_xid64_t *item) +{ + bool isnull; + TupleTableSlot *tempslot; + + tempslot = MakeSingleTupleTableSlot(slot->tts_tupleDescriptor, &TTSOpsMinimalTuple); + + if (!tuplesort_gettupleslot(sortstate, true, false, tempslot, NULL)) + { + ExecDropSingleTupleTableSlot(tempslot); + return false; + } + + slot_getsomeattrs(tempslot, 4); + + item->orig_tid = *DatumGetItemPointer(slot_getattr(tempslot, 1, &isnull)); + item->wos_tid = *DatumGetItemPointer(slot_getattr(tempslot, 2, &isnull)); + item->xid64 = DatumGetInt64(slot_getattr(tempslot, 3, &isnull)); + item->movable = DatumGetBool(slot_getattr(tempslot, 4, &isnull)); + + ExecDropSingleTupleTableSlot(tempslot); + return true; +} + +static int32 +compareXid64(int64 data_wos_xid64, int64 whiteout_wos_xid64) +{ + Assert((data_wos_xid64 > 0) && (whiteout_wos_xid64 > 0)); + + if (data_wos_xid64 == whiteout_wos_xid64) + { + return 0; + } + else if (data_wos_xid64 > whiteout_wos_xid64) + { + return +1; + } + else + { + + return 0; + } +} diff --git a/contrib/vci/storage/vci_ros_daemon.c b/contrib/vci/storage/vci_ros_daemon.c new file mode 100644 index 000000000000..597617031029 --- /dev/null +++ b/contrib/vci/storage/vci_ros_daemon.c @@ -0,0 +1,865 @@ +/*------------------------------------------------------------------------- + * + * vci_ros_daemon.c + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/storage/vci_ros_daemon.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/htup.h" +#include "access/htup_details.h" +#include "access/htup_details.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "c.h" +#include "catalog/index.h" +#include "catalog/pg_database.h" +#include "fmgr.h" +#include "lib/ilist.h" +#include "miscadmin.h" +#include "postmaster/autovacuum.h" +#include "postmaster/bgworker.h" +#include "storage/bufpage.h" +#include "storage/ipc.h" +#include "storage/latch.h" +#include "storage/lwlock.h" +#include "storage/proc.h" +#include "storage/procarray.h" /* for TransactionIdIsInProgress() */ +/* #include "storage/shmem.h" */ +#include "utils/guc.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" +#include "utils/syscache.h" +#include "pgstat.h" + +#include "vci.h" +#include "vci_mem.h" +#include "vci_ros.h" +#include "vci_ros_daemon.h" +#include "vci_ros_command.h" + +#include "vci_memory_entry.h" + +/** + * message on worker exit. + */ +typedef struct message_on_worker_exit +{ + int log_min_messages; + int message_level; + char message[1024]; +} message_on_worker_exit_t; + +static message_on_worker_exit_t message_on_worker_exit; + +#define INIT_MESSAGE_ON_WORKER_EXIT() \ +do \ +{ \ + MemSet(&message_on_worker_exit, 0x00, sizeof(message_on_worker_exit)); \ + message_on_worker_exit.log_min_messages = log_min_messages; \ + on_proc_exit(callback_on_exit_worker, Int32GetDatum(0)); \ +} while (0) + +#define SET_MESSAGE_ON_WORKER_EXIT(elevel, ...) \ +do \ +{ \ + message_on_worker_exit.message_level = (elevel); \ + snprintf(message_on_worker_exit.message, sizeof(message_on_worker_exit.message), __VA_ARGS__); \ + message_on_worker_exit.log_min_messages = log_min_messages; \ + log_min_messages = PANIC; \ +} while (0) + +#define RESET_MESSAGE_ON_WORKER_EXIT() \ +do \ +{ \ + log_min_messages = message_on_worker_exit.log_min_messages; \ + message_on_worker_exit.message_level = 0; \ + message_on_worker_exit.message[0] = '\0'; \ +} while (0) + +static bool TryToOpenVCIRelations(Oid indexOid, LOCKMODE heapLock, LOCKMODE indexLock, + Relation *heapRel, Relation *indexRel); +static void CheckRosControlWorkerCancel(void); +static void callback_on_exit_worker(int code, Datum arg); + +/* BGW_MAXREN = 64 */ +/* If the ROS control worker name is changed then update the bgw_name check in LockAcquire() too.*/ +static const char VCI_ROS_CONTROL_DAEMON_NAME[BGW_MAXLEN] = "vci:ROS control daemon"; +static const char VCI_ROS_CONTROL_WORKER_NAME_TEMP[BGW_MAXLEN] = "vci:ROS control worker(slot=%d)"; +static const char VCI_ROS_CONTROL_WORKER_TYPE[BGW_MAXLEN] = "vci:ROS control worker"; + +/* flags set by signal handlers */ +static volatile sig_atomic_t gotSighup = false; +static volatile sig_atomic_t gotSigterm = false; + +static vci_workerslot_t *workerslot; + +static char probeMessage[num_vci_rc][1024] = +{ + " data WOS count : %8d / %8d.", + " whiteout WOS count : %8d / %8d.", + " CDR : %8d / %8d (extent %d).", + " CDE : %8d / %8d (extent %d).", + " TIDCRID : %8d / %8d.", +}; + +/* ------------ daemon -------------- */ + +/** + * Register ROS Control daemon function called from _PG_init_ + */ +void +vci_ROS_control_daemon_setup(void) +{ + BackgroundWorker worker; + + /* for internal use */ + if (VciGuc.enable_ros_control_daemon == false) + { + elog(DEBUG1, "vci: no daemon mode"); + return; + } + + memset(&worker, 0, sizeof(worker)); + /* set up common data for all our workers */ + worker.bgw_flags = BGWORKER_SHMEM_ACCESS | + BGWORKER_BACKEND_DATABASE_CONNECTION; + /* worker.bgw_start_time = BgWorkerStart_ConsistentState; */ + worker.bgw_start_time = BgWorkerStart_RecoveryFinished; + /* worker.bgw_start_time = BgWorkerStart_PostmasterStart; */ + + worker.bgw_restart_time = VCI_DAEMON_RESTART_TIME; + worker.bgw_notify_pid = 0; + + snprintf(worker.bgw_name, BGW_MAXLEN, VCI_ROS_CONTROL_DAEMON_NAME); + snprintf(worker.bgw_type, BGW_MAXLEN, VCI_ROS_CONTROL_DAEMON_NAME); + strcpy(worker.bgw_library_name, VCI_STRING); + strcpy(worker.bgw_function_name, "vci_ROS_control_daemon_main"); + worker.bgw_main_arg = (Datum) 0; + + RegisterBackgroundWorker(&worker); +} + +/** + * Signal handler for SIGTERM + * + * @description + * Set a flag to let the main loop to terminate, and set our latch to wake it up. + * + * @param[in] SIGNAL_ARGS + */ +static void +vci_ROSControlDaemonSigterm(SIGNAL_ARGS) +{ + gotSigterm = true; + if (MyProc) + SetLatch(&MyProc->procLatch); +} + +/** + * Signal handler for SIGHUP + * + * @description + * Set a flag to tell the main loop to reread the config file, and set + * our latch to wake it up. + * + * @params[in] SIGNAL_ARGS + */ +static void +vci_ROSControlDaemonSighup(SIGNAL_ARGS) +{ + gotSighup = true; + if (MyProc) + SetLatch(&MyProc->procLatch); +} + +/** + * ROS control DAEMON's entory point. + */ +void +vci_ROS_control_daemon_main(Datum main_arg) +{ + /* + * XXX - VCI wants to pretend this worker is like an autovacuum launcher; + * Let's set the MyBackendType to achieve this. + */ + MyBackendType = B_AUTOVAC_LAUNCHER; + + pg_bindtextdomain(TEXTDOMAIN); + + /* StringInfoData buf; */ + elog(DEBUG1, "start initialize %s", MyBgworkerEntry->bgw_name); + + /* Establish signal handlers before unblocking signals. */ + pqsignal(SIGHUP, vci_ROSControlDaemonSighup); + pqsignal(SIGTERM, vci_ROSControlDaemonSigterm); + pqsignal(SIGQUIT, vci_ROSControlDaemonSigterm); + pqsignal(SIGINT, vci_ROSControlDaemonSigterm); + + /* pqsignal(SIGUSR1, vci_ROSNotify); */ + + /* We're now ready to receive signals */ + BackgroundWorkerUnblockSignals(); + + BackgroundWorkerInitializeConnection(NULL, NULL, 0); /* Connect to Shared + * database */ + + /* Connect DB to access common system catalog */ + + workerslot = (vci_workerslot_t *) palloc0(sizeof(vci_workerslot_t) * + VciGuc.control_max_workers); + + /* Main loop */ + while (!gotSigterm) + { + int rc; + int i; + + CHECK_FOR_INTERRUPTS(); + + /* + * Background workers mustn't call usleep() or any direct equivalent: + * instead, they may wait on their process latch, which sleeps as + * necessary, but is awakened if postmaster dies. That way the + * background process goes away immediately in an emergency. + */ + rc = WaitLatch(&MyProc->procLatch, + WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH, + VciGuc.control_naptime * INT64CONST(1000), + PG_WAIT_EXTENSION); + ResetLatch(&MyProc->procLatch); + + /* emergency bailout if postmaster has died */ + if (rc & WL_POSTMASTER_DEATH) + proc_exit(1); /* abnormal end */ + + if (gotSigterm) + goto done; + + LWLockAcquire(VciShmemAddr->io_load_lock, LW_EXCLUSIVE); + + /* Check VCI' database is exists */ + vci_RemoveMemoryEntryOnDroppedDatabase(); + + vci_update_memoryentry_in_devloadinfo(); + + if (gotSigterm) + { + LWLockRelease(VciShmemAddr->io_load_lock); + goto done; + } + + VciShmemAddr->translated_dev_pos = 0; + + elog(DEBUG2, ">>> 1. control_max_workers = %d", VciGuc.control_max_workers); + for (i = 0; i < VciGuc.control_max_workers; i++) + { + elog(DEBUG2, ">>> 1. workerslot[%d].pid is %d", i, (int) workerslot[i].pid); + if (workerslot[i].pid != 0) + { + pid_t pid; + BgwHandleStatus status; + + status = GetBackgroundWorkerPid(&workerslot[i].handle, &pid); + switch (status) + { + case BGWH_STOPPED: + workerslot[i].pid = 0; + break; + case BGWH_NOT_YET_STARTED: + case BGWH_POSTMASTER_DIED: + case BGWH_STARTED: + break; + default: + /* LCOV_EXCL_START */ + elog(PANIC, "invalid BgwHandleStatus in vci_ROS_control_daemon_main"); + /* LCOV_EXCL_STOP */ + break; + } + + if (gotSigterm) + { + LWLockRelease(VciShmemAddr->io_load_lock); + goto done; + } + } + } + + LWLockAcquire(VciShmemAddr->memory_entries->lock, LW_SHARED); + + vci_ResetDevloadCurrentPos(); + + if (!fullPageWrites) + goto reload_configuration; + + elog(DEBUG2, ">>> 2. control_max_workers = %d", VciGuc.control_max_workers); + for (i = 0; i < VciGuc.control_max_workers; i++) + { + elog(DEBUG2, ">>> 2. workerslot[%d].pid is %d", i, (int) workerslot[i].pid); + if (workerslot[i].pid == 0) + { + int j; + bool worker_running = false; + + if (!vci_GetWosRosConvertingVCI(&VciShmemAddr->worker_args_array[i])) + break; + + Assert(OidIsValid(VciShmemAddr->worker_args_array[i].dbid)); + Assert(OidIsValid(VciShmemAddr->worker_args_array[i].oid)); + + for (j = 0; j < VciGuc.control_max_workers; j++) + { + if (workerslot[j].pid != 0 && + workerslot[j].dbid == VciShmemAddr->worker_args_array[i].dbid && + workerslot[j].oid == VciShmemAddr->worker_args_array[i].oid) + { + elog(DEBUG1, "a worker is running on VCI (oid=%d, dbid=%d)", + VciShmemAddr->worker_args_array[i].oid, + VciShmemAddr->worker_args_array[i].dbid); + worker_running = true; + break; + } + } + + if (!worker_running) + { + workerslot[i] = vci_LaunchROSControlWorker(&VciShmemAddr->worker_args_array[i], i); + workerslot[i].oid = VciShmemAddr->worker_args_array[i].oid; + workerslot[i].dbid = VciShmemAddr->worker_args_array[i].dbid; + } + } + + } + + /* + * In case of a SIGHUP, just reload the configuration. (?) + */ +reload_configuration: + if (gotSighup) + { + gotSighup = false; + ProcessConfigFile(PGC_SIGHUP); + } + + vci_MoveTranslatedVCI2Tail(); + + LWLockRelease(VciShmemAddr->memory_entries->lock); + + LWLockRelease(VciShmemAddr->io_load_lock); + } + +done: + + /* + * Daemon terminate by exit code=1, restart by postmaster as necessary. + */ + proc_exit(1); +} + +/* ------------ Worker -------------- */ + +vci_workerslot_t +vci_LaunchROSControlWorker(vci_wosros_conv_worker_arg_t *vciinfo, int slot_id) +/* vci_database_priority_t *item, */ +{ + BackgroundWorker worker; + BackgroundWorkerHandle *handle; + pid_t pid; + + vci_workerslot_t result; + + /* Assert(MyDatabaseId == InvalidOid); */ + + worker.bgw_flags = BGWORKER_SHMEM_ACCESS | BGWORKER_BACKEND_DATABASE_CONNECTION; + worker.bgw_start_time = BgWorkerStart_ConsistentState; + worker.bgw_restart_time = BGW_NEVER_RESTART; + + sprintf(worker.bgw_library_name, VCI_STRING); + sprintf(worker.bgw_function_name, "vci_ROS_control_worker_main"); + snprintf(worker.bgw_name, BGW_MAXLEN, VCI_ROS_CONTROL_WORKER_NAME_TEMP, slot_id); + snprintf(worker.bgw_type, BGW_MAXLEN, VCI_ROS_CONTROL_WORKER_TYPE); + +/* + worker.bgw_main_arg = PointerGetDatum(item); +*/ + worker.bgw_main_arg = PointerGetDatum(vciinfo); + worker.bgw_notify_pid = 0; /* don't notify by SIG_USR1 since it calls + * SetLatch and and awakens the parent process + * ROS daemon. That results ROS daemon + * spawning unnecessary multiple ROS control + * workers. */ + + if (!RegisterDynamicBackgroundWorker(&worker, &handle)) + ereport(ERROR, + (errcode(ERRCODE_INSUFFICIENT_RESOURCES), + errmsg("could not register background process"), + errhint("You may need to increase max_worker_processes."))); + + /* Wait for workers to become ready. */ + while (true) + { + BgwHandleStatus status; + + status = GetBackgroundWorkerPid(handle, &pid); + if (gotSigterm) + break; + + switch (status) + { + case BGWH_NOT_YET_STARTED: + continue; + + case BGWH_STARTED: + goto done; + + case BGWH_STOPPED: + pid = 0; + goto done; + + case BGWH_POSTMASTER_DIED: + pid = 0; + goto done; + + default: + /* LCOV_EXCL_START */ + elog(PANIC, "should not reach here"); + /* LCOV_EXCL_STOP */ + goto done; + } + } + +done: + result.pid = pid; + result.handle = *handle; + + pfree(handle); + + return result; +} + +/** + * + */ +static inline bool +vci_GetRosCommandExecFlag(char flag, vci_ros_command_t command_id) +{ + return (flag & (1 << command_id)) != 0; +} + +static inline void +vci_SetRosCommandExecFlag(char *flag, vci_ros_command_t command_id) +{ + *flag |= (1 << command_id); +} + +static int +determine_ExecCommand_and_Extent(const Oid vci_oid, + char *targetExecFlag, + int32 *targetExtentForCdr, + bool force_wosros_conv) +{ + Relation indexRel; + Relation heapRel; + vci_ros_command_t command; + + /* Transaction Start */ + SetCurrentStatementStartTimestamp(); + StartTransactionCommand(); + PushActiveSnapshot(GetTransactionSnapshot()); + + /* Try to open the heap relation & the index relation. */ + if (!TryToOpenVCIRelations(vci_oid, AccessShareLock, AccessShareLock, + &heapRel, &indexRel)) + { + AbortCurrentTransaction(); + return -1; + } + + /* Check request for ros control worker cancel. */ + CheckRosControlWorkerCancel(); + + MemSet(targetExecFlag, 0, sizeof(char)); + MemSet(targetExtentForCdr, 0, sizeof(int32)); + + for (command = 0; command < num_vci_rc; command++) + { + int32 count = 0; + vci_target_extent_info_t extent_info = {0, -1}; + int32 targetExtentId; + + switch (command) + { + case vci_rc_wos_ros_conv: + /* 1. count DataWOS */ + count = vci_CountFreezedInDataWos(indexRel, MaxAllocSize); + break; + + case vci_rc_update_del_vec: + /* 2. count WhiteoutWOS */ + count = vci_CountFreezedInWhiteoutWos(indexRel, MaxAllocSize); + break; + + case vci_rc_collect_deleted: + /* 3. count deleted rows in each extent */ + extent_info = vci_CountDeletedRowsInROS(indexRel, (uint32) VciGuc.cdr_threshold); + break; + + case vci_rc_update_tid_crid: + /* 5. count TID->CRID update list */ + count = vci_CountTidCridUpdateListLength(indexRel, MaxAllocSize); + break; + + case vci_rc_collect_extent: + /* 6. count unused extents */ + extent_info = vci_CountUnusedExtents(indexRel); + break; + + default: + /* LCOV_EXCL_START */ + elog(ERROR, "unexpected ROS command"); + /* LCOV_EXCL_STOP */ + break; + } + + switch (command) + { + case vci_rc_wos_ros_conv: + elog(DEBUG2, &probeMessage[vci_rc_wos_ros_conv][0], count, VciGuc.wosros_conv_threshold); + if (force_wosros_conv || count >= VciGuc.wosros_conv_threshold) + vci_SetRosCommandExecFlag(targetExecFlag, vci_rc_wos_ros_conv); + break; + + case vci_rc_update_del_vec: + elog(DEBUG2, &probeMessage[vci_rc_update_del_vec][0], count, VCI_UPDATE_DELVEC_THRESHOLD); + if (force_wosros_conv || count >= VCI_UPDATE_DELVEC_THRESHOLD) + vci_SetRosCommandExecFlag(targetExecFlag, vci_rc_update_del_vec); + break; + + case vci_rc_update_tid_crid: + elog(DEBUG2, &probeMessage[vci_rc_update_tid_crid][0], count, VCI_UPDATE_TIDCRID_THRESHOLD); + if (count >= VCI_UPDATE_TIDCRID_THRESHOLD) + vci_SetRosCommandExecFlag(targetExecFlag, vci_rc_update_tid_crid); + break; + + case vci_rc_collect_extent: + case vci_rc_collect_deleted: + targetExtentId = VCI_INVALID_EXTENT_ID; + if (extent_info.num_fit_extents > 0) + { + targetExtentId = extent_info.best_extent_id; + + if (command == vci_rc_collect_deleted) + *targetExtentForCdr = targetExtentId; + + vci_SetRosCommandExecFlag(targetExecFlag, command); + } + break; + + default: + /* LCOV_EXCL_START */ + elog(ERROR, "unexpected ROS command"); + /* LCOV_EXCL_STOP */ + break; + } + } + + /* unlock VCI main rel */ + index_close(indexRel, AccessShareLock); + + table_close(heapRel, AccessShareLock); + + /* Transaction End */ + PopActiveSnapshot(); + CommitTransactionCommand(); + + return 0; +} + +/** + * update ROS + * + * @param[in] targetIndexOid target index oid + * @param[in] targetExecCommandFlag target exec commands + * @param[in] targetExtentId target extent id + * @param[out] num_converted_data_wos number of rows coverted in Data WOS + * @param[out] num_converted_whiteout_wos number of rows converted in Whiteout WOS + */ +static void +vci_executeROScommand(Oid targetIndexOid, char targetExecCommandFlag, int32 targetExtentId, + int *num_converted_data_wos, int *num_converted_whiteout_wos) +{ + vci_ros_command_t command; + + /* + * loop for executing ROS commaand each command is excuted in anoter + * Transaction(); + */ + for (command = 0; command < num_vci_rc; command++) + { + if (vci_GetRosCommandExecFlag(targetExecCommandFlag, command)) + { + Relation mainRel; + Relation heapRel; + Size workAreaSize = VciGuc.maintenance_work_mem * INT64CONST(1024); + + instr_time s_time; + instr_time e_time; + volatile Snapshot snapshot; + + /* Check request for ros control worker cancel. */ + CheckRosControlWorkerCancel(); + + /* transaction start */ + SetCurrentStatementStartTimestamp(); + StartTransactionCommand(); + snapshot = GetTransactionSnapshot(); + PushActiveSnapshot(snapshot); + GetCurrentTransactionId(); + + /** Try to open the heap relation & the index relation, + * and get ShareUpdateExclusiveLock for the index relation. */ + if (!TryToOpenVCIRelations(targetIndexOid, AccessShareLock, ShareUpdateExclusiveLock, + &heapRel, &mainRel)) + { + /* Exit worker process. */ + AbortCurrentTransaction(); + return; + } + + elog(LOG, "starts ROS command \"%s\"", vci_GetRosCommandName(command)); + INSTR_TIME_SET_CURRENT(s_time); + + switch (command) + { + case vci_rc_wos_ros_conv: + /* 1. WOS->ROS conversion */ + *num_converted_data_wos = vci_ConvertWos2Ros(mainRel, workAreaSize, VciGuc.wosros_conv_threshold); + break; + + case vci_rc_update_del_vec: + /* 2. update delete vector */ + *num_converted_whiteout_wos = vci_UpdateDelVec(mainRel, workAreaSize, VCI_UPDATE_DELVEC_THRESHOLD); + break; + + case vci_rc_collect_deleted: + /* 3. collect deleted rows */ + vci_CollectDeletedRows(mainRel, workAreaSize, targetExtentId); + break; + + case vci_rc_update_tid_crid: + /* 5. update TID->CRID update list to TID-CRID tree */ + vci_UpdateTidCrid(mainRel, workAreaSize, 10000); + break; + + case vci_rc_collect_extent: + /* 6. collect an unused extent */ + vci_CollectUnusedExtent(mainRel, workAreaSize); + break; + + default: + /* LCOV_EXCL_START */ + elog(ERROR, "unexpected ROS command"); + /* LCOV_EXCL_STOP */ + break; + } + + index_close(mainRel, ShareUpdateExclusiveLock); + table_close(heapRel, AccessShareLock); + + PopActiveSnapshot(); + CommitTransactionCommand(); + + INSTR_TIME_SET_CURRENT(e_time); + INSTR_TIME_SUBTRACT(e_time, s_time); + elog(LOG, "finished ROS command \"%s\" (%.03f ms)", vci_GetRosCommandName(command), + INSTR_TIME_GET_MILLISEC(e_time)); + } + } +} + +/* + * @param[in] dboid id of db to which the worker connects. + * @pramm[in] username user name + */ +static void +BackgroundWorkerInitializeConnectionByOid1(Oid dboid, const char *username) +{ + BackgroundWorker *worker = MyBgworkerEntry; + + /* XXX is this the right errcode? */ + if (!(worker->bgw_flags & BGWORKER_BACKEND_DATABASE_CONNECTION)) + ereport(FATAL, + (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("database connection requirement not indicated during registration"))); + + InitPostgres(NULL, dboid, username, InvalidOid, 0, NULL); + + /* it had better not gotten out of "init" mode yet */ + if (!IsInitProcessingMode()) + ereport(ERROR, + (errmsg("invalid processing mode in background worker"))); + SetProcessingMode(NormalProcessing); +} + +#define RATIO_OF_INCREASE 1.1 + +/** + * @param[in] main_arg id of vci, a WOS->ROS transfomation of which is performed. + */ +void +vci_ROS_control_worker_main(Datum main_arg) +{ + Oid targetIndexOid = InvalidOid; + int32 targetExtentId = 01; + char targetExecCommandFlag = 0x00; + + Oid dboid; + vci_wosros_conv_worker_arg_t *vciinfo; + int ret; + int num_converted_data_wos = INT_MAX; + int num_converted_whiteout_wos = INT_MAX; + + pg_bindtextdomain(TEXTDOMAIN); + + pqsignal(SIGHUP, vci_ROSControlDaemonSighup); + pqsignal(SIGTERM, vci_ROSControlDaemonSigterm); + pqsignal(SIGQUIT, vci_ROSControlDaemonSigterm); + pqsignal(SIGINT, vci_ROSControlDaemonSigterm); + /* pqsignal(SIGUSR1, vci_ROSNotify); */ + + /* Check full_page_writers=off */ + if (!fullPageWrites) + return; + + /* We're now ready to receive signals */ + BackgroundWorkerUnblockSignals(); + + INIT_MESSAGE_ON_WORKER_EXIT(); + + /* + * Checkout the Postmaster was rebooted. if + * (MyBgworkerEntry->bgw_notify_pid == 0) return; + */ + + /* Connect to DB corresponding to dbid */ + + vciinfo = (vci_wosros_conv_worker_arg_t *) DatumGetPointer(main_arg); + targetIndexOid = vciinfo->oid; + dboid = vciinfo->dbid; + + SET_MESSAGE_ON_WORKER_EXIT(DEBUG1, "worker: Failed to connect '%d'.", dboid); + BackgroundWorkerInitializeConnectionByOid1(dboid, NULL); + RESET_MESSAGE_ON_WORKER_EXIT(); + + elog(DEBUG1, "worker: connect to %d is OK. do wos->ros conversion on vci %d", dboid, targetIndexOid); + +#if 0 + /** + * TODO -- Put thi call back again if/when Iwata-San's separate bgworker patch is accepted. + * See https://www.postgresql.org/message-id/OS7PR01MB11964335F36BE41021B62EAE8EAE4A%40OS7PR01MB11964.jpnprd01.prod.outlook.com + */ + + /* Accept cancel by admin commands. */ + AcceptBackgroundWorkerCancel(MyDatabaseId, BGWORKER_CANCEL_ADMIN_COMMANDS); +#endif + + ret = determine_ExecCommand_and_Extent(targetIndexOid, &targetExecCommandFlag, + &targetExtentId, vciinfo->force_next_wosros_conv); + + if (ret == 0) + vci_executeROScommand(targetIndexOid, targetExecCommandFlag, targetExtentId, + &num_converted_data_wos, &num_converted_whiteout_wos); + + if (vciinfo->force_next_wosros_conv && + num_converted_data_wos == 0 && + num_converted_whiteout_wos == 0) + { + vci_id_t vciid; + + vciid.oid = targetIndexOid; + vciid.dbid = dboid; + + vci_SetForceNextWosRosConvFlag(&vciid, false); + } + +} + +/** + * Try to open the heap relation & the index relation. + * open the heap relation to detect AccessExclusiveLock of the heap + * relation, before opening the index relation. + */ +static bool +TryToOpenVCIRelations(Oid indexOid, LOCKMODE heapLock, LOCKMODE indexLock, + Relation *heapRel, Relation *indexRel) +{ + Oid heapOid; + + heapOid = IndexGetRelation(indexOid, true); + if (OidIsValid(heapOid)) + { + *heapRel = try_relation_open(heapOid, heapLock); + if (*heapRel != NULL) + { + *indexRel = try_relation_open(indexOid, indexLock); + if (*indexRel != NULL) + { + if (isVciIndexRelation(*indexRel)) + return true; + + relation_close(*indexRel, indexLock); + } + + relation_close(*heapRel, heapLock); + } + } + + elog(DEBUG1, "worker: The relation the OID=%d indicates was deleted.", indexOid); + + return false; +} + +/** + * Check request for ros control worker cancel. + */ +static void +CheckRosControlWorkerCancel(void) +{ +#ifdef WIN32 + if (UNBLOCKED_SIGNAL_QUEUE()) + pgwin32_dispatch_queued_signals(); +#endif /* WIN32 */ + + if (gotSigterm) + { + ereport(DEBUG1, + (errcode(ERRCODE_ADMIN_SHUTDOWN), + errmsg_internal("terminating VCI worker process due to administrator command"))); + /* process terminate. */ + exit(1); + + } +} + +/** + * callback on exit worker fro message. + */ +static void +callback_on_exit_worker(int code, Datum arg) +{ + log_min_messages = message_on_worker_exit.log_min_messages; + + if (message_on_worker_exit.message[0]) + { + elog(message_on_worker_exit.message_level, + "%s", message_on_worker_exit.message); + message_on_worker_exit.message[0] = '\0'; + } + elog(DEBUG1, "worker: ROS control worker exit code=%d.", code); +} diff --git a/contrib/vci/storage/vci_tidcrid.c b/contrib/vci/storage/vci_tidcrid.c new file mode 100644 index 000000000000..d5a1c60889f0 --- /dev/null +++ b/contrib/vci/storage/vci_tidcrid.c @@ -0,0 +1,1778 @@ +/*------------------------------------------------------------------------- + * + * vci_tidcrid.c + * TIDCRID update list and TIDCRID Tree relation handlings + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/storage/vci_tidcrid.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "catalog/storage.h" +#include "utils/tuplesort.h" + +#include "vci.h" +#include "vci_freelist.h" +#include "vci_ros.h" +#include "vci_tidcrid.h" + +/* + * Add TID-CRID tree page to the free list if number of free items exceeds + * VCI_TID_CRID_FREESPACE_THRESHOLD + */ +#define VCI_TID_CRID_FREESPACE_THRESHOLD (4) + +/* + * Dummy column id for the main relation + */ +#define VCI_TID_CRID_COLID_DUMMY ((int16) 1) + +#define VCI_TID_CRID_RECOVERY_CURRENT_VAL (InvalidOffsetNumber) + +static void InitializeTidCridUpdateList(Oid relOid); + +static void WriteTidCridUpdateList(vci_MainRelHeaderInfo *info, int sel, bool (*callback) (vcis_tidcrid_pair_item_t *item, void *data), void *data); +static void SampleTidCridUpdateList(Relation rel, uint64 count, vcis_tidcrid_pair_list_t *dest); + +static vcis_tidcrid_meta_t *vci_GetTidCridMeta(vci_TidCridRelations *relPair); +static vcis_tidcrid_pagetag_t *vci_GetTidCridTag(vci_TidCridRelations *relPair, BlockNumber blk); +static void GetTidCridMetaItemPosition(BlockNumber *blockNumber, uint32 *offset, BlockNumber blkNum); +static vcis_tidcrid_meta_item_t *vci_GetTidCridMetaItem(vci_TidCridRelations *relPair, BlockNumber blkNum); +static char *vci_GetTidCridTreeNode(vci_TidCridRelations *relPair, ItemPointer trunkPtr, int64 leafNo, ItemPointer retPtr); + +static void RemoveLeafTidCridTree(vci_TidCridRelations *relPair, ItemPointer trunkPtr, uint32 leafNo); +static void AddNewLeafTidCridTree(vci_TidCridRelations *relPair, ItemPointer trunkPtr, uint32 leafNo); + +static uint64 SearchFromTidCridTree(vci_MainRelHeaderInfo *info, ItemPointer tId); + +static uint64 SearchCridFromTidCridUpdateListContext(vci_TidCridUpdateListContext *context, ItemPointer tId); +static uint64 SearchCridInBlockRange(vci_TidCridUpdateListContext *context, ItemPointer tId, BlockNumber start, BlockNumber end); +static uint64 SearchCridInBlock(vci_TidCridUpdateListContext *context, ItemPointer tId, vcis_tidcrid_pair_item_t *array, int first, int last); + +static OffsetNumber FindFreeItem(vci_TidCridRelations *relPair, BlockNumber freeBlk); + +static void SetFreeSpaceBitmap(vci_TidCridRelations *relPair, BlockNumber blk, OffsetNumber bit); +static void UnsetFreeSpaceBitmap(vci_TidCridRelations *relPair, BlockNumber blk, OffsetNumber bit); + +static void WriteRecoveryRecordForTidCridTrunk(vci_TidCridRelations *relPair, BlockNumber origBlkno, BlockNumber trunkBlkno, OffsetNumber trunkOffset); +static void WriteRecoveryRecordForTidCridLeaf(vci_TidCridRelations *relPair, ItemPointer trunkPtr, uint32 leafNo, BlockNumber leafBlkno, OffsetNumber leafOffset); +static void WriteRecoveryRecordForTidCridCommon(vci_TidCridRelations *relPair, vcis_tid_crid_op_type_t operation, BlockNumber targetBlkno, uint32 targetInfo, BlockNumber freeBlkno, OffsetNumber freeOffset); + +/** + * function to cast from Page to (vcis_tidcrid_pair_list_t *). + */ +#define vci_GetTidCridPairListT(page) \ + ((vcis_tidcrid_pair_list_t *) &((page)[VCI_MIN_PAGE_HEADER])) + +#define vci_GetTidCridPairItemT(page) \ + ((vcis_tidcrid_pair_item_t *) &((page)[VCI_MIN_PAGE_HEADER])) + +#define ROUND_UP(value, size) ((((value) + (size) - 1) / (size)) * (size)) + +/* + * Initialize TID-CRID update list and create on the storage + */ +static void +InitializeTidCridUpdateList(Oid relOid) +{ + Relation rel = table_open(relOid, ShareLock); + Buffer buffer; + Page page; + vcis_tidcrid_pair_list_t *pairList; + BlockNumber blockNumber = VCI_TID_CRID_UPDATE_HEADER_PAGE_ID; + + Assert(offsetof(vcis_tidcrid_pair_list_t, body) == VCI_TID_CRID_UPDATE_PAGE_SPACE); + + vci_PreparePagesWithOneItemIfNecessary(rel, blockNumber); + buffer = ReadBuffer(rel, blockNumber); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buffer); + pairList = vci_GetTidCridPairListT(page); + pairList->num = 0; + + vci_WriteOneItemPage(rel, buffer); + UnlockReleaseBuffer(buffer); + table_close(rel, ShareLock); +} + +/* + * Same as above, but the argument is the main relation info + */ +void +vci_InitializeTidCridUpdateLists(vci_MainRelHeaderInfo *info) +{ + Oid oid; + + oid = vci_GetMainRelVar(info, vcimrv_tid_crid_update_oid_0, 0); + InitializeTidCridUpdateList(oid); + oid = vci_GetMainRelVar(info, vcimrv_tid_crid_update_oid_1, 0); + InitializeTidCridUpdateList(oid); +} + +/* + * Initialize TID-CRID tree relation and create on the storage + */ +void +vci_InitializeTidCridTree(vci_MainRelHeaderInfo *info) +{ + LOCKMODE lockmode = ShareLock; + + vci_TidCridRelations relPairData = {0}; + vci_TidCridRelations *relPair = &relPairData; + vcis_tidcrid_meta_t *tidcridMeta; + vcis_tidcrid_pagetag_t *tidcridTag; + + vci_OpenTidCridRelations(relPair, info, lockmode); + + /* --- Meta --- */ + + vci_FormatPageWithOneItem(relPair->meta, + VCI_TID_CRID_DATA_FIRST_PAGE_ID); + + tidcridMeta = vci_GetTidCridMeta(relPair); + LockBuffer(relPair->bufMeta, BUFFER_LOCK_EXCLUSIVE); + + tidcridMeta->free_page_begin_id = VCI_TID_CRID_DATA_FIRST_PAGE_ID; + tidcridMeta->free_page_begin_id_old = VCI_TID_CRID_DATA_FIRST_PAGE_ID; + tidcridMeta->free_page_end_id = VCI_TID_CRID_DATA_FIRST_PAGE_ID; + tidcridMeta->free_page_end_id_old = VCI_TID_CRID_DATA_FIRST_PAGE_ID; + tidcridMeta->free_page_prev_id = InvalidBlockNumber; + tidcridMeta->free_page_next_id = InvalidBlockNumber; + tidcridMeta->num_free_pages = 1; + tidcridMeta->num_free_pages_old = 1; + tidcridMeta->num_free_page_blocks = 1; + tidcridMeta->num_free_page_blocks_old = 1; + + tidcridMeta->num = 0; + tidcridMeta->num_old = 0; + tidcridMeta->free_block_number = 1; + tidcridMeta->offset = offsetof(vcis_tidcrid_meta_t, body); + + /* need to set invalid to first item ? */ + + vci_WriteOneItemPage(relPair->meta, relPair->bufMeta); + UnlockReleaseBuffer(relPair->bufMeta); + + /* --- Data --- */ + + vci_FormatPageWithItems(relPair->data, + VCI_TID_CRID_DATA_FIRST_PAGE_ID, + VCI_ITEMS_IN_PAGE_FOR_TID_CRID_TREE); + + tidcridTag = vci_GetTidCridTag(relPair, VCI_TID_CRID_DATA_FIRST_PAGE_ID); + LockBuffer(relPair->bufData, BUFFER_LOCK_EXCLUSIVE); + + tidcridTag->size = MaxBlockNumber; + tidcridTag->type = vcis_tidcrid_type_pagetag; + tidcridTag->prev_pos = InvalidBlockNumber; + tidcridTag->next_pos = InvalidBlockNumber; + + tidcridTag->num = 0; + + /* Meta data has already been added, so subtract from the free_size */ + tidcridTag->free_size = VCI_ITEMS_IN_PAGE_FOR_TID_CRID_TREE - 1; + tidcridTag->bitmap = 0x1; + + vci_WriteItem(relPair->data, relPair->bufData, VCI_TID_CRID_PAGETAG_ITEM_ID); + UnlockReleaseBuffer(relPair->bufData); + + vci_CloseTidCridRelations(relPair, lockmode); +} + +/* ************************************** + * TID CRID Update List Functions + * ************************************* + */ + +/* + * Open TID-CRID Update List + * + * Returns the alloced vci_TidCridUpdateListContext + */ +vci_TidCridUpdateListContext * +vci_OpenTidCridUpdateList(vci_MainRelHeaderInfo *info, int sel) +{ + Oid oid; + Buffer buffer; + Page page; + BlockNumber blkno; + vcis_tidcrid_pair_list_t *src; + vci_TidCridUpdateListContext *context; + + context = palloc0(sizeof(vci_TidCridUpdateListContext)); + + Assert((0 <= sel) && (sel < 2)); + oid = vci_GetMainRelVar(info, vcimrv_tid_crid_update_oid_0, sel); + + context->info = info; + context->rel = table_open(oid, AccessShareLock); + + blkno = VCI_TID_CRID_UPDATE_HEADER_PAGE_ID; + + buffer = vci_ReadBufferWithPageInit(context->rel, blkno); + + page = BufferGetPage(buffer); + src = vci_GetTidCridPairListT(page); + + /* Copy header parts */ + MemCpy(&context->header, src, offsetof(vcis_tidcrid_pair_list_t, body)); + + ReleaseBuffer(buffer); + + context->count = src->num; + + /* Calculate number of blocks in CRID-TID Update List */ + context->nblocks = + VCI_TID_CRID_UPDATE_BODY_PAGE_ID + ROUND_UP(context->count, VCI_TID_CRID_UPDATE_PAGE_ITEMS) / VCI_TID_CRID_UPDATE_PAGE_ITEMS; + + return context; +} + +/* + * Close TID-CRID Update List + */ +void +vci_CloseTidCridUpdateList(vci_TidCridUpdateListContext *context) +{ + table_close(context->rel, AccessShareLock); + + pfree(context); +} + +/* + * Read one TID-CRID pair from TID-CRID update list + */ +void +vci_ReadOneBlockFromTidCridUpdateList(vci_TidCridUpdateListContext *context, BlockNumber blkno, vcis_tidcrid_pair_item_t *array) +{ + Buffer buffer; + Page page; + + buffer = vci_ReadBufferWithPageInit(context->rel, blkno); + page = BufferGetPage(buffer); + MemCpy(array, &page[VCI_MIN_PAGE_HEADER], VCI_TID_CRID_UPDATE_PAGE_SPACE); + ReleaseBuffer(buffer); +} + +/* + * Get the length of TID-CRID update list + */ +int32 +vci_GetTidCridUpdateListLength(vci_MainRelHeaderInfo *info, int sel) +{ + Oid oid; + Relation rel; + Buffer buffer; + Page page; + vcis_tidcrid_pair_list_t *src; + int32 length; + BlockNumber blockNumber; + + Assert((0 <= sel) && (sel < 2)); + oid = vci_GetMainRelVar(info, vcimrv_tid_crid_update_oid_0, sel); + rel = table_open(oid, AccessShareLock); + + blockNumber = VCI_TID_CRID_UPDATE_HEADER_PAGE_ID; + buffer = vci_ReadBufferWithPageInit(rel, blockNumber); + page = BufferGetPage(buffer); + + src = vci_GetTidCridPairListT(page); + length = src->num; + ReleaseBuffer(buffer); + + table_close(rel, AccessShareLock); + + return length; +} + +/* + * Serialize TID-CRID update list + */ +static void +WriteTidCridUpdateList(vci_MainRelHeaderInfo *info, + int sel, + bool (*callback) (vcis_tidcrid_pair_item_t *item, void *data), + void *data) +{ + Oid oid; + Relation rel; + BlockNumber blockNumber; + vcis_tidcrid_pair_item_t *array; + Page page; + Buffer buffer; + bool is_terminated = false; + vcis_tidcrid_pair_list_t tidcrid_pair_list = {0}; + uint64 count = 0; + + array = palloc(VCI_TID_CRID_UPDATE_PAGE_SPACE); + + Assert((0 <= sel) && (sel < 2)); + oid = vci_GetMainRelVar(info, vcimrv_tid_crid_update_oid_0, sel); + rel = table_open(oid, AccessExclusiveLock); + + RelationTruncate(rel, 0); + + vci_PreparePagesWithOneItemIfNecessary(rel, VCI_TID_CRID_UPDATE_HEADER_PAGE_ID); + + blockNumber = VCI_TID_CRID_UPDATE_BODY_PAGE_ID; + + while (!is_terminated) + { + int i; + int count_in_page = 0; + + for (i = 0; i < VCI_TID_CRID_UPDATE_PAGE_ITEMS; i++) + { + if (!callback(&array[i], data)) + { + is_terminated = true; + break; + } + + count_in_page++; + } + + if (count_in_page > 0) + { + vci_PreparePagesWithOneItemIfNecessary(rel, blockNumber); + buffer = ReadBuffer(rel, blockNumber); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buffer); + MemCpy(&page[VCI_MIN_PAGE_HEADER], array, VCI_TID_CRID_UPDATE_PAGE_SPACE); + vci_WriteOneItemPage(rel, buffer); + UnlockReleaseBuffer(buffer); + + blockNumber++; + count += count_in_page; + } + } + + /* Write the initial block */ + tidcrid_pair_list.num = count; + + if (count > 0) + SampleTidCridUpdateList(rel, count, &tidcrid_pair_list); + + buffer = vci_ReadBufferWithPageInit(rel, VCI_TID_CRID_UPDATE_HEADER_PAGE_ID); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buffer); + MemCpy(&page[VCI_MIN_PAGE_HEADER], &tidcrid_pair_list, offsetof(vcis_tidcrid_pair_list_t, body)); + vci_WriteOneItemPage(rel, buffer); + UnlockReleaseBuffer(buffer); + + table_close(rel, AccessExclusiveLock); + + vci_SetMainRelVar(info, vcimrv_tid_crid_diff_sel, 0, sel); + + pfree(array); +} + +static void +SampleTidCridUpdateList(Relation rel, uint64 count, vcis_tidcrid_pair_list_t *dest) +{ + BlockNumber nblocks; + BlockNumber blkno; + Buffer buffer; + Page page; + + nblocks = VCI_TID_CRID_UPDATE_BODY_PAGE_ID + ROUND_UP(count, VCI_TID_CRID_UPDATE_PAGE_ITEMS) / VCI_TID_CRID_UPDATE_PAGE_ITEMS; + + dest->blocks_per_samp = + ROUND_UP(nblocks - 1 /* Except the header */ , VCI_TID_CRID_UPDATE_CONTEXT_SAMPLES) / VCI_TID_CRID_UPDATE_CONTEXT_SAMPLES; + + blkno = VCI_TID_CRID_UPDATE_BODY_PAGE_ID; + + while (blkno < nblocks) + { + buffer = vci_ReadBufferWithPageInit(rel, blkno); + page = BufferGetPage(buffer); + + Assert(dest->num_samples < VCI_TID_CRID_UPDATE_CONTEXT_SAMPLES); + + dest->sample_tids[dest->num_samples++] = vci_GetTidCridPairItemT(page)[0].page_item_id; + + ReleaseBuffer(buffer); + + blkno += dest->blocks_per_samp; + } + + /* Put final entry */ + buffer = vci_ReadBufferWithPageInit(rel, nblocks - 1); + page = BufferGetPage(buffer); + + dest->sample_tids[dest->num_samples++] = vci_GetTidCridPairItemT(page)[(count - 1) % VCI_TID_CRID_UPDATE_PAGE_ITEMS].page_item_id; + + ReleaseBuffer(buffer); + + /* Discard if the final entry is duplicated */ + if (ItemPointerEquals(&dest->sample_tids[dest->num_samples - 1], + &dest->sample_tids[dest->num_samples - 2])) + dest->num_samples--; +} + +/* ************************************** + * TID CRID Tree Functions + * ************************************* + */ + +/* + * Open the meta and data relation for TID-CRID tree relation + * + * Caller must release via vci_CloseTidCridRelations() + */ +void +vci_OpenTidCridRelations(vci_TidCridRelations *rel, + vci_MainRelHeaderInfo *info, + LOCKMODE lockmode) +{ + rel->meta = table_open(vci_GetMainRelVar(info, vcimrv_tid_crid_meta_oid, 0), lockmode); + rel->data = table_open(vci_GetMainRelVar(info, vcimrv_tid_crid_data_oid, 0), lockmode); + + rel->info = info; +} + +/* + * Close TID-CRID tree relation + */ +void +vci_CloseTidCridRelations(vci_TidCridRelations *rel, LOCKMODE lockmode) +{ + if (rel) + { + if (RelationIsValid(rel->data)) + table_close(rel->data, lockmode); + if (RelationIsValid(rel->meta)) + table_close(rel->meta, lockmode); + } +} + +#define vci_GetTidCridMetaT(page) \ + ((vcis_tidcrid_meta_t *)& ((page)[VCI_MIN_PAGE_HEADER])) + +/* + * Read metadata from the relation + */ +static vcis_tidcrid_meta_t * +vci_GetTidCridMeta(vci_TidCridRelations *relPair) +{ + Page page; + + relPair->bufMeta = vci_ReadBufferWithPageInit(relPair->meta, VCI_COLUMN_META_HEADER_PAGE_ID); + page = BufferGetPage(relPair->bufMeta); + + return vci_GetTidCridMetaT(page); +} + +/* + * Read the metadata in the initial tuple of pages + */ +static vcis_tidcrid_pagetag_t * +vci_GetTidCridTag(vci_TidCridRelations *relPair, BlockNumber blk) +{ + Page page; + HeapTupleHeader htup; + + relPair->bufData = vci_ReadBufferWithPageInit(relPair->data, blk); + page = BufferGetPage(relPair->bufData); + + htup = (HeapTupleHeader) PageGetItem(page, + PageGetItemId(page, VCI_TID_CRID_PAGETAG_ITEM_ID)); + + return (vcis_tidcrid_pagetag_t *) ((char *) htup + htup->t_hoff); +} + +/* + * Calculate offset (page number and the position in the page) to access the + * flexible array in meta relation + */ +static void +GetTidCridMetaItemPosition(BlockNumber *blockNumber, + uint32 *offset, + BlockNumber blkNum) +{ + const int maxTidCridMetaItemInFirstPage = + (VCI_MAX_PAGE_SPACE - offsetof(vcis_tidcrid_meta_t, body)) / sizeof(vcis_tidcrid_meta_item_t); + const int maxTidCridMetaItem = VCI_MAX_PAGE_SPACE / sizeof(vcis_tidcrid_meta_item_t); + + Assert(blockNumber); + Assert(offset); + + if (blkNum < maxTidCridMetaItemInFirstPage) + { + *blockNumber = 0; + *offset = VCI_MIN_PAGE_HEADER + offsetof(vcis_tidcrid_meta_t, body) + + (blkNum * sizeof(vcis_tidcrid_meta_item_t)); + } + else + { + int32 blkNumRem = blkNum - maxTidCridMetaItemInFirstPage; + + *blockNumber = blkNumRem / maxTidCridMetaItem; + blkNumRem -= *blockNumber * maxTidCridMetaItem; + *blockNumber += 1; + *offset = VCI_MIN_PAGE_HEADER + + (blkNumRem * sizeof(vcis_tidcrid_meta_item_t)); + } +} + +/* + * read an entry from vcis_tidcrid_meta + */ +static vcis_tidcrid_meta_item_t * +vci_GetTidCridMetaItem(vci_TidCridRelations *relPair, BlockNumber blkNum) +{ + BlockNumber blockNumber; + uint32 offset; + Page page; + BlockNumber currentBlocks = RelationGetNumberOfBlocks(relPair->meta); + + GetTidCridMetaItemPosition(&blockNumber, &offset, blkNum); + + if (blockNumber >= currentBlocks) + vci_FormatPageWithOneItem(relPair->meta, blockNumber); + else + vci_PreparePagesWithOneItemIfNecessary(relPair->meta, blockNumber); + + relPair->bufMeta = ReadBuffer(relPair->meta, blockNumber); + page = BufferGetPage(relPair->bufMeta); + + return (vcis_tidcrid_meta_item_t *) &(((char *) page)[offset]); +} + +/* + * Returns the pointer to nodes (trunk or leaf) + */ +static char * +vci_GetTidCridTreeNode(vci_TidCridRelations *relPair, ItemPointer trunkPtr, int64 leafNo, + ItemPointer retPtr) +{ + Page page; + HeapTupleHeader htup; + vcis_tidcrid_trunk_t *trunk; + ItemPointerData leafPtrData; + ItemPointer leafPtr = &leafPtrData; + + Assert(ItemPointerIsValid(trunkPtr)); + + relPair->bufData = vci_ReadBufferWithPageInit(relPair->data, ItemPointerGetBlockNumber(trunkPtr)); + page = BufferGetPage(relPair->bufData); + htup = (HeapTupleHeader) PageGetItem(page, PageGetItemId(page, ItemPointerGetOffsetNumber(trunkPtr))); + trunk = (vcis_tidcrid_trunk_t *) ((char *) htup + htup->t_hoff); + + if (leafNo == VCI_TID_CRID_TRUNKNODE) + { + Assert(retPtr == NULL); + + return (char *) trunk; + } + + Assert(leafNo >= 0 && leafNo < VCI_TID_CRID_LEAF_CAPACITY); + leafPtrData = trunk->leaf_item[leafNo]; /* copy */ + + ReleaseBuffer(relPair->bufData); + + if (ItemPointerIsValid(leafPtr)) + { + vcis_tidcrid_pagetag_t *tag PG_USED_FOR_ASSERTS_ONLY; + + tag = vci_GetTidCridTag(relPair, ItemPointerGetBlockNumber(leafPtr)); + + Assert(tag->bitmap & (1U << (ItemPointerGetOffsetNumber(leafPtr) - 1))); + + ReleaseBuffer(relPair->bufData); + + relPair->bufData = vci_ReadBufferWithPageInit(relPair->data, ItemPointerGetBlockNumber(leafPtr)); + page = BufferGetPage(relPair->bufData); + htup = (HeapTupleHeader) PageGetItem(page, + PageGetItemId(page, ItemPointerGetOffsetNumber(leafPtr))); + + if (retPtr) + *retPtr = leafPtrData; + + return (char *) htup + htup->t_hoff; + } + + return NULL; +} + +/* + * Removes LeafNode + */ +static void +RemoveLeafTidCridTree(vci_TidCridRelations *relPair, ItemPointer trunkPtr, uint32 leafNo) +{ + vcis_tidcrid_leaf_t *leaf PG_USED_FOR_ASSERTS_ONLY; + vcis_tidcrid_trunk_t *trunk; + + ItemPointerData leafPtrData; + ItemPointer leafPtr = &leafPtrData; + + /* leaf */ + leaf = (vcis_tidcrid_leaf_t *) vci_GetTidCridTreeNode(relPair, trunkPtr, + leafNo, leafPtr); + ReleaseBuffer(relPair->bufData); + Assert(leaf); + + /* Write recovery record */ + WriteRecoveryRecordForTidCridLeaf(relPair, trunkPtr, leafNo, + ItemPointerGetBlockNumber(leafPtr), + ItemPointerGetOffsetNumber(leafPtr)); + + UnsetFreeSpaceBitmap(relPair, + ItemPointerGetBlockNumber(leafPtr), + ItemPointerGetOffsetNumber(leafPtr)); + + /* Remove forom the trunk node */ + trunk = (vcis_tidcrid_trunk_t *) + vci_GetTidCridTreeNode(relPair, trunkPtr, VCI_TID_CRID_TRUNKNODE, NULL); + Assert(trunk->type == vcis_tidcrid_type_trunk); + Assert((trunk->bitmap & (UINT64CONST(1) << leafNo)) != 0); + + LockBuffer(relPair->bufData, BUFFER_LOCK_EXCLUSIVE); + trunk->bitmap &= ~(UINT64CONST(1) << leafNo); + MemSet(&trunk->leaf_item[leafNo], 0, sizeof(ItemPointerData)); + + vci_WriteItem(relPair->data, relPair->bufData, + ItemPointerGetOffsetNumber(trunkPtr)); + UnlockReleaseBuffer(relPair->bufData); +} + +/* + * Add new leaf node + */ +static void +AddNewLeafTidCridTree(vci_TidCridRelations *relPair, ItemPointer trunkPtr, uint32 leafNo) +{ + int i; + Page page; + HeapTupleHeader htup; + BlockNumber freeBlk; + OffsetNumber newOffset; + vcis_tidcrid_leaf_t *leaf; + vcis_tidcrid_trunk_t *trunk; + vcis_tidcrid_pagetag_t *tag; + + ItemPointerData leafPtrData; + ItemPointer leafPtr = &leafPtrData; + + /* Firstly search from the same page as trunk */ + tag = vci_GetTidCridTag(relPair, ItemPointerGetBlockNumber(trunkPtr)); + Assert(tag->type == vcis_tidcrid_type_pagetag); + newOffset = vci_GetLowestBit(~tag->bitmap) + 1; + ReleaseBuffer(relPair->bufData); + + if (newOffset <= VCI_ITEMS_IN_PAGE_FOR_TID_CRID_TREE) + { + /* Free space is found */ + freeBlk = ItemPointerGetBlockNumber(trunkPtr); + } + else + { + freeBlk = vci_FindFreeSpaceForExtent((vci_ColumnRelations *) relPair, 1); + newOffset = FindFreeItem(relPair, freeBlk); + } + + WriteRecoveryRecordForTidCridLeaf(relPair, trunkPtr, leafNo, freeBlk, VCI_TID_CRID_RECOVERY_CURRENT_VAL); + + ItemPointerSet(leafPtr, freeBlk, newOffset); + + /* Connect to the leaf from the trunk */ + trunk = (vcis_tidcrid_trunk_t *) + vci_GetTidCridTreeNode(relPair, trunkPtr, VCI_TID_CRID_TRUNKNODE, NULL); + Assert(trunk->type == vcis_tidcrid_type_trunk); + Assert((trunk->bitmap & (UINT64CONST(1) << leafNo)) == 0); + + LockBuffer(relPair->bufData, BUFFER_LOCK_EXCLUSIVE); + trunk->bitmap |= (UINT64CONST(1) << leafNo); + trunk->leaf_item[leafNo] = *leafPtr; + + vci_WriteItem(relPair->data, relPair->bufData, + ItemPointerGetOffsetNumber(trunkPtr)); + UnlockReleaseBuffer(relPair->bufData); + + /* Write a tag to the page */ + SetFreeSpaceBitmap(relPair, freeBlk, newOffset); + + relPair->bufData = vci_ReadBufferWithPageInit(relPair->data, ItemPointerGetBlockNumber(leafPtr)); + page = BufferGetPage(relPair->bufData); + htup = (HeapTupleHeader) PageGetItem(page, PageGetItemId(page, ItemPointerGetOffsetNumber(leafPtr))); + + leaf = (vcis_tidcrid_leaf_t *) ((char *) htup + htup->t_hoff); + LockBuffer(relPair->bufData, BUFFER_LOCK_EXCLUSIVE); + + leaf->type = vcis_tidcrid_type_leaf; + leaf->bitmap = UINT64CONST(0); + + for (i = 0; i < VCI_TID_CRID_LEAF_CAPACITY; i++) + { + leaf->crid[i] = vci_GetCridFromUint64(VCI_INVALID_CRID); + } + + vci_WriteItem(relPair->data, relPair->bufData, newOffset); + UnlockReleaseBuffer(relPair->bufData); +} + +/* + * Returns the item pointer to the subtree related with original TID + */ +void +vci_GetTidCridSubTree(vci_TidCridRelations *relPair, BlockNumber blkOrig, + ItemPointer retPtr) +{ + vcis_tidcrid_meta_item_t *metaItem; + vcis_tidcrid_pagetag_t *tag PG_USED_FOR_ASSERTS_ONLY; + + metaItem = vci_GetTidCridMetaItem(relPair, blkOrig); + ItemPointerSet(retPtr, metaItem->block_number, metaItem->item_id); + + if (ItemPointerIsValid(retPtr)) + { + tag = vci_GetTidCridTag(relPair, metaItem->block_number); + + Assert((tag->bitmap & (UINT64CONST(1) << (metaItem->item_id - 1))) != 0); + + ReleaseBuffer(relPair->bufData); + } + + ReleaseBuffer(relPair->bufMeta); +} + +/* + * Create a new trunk in the subtree + */ +void +vci_CreateTidCridSubTree(vci_TidCridRelations *relPair, BlockNumber blkOrig, + ItemPointer retPtr) +{ + BlockNumber freeBlk; + OffsetNumber newOffset; + + vcis_tidcrid_trunk_t *trunk; + vcis_tidcrid_meta_item_t *metaItem; + + Assert(retPtr); + + /* Find the free page from the list */ + freeBlk = vci_FindFreeSpaceForExtent((vci_ColumnRelations *) relPair, 1); + + /* Find the free item from the free page */ + newOffset = FindFreeItem(relPair, freeBlk); + + WriteRecoveryRecordForTidCridTrunk(relPair, blkOrig, freeBlk, VCI_TID_CRID_RECOVERY_CURRENT_VAL); + + /* Set ItemPointer to the meta relation item */ + metaItem = vci_GetTidCridMetaItem(relPair, blkOrig); + LockBuffer(relPair->bufMeta, BUFFER_LOCK_EXCLUSIVE); + metaItem->block_number = freeBlk; + metaItem->item_id = newOffset; + + vci_WriteOneItemPage(relPair->meta, relPair->bufMeta); + UnlockReleaseBuffer(relPair->bufMeta); + + /* Write a tag in the page */ + SetFreeSpaceBitmap(relPair, freeBlk, newOffset); + + ItemPointerSet(retPtr, freeBlk, newOffset); + trunk = (vcis_tidcrid_trunk_t *) + vci_GetTidCridTreeNode(relPair, retPtr, VCI_TID_CRID_TRUNKNODE, NULL); + LockBuffer(relPair->bufData, BUFFER_LOCK_EXCLUSIVE); + Assert(trunk); + + trunk->type = vcis_tidcrid_type_trunk; + trunk->bitmap = UINT64CONST(0); + + MemSet((trunk->leaf_item), 0, sizeof(trunk->leaf_item)); + + vci_WriteItem(relPair->data, relPair->bufData, newOffset); + UnlockReleaseBuffer(relPair->bufData); +} + +void +vci_UpdateTidCridSubTree(vci_TidCridRelations *relPair, ItemPointer trunkPtr, + vcis_tidcrid_pair_list_t *newItems) +{ + for (int i = 0; i < newItems->num; i++) + { + vcis_tidcrid_leaf_t *leaf; + ItemPointerData leafPtrData; + ItemPointer leafPtr = &leafPtrData; + int prevBitCount = 0; + uint32 offset = ItemPointerGetOffsetNumber(&newItems->body[i].page_item_id) - 1; + int8 itemIdUpperBits; + + /* Extract upper bits from item_id */ + itemIdUpperBits = (offset >> VCI_TID_CRID_LEAF_CAPACITY_BITS) & + ((1 << VCI_TID_CRID_LEAF_CAPACITY_BITS) - 1); + + Assert(itemIdUpperBits < VCI_TID_CRID_LEAF_CAPACITY); + + leaf = (vcis_tidcrid_leaf_t *) vci_GetTidCridTreeNode(relPair, trunkPtr, + itemIdUpperBits, leafPtr); + if (leaf == NULL) + { + AddNewLeafTidCridTree(relPair, trunkPtr, itemIdUpperBits); + leaf = (vcis_tidcrid_leaf_t *) vci_GetTidCridTreeNode(relPair, trunkPtr, + itemIdUpperBits, leafPtr); + } + + LockBuffer(relPair->bufData, BUFFER_LOCK_EXCLUSIVE); + + prevBitCount = vci_GetBitCount(leaf->bitmap); + + for (; i < newItems->num; i++) + { + uint32 innerOffset = ItemPointerGetOffsetNumber(&newItems->body[i].page_item_id) - 1; + int8 innerItemIdUpperBits; + int8 itemIdLowerBits; + + /* Extract upper bits from item_id */ + innerItemIdUpperBits = (innerOffset >> VCI_TID_CRID_LEAF_CAPACITY_BITS) & + ((1 << VCI_TID_CRID_LEAF_CAPACITY_BITS) - 1); + + if (itemIdUpperBits != innerItemIdUpperBits) + { + i--; + break; + } + + /* Extract lower bits from item_id */ + itemIdLowerBits = innerOffset & ((1 << VCI_TID_CRID_LEAF_CAPACITY_BITS) - 1); + + leaf->crid[itemIdLowerBits] = newItems->body[i].crid; + + if (vci_GetUint64FromCrid(leaf->crid[itemIdLowerBits]) == VCI_INVALID_CRID) + leaf->bitmap &= ~(UINT64CONST(1) << itemIdLowerBits); + else + leaf->bitmap |= UINT64CONST(1) << itemIdLowerBits; + } + + vci_WriteItem(relPair->data, relPair->bufData, + ItemPointerGetOffsetNumber(leafPtr)); + UnlockReleaseBuffer(relPair->bufData); + + if (prevBitCount != 0 && leaf->bitmap == 0) + RemoveLeafTidCridTree(relPair, trunkPtr, itemIdUpperBits); + } +} + +/* + * Covert TID->CRID from TID-CRID tree + * + * Returns CRID corresponds to the given tid, otherwise VCI_INVALID_CRID + */ +static uint64 +SearchFromTidCridTree(vci_MainRelHeaderInfo *info, ItemPointer tId) +{ + const LOCKMODE lockmode = AccessShareLock; + + uint64 retVal = VCI_INVALID_CRID; + ItemPointerData trunkNodeData; + ItemPointer trunkNode = &trunkNodeData; + + vcis_tidcrid_leaf_t *leaf; + + BlockNumber blk = ItemPointerGetBlockNumber(tId); + uint32 offset = ItemPointerGetOffsetNumber(tId) - 1; + int8 itemIdLowerBits; + int8 itemIdUpperBits; + vci_TidCridRelations relPairData; + vci_TidCridRelations *relPair = &relPairData; + + /* Separate item id into uppper/lower parts */ + itemIdLowerBits = offset & ((1 << VCI_TID_CRID_LEAF_CAPACITY_BITS) - 1); + itemIdUpperBits = (offset >> VCI_TID_CRID_LEAF_CAPACITY_BITS) & + ((1 << VCI_TID_CRID_LEAF_CAPACITY_BITS) - 1); + + vci_OpenTidCridRelations(relPair, info, lockmode); + vci_GetTidCridSubTree(relPair, blk, trunkNode); + + if (ItemPointerIsValid(trunkNode)) + { + leaf = (vcis_tidcrid_leaf_t *) vci_GetTidCridTreeNode(relPair, trunkNode, itemIdUpperBits, NULL); + if (leaf) + { + retVal = vci_GetUint64FromCrid(leaf->crid[itemIdLowerBits]); + ReleaseBuffer(relPair->bufData); + } + } + + vci_CloseTidCridRelations(relPair, lockmode); + + return retVal; +} + +/* + * Covert TID to CRID + * + * Firstly checks the TID-CRID update list, then search TID-CRID tree + * + * @param[in] context context for the TID-CRID update list + * @param[in] tId target tid + * @param[out] fromTree true if the CRID is found from the tree + * + * Returns found CID, otherwise VCI_INVALID_CRID + */ +uint64 +vci_GetCridFromTid(vci_TidCridUpdateListContext *context, ItemPointer tId, bool *fromTree) +{ + bool viaTree = false; + uint64 result = VCI_MOVED_CRID; + + if (context->count > 0) + result = SearchCridFromTidCridUpdateListContext(context, tId); + + if (result == VCI_MOVED_CRID) + { + result = SearchFromTidCridTree(context->info, tId); + viaTree = true; + } + + if (fromTree) + *fromTree = viaTree; + + return result; +} + +/* + * Search tid from TID-CRID update list + */ +static uint64 +SearchCridFromTidCridUpdateListContext(vci_TidCridUpdateListContext *context, ItemPointer tId) +{ + int ret; + int min, + max, + pivot; + BlockNumber blk_start, + blk_end; + + /* Compare with the first sample */ + ret = ItemPointerCompare(tId, &context->header.sample_tids[0]); + if (ret < 0) /* tId < context->samp_tids[0] */ + return VCI_MOVED_CRID; + + /* Compare with the last sample */ + ret = ItemPointerCompare(&context->header.sample_tids[context->header.num_samples - 1], tId); + if (ret < 0) /* context->samp_tids[context->num_samples - + * 1] < tId */ + return VCI_MOVED_CRID; + + min = 0; + max = context->header.num_samples - 1; + + while (max - min > 1) + { + pivot = (min + max) / 2; + + ret = ItemPointerCompare(tId, &context->header.sample_tids[pivot]); + + if (ret < 0) /* tId < pivot */ + max = pivot; + else if (0 < ret) /* pivot < tId */ + min = pivot; + else + min = max = pivot; + } + + blk_start = VCI_TID_CRID_UPDATE_BODY_PAGE_ID + min * context->header.blocks_per_samp; + blk_end = VCI_TID_CRID_UPDATE_BODY_PAGE_ID + max * context->header.blocks_per_samp + context->header.blocks_per_samp - 1; + + if (context->nblocks <= blk_start) + blk_start = context->nblocks - 1; + + if (context->nblocks <= blk_end) + blk_end = context->nblocks - 1; + + return SearchCridInBlockRange(context, tId, blk_start, blk_end); +} + +static uint64 +SearchCridInBlockRange(vci_TidCridUpdateListContext *context, + ItemPointer tId, + BlockNumber start, BlockNumber end /* inclusive */ ) +{ + bool found = false; + uint64 ret = VCI_MOVED_CRID; + + do + { + BlockNumber pivot; + int first, + last; + Buffer buffer; + Page page; + vcis_tidcrid_pair_item_t *array; + bool less_lower_bound; + bool more_upper_bound; + + pivot = (start + end) / 2; + + if (pivot < context->nblocks - 1) + { + first = 0; + last = VCI_TID_CRID_UPDATE_PAGE_ITEMS - 1; + } + else + { + first = 0; + last = (context->count - 1) % VCI_TID_CRID_UPDATE_PAGE_ITEMS; + } + + buffer = vci_ReadBufferWithPageInit(context->rel, pivot); + page = BufferGetPage(buffer); + + array = vci_GetTidCridPairItemT(page); + + less_lower_bound = (ItemPointerCompare(tId, &array[first].page_item_id) < 0); + more_upper_bound = (ItemPointerCompare(&array[last].page_item_id, tId) < 0); + + if ((start == end) && (less_lower_bound || more_upper_bound)) + { + found = true; + ret = VCI_MOVED_CRID; + } + else if (less_lower_bound) + { + end = pivot; + } + else if (more_upper_bound) + { + start = pivot + 1; + } + else + { + found = true; + ret = SearchCridInBlock(context, tId, array, first, last); + } + + ReleaseBuffer(buffer); + } while (!found); + + return ret; +} + +/* + * Search CRID from the one block in TID-CRID update list + */ +static uint64 +SearchCridInBlock(vci_TidCridUpdateListContext *context, + ItemPointer tId, + vcis_tidcrid_pair_item_t *array, + int first, int last /* inclusive */ ) +{ + int pivot; + + while (last - first > 1) + { + int ret; + + pivot = (first + last) / 2; + + ret = ItemPointerCompare(&array[pivot].page_item_id, tId); + + if (ret < 0) /* array[pivot].page_item_id < tId */ + first = pivot; + else if (ret > 0) /* array[pivot].page_item_id > tId */ + last = pivot; + else + return vci_GetUint64FromCrid(array[pivot].crid); + } + + if (ItemPointerEquals(&array[first].page_item_id, tId)) + return vci_GetUint64FromCrid(array[first].crid); + else if (ItemPointerEquals(&array[last].page_item_id, tId)) + return vci_GetUint64FromCrid(array[last].crid); + else + return VCI_MOVED_CRID; +} + +/* + * Find free item from pages in data relation of TID-CRID free + * + * Returns offset to the free item + */ +static OffsetNumber +FindFreeItem(vci_TidCridRelations *relPair, BlockNumber freeBlk) +{ + vcis_tidcrid_pagetag_t *tag; + OffsetNumber newOffset; + + tag = vci_GetTidCridTag(relPair, freeBlk); + Assert(tag->type == vcis_tidcrid_type_pagetag); + + /* Initialize if not done yet */ + if ((tag->bitmap & 1) == 0) + { + tag->num = 0; + tag->free_size = VCI_ITEMS_IN_PAGE_FOR_TID_CRID_TREE - 1; + tag->bitmap = 0x1; + } + + newOffset = vci_GetLowestBit(~tag->bitmap) + 1; /* LSB = 0 */ + + Assert((newOffset >= 1) && (newOffset <= VCI_ITEMS_IN_PAGE_FOR_TID_CRID_TREE)); + ReleaseBuffer(relPair->bufData); + + return newOffset; +} + +/* + * Set a bit to the page tag + */ +static void +SetFreeSpaceBitmap(vci_TidCridRelations *relPair, BlockNumber blk, OffsetNumber offset) +{ + vcis_tidcrid_pagetag_t *tag = vci_GetTidCridTag(relPair, blk); + uint32 bit = offset - 1; /* one-origin -> zero-origin */ + uint32 nextBitmap; + + Assert((offset >= 1) && (offset <= VCI_ITEMS_IN_PAGE_FOR_TID_CRID_TREE)); + Assert((tag->bitmap & (uint32) (1U << bit)) == 0); + + LockBuffer(relPair->bufData, BUFFER_LOCK_EXCLUSIVE); + nextBitmap = tag->bitmap | (uint32) (1U << bit); + + /* + * Remove from the free space list if the number of free items is less + * than threshold + */ + if (VCI_ITEMS_IN_PAGE_FOR_TID_CRID_TREE - vci_GetBitCount(nextBitmap) == + VCI_TID_CRID_FREESPACE_THRESHOLD) + { + vcis_free_space_t *FS; + + /* Release once to pass relPair to vci_RemoveFreeSpaceFromLinkLis */ + UnlockReleaseBuffer(relPair->bufData); + + FS = vci_GetFreeSpace((vci_RelationPair *) relPair, blk); + vci_WriteRecoveryRecordForFreeSpace(relPair, + VCI_TID_CRID_COLID_DUMMY, + VCI_INVALID_DICTIONARY_ID, + blk, + FS); + ReleaseBuffer(relPair->bufData); + + vci_RemoveFreeSpaceFromLinkList((vci_ColumnRelations *) relPair, blk, 1); + + /* Adjust size and positions */ + tag = vci_GetTidCridTag(relPair, blk); + LockBuffer(relPair->bufData, BUFFER_LOCK_EXCLUSIVE); + + tag->size = 1; + tag->prev_pos = blk; + tag->next_pos = blk; + } + + tag->bitmap = nextBitmap; + vci_WriteItem(relPair->data, relPair->bufData, VCI_TID_CRID_PAGETAG_ITEM_ID); + UnlockReleaseBuffer(relPair->bufData); + +} + +/* + * Unset a bit to the page tag + */ +static void +UnsetFreeSpaceBitmap(vci_TidCridRelations *relPair, BlockNumber blk, OffsetNumber offset) +{ + vcis_tidcrid_pagetag_t *tag = vci_GetTidCridTag(relPair, blk); + int bit = offset - 1; /* one-origin -> zero-origin */ + + Assert((offset >= 1) && (offset <= VCI_ITEMS_IN_PAGE_FOR_TID_CRID_TREE)); + Assert((tag->bitmap & (uint32) (1U << bit)) != 0); + + LockBuffer(relPair->bufData, BUFFER_LOCK_EXCLUSIVE); + tag->bitmap &= ~(uint32) (1U << bit); + vci_WriteItem(relPair->data, relPair->bufData, VCI_TID_CRID_PAGETAG_ITEM_ID); + UnlockReleaseBuffer(relPair->bufData); + + if (VCI_ITEMS_IN_PAGE_FOR_TID_CRID_TREE - (vci_GetBitCount(tag->bitmap) + 1) == + VCI_TID_CRID_FREESPACE_THRESHOLD) + { + vcis_free_space_t newFS; + BlockNumber newFSBlockNumber; + + vci_MakeFreeSpace((vci_ColumnRelations *) relPair, blk, &newFSBlockNumber, &newFS, false); + Assert(newFSBlockNumber == blk); + + vci_WriteRecoveryRecordForFreeSpace(relPair, + VCI_TID_CRID_COLID_DUMMY, + VCI_INVALID_DICTIONARY_ID, + newFSBlockNumber, + &newFS); + + vci_AppendFreeSpaceToLinkList((vci_ColumnRelations *) relPair, + newFSBlockNumber, + newFS.prev_pos, + newFS.next_pos, + newFS.size); + } +} + +/* + * Write a recovery record while creating trunk node in the subtree + */ +static void +WriteRecoveryRecordForTidCridTrunk(vci_TidCridRelations *relPair, BlockNumber origBlkno, BlockNumber trunkBlkno, OffsetNumber trunkOffset) +{ + WriteRecoveryRecordForTidCridCommon(relPair, vcis_tid_crid_op_trunk, origBlkno, 0, trunkBlkno, trunkOffset); +} + +/* + * Write a recovery record while creating leaf node + */ +static void +WriteRecoveryRecordForTidCridLeaf(vci_TidCridRelations *relPair, ItemPointer trunkPtr, uint32 leafNo, BlockNumber leafBlkno, OffsetNumber leafOffset) +{ + vcis_tid_crid_op_type_t operation; + OffsetNumber trunkOffset; + uint32 targetInfo; + + if (leafOffset == VCI_TID_CRID_RECOVERY_CURRENT_VAL) + operation = vcis_tid_crid_op_leaf_add; + else + operation = vcis_tid_crid_op_leaf_remove; + + trunkOffset = ItemPointerGetOffsetNumber(trunkPtr); + Assert((trunkOffset <= 0xFFFF) && (leafNo <= 0xFFFF)); + targetInfo = (trunkOffset & 0xFFFF) | ((leafNo & 0xFFFF) << 16); + + WriteRecoveryRecordForTidCridCommon(relPair, operation, ItemPointerGetBlockNumber(trunkPtr), targetInfo, leafBlkno, leafOffset); +} + +/* + * Write a recovery record while updating TID-CRID tree + */ +static void +WriteRecoveryRecordForTidCridCommon(vci_TidCridRelations *relPair, vcis_tid_crid_op_type_t operation, BlockNumber targetBlkno, uint32 targetInfo, BlockNumber freeBlkno, OffsetNumber freeOffset) +{ + vcis_tidcrid_pagetag_t *tag; + uint32 tag_bitmap; + + /* + * 1. Obtains the bitmap to write the meta relation + */ + tag = vci_GetTidCridTag(relPair, freeBlkno); + Assert(tag->type == vcis_tidcrid_type_pagetag); + LockBuffer(relPair->bufData, BUFFER_LOCK_EXCLUSIVE); + + if (freeOffset == VCI_TID_CRID_RECOVERY_CURRENT_VAL) + tag_bitmap = tag->bitmap; + else + tag_bitmap = tag->bitmap & ~(UINT64CONST(1) << (freeOffset - 1)); + + UnlockReleaseBuffer(relPair->bufData); + + /* 2. Write information to the meta relation */ + Assert(relPair->info); + + vci_SetMainRelVar(relPair->info, vcimrv_tid_crid_operation, 0, operation); + vci_SetMainRelVar(relPair->info, vcimrv_tid_crid_target_blocknumber, 0, targetBlkno); + vci_SetMainRelVar(relPair->info, vcimrv_tid_crid_target_info, 0, targetInfo); + vci_SetMainRelVar(relPair->info, vcimrv_tid_crid_free_blocknumber, 0, freeBlkno); + vci_SetMainRelVar(relPair->info, vcimrv_tid_crid_tag_bitmap, 0, tag_bitmap); + vci_SetMainRelVar(relPair->info, vcimrv_working_column_id, 0, VCI_INVALID_COLUMN_ID); + vci_WriteMainRelVar(relPair->info, vci_wmrv_update); +} + +/* + * Initialize recovery record for the TID-CRID + */ +void +vci_InitRecoveryRecordForTidCrid(vci_MainRelHeaderInfo *info) +{ + vci_SetMainRelVar(info, vcimrv_tid_crid_operation, 0, vcis_tid_crid_op_none); + + vci_SetMainRelVar(info, vcimrv_working_column_id, 0, VCI_INVALID_COLUMN_ID); +} + +/* + * Recovery the lastly modifying bitmap + * + * @param[in] info main relation + */ +void +vci_RecoveryTidCrid(vci_MainRelHeaderInfo *info) +{ + LOCKMODE lockmode = RowExclusiveLock; + + vci_TidCridRelations relPairData = {0}; + vci_TidCridRelations *relPair = &relPairData; + + vcis_tid_crid_op_type_t operation; + BlockNumber targetBlkno; + uint32 targetInfo; + BlockNumber freeBlkno; + uint32 tag_bitmap; + + operation = vci_GetMainRelVar(info, vcimrv_tid_crid_operation, 0); + targetBlkno = vci_GetMainRelVar(info, vcimrv_tid_crid_target_blocknumber, 0); + targetInfo = vci_GetMainRelVar(info, vcimrv_tid_crid_target_info, 0); + freeBlkno = vci_GetMainRelVar(info, vcimrv_tid_crid_free_blocknumber, 0); + tag_bitmap = vci_GetMainRelVar(info, vcimrv_tid_crid_tag_bitmap, 0); + + if (operation == vcis_tid_crid_op_none) + return; + + Assert(BlockNumberIsValid(freeBlkno)); + vci_OpenTidCridRelations(relPair, info, lockmode); + { + vcis_tidcrid_pagetag_t *tag; + + tag = vci_GetTidCridTag(relPair, freeBlkno); + Assert(tag->type == vcis_tidcrid_type_pagetag); + LockBuffer(relPair->bufData, BUFFER_LOCK_EXCLUSIVE); + tag->bitmap = tag_bitmap; + vci_WriteItem(relPair->data, relPair->bufData, VCI_TID_CRID_PAGETAG_ITEM_ID); + UnlockReleaseBuffer(relPair->bufData); + } + vci_CloseTidCridRelations(relPair, lockmode); + + switch (operation) + { + case vcis_tid_crid_op_trunk: + { + vcis_tidcrid_meta_item_t *metaItem; + + metaItem = vci_GetTidCridMetaItem(relPair, targetBlkno); + LockBuffer(relPair->bufMeta, BUFFER_LOCK_EXCLUSIVE); + metaItem->block_number = InvalidBlockNumber; + metaItem->item_id = InvalidOffsetNumber; + + vci_WriteOneItemPage(relPair->meta, relPair->bufMeta); + UnlockReleaseBuffer(relPair->bufMeta); + } + break; + + case vcis_tid_crid_op_leaf_add: + case vcis_tid_crid_op_leaf_remove: + { + vcis_tidcrid_trunk_t *trunk; + ItemPointerData trunkItem; + uint32 leafNo; + + /* + * In vcis_tid_crid_op_leaf, targetBlkno represents a block + * number for the trunck, and lower 16 bit of targetInfo is + * the offset to the trunk. + */ + ItemPointerSet(&trunkItem, targetBlkno, (targetInfo & 0xFFFF)); + + /* + * Upper 16 bit of targetInfo represents the leafNo in the + * trunk. + */ + leafNo = targetInfo >> 16; + + trunk = (vcis_tidcrid_trunk_t *) + vci_GetTidCridTreeNode(relPair, &trunkItem, VCI_TID_CRID_TRUNKNODE, NULL); + + LockBuffer(relPair->bufData, BUFFER_LOCK_EXCLUSIVE); + trunk->bitmap &= ~(UINT64CONST(1) << leafNo); + MemSet(&trunk->leaf_item[leafNo], 0, sizeof(ItemPointerData)); + + vci_WriteItem(relPair->data, relPair->bufData, + ItemPointerGetOffsetNumber(&trunkItem)); + UnlockReleaseBuffer(relPair->bufData); + } + break; + + default: + break; + } +} + +/* + * Recovery the free list for TID-CRID tree relation + */ +void +vci_RecoveryFreeSpaceForTidCrid(vci_MainRelHeaderInfo *info) +{ + LOCKMODE lockmode = RowExclusiveLock; + + int16 colId; + vci_ColumnRelations relPairData = {0}; + vci_ColumnRelations *relPair = &relPairData; + vcis_column_meta_t *columnMeta; + + BlockNumber startBlockNumber; + BlockNumber prevFreeBlockNumber; + BlockNumber nextFreeBlockNumber; + uint32 oldSize; + + vci_OpenTidCridRelations(relPair, info, lockmode); + + /* get last working column */ + colId = vci_GetMainRelVar(info, vcimrv_working_column_id, 0); + + if (colId != VCI_INVALID_COLUMN_ID) + { + /* get column rel set */ + columnMeta = vci_GetColumnMeta(&relPair->bufMeta, relPair->meta); + LockBuffer(relPair->bufMeta, BUFFER_LOCK_EXCLUSIVE); + + /* restore from old fieleds */ + columnMeta->num_extents = columnMeta->num_extents_old; + columnMeta->num_free_pages = columnMeta->num_free_pages_old; + columnMeta->num_free_page_blocks = columnMeta->num_free_page_blocks_old; + + /* read freelink list recovery information */ + startBlockNumber = columnMeta->new_data_head; + prevFreeBlockNumber = columnMeta->free_page_prev_id; + nextFreeBlockNumber = columnMeta->free_page_next_id; + oldSize = columnMeta->free_page_old_size; + + vci_WriteColumnMetaDataHeader(relPair->meta, relPair->bufMeta); + UnlockReleaseBuffer(relPair->bufMeta); + + /* Recovery the free link list */ + + vci_AppendFreeSpaceToLinkList(relPair, startBlockNumber, prevFreeBlockNumber, + nextFreeBlockNumber, oldSize); + } + else + { + /* + * Connect to the free list if the previous crash was done before leaf + * was removed from the trunk. + */ + vcis_tid_crid_op_type_t operation; + BlockNumber freeBlkno; + uint32 tag_bitmap; + vcis_free_space_t newFS; + BlockNumber newFSBlockNumber; + + operation = vci_GetMainRelVar(info, vcimrv_tid_crid_operation, 0); + freeBlkno = vci_GetMainRelVar(info, vcimrv_tid_crid_free_blocknumber, 0); + tag_bitmap = vci_GetMainRelVar(info, vcimrv_tid_crid_tag_bitmap, 0); + + switch (operation) + { + case vcis_tid_crid_op_none: + case vcis_tid_crid_op_trunk: + case vcis_tid_crid_op_leaf_add: + break; + + case vcis_tid_crid_op_leaf_remove: + if (VCI_ITEMS_IN_PAGE_FOR_TID_CRID_TREE - (vci_GetBitCount(tag_bitmap) + 1) == + VCI_TID_CRID_FREESPACE_THRESHOLD) + { + vci_MakeFreeSpace((vci_ColumnRelations *) relPair, freeBlkno, &newFSBlockNumber, &newFS, false); + Assert(newFSBlockNumber == freeBlkno); + + vci_WriteRecoveryRecordForFreeSpace(relPair, + VCI_TID_CRID_COLID_DUMMY, + VCI_INVALID_DICTIONARY_ID, + newFSBlockNumber, + &newFS); + + vci_AppendFreeSpaceToLinkList((vci_ColumnRelations *) relPair, + newFSBlockNumber, + newFS.prev_pos, + newFS.next_pos, + newFS.size); + } + break; + + default: + break; + } + } + + vci_CloseTidCridRelations(relPair, lockmode); +} + +static int +CmpTidCridPairbyTID(const void *pa, const void *pb) +{ + vcis_tidcrid_pair_item_t *a = (vcis_tidcrid_pair_item_t *) pa; + vcis_tidcrid_pair_item_t *b = (vcis_tidcrid_pair_item_t *) pb; + + uint64 a_tid = vci_GetTid64FromItemPointer(&a->page_item_id); + uint64 b_tid = vci_GetTid64FromItemPointer(&b->page_item_id); + + return (a_tid < b_tid) ? -1 : ((b_tid < a_tid) ? 1 : 0); +} + +static vcis_tidcrid_pair_item_t * +CreateTidCridUpdateListFromRosChunkStorage(RosChunkStorage *src, + int32 extentId) +{ + vcis_tidcrid_pair_item_t *dst; + int chunkId; + int ptr = 0; + uint64 crid = vci_CalcCrid64(extentId, 0); + vcis_tidcrid_pair_item_t temp; + + Assert(VCI_FIRST_NORMAL_EXTENT_ID <= extentId); + dst = palloc(sizeof(vcis_tidcrid_pair_item_t) * src->numTotalRows); + for (chunkId = 0; chunkId < src->numFilled; ++chunkId) + { + uint32 lId; + RosChunkBuffer *chunk = src->chunk[chunkId]; + + for (lId = 0; lId < chunk->numFilled; ++lId) + { + temp.page_item_id = *(ItemPointerData *) + &(chunk->tidData[lId * sizeof(ItemPointerData)]); + temp.crid = vci_GetCridFromUint64(crid); + dst[ptr++] = temp; + ++crid; + } + } + + qsort(dst, ptr, sizeof(vcis_tidcrid_pair_item_t), CmpTidCridPairbyTID); + + return dst; +} + +/* + * Callback structure passed to MergeTidCridUpdateListCallback + */ +typedef struct +{ + /* + * oldList: base list for the merge + */ + + /* + * Context for TID-CRID Update List + */ + vci_TidCridUpdateListContext *oldListContext; + + /* + * Current position in old list + */ + uint64 oldListContextIndex; + + /* + * Record one block from the oldListContext + */ + vcis_tidcrid_pair_item_t oldListInBlock[VCI_TID_CRID_UPDATE_PAGE_ITEMS]; + + /* + * Position of reading block + */ + BlockNumber prevOldListContextBlkno; + + /* + * addList1: add different entries to oldList (exclusively used with + * addList2) + */ + + /* + * Pair TID-CRID list + */ + vcis_tidcrid_pair_item_t *addList1; + + /* + * Maximum entries in addList1 + */ + int32 numAddList1; + + /* + * Current position in addList1 + */ + int32 addList1Index; + + /* + * addList2: add different entries to oldList (exclusively used with + * addList1) + */ + Tuplesortstate *addList2; + ItemPointerData addList2CurrentTid; + vcis_Crid addList2Crid; + bool addList2Terminated; + +} vci_MergeTidCridUpdateListContext; + +/* + * Callback function passed to WriteTidCridUpdateList() + * + * Merge oldList and {addList1, addList2} and outputs with TID ordering. + */ +static bool +MergeTidCridUpdateListCallback(vcis_tidcrid_pair_item_t *item, void *data) +{ + vci_MergeTidCridUpdateListContext *mergeContext = (vci_MergeTidCridUpdateListContext *) data; + bool old_entry_valid; + bool add_entry_valid; + vcis_tidcrid_pair_item_t old_item, + add_item; + +retry: + old_entry_valid = false; + add_entry_valid = false; + + if (mergeContext->addList1) + { + /* addList1 */ + if (mergeContext->addList1Index < mergeContext->numAddList1) + { + add_item = mergeContext->addList1[mergeContext->addList1Index]; + add_entry_valid = true; + } + } + else + { + /* addList2 */ + if (!mergeContext->addList2Terminated) + { + if (!ItemPointerIsValid(&mergeContext->addList2CurrentTid)) + { + Datum value; + bool isnull; + + if (tuplesort_getdatum(mergeContext->addList2, true, true, &value, &isnull, NULL)) + { + mergeContext->addList2CurrentTid = *DatumGetItemPointer(value); + } + else + { + mergeContext->addList2Terminated = true; + goto get_old_list; + } + } + + add_item.page_item_id = mergeContext->addList2CurrentTid; + add_item.crid = mergeContext->addList2Crid; + + add_entry_valid = true; + } + } + +get_old_list: + if (mergeContext->oldListContextIndex < mergeContext->oldListContext->count) + { + BlockNumber blkno; + + blkno = VCI_TID_CRID_UPDATE_BODY_PAGE_ID + mergeContext->oldListContextIndex / VCI_TID_CRID_UPDATE_PAGE_ITEMS; + + if (blkno != mergeContext->prevOldListContextBlkno) + { + vci_ReadOneBlockFromTidCridUpdateList(mergeContext->oldListContext, blkno, mergeContext->oldListInBlock); + mergeContext->prevOldListContextBlkno = blkno; + } + + old_item = mergeContext->oldListInBlock[mergeContext->oldListContextIndex % VCI_TID_CRID_UPDATE_PAGE_ITEMS]; + + old_entry_valid = true; + } + + if (old_entry_valid && add_entry_valid) + { + int32 res = ItemPointerCompare(&old_item.page_item_id, &add_item.page_item_id); + + if (res == 0) + { + /* + * Retain latter one if same TID item has come + */ + mergeContext->oldListContextIndex++; + mergeContext->addList1Index++; + ItemPointerSetInvalid(&mergeContext->addList2CurrentTid); + + if (vci_GetUint64FromCrid(add_item.crid) == VCI_MOVED_CRID) + goto retry; + + *item = add_item; + } + else if (res < 0) + { + mergeContext->oldListContextIndex++; + + *item = old_item; + } + else + { + mergeContext->addList1Index++; + ItemPointerSetInvalid(&mergeContext->addList2CurrentTid); + + Assert(vci_GetUint64FromCrid(add_item.crid) != VCI_MOVED_CRID); + + *item = add_item; + } + + return true; + } + else if (old_entry_valid) + { + mergeContext->oldListContextIndex++; + + *item = old_item; + + return true; + } + else if (add_entry_valid) + { + mergeContext->addList1Index++; + ItemPointerSetInvalid(&mergeContext->addList2CurrentTid); + + Assert(vci_GetUint64FromCrid(add_item.crid) != VCI_MOVED_CRID); + + *item = add_item; + + return true; + } + else + { + return false; + } +} + +/* + * Add TID-CRID pair into the TID-CRID Update List + * + * @param[in] info info main relation + * @param[in] src extent to be added + * @param[in] extentId extent id to be added + */ +void +vci_AddTidCridUpdateList(vci_MainRelHeaderInfo *info, + RosChunkStorage *src, + int32 extentId) +{ + uint32 oldSel = vci_GetMainRelVar(info, vcimrv_tid_crid_diff_sel, 0); + uint32 newSel = 1 ^ oldSel; + vci_MergeTidCridUpdateListContext mergeContext = {0}; + + Assert(VCI_FIRST_NORMAL_EXTENT_ID <= extentId); + mergeContext.oldListContext = vci_OpenTidCridUpdateList(info, oldSel); + + mergeContext.addList1 = CreateTidCridUpdateListFromRosChunkStorage(src, extentId); + mergeContext.numAddList1 = src->numTotalRows; + + mergeContext.prevOldListContextBlkno = InvalidBlockNumber; + + WriteTidCridUpdateList(info, newSel, MergeTidCridUpdateListCallback, &mergeContext); + + pfree(mergeContext.addList1); + vci_CloseTidCridUpdateList(mergeContext.oldListContext); +} + +void +vci_MergeAndWriteTidCridUpdateList(vci_MainRelHeaderInfo *info, + int newSel, int oldSel, + Tuplesortstate *newList, vcis_Crid crid) +{ + vci_MergeTidCridUpdateListContext mergeContext = {0}; + + mergeContext.oldListContext = vci_OpenTidCridUpdateList(info, oldSel); + + mergeContext.addList2 = newList; + ItemPointerSetInvalid(&mergeContext.addList2CurrentTid); + mergeContext.addList2Crid = crid; + + mergeContext.prevOldListContextBlkno = InvalidBlockNumber; + + WriteTidCridUpdateList(info, newSel, MergeTidCridUpdateListCallback, &mergeContext); + + vci_CloseTidCridUpdateList(mergeContext.oldListContext); +} diff --git a/contrib/vci/storage/vci_wos.c b/contrib/vci/storage/vci_wos.c new file mode 100644 index 000000000000..3e3f34c2fe2c --- /dev/null +++ b/contrib/vci/storage/vci_wos.c @@ -0,0 +1,265 @@ +/*------------------------------------------------------------------------- + * + * vci_wos.c + * Manipulate WOS + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/storage/vci_wos.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include + +#include "access/heapam_xlog.h" +#include "access/relscan.h" +#include "access/visibilitymap.h" +#include "access/xact.h" +#include "c.h" +#include "miscadmin.h" +#include "storage/ipc.h" +#include "storage/procarray.h" +#include "utils/memutils.h" +#include "utils/rel.h" +#include "utils/snapmgr.h" + +#include "vci.h" + +#include "vci_mem.h" +#include "vci_ros.h" +#include "vci_wos.h" +#include "vci_xact.h" + +bool HeapTupleSatisfiesWos2Ros(HeapTuple htup, Snapshot snapshot, Buffer buffer); +bool HeapTupleSatisfiesLocalRos(HeapTuple htup, Snapshot snapshot, Buffer buffer); +static bool IsXmaxHasCommitted(HeapTuple htup); + +/* Cache used by IsXmaxHasCommitted */ +static struct +{ + TransactionId xid; + bool committed; +} cachedTransactionInfo; + +/* + * vci_GetSnapshotForWos2Ros + * + * Creates a snapshot which is used for WOS->ROS and WOS->Delete vector + * conversions. + * + * WOS entries are created when CRUD commands are executed, and the visibility + * check in WOS is done with the normal snapshot. + * + * ROS control commands can removes WOS entries, and the result can be seen by + * everyone as soon as the command is done. + * + * Caller must call PopActiveSnapshot() afterward. + */ +Snapshot +vci_GetSnapshotForWos2Ros(void) +{ + Snapshot snapshot; + + snapshot = vci_GetCurrentSnapshot(); + + snapshot->snapshot_type = SNAPSHOT_VCI_WOS2ROS; + + /* Clean up the cache */ + cachedTransactionInfo.xid = InvalidTransactionId; + + return snapshot; +} + +bool +HeapTupleSatisfiesWos2Ros(HeapTuple htup, Snapshot snapshot, Buffer buffer) +{ + + SnapshotType temp_snapshot_type; + + temp_snapshot_type = snapshot->snapshot_type; + snapshot->snapshot_type = SNAPSHOT_MVCC; + + if (HeapTupleSatisfiesVisibility(htup, snapshot, buffer)) + { + snapshot->snapshot_type = temp_snapshot_type; + + if (IsXmaxHasCommitted(htup)) + return false; + + return true; + } + + snapshot->snapshot_type = temp_snapshot_type; + return false; +} + +static TransactionId exclusiveTransactionId; + +/* + * vci_GetSnapshotForLocalRos + * + * Creates a snapshot which is used for local ROS conversion + * + * @param[in] inclusive_xid Visible xid regardless of the MVCC snapshot + * @param[in] exclusive_xid Invisible xid regardless of the MVCC snapshot + * + * Mostly same as vci_GetSnapshotForWos2Ros(), but sometimes results by ROS + * control commands cannot be seen by MVCC. Because the transactions creating + * local ROS and ROS control commands are sometimes overlapped. + */ +Snapshot +vci_GetSnapshotForLocalRos(TransactionId inclusive_xid, TransactionId exclusive_xid) +{ + Snapshot snapshot; + + snapshot = vci_GetCurrentSnapshot(); + + snapshot->snapshot_type = SNAPSHOT_VCI_LOCALROS; + + /* Removes transaction inclusive_xid from MVCC control */ + if (TransactionIdIsValid(inclusive_xid)) + { + int i; + + for (i = 0; i < snapshot->xcnt; i++) + { + if (TransactionIdEquals(snapshot->xip[i], inclusive_xid)) + { + i++; + for (; i < snapshot->xcnt; i++) + snapshot->xip[i - 1] = snapshot->xip[i]; + snapshot->xcnt--; + break; + } + } + } + + exclusiveTransactionId = exclusive_xid; + + /* Clean up the cache */ + cachedTransactionInfo.xid = InvalidTransactionId; + + return snapshot; +} + +bool +HeapTupleSatisfiesLocalRos(HeapTuple htup, Snapshot snapshot, Buffer buffer) +{ + SnapshotType temp_snapshot_type; + + /* Store away the VCI specific type and check for MVCC visibility */ + temp_snapshot_type = snapshot->snapshot_type; + snapshot->snapshot_type = SNAPSHOT_MVCC; + + if (HeapTupleSatisfiesVisibility(htup, snapshot, buffer)) + { + snapshot->snapshot_type = temp_snapshot_type; + if (IsXmaxHasCommitted(htup)) + { + TransactionId xmax; + + xmax = HeapTupleHeaderGetRawXmax(htup->t_data); + + if (TransactionIdEquals(xmax, exclusiveTransactionId)) + return true; + + return false; + } + + return true; + } + + snapshot->snapshot_type = temp_snapshot_type; + return false; +} + +/* + * Checks whether the htup has been removed + */ +static bool +IsXmaxHasCommitted(HeapTuple htup) +{ + TransactionId xmax; + bool result = false; + + if (htup->t_data->t_infomask & HEAP_XMAX_COMMITTED) + return true; + + xmax = HeapTupleHeaderGetRawXmax(htup->t_data); + + if (!TransactionIdIsValid(xmax)) + return false; + + if (htup->t_data->t_infomask & HEAP_XMAX_INVALID) + return false; + + if (TransactionIdEquals(xmax, cachedTransactionInfo.xid)) + return cachedTransactionInfo.committed; + + switch (vci_transaction_get_type(xmax)) + { + case VCI_XACT_SELF: + case VCI_XACT_DID_COMMIT: + result = true; + break; + + default: + break; + } + + cachedTransactionInfo.xid = xmax; + cachedTransactionInfo.committed = result; + + return result; +} + +/** + * @brief This function estimate the number of items in all pages of a heap + * relation, from the item size and number of pages, assuming that all the + * entries has the same size, and no HOT chains. + * + * @param[in] oid Oid of relation. + * @return estimated number of items in the relation. + */ +uint64 +vci_EstimateNumEntriesInHeapRelation(Oid oid) +{ + if (OidIsValid(oid)) + { + Relation rel; + TableScanDesc scan; + HeapTuple tuple; + uint64 result = 0; + + rel = table_open(oid, AccessShareLock); + scan = table_beginscan(rel, GetActiveSnapshot(), 0, NULL); + scan->rs_flags &= ~SO_ALLOW_PAGEMODE; + tuple = heap_getnext(scan, ForwardScanDirection); + + if (NULL != tuple) + { + BlockNumber relallvisible; + uint64 numEntriesPerPage = (BLCKSZ - SizeOfPageHeaderData) / + (tuple->t_len + sizeof(ItemIdData)); + + /* + * Estimated value would be calculated as: - Subtract the free + * page from the total number of pages, - then multiple the + * maximum entries per page. + */ + visibilitymap_count(rel, &relallvisible, NULL); + result = (RelationGetNumberOfBlocks(rel) - relallvisible) * numEntriesPerPage; + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + return result; + } + + return 0; +} diff --git a/contrib/vci/storage/vci_xact.c b/contrib/vci/storage/vci_xact.c new file mode 100644 index 000000000000..fe46f29bd186 --- /dev/null +++ b/contrib/vci/storage/vci_xact.c @@ -0,0 +1,146 @@ +/*------------------------------------------------------------------------- + * + * vci_xact.c + * Transaction control + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/storage/vci_xact.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/transam.h" +#include "access/xact.h" +#include "storage/lmgr.h" +#include "storage/procarray.h" +#include "miscadmin.h" + +#include "vci.h" +#include "vci_ros.h" +#include "vci_xact.h" + +/* + * Returns the status of the transaction + */ +enum vci_xact_status_kind +vci_transaction_get_type(TransactionId xid) +{ + /* + * XXX please preserve the ordering. + * + * TransactionIdDidCommit() and TransactionIdDidAbort() can be used when + * TransactionIdIsInProgress() is false + */ + if (!TransactionIdIsValid(xid)) + return VCI_XACT_INVALID; + else if (xid == FrozenTransactionId) + return VCI_XACT_DID_COMMIT; + else if (TransactionIdIsCurrentTransactionId(xid)) + return VCI_XACT_SELF; + else if (TransactionIdIsInProgress(xid)) + return VCI_XACT_IN_PROGRESS; + else if (TransactionIdDidCommit(xid)) + return VCI_XACT_DID_COMMIT; + else if (TransactionIdDidAbort(xid)) + return VCI_XACT_DID_ABORT; + else + return VCI_XACT_DID_CRASH; +} + +/*==========================================================================*/ +/* Extended transaction ID generations */ +/*==========================================================================*/ + +/* + * WOS extends transaction IDs to 64bit, called "xid64". The upper 32bit is + * same as normal xid, and the lower 32bit represents the "generation" of the + * transaction. + * + * The generation can be advanced when the 1/4 32-bit transaction has been + * advanced. Ideally it can be done when whole of them are consumed, but + * current approach is simpler. + * + * vcimrv_xid_generation of the main relation stores the generation, and + * vcimrv_xid_gen_update_xid stores 32-bit xid when the generation is lastly + * advanced. E.g., either of XID which happened CREATE XID or VACUUM. + * + * While doing VACUUM, the upper 2 bit of vcimrv_xid_gen_update_xid and current + * xid would be compared, and the generation would be advanced if they are + * different. + * + * When the index is created, the generation is 1. If older transaction than + * doing CREATE INDEX refers to the index (can happen if the isolation level is + * READ COMMITTED), their generation would be 0. + */ + +/* The length of shift used for advancing generations */ +static const int xid_shift_bits = 30; + +/* + * Returns extended xid based on the given one. + * + * @param[in] target_xid 32-bit xid + * @param[in] info information of the main relation + * @return 64-bit xid + */ +int64 +vci_GenerateXid64(TransactionId target_xid, vci_MainRelHeaderInfo *info) +{ + uint64 xid_gen; + TransactionId base_xid; + uint32 base_xid_upper_bits; + uint32 target_xid_upper_bits; + int32 diff; + + xid_gen = (uint64) vci_GetMainRelVar(info, vcimrv_xid_generation, 0); + base_xid = vci_GetMainRelVar(info, vcimrv_xid_gen_update_xid, 0); + + base_xid_upper_bits = ((uint32) base_xid) >> xid_shift_bits; + target_xid_upper_bits = ((uint32) target_xid) >> xid_shift_bits; + + diff = (target_xid_upper_bits - base_xid_upper_bits) << xid_shift_bits; + + return (int64) (((xid_gen + (diff >> xid_shift_bits)) << 32) | (uint64) target_xid); +} + +/* + * Updates the generation based on the current transaction ID. + * + * This can be called only from VACUUM, and won't be rolled back. + */ +void +vci_UpdateXidGeneration(vci_MainRelHeaderInfo *info) +{ + TransactionId cur_xid; + uint32 xid_gen; + uint32 base_xid; + uint32 cur_xid_upper_bits; + uint32 base_xid_upper_bits; + int32 diff; + + cur_xid = GetCurrentTransactionId(); + + xid_gen = (uint32) vci_GetMainRelVar(info, vcimrv_xid_generation, 0); + base_xid = vci_GetMainRelVar(info, vcimrv_xid_gen_update_xid, 0); + + base_xid_upper_bits = ((uint32) base_xid) >> xid_shift_bits; + cur_xid_upper_bits = ((uint32) cur_xid) >> xid_shift_bits; + + diff = (cur_xid_upper_bits - base_xid_upper_bits) << xid_shift_bits; + + if (diff != 0) + { + LockRelation(info->rel, AccessExclusiveLock); + + vci_SetMainRelVar(info, vcimrv_xid_generation, 0, xid_gen + (diff >> xid_shift_bits)); + vci_SetMainRelVar(info, vcimrv_xid_gen_update_xid, 0, cur_xid); + + vci_WriteMainRelVar(info, vci_wmrv_update); + + UnlockRelation(info->rel, AccessExclusiveLock); + } +} diff --git a/contrib/vci/utils/Makefile b/contrib/vci/utils/Makefile new file mode 100644 index 000000000000..25b49d2fba89 --- /dev/null +++ b/contrib/vci/utils/Makefile @@ -0,0 +1,20 @@ +# contrib/vci/utils/Makefile + +SUBOBJS = \ + vci_symbols.o + +EXTRA_CLEAN = SUBSYS.o $(SUBOBJS) + +PG_CPPFLAGS = -I $(top_srcdir)/contrib/vci/include + +ifdef USE_PGXS +PGXS := $(shell pg_config --pgxs) +include $(PGXS) +else +subdir = contrib/vci/utils +top_builddir = ../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +override CFLAGS += $(CFLAGS_SL) diff --git a/contrib/vci/utils/meson.build b/contrib/vci/utils/meson.build new file mode 100644 index 000000000000..0a765df7bc1e --- /dev/null +++ b/contrib/vci/utils/meson.build @@ -0,0 +1,10 @@ +# Copyright (c) 2025, PostgreSQL Global Development Group + +vci_utils_sources = files( + 'vci_symbols.c', +) + +install_headers( + 'nodes.t', + install_dir: dir_include_extension / 'vci_utils', +) diff --git a/contrib/vci/utils/nodes.t b/contrib/vci/utils/nodes.t new file mode 100644 index 000000000000..82bbcc78673f --- /dev/null +++ b/contrib/vci/utils/nodes.t @@ -0,0 +1,448 @@ + +Item(IndexInfo ) +Item(ExprContext ) +Item(ProjectionInfo ) +Item(JunkFilter ) +Item(OnConflictSetState ) +Item(ResultRelInfo ) +Item(EState ) +Item(TupleTableSlot ) + +Item(Result ) +Item(ProjectSet ) +Item(ModifyTable ) +Item(Append ) +Item(MergeAppend ) +Item(RecursiveUnion ) +Item(BitmapAnd ) +Item(BitmapOr ) +Item(SeqScan ) +Item(SampleScan ) +Item(IndexScan ) +Item(IndexOnlyScan ) +Item(BitmapIndexScan ) +Item(BitmapHeapScan ) +Item(TidScan ) +Item(TidRangeScan ) +Item(SubqueryScan ) +Item(FunctionScan ) +Item(ValuesScan ) +Item(TableFuncScan ) +Item(CteScan ) +Item(NamedTuplestoreScan ) +Item(WorkTableScan ) +Item(ForeignScan ) +Item(CustomScan ) +Item(CustomPlanMarkPos ) +Item(NestLoop ) +Item(MergeJoin ) +Item(HashJoin ) +Item(Material ) +Item(Memoize ) +Item(Sort ) +Item(IncrementalSort ) +Item(Group ) +Item(Agg ) +Item(WindowAgg ) +Item(Unique ) +Item(Gather ) +Item(GatherMerge ) +Item(Hash ) +Item(SetOp ) +Item(LockRows ) +Item(Limit ) +Item(NestLoopParam ) +Item(PlanRowMark ) +Item(PartitionPruneInfo ) +Item(PartitionedRelPruneInfo) +Item(PartitionPruneStepOp ) +Item(PartitionPruneStepCombine) +Item(PlanInvalItem ) + +Item(ResultState ) +Item(ProjectSetState ) +Item(ModifyTableState ) +Item(AppendState ) +Item(MergeAppendState ) +Item(RecursiveUnionState ) +Item(BitmapAndState ) +Item(BitmapOrState ) +Item(ScanState ) +Item(SeqScanState ) +Item(SampleScanState ) +Item(IndexScanState ) +Item(IndexOnlyScanState ) +Item(BitmapIndexScanState ) +Item(BitmapHeapScanState ) +Item(TidScanState ) +Item(TidRangeScanState ) +Item(SubqueryScanState ) +Item(FunctionScanState ) +Item(TableFuncScanState ) +Item(ValuesScanState ) +Item(CteScanState ) +Item(NamedTuplestoreScanState) +Item(WorkTableScanState ) +Item(ForeignScanState ) +Item(CustomScanState ) +Item(JoinState ) +Item(NestLoopState ) +Item(MergeJoinState ) +Item(HashJoinState ) +Item(MaterialState ) +Item(MemoizeState ) +Item(SortState ) +Item(IncrementalSortState ) +Item(GroupState ) +Item(AggState ) +Item(WindowAggState ) +Item(UniqueState ) +Item(GatherState ) +Item(GatherMergeState ) +Item(HashState ) +Item(SetOpState ) +Item(LockRowsState ) +Item(LimitState ) + +Item(Alias ) +Item(RangeVar ) +Item(TableFunc ) +Item(Var ) +Item(Const ) +Item(Param ) +Item(Aggref ) +Item(GroupingFunc ) +Item(WindowFunc ) +Item(SubscriptingRef ) +Item(FuncExpr ) +Item(NamedArgExpr ) +Item(OpExpr ) +Item(DistinctExpr ) +Item(NullIfExpr ) +Item(ScalarArrayOpExpr ) +Item(BoolExpr ) +Item(SubLink ) +Item(SubPlan ) +Item(AlternativeSubPlan ) +Item(FieldSelect ) +Item(FieldStore ) +Item(RelabelType ) +Item(CoerceViaIO ) +Item(ArrayCoerceExpr ) +Item(ConvertRowtypeExpr ) +Item(CollateExpr ) +Item(CaseExpr ) +Item(CaseWhen ) +Item(CaseTestExpr ) +Item(ArrayExpr ) +Item(RowExpr ) +Item(RowCompareExpr ) +Item(CoalesceExpr ) +Item(MinMaxExpr ) +Item(SQLValueFunction ) +Item(XmlExpr ) +Item(NullTest ) +Item(BooleanTest ) +Item(CoerceToDomain ) +Item(CoerceToDomainValue ) +Item(SetToDefault ) +Item(CurrentOfExpr ) +Item(NextValueExpr ) +Item(InferenceElem ) +Item(TargetEntry ) +Item(RangeTblRef ) +Item(JoinExpr ) +Item(FromExpr ) +Item(OnConflictExpr ) +Item(IntoClause ) + +Item(ExprState ) +Item(WindowFuncExprState ) +Item(SetExprState ) +Item(SubPlanState ) +Item(DomainConstraintState ) + +Item(PlannerInfo ) +Item(PlannerGlobal ) +Item(RelOptInfo ) +Item(IndexOptInfo ) +Item(ForeignKeyOptInfo ) +Item(ParamPathInfo ) + +Item(Path ) +Item(IndexPath ) +Item(BitmapHeapPath ) +Item(BitmapAndPath ) +Item(BitmapOrPath ) +Item(TidPath ) +Item(TidRangePath ) +Item(SubqueryScanPath ) +Item(ForeignPath ) +Item(CustomPath ) +Item(NestPath ) +Item(MergePath ) +Item(HashPath ) +Item(AppendPath ) +Item(MergeAppendPath ) +Item(GroupResultPath ) +Item(MaterialPath ) +Item(MemoizePath ) +Item(UniquePath ) +Item(GatherPath ) +Item(GatherMergePath ) +Item(ProjectionPath ) +Item(ProjectSetPath ) +Item(SortPath ) +Item(IncrementalSortPath ) +Item(GroupPath ) +Item(AggPath ) +Item(GroupingSetsPath ) +Item(MinMaxAggPath ) +Item(WindowAggPath ) +Item(SetOpPath ) +Item(RecursiveUnionPath ) +Item(LockRowsPath ) +Item(ModifyTablePath ) +Item(LimitPath ) + +Item(EquivalenceClass ) +Item(EquivalenceMember ) +Item(PathKey ) +Item(PathTarget ) +Item(RestrictInfo ) +Item(IndexClause ) +Item(PlaceHolderVar ) +Item(SpecialJoinInfo ) +Item(AppendRelInfo ) +Item(RowIdentityVarInfo ) +Item(PlaceHolderInfo ) +Item(MinMaxAggInfo ) +Item(PlannerParamItem ) +Item(RollupData ) +Item(GroupingSetData ) +Item(StatisticExtInfo ) + +Item(AllocSetContext ) +Item(SlabContext ) +Item(GenerationContext ) + +Item(Integer ) +Item(Float ) +Item(String ) +Item(BitString ) + +Item(List ) +Item(IntList ) +Item(OidList ) + +Item(ExtensibleNode ) + +Item(RawStmt ) +Item(Query ) +Item(PlannedStmt ) +Item(InsertStmt ) +Item(DeleteStmt ) +Item(UpdateStmt ) +Item(SelectStmt ) +Item(ReturnStmt ) +Item(PLAssignStmt ) +Item(AlterTableStmt ) +Item(AlterTableCmd ) +Item(AlterDomainStmt ) +Item(SetOperationStmt ) +Item(GrantStmt ) +Item(GrantRoleStmt ) +Item(AlterDefaultPrivilegesStmt) +Item(ClosePortalStmt ) +Item(ClusterStmt ) +Item(CopyStmt ) +Item(CreateStmt ) +Item(DefineStmt ) +Item(DropStmt ) +Item(TruncateStmt ) +Item(CommentStmt ) +Item(FetchStmt ) +Item(IndexStmt ) +Item(CreateFunctionStmt ) +Item(AlterFunctionStmt ) +Item(DoStmt ) +Item(RenameStmt ) +Item(RuleStmt ) +Item(NotifyStmt ) +Item(ListenStmt ) +Item(UnlistenStmt ) +Item(TransactionStmt ) +Item(ViewStmt ) +Item(LoadStmt ) +Item(CreateDomainStmt ) +Item(CreatedbStmt ) +Item(DropdbStmt ) +Item(VacuumStmt ) +Item(ExplainStmt ) +Item(CreateTableAsStmt ) +Item(CreateSeqStmt ) +Item(AlterSeqStmt ) +Item(VariableSetStmt ) +Item(VariableShowStmt ) +Item(DiscardStmt ) +Item(CreateTrigStmt ) +Item(CreatePLangStmt ) +Item(CreateRoleStmt ) +Item(AlterRoleStmt ) +Item(DropRoleStmt ) +Item(LockStmt ) +Item(ConstraintsSetStmt ) +Item(ReindexStmt ) +Item(CheckPointStmt ) +Item(CreateSchemaStmt ) +Item(AlterDatabaseStmt ) +Item(AlterDatabaseSetStmt ) +Item(AlterRoleSetStmt ) +Item(CreateConversionStmt ) +Item(CreateCastStmt ) +Item(CreateOpClassStmt ) +Item(CreateOpFamilyStmt ) +Item(AlterOpFamilyStmt ) +Item(PrepareStmt ) +Item(ExecuteStmt ) +Item(DeallocateStmt ) +Item(DeclareCursorStmt ) +Item(CreateTableSpaceStmt ) +Item(DropTableSpaceStmt ) +Item(AlterObjectDependsStmt ) +Item(AlterObjectSchemaStmt ) +Item(AlterOwnerStmt ) +Item(AlterOperatorStmt ) +Item(AlterTypeStmt ) +Item(DropOwnedStmt ) +Item(ReassignOwnedStmt ) +Item(CompositeTypeStmt ) +Item(CreateEnumStmt ) +Item(CreateRangeStmt ) +Item(AlterEnumStmt ) +Item(AlterTSDictionaryStmt ) +Item(AlterTSConfigurationStmt) +Item(CreateFdwStmt ) +Item(AlterFdwStmt ) +Item(CreateForeignServerStmt) +Item(AlterForeignServerStmt ) +Item(CreateUserMappingStmt ) +Item(AlterUserMappingStmt ) +Item(DropUserMappingStmt ) +Item(AlterTableSpaceOptionsStmt) +#if PG_VERSION_NUM >= 90400 +Item(AlterTableMoveAllStmt ) +#endif +Item(SecLabelStmt ) +Item(CreateForeignTableStmt ) +Item(ImportForeignSchemaStmt) +Item(CreateExtensionStmt ) +Item(AlterExtensionStmt ) +Item(AlterExtensionContentsStmt) +#if PG_VERSION_NUM >= 90300 +Item(CreateEventTrigStmt ) +Item(AlterEventTrigStmt ) +Item(RefreshMatViewStmt ) +#endif +#if PG_VERSION_NUM >= 90400 +Item(ReplicaIdentityStmt ) +Item(AlterSystemStmt ) +#endif +Item(CreatePolicyStmt ) +Item(AlterPolicyStmt ) +Item(CreateTransformStmt ) +Item(CreateAmStmt ) +Item(CreatePublicationStmt ) +Item(AlterPublicationStmt ) +Item(CreateSubscriptionStmt ) +Item(AlterSubscriptionStmt ) +Item(DropSubscriptionStmt ) +Item(CreateStatsStmt ) +Item(AlterCollationStmt ) +Item(CallStmt ) +Item(AlterStatsStmt ) + +Item(A_Expr ) +Item(ColumnRef ) +Item(ParamRef ) +Item(A_Const ) +Item(FuncCall ) +Item(A_Star ) +Item(A_Indices ) +Item(A_Indirection ) +Item(A_ArrayExpr ) +Item(ResTarget ) +Item(MultiAssignRef ) +Item(TypeCast ) +Item(CollateClause ) +Item(SortBy ) +Item(WindowDef ) +Item(RangeSubselect ) +Item(RangeFunction ) +Item(RangeTableSample ) +Item(RangeTableFunc ) +Item(RangeTableFuncCol ) +Item(TypeName ) +Item(ColumnDef ) +Item(IndexElem ) +Item(StatsElem ) +Item(Constraint ) +Item(DefElem ) +Item(RangeTblEntry ) +#if PG_VERSION_NUM >= 90400 +Item(RangeTblFunction ) +#endif +Item(TableSampleClause ) +#if PG_VERSION_NUM >= 90400 +Item(WithCheckOption ) +#endif +Item(SortGroupClause ) +Item(GroupingSet ) +Item(WindowClause ) +Item(ObjectWithArgs ) +Item(AccessPriv ) +Item(CreateOpClassItem ) +Item(TableLikeClause ) +Item(FunctionParameter ) +Item(LockingClause ) +Item(RowMarkClause ) +Item(XmlSerialize ) +Item(WithClause ) +Item(InferClause ) +Item(OnConflictClause ) +Item(CTESearchClause ) +Item(CTECycleClause ) +Item(CommonTableExpr ) +Item(RoleSpec ) +Item(TriggerTransition ) +Item(PartitionElem ) +Item(PartitionSpec ) +Item(PartitionBoundSpec ) +Item(PartitionRangeDatum ) +Item(PartitionCmd ) +Item(VacuumRelation ) + +Item(IdentifySystemCmd ) +Item(BaseBackupCmd ) +Item(CreateReplicationSlotCmd) +Item(DropReplicationSlotCmd ) +Item(StartReplicationCmd ) +Item(TimeLineHistoryCmd ) + +Item(TriggerData ) +Item(EventTriggerData ) +Item(ReturnSetInfo ) +Item(WindowObjectData ) +Item(TIDBitmap ) +Item(InlineCodeBlock ) +Item(FdwRoutine ) +Item(IndexAmRoutine ) +Item(TableAmRoutine ) +Item(TsmRoutine ) +Item(ForeignKeyCacheInfo ) +Item(CallContext ) +Item(SupportRequestSimplify ) +Item(SupportRequestSelectivity) +Item(SupportRequestCost ) +Item(SupportRequestRows ) +Item(SupportRequestIndexCondition) diff --git a/contrib/vci/utils/vci_symbols.c b/contrib/vci/utils/vci_symbols.c new file mode 100644 index 000000000000..e2c676329480 --- /dev/null +++ b/contrib/vci/utils/vci_symbols.c @@ -0,0 +1,48 @@ +/*------------------------------------------------------------------------- + * + * vci_symbols.c + * Converts a string from a node tag + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/utils/vci_symbols.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "nodes/nodes.h" + +#include "vci.h" +#include "vci_utils.h" + +#define Item(X) {T_ ## X, # X}, + +typedef struct +{ + NodeTag type; + const char *name; +} node_info_t; + +static const node_info_t node_info_table[] = { +#include "nodes.t" +}; + +#undef Item + +/* + * Returns a literal from a node + * + * XXX This is used for debugging or error reporting purposes. Performance is + * ignored for now, the linear search is used. + */ +const char * +VciGetNodeName(NodeTag type) +{ + for (int i = 0; i < lengthof(node_info_table); i++) + if (node_info_table[i].type == type) + return node_info_table[i].name; + + return "Unknown"; +} diff --git a/contrib/vci/vci--1.0.sql b/contrib/vci/vci--1.0.sql new file mode 100644 index 000000000000..4ba6c2d416ee --- /dev/null +++ b/contrib/vci/vci--1.0.sql @@ -0,0 +1,76 @@ +CREATE FUNCTION vci_handler(internal) +RETURNS index_am_handler +AS 'MODULE_PATHNAME' +LANGUAGE C VOLATILE STRICT; + +CREATE ACCESS METHOD vci TYPE index HANDLER vci_handler; + +CREATE OPERATOR CLASS bool_ops DEFAULT FOR TYPE bool USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS bytea_ops DEFAULT FOR TYPE bytea USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS char_ops DEFAULT FOR TYPE "char" USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS name_ops DEFAULT FOR TYPE name USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS int8_ops DEFAULT FOR TYPE int8 USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS int2_ops DEFAULT FOR TYPE int2 USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS int4_ops DEFAULT FOR TYPE int4 USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS text_ops DEFAULT FOR TYPE text USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS float4_ops DEFAULT FOR TYPE float4 USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS float8_ops DEFAULT FOR TYPE float8 USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS money_ops DEFAULT FOR TYPE money USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS bpchar_ops DEFAULT FOR TYPE bpchar USING vci AS OPERATOR 1 =; +-- CREATE OPERATOR CLASS varchar_ops DEFAULT FOR TYPE varchar USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS date_ops DEFAULT FOR TYPE date USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS time_ops DEFAULT FOR TYPE time USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS timestamp_ops DEFAULT FOR TYPE timestamp USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS timestamptz_ops DEFAULT FOR TYPE timestamptz USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS interval_ops DEFAULT FOR TYPE interval USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS timetz_ops DEFAULT FOR TYPE timetz USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS bit_ops DEFAULT FOR TYPE bit USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS varbit_ops DEFAULT FOR TYPE varbit USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS numeric_ops DEFAULT FOR TYPE numeric USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS uuid_ops DEFAULT FOR TYPE uuid USING vci AS OPERATOR 1 =; +CREATE OPERATOR CLASS tid_ops DEFAULT FOR TYPE tid USING vci AS OPERATOR 1 =; +-- CREATE OPERATOR CLASS nvarchar_ops DEFAULT FOR TYPE nvarchar USING vci AS OPERATOR 1 =; + +CREATE FUNCTION vci_check_supported_functions() +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C STABLE STRICT; + +CREATE FUNCTION vci_check_supported_types() +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE C STABLE STRICT; + +CREATE FUNCTION vci_index_size(IN vci_index_name text, OUT size int8) +AS 'MODULE_PATHNAME' +LANGUAGE C VOLATILE STRICT; + +CREATE FUNCTION vci_enable() RETURNS void AS $$ + BEGIN + SET vci.enable = on; + END +$$ LANGUAGE plpgsql; + +CREATE FUNCTION vci_disable() RETURNS void AS $$ + BEGIN + SET vci.enable = off; + END +$$ LANGUAGE plpgsql; + +CREATE FUNCTION vci_runs_in_query() +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C STABLE STRICT; + +CREATE FUNCTION vci_runs_in_plan() +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C VOLATILE STRICT; + +CREATE FUNCTION vci_always_return_true() +RETURNS bool +AS 'MODULE_PATHNAME' +LANGUAGE C VOLATILE STRICT; + +SELECT vci_check_supported_functions(); +SELECT vci_check_supported_types(); diff --git a/contrib/vci/vci.conf b/contrib/vci/vci.conf new file mode 100644 index 000000000000..70f597c1ce36 --- /dev/null +++ b/contrib/vci/vci.conf @@ -0,0 +1,8 @@ +shared_preload_libraries = 'vci' +max_worker_processes = 20 +vci.table_rows_threshold = 0 +vci.cost_threshold = 0 +vci.enable_ros_control_daemon = true +vci.control_naptime = 60s +log_min_messages = debug2 +autovacuum = off diff --git a/contrib/vci/vci.control b/contrib/vci/vci.control new file mode 100644 index 000000000000..0863f8dc503a --- /dev/null +++ b/contrib/vci/vci.control @@ -0,0 +1,5 @@ +# vci extension +comment = 'vertical clustered index' +default_version = '1.0' +module_pathname = '$libdir/vci' +relocatable = false diff --git a/contrib/vci/vci_main.c b/contrib/vci/vci_main.c new file mode 100644 index 000000000000..564e4a2be1e9 --- /dev/null +++ b/contrib/vci/vci_main.c @@ -0,0 +1,183 @@ +/*------------------------------------------------------------------------- + * + * vci_main.c + * VCI main file + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/vci_main.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam_xlog.h" +#include "access/transam.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "catalog/dependency.h" +#include "catalog/index.h" +#include "commands/tablecmds.h" +#include "common/file_utils.h" +#include "executor/nodeModifyTable.h" +#include "fmgr.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "nodes/pg_list.h" +#include "storage/ipc.h" +#include "storage/smgr.h" +#include "utils/guc.h" +#include "utils/rel.h" +#include "utils/relcache.h" +#include "utils/varlena.h" + +#include "vci.h" +#include "vci_mem.h" +#include "vci_ros.h" +#include "vci_ros_daemon.h" + +static void vci_xact_callback(XactEvent event, void *arg); +static void vci_subxact_callback(SubXactEvent event, SubTransactionId mySubid, SubTransactionId parentSubid, void *arg); + +PG_MODULE_MAGIC_EXT( + .name = "vci", + .version = PG_VERSION +); + +/* saved hook value in case of unload */ +/** + * Commands which re-index VCI. + */ +vci_RebuildCommand vci_rebuild_command = vcirc_invalid; + +ProcessUtility_hook_type process_utility_prev = NULL; + +static shmem_request_hook_type prev_shmem_request_hook = NULL; +static void vci_shmem_request(void); + +/** + * _PG_init: Entry point of this module. + * It is called when the module is loaded. + */ +void +_PG_init(void) +{ + pg_bindtextdomain(TEXTDOMAIN); + + if (!process_shared_preload_libraries_in_progress) + { + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("\"%s\" must be registered in shared_preload_libraries", VCI_STRING))); + return; /* LCOV_EXCL_LINE */ + } + + vci_read_guc_variables(); + + if (!IsPostmasterEnvironment) + { + VciGuc.enable = 0; + VciGuc.enable_ros_control_daemon = false; + } + + vci_setup_shmem(); + + vci_setup_executor_hook(); + + /* register process utilityhook */ + process_utility_prev = ProcessUtility_hook; + ProcessUtility_hook = vci_process_utility; + + /* register function to custom hook */ + add_index_delete_hook = vci_add_index_delete; + add_should_index_insert_hook = vci_add_should_index_insert; + add_drop_relation_hook = vci_add_drop_relation; + add_reindex_index_hook = vci_add_reindex_index; + add_skip_vci_index_hook = vci_add_skip_vci_index; + add_alter_tablespace_hook = vci_add_alter_tablespace; + add_alter_table_change_owner_hook = vci_alter_table_change_owner; + add_alter_table_change_schema_hook = vci_alter_table_change_schema; + add_snapshot_satisfies_hook = VCITupleSatisfiesVisibility; + add_skip_vacuum_hook = vci_isVciAdditionalRelation; + + /* If single user mode, not set environment for parallel. */ + if (IsPostmasterEnvironment) + { + if (!IsUnderPostmaster) + { +#ifdef WIN32 + struct stat st; + char *dir_name = "base/" PG_TEMP_FILES_DIR; + + if (stat(dir_name, &st) == 0) + { + if (!S_ISDIR(st.st_mode)) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("\"%s\" is not directory", dir_name))); + } + else + { + if (errno == ENOENT) + { + if (mkdir(dir_name, S_IRWXU) < 0) + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not create directory \"%s\": %m", + dir_name))); + } + else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not stat directory \"%s\": %m", + dir_name))); + } +#endif + + /* Register ROS Control Daemon */ + vci_ROS_control_daemon_setup(); + } + } + else + vci_shmem_startup_routine(); + + RegisterXactCallback(vci_xact_callback, NULL); + RegisterSubXactCallback(vci_subxact_callback, NULL); + + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = vci_shmem_request; + +} + +static void +vci_shmem_request(void) +{ + if (prev_shmem_request_hook) + prev_shmem_request_hook(); + + /* Register LWLocks used by VCI */ + RequestNamedLWLockTranche("VciStandbyExec", 1); + RequestNamedLWLockTranche("VciIOLoad", 1); + RequestNamedLWLockTranche("VciMemoryEntries", 1); + RequestNamedLWLockTranche("VciQueryContext", 1); + RequestNamedLWLockTranche("VciMntpoint2dev", 1); +} + +/* + * Callback function for COMMIT/ABORT/PREPARE operations. + */ +static void +vci_xact_callback(XactEvent event, void *arg) +{ + vci_xact_change_handler(event); +} + +/* + * Callback function for subxact operations. + */ +static void +vci_subxact_callback(SubXactEvent event, SubTransactionId mySubid, SubTransactionId parentSubid, void *arg) +{ + vci_subxact_change_handler(event, mySubid); +} diff --git a/contrib/vci/vci_read_guc.c b/contrib/vci/vci_read_guc.c new file mode 100644 index 000000000000..6b95108824dc --- /dev/null +++ b/contrib/vci/vci_read_guc.c @@ -0,0 +1,422 @@ +/*------------------------------------------------------------------------- + * + * vci_read_guc.c + * GUC parameter settings + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/vci_read_guc.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include + +#include "miscadmin.h" +#include "postmaster/postmaster.h" +#include "storage/procnumber.h" +#include "utils/guc.h" +#include "utils/guc_tables.h" +#include "utils/palloc.h" + +#include "vci.h" + +#include "vci_executor.h" +#include "vci_mem.h" + +/* GUC parameter holder */ +VciGucStruct VciGuc; + +static void check_max_worker_processes(void); + +static struct config_bool VciConfigureNamesBool[] = +{ + /* for internal use */ + { + { + "vci.enable", + PGC_USERSET, RESOURCES_MEM, + "Enables VCI.", + NULL, + }, + &VciGuc.enable, + true, + NULL, NULL, NULL, + }, + + { + { + "vci.log_query", + PGC_USERSET, RESOURCES_MEM, + "Logs information when a query fails to be executed by VCI.", + NULL, + }, + &VciGuc.log_query, + false, + NULL, NULL, NULL, + }, + + { + { + "vci.enable_seqscan", + PGC_USERSET, DEVELOPER_OPTIONS, + "Enables VCI planner to replace sequential-scan plans.", + NULL, + }, + &VciGuc.enable_seqscan, + true, + NULL, NULL, NULL, + }, + + { + { + "vci.enable_indexscan", + PGC_USERSET, DEVELOPER_OPTIONS, + "Enables VCI planner to replace index-scan plans.", + NULL, + }, + &VciGuc.enable_indexscan, + true, + NULL, NULL, NULL, + }, + + { + { + "vci.enable_bitmapheapscan", + PGC_USERSET, DEVELOPER_OPTIONS, + "Enables VCI planner to replace bitmap-scan plans.", + NULL, + }, + &VciGuc.enable_bitmapheapscan, + true, + NULL, NULL, NULL, + }, + + { + { + "vci.enable_sort", + PGC_USERSET, DEVELOPER_OPTIONS, + "Enables VCI planner to replace sort plans.", + NULL, + }, + &VciGuc.enable_sort, + true, + NULL, NULL, NULL, + }, + + { + { + "vci.enable_hashagg", + PGC_USERSET, DEVELOPER_OPTIONS, + "Enables VCI planner to replace hashed aggregation plans.", + NULL, + }, + &VciGuc.enable_hashagg, + true, + NULL, NULL, NULL, + }, + + { + { + "vci.enable_sortagg", + PGC_USERSET, DEVELOPER_OPTIONS, + "Enables VCI planner to replace sorted aggregation plans.", + NULL, + }, + &VciGuc.enable_sortagg, + true, + NULL, NULL, NULL, + }, + + { + { + "vci.enable_plainagg", + PGC_USERSET, DEVELOPER_OPTIONS, + "Enables VCI planner to replace plain aggregation plans.", + NULL, + }, + &VciGuc.enable_plainagg, + true, + NULL, NULL, NULL, + }, + + { + { + "vci.enable_hashjoin", + PGC_USERSET, DEVELOPER_OPTIONS, + "Enables VCI planner to replace hash join plans.", + NULL, + }, + &VciGuc.enable_hashjoin, + false, + NULL, NULL, NULL, + }, + + { + { + "vci.enable_nestloop", + PGC_USERSET, DEVELOPER_OPTIONS, + "Enables VCI planner to replace nested-loop plans.", + NULL, + }, + &VciGuc.enable_nestloop, + false, + NULL, NULL, NULL, + }, + + { + { + "vci.enable_ros_control_daemon", + PGC_POSTMASTER, RESOURCES_MEM, + "Enables the VCI ROS Control Daemon.", + NULL, + }, + &VciGuc.enable_ros_control_daemon, + false, + NULL, NULL, NULL, + }, + +}; + +static struct config_int VciConfigureNamesInt[] = +{ + { + { + "vci.cost_threshold", + PGC_USERSET, RESOURCES_MEM, + "Sets the threshold CPU load beyond which the VCI control worker is stopped.", + NULL, + }, + &VciGuc.cost_threshold, + 18000, 0, INT_MAX, + NULL, NULL, NULL, + }, + + { + { + "vci.maintenance_work_mem", + PGC_SIGHUP, RESOURCES_MEM, + "Sets the maximum memory to be used by each control worker for VCI control operations.", + NULL, + GUC_UNIT_KB, + }, + &VciGuc.maintenance_work_mem, + 256 * 1024, 1024, MAX_KILOBYTES, + NULL, NULL, NULL, + }, + + /* **************************************** */ + /* ROS Control Daemon/Worker configurations */ + /* **************************************** */ + + /* Daemon setup */ + { + { + "vci.control_max_workers", + PGC_POSTMASTER, RESOURCES_IO, + "Sets the maximum number of simultaneously running VCI control worker processes.", + NULL, + }, + &VciGuc.control_max_workers, + 8, 1, MAX_BACKENDS, + NULL, NULL, NULL, + }, + + { + { + "vci.control_naptime", + PGC_SIGHUP, RESOURCES_IO, + "Time to sleep between VCI control worker runs.", + NULL, + GUC_UNIT_S, + }, + &VciGuc.control_naptime, + 1, 1, INT_MAX / 1000, + NULL, NULL, NULL, + }, + + /* Worker : ROS control command thresholds */ + + { + { + "vci.wosros_conv_threshold", + PGC_SIGHUP, RESOURCES_MEM, + "Sets the threshold of Data WOS rows to execute WOS->ROS conversion.", + NULL, + }, + &VciGuc.wosros_conv_threshold, + 256 * 1024, 1, INT_MAX, + NULL, NULL, NULL, + }, + + { + { + "vci.cdr_threshold", + PGC_SIGHUP, RESOURCES_MEM, + "Sets the threshold of deleted rows in ROS to execute collect-deleted-rows command.", + NULL, + }, + &VciGuc.cdr_threshold, + 128 * 1024, 1, INT_MAX, + NULL, NULL, NULL, + }, + + /******************************************/ + /* Custom Plan Execution */ + /******************************************/ + + { + { + "vci.max_local_ros", + PGC_USERSET, RESOURCES_MEM, + "Sets the maximum local ROS memory.", + NULL, + GUC_UNIT_KB, + }, + &VciGuc.max_local_ros_size, + 64 * 1024, 64 * 1024, INT_MAX, + NULL, NULL, NULL, + }, + + { + { + "vci.table_rows_threshold", + PGC_USERSET, DEVELOPER_OPTIONS, + "Sets the threshold of table rows to execute VCI Scan.", + NULL, + }, + &VciGuc.table_rows_threshold, + VCI_MAX_FETCHING_ROWS, 0, INT_MAX, + NULL, NULL, NULL, + }, + +}; + +static const struct config_enum_entry table_scan_policy_options[] = { + + {"column store only", VCI_TABLE_SCAN_POLICY_COLUMN_ONLY, false}, + {"column only", VCI_TABLE_SCAN_POLICY_COLUMN_ONLY, true}, + {"none", VCI_TABLE_SCAN_POLICY_NONE, false}, + {NULL, 0, false} +}; + +static struct config_enum VciConfigureNamesEnum[] = +{ + { + { + "vci.table_scan_policy", + PGC_USERSET, DEVELOPER_OPTIONS, + "Sets the policy that a scan node reads from the column store table(VCI index) or the row store table(original).", + NULL, + }, + &VciGuc.table_scan_policy, + VCI_TABLE_SCAN_POLICY_COLUMN_ONLY, + table_scan_policy_options, + NULL, NULL, NULL + } +}; + +/* + * Set GUC parameters + */ +void +vci_read_guc_variables(void) +{ + int i; + + /* + * TODO: Raise warnings or set parameters to default, when the specified + * value is out-of-range. + */ + for (i = 0; i < (int) lengthof(VciConfigureNamesBool); i++) + { + struct config_bool *conf = &VciConfigureNamesBool[i]; + + if (IsPostmasterEnvironment) + DefineCustomBoolVariable(conf->gen.name, + conf->gen.short_desc, + conf->gen.long_desc, + conf->variable, + conf->boot_val, + conf->gen.context, + conf->gen.flags, + conf->check_hook, + conf->assign_hook, + conf->show_hook); + else + *(conf->variable) = conf->boot_val; + + } + + for (i = 0; i < (int) lengthof(VciConfigureNamesInt); i++) + { + struct config_int *conf = &VciConfigureNamesInt[i]; + + if (IsPostmasterEnvironment) + DefineCustomIntVariable(conf->gen.name, + conf->gen.short_desc, + conf->gen.long_desc, + conf->variable, + conf->boot_val, + conf->min, + conf->max, + conf->gen.context, + conf->gen.flags, + conf->check_hook, + conf->assign_hook, + conf->show_hook); + else + *(conf->variable) = conf->boot_val; + } + + /* FIXME: Add initial value to pass Assert() */ + VciGuc.table_scan_policy = VCI_TABLE_SCAN_POLICY_COLUMN_ONLY; + + for (i = 0; i < (int) lengthof(VciConfigureNamesEnum); i++) + { + struct config_enum *conf = &VciConfigureNamesEnum[i]; + + if (IsPostmasterEnvironment) + DefineCustomEnumVariable(conf->gen.name, + conf->gen.short_desc, + conf->gen.long_desc, + conf->variable, + conf->boot_val, + conf->options, + conf->gen.context, + conf->gen.flags, + conf->check_hook, + conf->assign_hook, + conf->show_hook); + else + *(conf->variable) = conf->boot_val; + } + + VciGuc.have_loaded_postgresql_conf = true; + + check_max_worker_processes(); +} + +/* + * Check for max_worker_processes + */ +static void +check_max_worker_processes(void) +{ + int num_needed_workers; + + num_needed_workers = 1 + VciGuc.control_max_workers; /* ros control daemon & + * workers */ + num_needed_workers += 1; /* parallel control daemon */ + + if (num_needed_workers > MAX_BACKENDS) + num_needed_workers = MAX_BACKENDS; + + if (max_worker_processes < num_needed_workers) + ereport(ERROR, + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg(VCI_STRING " needs to set at least %d to \"max_worker_processes\"", + num_needed_workers))); +} diff --git a/contrib/vci/vci_shmem.c b/contrib/vci/vci_shmem.c new file mode 100644 index 000000000000..53834466a521 --- /dev/null +++ b/contrib/vci/vci_shmem.c @@ -0,0 +1,206 @@ +/*------------------------------------------------------------------------- + * + * vci_shmem.c + * Managing shared memory + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/vci_shmem.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "miscadmin.h" +#include "storage/ipc.h" + +#include "vci.h" +#include "vci_mem.h" +#include "vci_ros_daemon.h" + +/* + * Pointer to fixed-position shared memory area + */ +VciShmemStruct *VciShmemAddr; + +/* Saved hook value */ +static shmem_startup_hook_type shmem_startup_prev = NULL; + +static shmem_request_hook_type prev_shmem_request_hook = NULL; +static void vci_shmem_request(void); + +/* + * Setup shmem_startup_hook + */ +void +vci_setup_shmem(void) +{ + Assert(VciGuc.have_loaded_postgresql_conf); + + prev_shmem_request_hook = shmem_request_hook; + shmem_request_hook = vci_shmem_request; + + shmem_startup_prev = shmem_startup_hook; + shmem_startup_hook = vci_shmem_startup_routine; +} + +/* + * Request additional shared resources + */ +static void +vci_shmem_request(void) +{ + if (prev_shmem_request_hook) + prev_shmem_request_hook(); + + RequestAddinShmemSpace(sizeof(VciShmemStruct)); + RequestAddinShmemSpace(vci_GetSizeOfMemoryEntries()); + + /* + * + */ + + /* + * The + 1 is for wos->ros conversion of vci assigned to unmonitored + * devices. vci_devload_t is allocated per device. It monitors and stores + * the IO load, holds a set of vci on the device, and is used to determine + * which vci to perform wos->ros conversion. Since only a fixed number of + * vci_devload_t type values are prepared, if more devices are added and + * exceed the number, they fall outside the management scope for wos->ros + * conversion. To ensure that no vci is left without wos->ros conversion, + * a vci_devload_t type value is prepared to store the set of vci on + * devices outside the management scope, and it is handled like other + * vci_devload_t values to convert vci. This is the area allocated by + 1. + * In order to be treated in the program similarly to the device being + * monitored, the value used to determine whether to convert vci on that + * device should be set appropriately as a device for conversion. Such + * devices are processed as if they are collectively one device, so the + * conversion frequency becomes lower. + */ + RequestAddinShmemSpace(sizeof(vci_devload_t) * (VciGuc.max_devices + 1)); +} + +/** + * initialize devload info + */ +static void +vci_Initialize_devload_info(void) +{ + vci_devload_t *dl_not_monitored; + vci_memory_entry_list_t *list; + + LWLockAcquire(VciShmemAddr->io_load_lock, LW_EXCLUSIVE); + + dlist_init(&(VciShmemAddr->free_memory_entry_queue_list)); + + /* OSS has no loop for monitored devices: just init for [0] */ + Assert(VciShmemAddr->max_devices == 0); + list = &(VciShmemAddr->memory_entry_queue_array[0]); + dlist_init(&(list->head)); + dlist_push_tail(&(VciShmemAddr->free_memory_entry_queue_list), &(list->link)); + + /* Setup for unmonitored device */ + dl_not_monitored = &(VciShmemAddr->devload_array[0]); + strcpy(dl_not_monitored->devname, VCI_PSEUDO_UNMONITORED_DEVICE); + list = dlist_container(vci_memory_entry_list_t, link, dlist_pop_head_node(&(VciShmemAddr->free_memory_entry_queue_list))); + dl_not_monitored->memory_entry_queue = list; + + /* OSS has just 1 devload_info */ + VciShmemAddr->num_devload_info = 1; + + LWLockRelease(VciShmemAddr->io_load_lock); +} + +/* + * Initialize shared memory + */ +void +vci_shmem_startup_routine(void) +{ + bool found; + + VciGuc.max_devices = 0; + + if (IsPostmasterEnvironment) + { + if (shmem_startup_prev) + shmem_startup_prev(); + + VciShmemAddr = + (VciShmemStruct *) ShmemInitStruct("vci: shared memory", sizeof(VciShmemStruct), &found); + Assert(VciShmemAddr != NULL); + +#ifdef WIN32 + if (IsUnderPostmaster) + { + /** Later process is only necessary in Postmaster, + * so child process processing ends here + */ + return; + } +#endif + + /* + * Prepare the same number of vci_id_t as the number of worker This + * area is used to pass parameters from the ros daemon to the worker + * that actually does the conversion Note: The minimum value of + * control_max_workers is set to 1, so the allocation size would not + * be 0 + */ + VciShmemAddr->worker_args_array = + ShmemInitStruct("vci: arguments for workers ", sizeof(vci_wosros_conv_worker_arg_t) * VciGuc.control_max_workers, &found); + Assert(VciShmemAddr->worker_args_array != NULL); + + VciShmemAddr->memory_entries = + ShmemInitStruct("vci: memory entries", vci_GetSizeOfMemoryEntries(), &found); + Assert(VciShmemAddr->memory_entries != NULL); + + /* + * + 1 for non-monitored devices: ramfs and the ones that cannot be + * observed because of space limitation + */ + VciShmemAddr->devload_array = + ShmemInitStruct("vci: io load watch", sizeof(vci_devload_t) * (VciGuc.max_devices + 1), &found); + Assert(VciShmemAddr->devload_array != NULL); + VciShmemAddr->memory_entry_queue_array = + ShmemInitStruct("vci: memory entry queue", sizeof(vci_memory_entry_list_t) * (VciGuc.max_devices + 1), &found); + Assert(VciShmemAddr->memory_entry_queue_array != NULL); + } + else + { + VciShmemAddr = malloc(sizeof(VciShmemStruct)); + MemSet(VciShmemAddr, 0, sizeof(VciShmemStruct)); + VciShmemAddr->worker_args_array = malloc(sizeof(vci_wosros_conv_worker_arg_t) * VciGuc.control_max_workers); + MemSet(VciShmemAddr->worker_args_array, 0, sizeof(vci_wosros_conv_worker_arg_t) * VciGuc.control_max_workers); + VciShmemAddr->memory_entries = malloc(vci_GetSizeOfMemoryEntries()); + MemSet(VciShmemAddr->memory_entries, 0, vci_GetSizeOfMemoryEntries()); + VciShmemAddr->devload_array = malloc(sizeof(vci_devload_t) * (VciGuc.max_devices + 1)); + MemSet(VciShmemAddr->devload_array, 0, sizeof(vci_devload_t) * (VciGuc.max_devices + 1)); + VciShmemAddr->memory_entry_queue_array = malloc(sizeof(vci_memory_entry_list_t) * (VciGuc.max_devices + 1)); + MemSet(VciShmemAddr->memory_entry_queue_array, 0, sizeof(vci_memory_entry_list_t) * (VciGuc.max_devices + 1)); + } + + /* + * Standby server execution control + */ + VciShmemAddr->standby_exec_loc = &(GetNamedLWLockTranche("VciStandbyExec"))->lock; + + /* + * Set the number of monitorable devices and initialize lock for IO load + * monitoring + */ + VciShmemAddr->max_devices = VciGuc.max_devices; + VciShmemAddr->io_load_lock = &(GetNamedLWLockTranche("VciIOLoad"))->lock; + + /* Additional LWLocks Initialization */ + VciShmemAddr->vci_memory_entries_lock = &(GetNamedLWLockTranche("VciMemoryEntries"))->lock; + VciShmemAddr->vci_query_context_lock = &(GetNamedLWLockTranche("VciQueryContext"))->lock; + VciShmemAddr->vci_mnt_point2dev_lock = &(GetNamedLWLockTranche("VciMntpoint2dev"))->lock; + + /* initialize the lists of vci_devload_t */ + vci_Initialize_devload_info(); + + /* Initialize vci-memory-entries */ + vci_InitMemoryEntries(); +} diff --git a/contrib/vci/vci_supported_funcs.c b/contrib/vci/vci_supported_funcs.c new file mode 100644 index 000000000000..65c61d02d113 --- /dev/null +++ b/contrib/vci/vci_supported_funcs.c @@ -0,0 +1,855 @@ +/*------------------------------------------------------------------------- + * + * vci_supported_func.c + * Function that VCI supports that can be called with FuncExpr + * + * vci_supported_func_table[] is created with the following SQL and then examined individually. + * + * SELECT oid, proname FROM pg_proc WHERE prokind = 'f' AND NOT proretset + * AND NOT EXISTS (SELECT funcoid FROM sys_func_table WHERE pg_proc.oid = sys_func_table.funcoid) + * AND (SELECT bool_and(i IN (SELECT typeoid FROM safe_types)) FROM unnest(array_prepend(prorettype, proargtypes)) AS t(i)) + * AND oid < 16384 ORDER BY oid; + * + * - prokind = 'f' is to include only normal functions (e.g. exclude aggregate functions and window functions). + * - NOT proretset is to exclude SRF + * - NOT EXISTS (SELECT ...) is to exclude system related functions + * - (SELECT bool_and( ...) is to exclude the appearance of unauthorized types in return values and arguments. + * - oid < 16384 is to exclude user-defined types + * + * sys_func_table and safe_types are in reference to vci_supported_funcs.sql. + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/vci_supported_funcs.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam.h" +#include "access/htup.h" +#include "access/htup_details.h" +#include "access/transam.h" +#include "c.h" +#include "catalog/pg_namespace.h" +#include "catalog/pg_proc.h" /* for ProcedureRelationId, Form_pg_proc */ +#include "catalog/pg_type.h" +#include "fmgr.h" +#include "utils/elog.h" +#include "utils/memutils.h" +#include "utils/relcache.h" +#include "utils/syscache.h" + +#include "vci.h" +#include "vci_mem.h" +#include "vci_supported_oid.h" + +/** + * Smallest OID among functions supported by VCI + */ +#define VCI_SUPPORTED_FUNC_MIN (77) + +/** + * Biggest OID among functions supported by VCI + */ +#define VCI_SUPPORTED_FUNC_MAX (6204) + +/** + * Data to record special user defined functions + */ +vci_special_udf_info_t vci_special_udf_info; + +/** + * Array of information about functions supported by VCI + * Note when modifying vci_supported_func_table array: + * 1.OIDs are in ascending order. + * 2.Don't forget to change the macro value of VCI_SUPPORTED_FUNC_MIN/VCI_SUPPORTED_FUNC_MAX. + */ +static const struct +{ + Oid oid; + const char *name; + bool is_support; +} vci_supported_func_table[] = { + {77, "int4", true}, /* immutable, internal(12) {char,int4} */ + {78, "char", true}, /* immutable, internal(12) {char,int4} */ + {89, "version", false}, /* stable, internal(12) {text} */ + {228, "dround", true}, /* immutable, internal(12) {float8,float8} */ + {229, "dtrunc", true}, /* immutable, internal(12) {float8,float8} */ + {233, "dexp", true}, /* immutable, internal(12) {float8,float8} */ + {234, "dlog1", true}, /* immutable, internal(12) {float8,float8} */ + {235, "float8", true}, /* immutable, internal(12) {int2,float8} */ + {236, "float4", true}, /* immutable, internal(12) {int2,float4} */ + {237, "int2", true}, /* immutable, internal(12) {int2,float8} */ + {238, "int2", true}, /* immutable, internal(12) {int2,float4} */ + {274, "timeofday", true}, /* volatile, internal(12) {text} */ + {311, "float8", true}, /* immutable, internal(12) {float4,float8} */ + {312, "float4", true}, /* immutable, internal(12) {float4,float8} */ + {313, "int4", true}, /* immutable, internal(12) {int2,int4} */ + {314, "int2", true}, /* immutable, internal(12) {int2,int4} */ + {316, "float8", true}, /* immutable, internal(12) {int4,float8} */ + {317, "int4", true}, /* immutable, internal(12) {int4,float8} */ + {318, "float4", true}, /* immutable, internal(12) {int4,float4} */ + {319, "int4", true}, /* immutable, internal(12) {int4,float4} */ + {320, "width_bucket", true}, /* immutable, internal(12) + * {int4,int4,float8,float8,float8} */ + {376, "string_to_array", false}, /* immutable, internal(12) + * {text,text,text,_text} */ + {384, "array_to_string", false}, /* stable, internal(12) + * {text,text,text,anyarray} */ + {394, "string_to_array", false}, /* immutable, internal(12) + * {text,text,_text} */ + {395, "array_to_string", false}, /* stable, internal(12) + * {text,text,anyarray} */ + {401, "text", true}, /* immutable, internal(12) {text,bpchar} */ + {406, "text", true}, /* immutable, internal(12) {name,text} */ + {407, "name", true}, /* immutable, internal(12) {name,text} */ + {408, "bpchar", true}, /* immutable, internal(12) {name,bpchar} */ + {409, "name", true}, /* immutable, internal(12) {name,bpchar} */ + {480, "int4", true}, /* immutable, internal(12) {int8,int4} */ + {481, "int8", true}, /* immutable, internal(12) {int8,int4} */ + {482, "float8", true}, /* immutable, internal(12) {int8,float8} */ + {483, "int8", true}, /* immutable, internal(12) {int8,float8} */ + {652, "float4", true}, /* immutable, internal(12) {int8,float4} */ + {653, "int8", true}, /* immutable, internal(12) {int8,float4} */ + {668, "bpchar", true}, /* immutable, internal(12) + * {bool,int4,bpchar,bpchar} */ + {710, "getpgusername", false}, /* stable, internal(12) {name} */ + {714, "int2", true}, /* immutable, internal(12) {int8,int2} */ + {720, "octet_length", true}, /* immutable, internal(12) {bytea,int4} */ + {721, "get_byte", true}, /* immutable, internal(12) {bytea,int4,int4} */ + {722, "set_byte", true}, /* immutable, internal(12) + * {bytea,bytea,int4,int4} */ + {723, "get_bit", true}, /* immutable, internal(12) {bytea,int8,int4} */ + {724, "set_bit", true}, /* immutable, internal(12) + * {bytea,bytea,int8,int4} */ + {745, "current_user", false}, /* stable, internal(12) {name} */ + {746, "session_user", false}, /* stable, internal(12) {name} */ + {747, "array_dims", false}, /* immutable, internal(12) {text,anyarray} */ + {748, "array_ndims", false}, /* immutable, internal(12) {int4,anyarray} */ + {749, "overlay", true}, /* immutable, internal(12) + * {bytea,bytea,bytea,int4,int4} */ + {752, "overlay", true}, /* immutable, internal(12) + * {bytea,bytea,bytea,int4} */ + {754, "int8", true}, /* immutable, internal(12) {int8,int2} */ + {766, "int4inc", true}, /* immutable, internal(12) {int4,int4} */ + {810, "pg_client_encoding", true}, /* stable, internal(12) {name} */ + {817, "current_query", false}, /* volatile, internal(12) {text} */ + {849, "position", true}, /* immutable, internal(12) {int4,text,text} */ + {860, "bpchar", true}, /* immutable, internal(12) {char,bpchar} */ + {861, "current_database", false}, /* stable, internal(12) {name} */ + {868, "strpos", true}, /* immutable, internal(12) {int4,text,text} */ + {870, "lower", true}, /* immutable, internal(12) {text,text} */ + {871, "upper", true}, /* immutable, internal(12) {text,text} */ + {872, "initcap", true}, /* immutable, internal(12) {text,text} */ + {873, "lpad", true}, /* immutable, internal(12) + * {int4,text,text,text} */ + {874, "rpad", true}, /* immutable, internal(12) + * {int4,text,text,text} */ + {875, "ltrim", true}, /* immutable, internal(12) {text,text,text} */ + {876, "rtrim", true}, /* immutable, internal(12) {text,text,text} */ + {877, "substr", true}, /* immutable, internal(12) + * {int4,int4,text,text} */ + {878, "translate", true}, /* immutable, internal(12) + * {text,text,text,text} */ + {879, "lpad", true}, /* immutable, sql(14) {int4,text,text} */ + {880, "rpad", true}, /* immutable, sql(14) {int4,text,text} */ + {881, "ltrim", true}, /* immutable, internal(12) {text,text} */ + {882, "rtrim", true}, /* immutable, internal(12) {text,text} */ + {883, "substr", true}, /* immutable, internal(12) {int4,text,text} */ + {884, "btrim", true}, /* immutable, internal(12) {text,text,text} */ + {885, "btrim", true}, /* immutable, internal(12) {text,text} */ + {935, "cash_words", true}, /* immutable, internal(12) {text,money} */ + {936, "substring", true}, /* immutable, internal(12) + * {int4,int4,text,text} */ + {937, "substring", true}, /* immutable, internal(12) {int4,text,text} */ + {940, "mod", true}, /* immutable, internal(12) {int2,int2,int2} */ + {941, "mod", true}, /* immutable, internal(12) {int4,int4,int4} */ + {944, "char", true}, /* immutable, internal(12) {char,text} */ + {946, "text", true}, /* immutable, internal(12) {char,text} */ + {947, "mod", true}, /* immutable, internal(12) {int8,int8,int8} */ + {1026, "timezone", true}, /* immutable, internal(12) + * {timestamp,timestamptz,interval} */ + {1039, "getdatabaseencoding", false}, /* stable, internal(12) {name} */ + {1158, "to_timestamp", true}, /* immutable, sql(14) {float8,timestamptz} */ + {1159, "timezone", true}, /* immutable, internal(12) + * {text,timestamp,timestamptz} */ + {1171, "date_part", true}, /* stable, internal(12) + * {text,float8,timestamptz} */ + {1172, "date_part", true}, /* immutable, internal(12) + * {text,float8,interval} */ + {1174, "timestamptz", true}, /* stable, internal(12) + * {date,timestamptz} */ + {1175, "justify_hours", true}, /* immutable, internal(12) + * {interval,interval} */ + {1176, "timestamptz", true}, /* stable, sql(14) + * {date,time,timestamptz} */ + {1178, "date", true}, /* stable, internal(12) {date,timestamptz} */ + {1193, "array_fill", false}, /* immutable, internal(12) + * {_int4,anyarray,anyelement} */ + {1199, "age", true}, /* immutable, internal(12) + * {timestamptz,timestamptz,interval} */ + {1200, "interval", true}, /* immutable, internal(12) + * {int4,interval,interval} */ + {1217, "date_trunc", true}, /* stable, internal(12) + * {text,timestamptz,timestamptz} */ + {1218, "date_trunc", true}, /* immutable, internal(12) + * {text,interval,interval} */ + {1257, "textlen", true}, /* immutable, internal(12) {int4,text} */ + {1264, "pg_char_to_encoding", true}, /* stable, internal(12) + * {name,int4} */ + {1269, "pg_column_size", false}, /* stable, internal(12) {int4,any} */ + {1271, "overlaps", true}, /* immutable, internal(12) + * {bool,timetz,timetz,timetz,timetz} */ + {1273, "date_part", true}, /* immutable, internal(12) + * {text,float8,timetz} */ + {1282, "quote_ident", true}, /* immutable, internal(12) {text,text} */ + {1283, "quote_literal", true}, /* immutable, internal(12) {text,text} */ + {1285, "quote_literal", true}, /* stable, sql(14) {text,anyelement} */ + {1286, "array_fill", false}, /* immutable, internal(12) + * {_int4,_int4,anyarray,anyelement} */ + {1289, "quote_nullable", true}, /* immutable, internal(12) {text,text} */ + {1290, "quote_nullable", true}, /* stable, sql(14) {text,anyelement} */ + {1295, "justify_days", true}, /* immutable, internal(12) + * {interval,interval} */ + {1299, "now", true}, /* stable, internal(12) {timestamptz} */ + {1304, "overlaps", true}, /* immutable, internal(12) + * {bool,timestamptz,timestamptz,timestamptz,timestamptz} */ + {1305, "overlaps", true}, /* stable, sql(14) + * {bool,timestamptz,timestamptz,interval,interval} */ + {1306, "overlaps", true}, /* stable, sql(14) + * {bool,timestamptz,timestamptz,timestamptz,interval} */ + {1307, "overlaps", true}, /* stable, sql(14) + * {bool,timestamptz,timestamptz,timestamptz,interval} */ + {1308, "overlaps", true}, /* immutable, internal(12) + * {bool,time,time,time,time} */ + {1309, "overlaps", true}, /* immutable, sql(14) + * {bool,time,time,interval,interval} */ + {1310, "overlaps", true}, /* immutable, sql(14) + * {bool,time,time,time,interval} */ + {1311, "overlaps", true}, /* immutable, sql(14) + * {bool,time,time,time,interval} */ + {1316, "time", true}, /* immutable, internal(12) {time,timestamp} */ + {1317, "length", true}, /* immutable, internal(12) {int4,text} */ + {1318, "length", true}, /* immutable, internal(12) {int4,bpchar} */ + {1339, "dlog10", true}, /* immutable, internal(12) {float8,float8} */ + {1340, "log", true}, /* immutable, internal(12) {float8,float8} */ + {1341, "ln", true}, /* immutable, internal(12) {float8,float8} */ + {1342, "round", true}, /* immutable, internal(12) {float8,float8} */ + {1343, "trunc", true}, /* immutable, internal(12) {float8,float8} */ + {1344, "sqrt", true}, /* immutable, internal(12) {float8,float8} */ + {1345, "cbrt", true}, /* immutable, internal(12) {float8,float8} */ + {1346, "pow", true}, /* immutable, internal(12) + * {float8,float8,float8} */ + {1347, "exp", true}, /* immutable, internal(12) {float8,float8} */ + {1359, "timestamptz", true}, /* immutable, internal(12) + * {date,timestamptz,timetz} */ + {1367, "character_length", true}, /* immutable, internal(12) + * {int4,bpchar} */ + {1368, "power", true}, /* immutable, internal(12) + * {float8,float8,float8} */ + {1369, "character_length", true}, /* immutable, internal(12) {int4,text} */ + {1370, "interval", true}, /* immutable, internal(12) {time,interval} */ + {1372, "char_length", true}, /* immutable, internal(12) {int4,bpchar} */ + {1373, "isfinite", true}, /* immutable, internal(12) {bool,date} */ + {1374, "octet_length", true}, /* immutable, internal(12) {int4,text} */ + {1375, "octet_length", true}, /* immutable, internal(12) {int4,bpchar} */ + {1376, "factorial", true}, /* immutable, internal(12) {int8,numeric} */ + {1381, "char_length", true}, /* immutable, internal(12) {int4,text} */ + {1384, "date_part", true}, /* immutable, sql(14) {text,float8,date} */ + {1385, "date_part", true}, /* immutable, internal(12) {text,float8,time} */ + {1386, "age", true}, /* stable, sql(14) {timestamptz,interval} */ + {1388, "timetz", true}, /* stable, internal(12) + * {timestamptz,timetz} */ + {1389, "isfinite", true}, /* immutable, internal(12) {bool,timestamptz} */ + {1390, "isfinite", true}, /* immutable, internal(12) {bool,interval} */ + {1394, "abs", true}, /* immutable, internal(12) {float4,float4} */ + {1395, "abs", true}, /* immutable, internal(12) {float8,float8} */ + {1396, "abs", true}, /* immutable, internal(12) {int8,int8} */ + {1397, "abs", true}, /* immutable, internal(12) {int4,int4} */ + {1398, "abs", true}, /* immutable, internal(12) {int2,int2} */ + {1402, "current_schema", false}, /* stable, internal(12) {name} */ + {1403, "current_schemas", false}, /* stable, internal(12) + * {bool,_name} */ + {1404, "overlay", true}, /* immutable, internal(12) + * {int4,int4,text,text,text} */ + {1405, "overlay", true}, /* immutable, internal(12) + * {int4,text,text,text} */ + {1419, "time", true}, /* immutable, internal(12) {time,interval} */ + {1569, "like", true}, /* immutable, internal(12) {bool,text,text} */ + {1570, "notlike", true}, /* immutable, internal(12) {bool,text,text} */ + {1571, "like", true}, /* immutable, internal(12) {bool,name,text} */ + {1572, "notlike", true}, /* immutable, internal(12) {bool,name,text} */ + {1597, "pg_encoding_to_char", true}, /* stable, internal(12) + * {name,int4} */ + {1598, "random", false}, /* volatile, internal(12) {float8} */ + {1599, "setseed", false}, /* volatile, internal(12) {float8,void} */ + {1600, "asin", true}, /* immutable, internal(12) {float8,float8} */ + {1601, "acos", true}, /* immutable, internal(12) {float8,float8} */ + {1602, "atan", true}, /* immutable, internal(12) {float8,float8} */ + {1603, "atan2", true}, /* immutable, internal(12) + * {float8,float8,float8} */ + {1604, "sin", true}, /* immutable, internal(12) {float8,float8} */ + {1605, "cos", true}, /* immutable, internal(12) {float8,float8} */ + {1606, "tan", true}, /* immutable, internal(12) {float8,float8} */ + {1607, "cot", true}, /* immutable, internal(12) {float8,float8} */ + {1608, "degrees", true}, /* immutable, internal(12) {float8,float8} */ + {1609, "radians", true}, /* immutable, internal(12) {float8,float8} */ + {1610, "pi", true}, /* immutable, internal(12) {float8} */ + {1620, "ascii", true}, /* immutable, internal(12) {int4,text} */ + {1621, "chr", true}, /* immutable, internal(12) {int4,text} */ + {1622, "repeat", true}, /* immutable, internal(12) {int4,text,text} */ + {1623, "similar_escape", true}, /* immutable, internal(12) + * {text,text,text} */ + {1637, "like_escape", true}, /* immutable, internal(12) + * {text,text,text} */ + {1640, "pg_get_viewdef", false}, /* stable, internal(12) {text,text} */ + {1665, "pg_get_serial_sequence", false}, /* stable, internal(12) + * {text,text,text} */ + {1680, "substring", true}, /* immutable, internal(12) {int4,int4,bit,bit} */ + {1681, "length", true}, /* immutable, internal(12) {int4,bit} */ + {1682, "octet_length", true}, /* immutable, internal(12) {int4,bit} */ + {1683, "bit", true}, /* immutable, internal(12) {int4,int4,bit} */ + {1684, "int4", true}, /* immutable, internal(12) {int4,bit} */ + {1685, "bit", true}, /* immutable, internal(12) {bool,int4,bit,bit} */ + {1698, "position", true}, /* immutable, internal(12) {int4,bit,bit} */ + {1699, "substring", true}, /* immutable, internal(12) {int4,bit,bit} */ + {1703, "numeric", true}, /* immutable, internal(12) + * {int4,numeric,numeric} */ + {1705, "abs", true}, /* immutable, internal(12) {numeric,numeric} */ + {1706, "sign", true}, /* immutable, internal(12) {numeric,numeric} */ + {1707, "round", true}, /* immutable, internal(12) + * {int4,numeric,numeric} */ + {1708, "round", true}, /* immutable, sql(14) {numeric,numeric} */ + {1709, "trunc", true}, /* immutable, internal(12) + * {int4,numeric,numeric} */ + {1710, "trunc", true}, /* immutable, sql(14) {numeric,numeric} */ + {1711, "ceil", true}, /* immutable, internal(12) {numeric,numeric} */ + {1712, "floor", true}, /* immutable, internal(12) {numeric,numeric} */ + {1713, "length", true}, /* stable, internal(12) {bytea,name,int4} */ + {1714, "convert_from", true}, /* stable, internal(12) + * {bytea,name,text} */ + {1717, "convert_to", true}, /* stable, internal(12) {bytea,name,text} */ + {1728, "mod", true}, /* immutable, internal(12) + * {numeric,numeric,numeric} */ + {1730, "sqrt", true}, /* immutable, internal(12) {numeric,numeric} */ + {1731, "numeric_sqrt", true}, /* immutable, internal(12) + * {numeric,numeric} */ + {1732, "exp", true}, /* immutable, internal(12) {numeric,numeric} */ + {1733, "numeric_exp", true}, /* immutable, internal(12) + * {numeric,numeric} */ + {1734, "ln", true}, /* immutable, internal(12) {numeric,numeric} */ + {1735, "numeric_ln", true}, /* immutable, internal(12) {numeric,numeric} */ + {1736, "log", true}, /* immutable, internal(12) + * {numeric,numeric,numeric} */ + {1737, "numeric_log", true}, /* immutable, internal(12) + * {numeric,numeric,numeric} */ + {1738, "pow", true}, /* immutable, internal(12) + * {numeric,numeric,numeric} */ + {1740, "numeric", true}, /* immutable, internal(12) {int4,numeric} */ + {1741, "log", true}, /* immutable, sql(14) {numeric,numeric} */ + {1742, "numeric", true}, /* immutable, internal(12) {float4,numeric} */ + {1743, "numeric", true}, /* immutable, internal(12) {float8,numeric} */ + {1744, "int4", true}, /* immutable, internal(12) {int4,numeric} */ + {1745, "float4", true}, /* immutable, internal(12) {float4,numeric} */ + {1746, "float8", true}, /* immutable, internal(12) {float8,numeric} */ + {1764, "numeric_inc", true}, /* immutable, internal(12) + * {numeric,numeric} */ + {1768, "to_char", true}, /* stable, internal(12) + * {text,text,interval} */ + {1770, "to_char", true}, /* stable, internal(12) + * {text,text,timestamptz} */ + {1772, "to_char", true}, /* stable, internal(12) {text,text,numeric} */ + {1773, "to_char", true}, /* stable, internal(12) {int4,text,text} */ + {1774, "to_char", true}, /* stable, internal(12) {int8,text,text} */ + {1775, "to_char", true}, /* stable, internal(12) {text,text,float4} */ + {1776, "to_char", true}, /* stable, internal(12) {text,text,float8} */ + {1777, "to_number", true}, /* stable, internal(12) {text,text,numeric} */ + {1778, "to_timestamp", true}, /* stable, internal(12) + * {text,text,timestamptz} */ + {1779, "int8", true}, /* immutable, internal(12) {int8,numeric} */ + {1780, "to_date", true}, /* stable, internal(12) {text,text,date} */ + {1781, "numeric", true}, /* immutable, internal(12) {int8,numeric} */ + {1782, "numeric", true}, /* immutable, internal(12) {int2,numeric} */ + {1783, "int2", true}, /* immutable, internal(12) {int2,numeric} */ + {1810, "bit_length", true}, /* immutable, sql(14) {bytea,int4} */ + {1811, "bit_length", true}, /* immutable, sql(14) {int4,text} */ + {1812, "bit_length", true}, /* immutable, sql(14) {int4,bit} */ + {1813, "convert", true}, /* stable, internal(12) + * {bytea,bytea,name,name} */ + {1842, "int8_sum", true}, /* immutable, internal(12) + * {int8,numeric,numeric} */ + {1845, "to_ascii", true}, /* immutable, internal(12) {text,text} */ + {1846, "to_ascii", true}, /* immutable, internal(12) {int4,text,text} */ + {1847, "to_ascii", true}, /* immutable, internal(12) {name,text,text} */ + {1946, "encode", true}, /* immutable, internal(12) {bytea,text,text} */ + {1947, "decode", true}, /* immutable, internal(12) {bytea,text,text} */ + {1961, "timestamp", true}, /* immutable, internal(12) + * {int4,timestamp,timestamp} */ + {1967, "timestamptz", true}, /* immutable, internal(12) + * {int4,timestamptz,timestamptz} */ + {1968, "time", true}, /* immutable, internal(12) {int4,time,time} */ + {1969, "timetz", true}, /* immutable, internal(12) + * {int4,timetz,timetz} */ + {1973, "div", true}, /* immutable, internal(12) + * {numeric,numeric,numeric} */ + {1980, "numeric_div_trunc", true}, /* immutable, internal(12) + * {numeric,numeric,numeric} */ + {1986, "similar_to_escape", true}, /* immutable, internal(12) + * {text,text,text} */ + {1987, "similar_to_escape", true}, /* immutable, internal(12) {text,text} */ + {2007, "like", true}, /* immutable, internal(12) {bool,bytea,bytea} */ + {2008, "notlike", true}, /* immutable, internal(12) {bool,bytea,bytea} */ + {2009, "like_escape", true}, /* immutable, internal(12) + * {bytea,bytea,bytea} */ + {2010, "length", true}, /* immutable, internal(12) {bytea,int4} */ + {2012, "substring", true}, /* immutable, internal(12) + * {bytea,bytea,int4,int4} */ + {2013, "substring", true}, /* immutable, internal(12) {bytea,bytea,int4} */ + {2014, "position", true}, /* immutable, internal(12) {bytea,bytea,int4} */ + {2015, "btrim", true}, /* immutable, internal(12) {bytea,bytea,bytea} */ + {2019, "time", true}, /* stable, internal(12) {time,timestamptz} */ + {2020, "date_trunc", true}, /* immutable, internal(12) + * {text,timestamp,timestamp} */ + {2021, "date_part", true}, /* immutable, internal(12) + * {text,float8,timestamp} */ + {2024, "timestamp", true}, /* immutable, internal(12) {date,timestamp} */ + {2025, "timestamp", true}, /* immutable, internal(12) + * {date,time,timestamp} */ + {2026, "pg_backend_pid", false}, /* stable, internal(12) {int4} */ + {2027, "timestamp", true}, /* stable, internal(12) + * {timestamp,timestamptz} */ + {2028, "timestamptz", true}, /* stable, internal(12) + * {timestamp,timestamptz} */ + {2029, "date", true}, /* immutable, internal(12) {date,timestamp} */ + {2034, "pg_conf_load_time", false}, /* stable, internal(12) + * {timestamptz} */ + {2037, "timezone", true}, /* volatile, internal(12) + * {text,timetz,timetz} */ + {2038, "timezone", true}, /* immutable, internal(12) + * {interval,timetz,timetz} */ + {2041, "overlaps", true}, /* immutable, internal(12) + * {bool,timestamp,timestamp,timestamp,timestamp} */ + {2042, "overlaps", true}, /* immutable, sql(14) + * {bool,timestamp,timestamp,interval,interval} */ + {2043, "overlaps", true}, /* immutable, sql(14) + * {bool,timestamp,timestamp,timestamp,interval} */ + {2044, "overlaps", true}, /* immutable, sql(14) + * {bool,timestamp,timestamp,timestamp,interval} */ + {2046, "time", true}, /* immutable, internal(12) {time,timetz} */ + {2047, "timetz", true}, /* stable, internal(12) {time,timetz} */ + {2048, "isfinite", true}, /* immutable, internal(12) {bool,timestamp} */ + {2049, "to_char", true}, /* stable, internal(12) + * {text,text,timestamp} */ + {2058, "age", true}, /* immutable, internal(12) + * {timestamp,timestamp,interval} */ + {2059, "age", true}, /* stable, sql(14) {timestamp,interval} */ + {2069, "timezone", true}, /* immutable, internal(12) + * {text,timestamp,timestamptz} */ + {2070, "timezone", true}, /* immutable, internal(12) + * {timestamp,timestamptz,interval} */ + {2073, "substring", true}, /* immutable, internal(12) {text,text,text} */ + {2074, "substring", true}, /* immutable, sql(14) {text,text,text,text} */ + {2075, "bit", true}, /* immutable, internal(12) {int8,int4,bit} */ + {2076, "int8", true}, /* immutable, internal(12) {int8,bit} */ + {2077, "current_setting", false}, /* stable, internal(12) {text,text} */ + {2078, "set_config", false}, /* volatile, internal(12) + * {bool,text,text,text} */ + {2085, "substr", true}, /* immutable, internal(12) + * {bytea,bytea,int4,int4} */ + {2086, "substr", true}, /* immutable, internal(12) {bytea,bytea,int4} */ + {2087, "replace", true}, /* immutable, internal(12) + * {text,text,text,text} */ + {2088, "split_part", true}, /* immutable, internal(12) + * {int4,text,text,text} */ + {2089, "to_hex", true}, /* immutable, internal(12) {int4,text} */ + {2090, "to_hex", true}, /* immutable, internal(12) {int8,text} */ + {2091, "array_lower", true}, /* immutable, internal(12) + * {int4,int4,anyarray} */ + {2092, "array_upper", true}, /* immutable, internal(12) + * {int4,int4,anyarray} */ + {2167, "ceiling", true}, /* immutable, internal(12) {numeric,numeric} */ + {2169, "power", true}, /* immutable, internal(12) + * {numeric,numeric,numeric} */ + {2170, "width_bucket", true}, /* immutable, internal(12) + * {int4,int4,numeric,numeric,numeric} */ + {2176, "array_length", false}, /* immutable, internal(12) + * {int4,int4,anyarray} */ + {2284, "regexp_replace", true}, /* immutable, internal(12) + * {text,text,text,text} */ + {2285, "regexp_replace", true}, /* immutable, internal(12) + * {text,text,text,text,text} */ + {2288, "pg_size_pretty", false}, /* volatile, internal(12) {int8,text} */ + {2308, "ceil", true}, /* immutable, internal(12) {float8,float8} */ + {2309, "floor", true}, /* immutable, internal(12) {float8,float8} */ + {2310, "sign", true}, /* immutable, internal(12) {float8,float8} */ + {2311, "md5", true}, /* immutable, internal(12) {text,text} */ + {2319, "pg_encoding_max_length", false}, /* immutable, internal(12) + * {int4,int4} */ + {2320, "ceiling", true}, /* immutable, internal(12) {float8,float8} */ + {2321, "md5", true}, /* immutable, internal(12) {bytea,text} */ + {2557, "bool", true}, /* immutable, internal(12) {bool,int4} */ + {2558, "int4", true}, /* immutable, internal(12) {bool,int4} */ + {2559, "lastval", false}, /* volatile, internal(12) {int8} */ + {2560, "pg_postmaster_start_time", false}, /* stable, internal(12) + * {timestamptz} */ + {2621, "pg_reload_conf", false}, /* volatile, internal(12) {bool} */ + {2622, "pg_rotate_logfile", false}, /* volatile, internal(12) {bool} */ + {2623, "pg_stat_file", false}, /* volatile, internal(12) {text,record} */ + {2624, "pg_read_file", false}, /* volatile, internal(12) + * {int8,int8,text,text} */ + {2626, "pg_sleep", false}, /* volatile, internal(12) {float8,void} */ + {2647, "transaction_timestamp", false}, /* stable, internal(12) + * {timestamptz} */ + {2648, "statement_timestamp", false}, /* stable, internal(12) + * {timestamptz} */ + {2649, "clock_timestamp", true}, /* volatile, internal(12) + * {timestamptz} */ + {2705, "pg_has_role", false}, /* stable, internal(12) + * {bool,name,name,text} */ + {2709, "pg_has_role", false}, /* stable, internal(12) + * {bool,name,text} */ + {2711, "justify_interval", true}, /* immutable, internal(12) + * {interval,interval} */ + {2767, "regexp_split_to_array", false}, /* immutable, internal(12) + * {text,text,_text} */ + {2768, "regexp_split_to_array", false}, /* immutable, internal(12) + * {text,text,text,_text} */ + {2971, "text", true}, /* immutable, internal(12) {bool,text} */ + {3030, "overlay", true}, /* immutable, internal(12) + * {int4,int4,bit,bit,bit} */ + {3031, "overlay", true}, /* immutable, internal(12) {int4,bit,bit,bit} */ + {3032, "get_bit", true}, /* immutable, internal(12) {int4,int4,bit} */ + {3033, "set_bit", true}, /* immutable, internal(12) {int4,int4,bit,bit} */ + {3036, "pg_notify", false}, /* volatile, internal(12) {text,text,void} */ + {3051, "xml_is_well_formed", false}, /* stable, internal(12) + * {bool,text} */ + {3052, "xml_is_well_formed_document", false}, /* immutable, internal(12) + * {bool,text} */ + {3053, "xml_is_well_formed_content", false}, /* immutable, internal(12) + * {bool,text} */ + {3058, "concat", true}, /* stable, internal(12) {text,any} */ + {3059, "concat_ws", true}, /* stable, internal(12) {text,text,any} */ + {3060, "left", true}, /* immutable, internal(12) {int4,text,text} */ + {3061, "right", true}, /* immutable, internal(12) {int4,text,text} */ + {3062, "reverse", true}, /* immutable, internal(12) {text,text} */ + {3162, "pg_collation_for", false}, /* stable, internal(12) {text,any} */ + {3166, "pg_size_pretty", false}, /* volatile, internal(12) + * {text,numeric} */ + {3167, "array_remove", false}, /* immutable, internal(12) + * {anyarray,anyarray,anyelement} */ + {3168, "array_replace", false}, /* immutable, internal(12) + * {anyarray,anyarray,anyelement,anyelement} */ + {3179, "cardinality", false}, /* immutable, internal(12) {int4,anyarray} */ + {3461, "make_timestamp", true}, /* immutable, internal(12) + * {int4,int4,int4,int4,int4,float8,timestamp} */ + {3462, "make_timestamptz", true}, /* stable, internal(12) + * {int4,int4,int4,int4,int4,float8,timestamptz} */ + {3463, "make_timestamptz", true}, /* stable, internal(12) + * {int4,int4,int4,int4,int4,text,float8,timestamptz} */ + {3464, "make_interval", true}, /* immutable, internal(12) + * {int4,int4,int4,int4,int4,int4,float8,interval} */ + {3528, "enum_first", false}, /* stable, internal(12) + * {anyenum,anyenum} */ + {3529, "enum_last", false}, /* stable, internal(12) {anyenum,anyenum} */ + {3530, "enum_range", false}, /* stable, internal(12) + * {anyarray,anyenum,anyenum} */ + {3531, "enum_range", false}, /* stable, internal(12) + * {anyarray,anyenum} */ + {3533, "enum_send", false}, /* stable, internal(12) {bytea,anyenum} */ + {3539, "format", false}, /* stable, internal(12) {text,text,any} */ + {3540, "format", true}, /* stable, internal(12) {text,text} */ + {3811, "money", true}, /* stable, internal(12) {int4,money} */ + {3812, "money", true}, /* stable, internal(12) {int8,money} */ + {3823, "numeric", true}, /* stable, internal(12) {money,numeric} */ + {3824, "money", true}, /* stable, internal(12) {money,numeric} */ + {3846, "make_date", true}, /* immutable, internal(12) + * {int4,int4,int4,date} */ + {3847, "make_time", true}, /* immutable, internal(12) + * {int4,int4,float8,time} */ + {3935, "pg_sleep_for", false}, /* volatile, sql(14) {interval,void} */ + {3936, "pg_sleep_until", false}, /* volatile, sql(14) + * {timestamptz,void} */ + {4350, "normalize", true}, /* immutable, internal(12) {text,text,text} */ + {4351, "is_normalized", true}, /* immutable, internal(12) + * {bool,text,text} */ + {5044, "gcd", true}, /* immutable, internal(12) {int4,int4,int4} */ + {5045, "gcd", true}, /* immutable, internal(12) {int8,int8,int8} */ + {5046, "lcm", true}, /* immutable, internal(12) {int4,int4,int4} */ + {5047, "lcm", true}, /* immutable, internal(12) {int8,int8,int8} */ + {5048, "gcd", true}, /* immutable, internal(12) + * {numeric,numeric,numeric} */ + {5049, "lcm", true}, /* immutable, internal(12) + * {numeric,numeric,numeric} */ + {6162, "bit_count", true}, /* immutable, internal(12) {int8,bit} */ + {6163, "bit_count", true}, /* immutable, internal(12) {bytea,int8} */ + {6177, "date_bin", true}, /* immutable, internal(12) + * {timestamp,timestamp,timestamp,interval} */ + {6178, "date_bin", true}, /* immutable, internal(12) + * {timestamptz,timestamptz,timestamptz,interval} */ + {6195, "ltrim", true}, /* immutable, internal(12) {bytea,bytea,bytea} */ + {6196, "rtrim", true}, /* immutable, internal(12) {bytea,bytea,bytea} */ + {6198, "unistr", true}, /* immutable, internal(12) {text,text} */ + {6199, "extract", true}, /* immutable, internal(12) {text,date,numeric} */ + {6200, "extract", true}, /* immutable, internal(12) {text,time,numeric} */ + {6201, "extract", true}, /* immutable, internal(12) + * {text,timetz,numeric} */ + {6202, "extract", true}, /* immutable, internal(12) + * {text,timestamp,numeric} */ + {6203, "extract", true}, /* stable, internal(12) + * {text,timestamptz,numeric} */ + {6204, "extract", true}, /* immutable, internal(12) + * {text,interval,numeric} */ +}; + +/** Maximum number of arguments for specially treated user defined function */ +#define VCI_MAX_APPLICABLE_UDF_NARGS (2) + +/** + * Template to specify a specially treated user defined function + */ +typedef struct +{ + const char *name; /* Function name */ + Oid namespace; /* Namespace */ + int16 nargs; /* Number of arguments */ + Oid rettype; /* Function return type */ + /** Function argument types. The number of elements is specified by nargs */ + Oid argtypes[VCI_MAX_APPLICABLE_UDF_NARGS]; +} vci_applicable_udf_template; + +/* + * Index numbers that are treated specially among applicable_udf_table[] + */ +#define APPLICABLE_UDF_TABLE_VCI_RUNS_IN_PLAN_INDEX (0) +#define APPLICABLE_UDF_TABLE_VCI_ALWAYS_RETURN_TRUE (1) + +/** + * Template table for specially treated user defined function + * + * However the top 2 functions are treated specially and have fixed array positions. + */ +static vci_applicable_udf_template applicable_udf_table[] = { + + {"vci_runs_in_plan", PG_PUBLIC_NAMESPACE, 0, BOOLOID, {0, 0}}, + {"vci_always_return_true", PG_PUBLIC_NAMESPACE, 0, BOOLOID, {0, 0}}, + + {"vci_runs_in_query", PG_PUBLIC_NAMESPACE, 0, BOOLOID, {0, 0}}, + {"hamming_distance", PG_PUBLIC_NAMESPACE, 2, INT4OID, {BITOID, BITOID}} +}; + +static bool is_supported_udf(Oid oid); + +/** + * Determine if the given oid is a function that VCI can support + * + * @param[in] oid OID (pg_proc.oid) indicating the function to be determined + * @return true if supported, false otherwise + */ +bool +vci_is_supported_function(Oid oid) +{ + int min, + max, + pivot; + + if (FirstNormalObjectId <= oid) + return is_supported_udf(oid); + + if ((oid < VCI_SUPPORTED_FUNC_MIN) || (VCI_SUPPORTED_FUNC_MAX < oid)) + return false; + + /* 2 minute search */ + + min = 0; + max = lengthof(vci_supported_func_table); /* exclusive */ + + while (max - min > 1) + { + Oid comp; + + pivot = (min + max) / 2; + + comp = vci_supported_func_table[pivot].oid; + + if (comp == oid) + return vci_supported_func_table[pivot].is_support; + else if (oid < comp) + max = pivot; + else /* comp < oid */ + min = pivot; + } + + if (max - min == 1) + if (oid == vci_supported_func_table[min].oid) + return vci_supported_func_table[min].is_support; + + return false; +} + +/** + * Determine if the given user-defined functions indicated by the oid is treated specially by VCI + * + * @param[in] oid OID (pg_proc.oid) indicating the function to be determined + * @return true if supported, false otherwise + */ +static bool +is_supported_udf(Oid oid) +{ + int i; + bool result; + + result = false; + + for (i = 0; i < vci_special_udf_info.num_applicable_udfs; i++) + { + if (oid == vci_special_udf_info.applicable_udfs[i]) + { + result = true; + break; + } + } + + return result; +} + +/** + * Register user defined function for special handling + * + * @param[in] snapshot Current snapshot + * + * This function is called every time before attempting to rewrite the VCI plan, + * but the actual registration process is only called once within the PostgreSQL instance. + */ +void +vci_register_applicable_udf(Snapshot snapshot) +{ + bool already_registerd; + MemoryContext tmpcontext, + oldcontext; + Relation rel; + TableScanDesc scan; + HeapTuple tuple; + + already_registerd = (vci_special_udf_info.num_applicable_udfs > 0); + + if (already_registerd) + return; + + /* + * To use fmgr_info, a temporary memory context is needed, but since + * CurrentMemoryContext is SMC here, create child memory context from + * MessageContext. + */ + tmpcontext = AllocSetContextCreate(MessageContext, + "Register Applicable UDF", + ALLOCSET_DEFAULT_SIZES); + + oldcontext = MemoryContextSwitchTo(tmpcontext); + + rel = table_open(ProcedureRelationId, AccessShareLock); + scan = table_beginscan(rel, snapshot, 0, NULL); + + while ((tuple = heap_getnext(scan, ForwardScanDirection)) != NULL) + { + Oid funcoid; + Form_pg_proc procform; + int i; + + funcoid = ((Form_pg_proc) GETSTRUCT(tuple))->oid; + + /* + * UDF always takes an OID greater than or equal to + * FirstNormalObjectId + */ + if (funcoid < FirstNormalObjectId) + continue; + + procform = (Form_pg_proc) GETSTRUCT(tuple); + + /* + * Check if tuple matches an entry in the template table + */ + for (i = 0; i < lengthof(applicable_udf_table); i++) + { + vci_applicable_udf_template *entry = &applicable_udf_table[i]; + int j; + + if ((procform->pronamespace != entry->namespace) || + (procform->pronargs != entry->nargs) || + (procform->prorettype != entry->rettype) || + (strcmp(NameStr(procform->proname), entry->name) != 0)) + goto next; + + for (j = 0; j < Min(entry->nargs, VCI_MAX_APPLICABLE_UDF_NARGS); j++) + if (procform->proargtypes.values[j] != entry->argtypes[j]) + goto next; + + vci_special_udf_info.applicable_udfs[vci_special_udf_info.num_applicable_udfs++] + = funcoid; + + if (i == APPLICABLE_UDF_TABLE_VCI_RUNS_IN_PLAN_INDEX) + vci_special_udf_info.vci_runs_in_plan_funcoid = funcoid; + else if (i == APPLICABLE_UDF_TABLE_VCI_ALWAYS_RETURN_TRUE) + vci_special_udf_info.vci_always_return_true_funcoid = funcoid; + + break; + + next: + ; + } + } + + table_endscan(scan); + table_close(rel, AccessShareLock); + + MemoryContextSwitchTo(oldcontext); + MemoryContextDelete(tmpcontext); +} + +/*==========================================================================*/ +/* Implementation of PG function to check supported functions at CREATE EXTENSION */ +/*==========================================================================*/ + +PG_FUNCTION_INFO_V1(vci_check_supported_functions); + +Datum +vci_check_supported_functions(PG_FUNCTION_ARGS) +{ + Relation rel; + int i; + + rel = table_open(ProcedureRelationId, AccessShareLock); + + for (i = 0; i < lengthof(vci_supported_func_table); i++) + { + HeapTuple tuple; + Form_pg_proc procform; + + if (!vci_supported_func_table[i].is_support) + continue; + + tuple = SearchSysCache1(PROCOID, ObjectIdGetDatum(vci_supported_func_table[i].oid)); + if (!HeapTupleIsValid(tuple)) + goto error; + + procform = (Form_pg_proc) GETSTRUCT(tuple); + + if (strcmp(vci_supported_func_table[i].name, NameStr(procform->proname)) != 0) + goto error; + + ReleaseSysCache(tuple); + } + + table_close(rel, AccessShareLock); + + PG_RETURN_VOID(); + +error: + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("extension \"%s\" cannot be installed under this version of PostgreSQL", VCI_STRING))); + + PG_RETURN_VOID(); +} diff --git a/contrib/vci/vci_supported_funcs.sql b/contrib/vci/vci_supported_funcs.sql new file mode 100644 index 000000000000..cecebe4e60c2 --- /dev/null +++ b/contrib/vci/vci_supported_funcs.sql @@ -0,0 +1,114 @@ +-- sys_func_table A table that registers the OID of system-related functions from multiple system catalogs + +CREATE TEMPORARY TABLE test (funcoid oid); +INSERT INTO test (funcoid) SELECT unnest(ARRAY[aggfnoid, aggtransfn, aggfinalfn, aggmtransfn, aggminvtransfn, aggmfinalfn]) FROM pg_aggregate; +INSERT INTO test (funcoid) SELECT amhandler FROM pg_am; +INSERT INTO test (funcoid) SELECT unnest(ARRAY[amproc]) FROM pg_amproc; +INSERT INTO test (funcoid) SELECT unnest(ARRAY[castfunc]) FROM pg_cast; +INSERT INTO test (funcoid) SELECT unnest(ARRAY[conproc]) FROM pg_conversion; +INSERT INTO test (funcoid) SELECT evtfoid FROM pg_event_trigger; +INSERT INTO test (funcoid) SELECT unnest(ARRAY[fdwhandler, fdwvalidator]) FROM pg_foreign_data_wrapper; +INSERT INTO test (funcoid) SELECT unnest(ARRAY[lanplcallfoid, laninline, lanvalidator]) FROM pg_language; +INSERT INTO test (funcoid) SELECT tgfoid FROM pg_trigger; +INSERT INTO test (funcoid) SELECT unnest(ARRAY[oprcode, oprrest, oprjoin]) FROM pg_operator; +INSERT INTO test (funcoid) SELECT unnest(ARRAY[rngcanonical, rngsubdiff]) FROM pg_range; +INSERT INTO test (funcoid) SELECT unnest(ARRAY[prsstart, prstoken, prsend, prsheadline, prslextype]) FROM pg_ts_parser; +INSERT INTO test (funcoid) SELECT unnest(ARRAY[tmplinit, tmpllexize]) FROM pg_ts_template; +INSERT INTO test (funcoid) SELECT unnest(ARRAY[typinput, typoutput, typreceive, typsend, typmodin, typmodout, typanalyze]) FROM pg_type; + +DROP TABLE IF EXISTS sys_func_table; +CREATE TABLE sys_func_table (funcoid oid UNIQUE); +INSERT INTO sys_func_table SELECT distinct funcoid FROM test WHERE funcoid > 0 ORDER BY funcoid; + +DROP TABLE IF EXISTS safe_types; +CREATE TABLE safe_types (typeoid oid UNIQUE); +INSERT INTO safe_types (typeoid) VALUES + (16), -- bool + (17), -- bytea + (18), -- char + (19), -- name + (20), -- int8 + (21), -- int2 + (23), -- int4 +-- (24) -- regproc + (25), -- text +-- (26) -- oid +-- (27) -- tid +-- (28) -- xid +-- (30) -- oidvector +-- (71) -- pg_type +-- (75) -- pg_attribute +-- (114) -- json [not supported] +-- (142) -- xml [not supported] +-- (143) -- _xml [not supported] +-- (194) -- pg_node_tree +-- (210) -- smgr +-- (600), -- point [not supported] +-- (601), -- lseg [not supported] +-- (602), -- path [not supported] +-- (603), -- box [not supported] +-- (604), -- polygon [not supported] +-- (628), -- line [not supported] + (700), -- float4 + (701), -- float8 +-- (718), -- circle [not supported] + (790), -- money +-- (829), -- macaddr [not supported] +-- (869), -- inet [not supported] +-- (650), -- cidr [not supported] + (1003), -- _name + (1005), -- _int2 + (1007), -- _int4 + (1009), -- _text + (1021), -- _float4 +-- (1033) -- aclitem +-- (1034) -- _aclitem + (1042), -- bpchar + (1082), -- date + (1083), -- time + (1114), -- timestamp + (1184), -- timestamptz + (1186), -- interval + (1266), -- timetz + (1560), -- bit + (1700), -- numeric +-- (1790), -- refcursor +-- (2202), -- regprocedure +-- (2203), -- regoper +-- (2204), -- regoperator +-- (2205), -- regclass +-- (2206), -- regtype +-- (3220), -- pg_lsn +-- (3614), -- tsvector [not supported] +-- (3615), -- tsquery [not supported] +-- (3734), -- regconfig +-- (3769), -- regdictionary +-- (3802), -- jsonb [not supported] +-- (2970), -- txid_snapshot +-- (3904), -- int4range [not supported] +-- (3906), -- numrange [not supported] +-- (3908), -- tsrange [not supported] +-- (3910), -- tstzrange [not supported] +-- (3912), -- daterange [not supported] +-- (3926), -- int8range [not supported] + (2249), -- record +-- (2275) -- cstring + (2276), -- any + (2277), -- anyarray + (2278), -- void +-- (2279) -- trigger +-- (2281) -- internal +-- (2282) -- opaque + (2283), -- anyelement + (3500); -- anyenum +-- (3831) -- anyrange + +DROP FUNCTION IF EXISTS print_typename; +CREATE FUNCTION print_typename(IN oids _oid) RETURNS _name AS $$ + SELECT array_agg(pg_type.typname) FROM unnest(oids) AS t(i), pg_type WHERE i = pg_type.oid; +$$ LANGUAGE SQL; + +SELECT oid, proname, provolatile, prolang, print_typename(array_prepend(prorettype, proargtypes)) FROM pg_proc WHERE prokind = 'f' AND NOT proretset + AND NOT EXISTS (SELECT funcoid FROM sys_func_table WHERE pg_proc.oid = sys_func_table.funcoid) + AND (SELECT bool_and(i IN (SELECT typeoid FROM safe_types)) FROM unnest(array_prepend(prorettype, proargtypes)) AS t(i)) + AND oid < 16384 ORDER BY oid; diff --git a/contrib/vci/vci_supported_types.c b/contrib/vci/vci_supported_types.c new file mode 100644 index 000000000000..d557bc3cbf3e --- /dev/null +++ b/contrib/vci/vci_supported_types.c @@ -0,0 +1,245 @@ +/*------------------------------------------------------------------------- + * + * vci_supported_types.c + * Types supported by VCI + * + * vci_supported_type_table[] is created with following SQL and then examined individually. + * + * SELECT oid, typname FROM pg_type WHERE typnamespace = 11 AND typrelid = 0 AND typelem = 0 ORDER BY oid; + * + * - 'typnamespace = 11' is to exclude types not related to table structure + * - 'typelem = 0' is to exclude array type + * - 'typrelid = 0' is to exclude composite type + * + * Portions Copyright (c) 2025, PostgreSQL Global Development Group + * + * IDENTIFICATION + * contrib/vci/vci_supported_types.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam.h" +#include "access/htup.h" +#include "access/htup_details.h" +#include "c.h" +#include "catalog/pg_type.h" /* for TypeRelationId, Form_pg_type */ +#include "fmgr.h" +#include "utils/elog.h" +#include "utils/relcache.h" +#include "utils/syscache.h" + +#include "vci.h" +#include "vci_supported_oid.h" + +/** + * Smallest OID among types supported by VCI + */ +#define VCI_SUPPORTED_TYPE_MIN (16) + +/** + * Biggest OID among types supported by VCI + */ +#define VCI_SUPPORTED_TYPE_MAX (2950) + +/** + * Array of information about types supported by VCI + */ +static const struct +{ + Oid oid; + const char *name; + bool is_support; +} vci_supported_type_table[] = { + {16, "bool", true}, /* BOOLOID */ + {17, "bytea", true}, /* BYTEAOID */ + {18, "char", true}, /* CHAROID */ + {20, "int8", true}, /* INT8OID */ + {21, "int2", true}, /* INT2OID */ + {23, "int4", true}, /* INT4OID */ + {24, "regproc", false}, /* REGPROCOID */ + {25, "text", true}, /* TEXTOID */ + {26, "oid", false}, /* OIDOID */ + {27, "tid", false}, /* TIDOID */ + {28, "xid", false}, /* XIDOID */ + {29, "cid", false}, /* CIDOID */ + {32, "pg_ddl_command", false}, /* PG_DDL_COMMANDOID */ + {114, "json", false}, /* JSONOID */ + {142, "xml", false}, /* XMLOID */ + {194, "pg_node_tree", false}, /* PG_NODE_TREEOID */ + {269, "table_am_handler", false}, /* TABLE_AM_HANDLEROID */ + {325, "index_am_handler", false}, /* INDEX_AM_HANDLEROID */ + {602, "path", false}, /* PATHOID */ + {604, "polygon", false}, /* POLYGONOID */ + {650, "cidr", false}, /* CIDROID */ + {700, "float4", true}, /* FLOAT4OID */ + {701, "float8", true}, /* FLOAT8OID */ + {705, "unknown", false}, /* UNKNOWNOID */ + {718, "circle", false}, /* CIRCLEOID */ + {774, "macaddr8", false}, /* MACADDR8OID */ + {790, "money", true}, /* MONEYOID */ + {829, "macaddr", false}, /* MACADDROID */ + {869, "inet", false}, /* INETOID */ + {1033, "aclitem", false}, /* ACLITEMOID */ + {1042, "bpchar", true}, /* BPCHAROID */ + {1043, "varchar", true}, /* VARCHAROID */ + {1082, "date", true}, /* DATEOID */ + {1083, "time", true}, /* TIMEOID */ + {1114, "timestamp", true}, /* TIMESTAMPOID */ + {1184, "timestamptz", true}, /* TIMESTAMPTZOID */ + {1186, "interval", true}, /* INTERVALOID */ + {1266, "timetz", true}, /* TIMETZOID */ + {1560, "bit", true}, /* BITOID */ + {1562, "varbit", true}, /* VARBITOID */ + {1700, "numeric", true}, /* NUMERICOID */ + {1790, "refcursor", false}, /* REFCURSOROID */ + {2202, "regprocedure", false}, /* REGPROCEDUREOID */ + {2203, "regoper", false}, /* REGOPEROID */ + {2204, "regoperator", false}, /* REGOPERATOROID */ + {2205, "regclass", false}, /* REGCLASSOID */ + {2206, "regtype", false}, /* REGTYPEOID */ + {2249, "record", false}, /* RECORDOID */ + {2275, "cstring", false}, /* CSTRINGOID */ + {2276, "any", false}, /* ANYOID */ + {2277, "anyarray", false}, /* ANYARRAYOID */ + {2278, "void", false}, /* VOIDOID */ + {2279, "trigger", false}, /* TRIGGEROID */ + {2280, "language_handler", false}, /* LANGUAGE_HANDLEROID */ + {2281, "internal", false}, /* INTERNALOID */ + {2283, "anyelement", false}, /* ANYELEMENTOID */ + {2776, "anynonarray", false}, /* ANYNONARRAYOID */ + {2950, "uuid", true}, /* UUIDOID */ + {2970, "txid_snapshot", false}, + {3115, "fdw_handler", false}, /* FDW_HANDLEROID */ + {3220, "pg_lsn", false}, /* PG_LSNOID */ + {3310, "tsm_handler", false}, /* TSM_HANDLEROID */ + {3361, "pg_ndistinct", false}, /* PG_NDISTINCTOID */ + {3402, "pg_dependencies", false}, /* PG_DEPENDENCIESOID */ + {3500, "anyenum", false}, /* ANYENUMOID */ + {3614, "tsvector", false}, /* TSVECTOROID */ + {3615, "tsquery", false}, /* TSQUERYOID */ + {3642, "gtsvector", false}, /* GTSVECTOROID */ + {3734, "regconfig", false}, /* REGCONFIGOID */ + {3769, "regdictionary", false}, /* REGDICTIONARYOID */ + {3802, "jsonb", false}, /* JSONBOID */ + {3831, "anyrange", false}, /* ANYRANGEOID */ + {3838, "event_trigger", false}, /* EVENT_TRIGGEROID */ + {3904, "int4range", false}, /* INT4RANGEOID */ + {3906, "numrange", false}, + {3908, "tsrange", false}, + {3910, "tstzrange", false}, + {3912, "daterange", false}, + {3926, "int8range", false}, + {4072, "jsonpath", false}, /* JSONPATHOID */ + {4089, "regnamespace", false}, /* REGNAMESPACEOID */ + {4096, "regrole", false}, /* REGROLEOID */ + {4191, "regcollation", false}, /* REGCOLLATIONOID */ + {4451, "int4multirange", false}, + {4532, "nummultirange", false}, + {4533, "tsmultirange", false}, + {4534, "tstzmultirange", false}, + {4535, "datemultirange", false}, + {4536, "int8multirange", false}, + {4537, "anymultirange", false}, + {4538, "anycompatiblemultirange", false}, + {4600, "pg_brin_bloom_summary", false}, /* PG_BRIN_BLOOM_SUMMARYOID */ + {4601, "pg_brin_minmax_multi_summary", false}, /* PG_BRIN_MINMAX_MULTI_SUMMARYOID */ + {5017, "pg_mcv_list", false}, /* PG_MCV_LISTOID */ + {5038, "pg_snapshot", false}, /* PG_SNAPSHOTOID */ + {5069, "xid8", false}, /* XID8OID */ + {5077, "anycompatible", false}, /* ANYCOMPATIBLEOID */ + {5078, "anycompatiblearray", false}, /* ANYCOMPATIBLEARRAYOID */ + {5079, "anycompatiblenonarray", false}, /* ANYCOMPATIBLENONARRAYOID */ + {5080, "anycompatiblerange", false}, /* ANYCOMPATIBLERANGEOID */ +}; + +/** + * Determine if the given oid is a type that can be supported by VCI + * + * @param[in] oid OID (pg_proc.oid) indicating the type to be determined + * @return true if supported, false otherwise + */ +bool +vci_is_supported_type(Oid oid) +{ + int min, + max, + pivot; + + if ((oid < VCI_SUPPORTED_TYPE_MIN) || (VCI_SUPPORTED_TYPE_MAX < oid)) + return false; + + /* 2 minute search */ + + min = 0; + max = lengthof(vci_supported_type_table); /* exclusive */ + + while (max - min > 1) + { + Oid comp; + + pivot = (min + max) / 2; + + comp = vci_supported_type_table[pivot].oid; + + if (comp == oid) + return vci_supported_type_table[pivot].is_support; + else if (oid < comp) + max = pivot; + else /* comp < oid */ + min = pivot; + } + + if (max - min == 1) + if (oid == vci_supported_type_table[min].oid) + return vci_supported_type_table[min].is_support; + + return false; +} + +/*==========================================================================*/ +/* Implementation of PG function to check supported types at CREATE EXTENSION */ +/*==========================================================================*/ + +PG_FUNCTION_INFO_V1(vci_check_supported_types); + +Datum +vci_check_supported_types(PG_FUNCTION_ARGS) +{ + Relation rel; + int i; + + rel = table_open(TypeRelationId, AccessShareLock); + + for (i = 0; i < lengthof(vci_supported_type_table); i++) + { + HeapTuple tuple; + Form_pg_type typeform; + + if (!vci_supported_type_table[i].is_support) + continue; + + tuple = SearchSysCache1(TYPEOID, ObjectIdGetDatum(vci_supported_type_table[i].oid)); + if (!HeapTupleIsValid(tuple)) + goto error; + + typeform = (Form_pg_type) GETSTRUCT(tuple); + + if (strcmp(vci_supported_type_table[i].name, NameStr(typeform->typname)) != 0) + goto error; + + ReleaseSysCache(tuple); + } + + table_close(rel, AccessShareLock); + + PG_RETURN_VOID(); + +error: + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("extension \"%s\" cannot be installed under this version of PostgreSQL", VCI_STRING))); + + PG_RETURN_VOID(); +} diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 43fe3bcd593e..07212641828c 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -4353,3 +4353,136 @@ yyscan_t z_stream z_streamp zic_t +ZZZ_VCI_TYPEDEFS_FOLLOW_ZZZ +vci_applicable_udf_template +vci_CmpInfo +CopyCommandInfo +message_on_worker_exit_t +CEKind +WosKind +vci_tid_tid_xid64_t +vci_MergeTidCridUpdateListContext +vcis_free_space_t +vci_id_t +vci_memory_entry_t +vci_memory_entries_t +vci_inner_plan_type_t +vci_plan_compat_t +vci_plan_attr_t +vci_param_exec_type_t +vci_param_exec_attr_t +vci_subplan_type_t +vci_subplan_attr_t +vci_rewrite_plan_context_t +vci_memory_entry_list_t +vci_devload_t +vci_ros_command_t +vci_offset_in_extent_t +vci_MainRelVar +vci_MainRelHeaderInfo +vcis_m_column_t +vcis_m_extent_t +vci_wmrv_t +vcis_attribute_type_t +vcis_compression_type_t +vcis_extent_type_t +vcis_tidcrid_item_type_t +vcis_dict_type_t +vcis_tid_crid_op_type_t +vci_local_delete_list +vci_local_ros_t +vci_RelationPair +vci_DictInfo +vci_meta_item_scanner_t +vci_wosros_conv_worker_arg_t +vci_workerslot_t +vci_tid_array_t +vci_blk_array_t +vci_RosCommandContext +vci_target_extent_info_t +vci_special_udf_info_t +Int8TransTypeData +VciTableScanPolicy +VciScanMode +VciFetchPos +vci_RebuildCommand +VciAggStatePerAggData +VciAggStatePerGroupData +RosChunkBuffer +RosChunkStorage +vcis_c_extent_t +vcis_c_common_dict_t +vcis_column_meta_t +vcis_extent_t +vci_ColumnRelations +vci_fetch_placeholder_t +vci_index_placeholder_t +vci_plan_info_t +vci_query_context_t +vci_vp_item_id +VciVPExecOp_func +VciVPNode +VciVPContext +VciProjectionInfoSlot +VciProjectionInfo +VciPlan +VciPlanState +VciScan +VciScanState +VciSort +VciSortState +VciAgg +VciAggStatePerAgg +VciAggStatePerGroup +VciAggState +VciAdvanceAggref_Func +VciGather +VciGatherState +VciVarState +VciParamState +vci_initexpr_t +vci_topmost_plan_cb_t +vci_mutator_t +VciCopyDatumFunc +vci_CSQueryContextData +vci_CSQueryContext +vci_seq_scan_buffer_t +vci_CSFetchContextData +vci_CSFetchContext +vci_minmax_t +vci_extent_status_t +vci_read_vector_status_t +vci_virtual_tuples_column_info_t +vci_virtual_tuples_t +VciGucStruct +VciShmemStruct +NumericDigit +NumericVar +vcis_Crid +vcis_tidcrid_meta_item_t +vcis_tidcrid_meta_t +vcis_tidcrid_pagetag_t +vcis_tidcrid_leaf_t +vcis_tidcrid_trunk_t +vcis_tidcrid_pair_item_t +vcis_tidcrid_pair_list_t +vci_TidCridUpdateListContext +vci_TidCridRelations +AggrefTransInfo +int8_t +int16_t +int32_t +uint8_t +uint16_t +uint32_t +int64_t +uint64_t +vci_search_vci_scan_context_t +FuncExprinfo +VciScalarArrayOpExprHashEntry +VciScalarArrayOpExprHashTable +vci_table_info_t +vci_gather_used_attrs_t +vci_renumber_attrs_t +father_gather_plans +node_info_t From c7c1c19d451dbdc739a011b1bdcce6431017a4d1 Mon Sep 17 00:00:00 2001 From: Peter Smith Date: Tue, 14 Oct 2025 16:34:51 +1100 Subject: [PATCH 3/3] VCI module - documentation --- doc/src/sgml/contrib.sgml | 1 + doc/src/sgml/filelist.sgml | 1 + doc/src/sgml/vci.sgml | 150 +++++++++++++++++++++++++++++++++++++ 3 files changed, 152 insertions(+) create mode 100644 doc/src/sgml/vci.sgml diff --git a/doc/src/sgml/contrib.sgml b/doc/src/sgml/contrib.sgml index 24b706b29adc..ca0db7fdc7b7 100644 --- a/doc/src/sgml/contrib.sgml +++ b/doc/src/sgml/contrib.sgml @@ -176,6 +176,7 @@ CREATE EXTENSION extension_name; &tsm-system-time; &unaccent; &uuid-ossp; + &vci; &xml2; diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml index ac66fcbdb572..a73bdf779463 100644 --- a/doc/src/sgml/filelist.sgml +++ b/doc/src/sgml/filelist.sgml @@ -172,6 +172,7 @@ + diff --git a/doc/src/sgml/vci.sgml b/doc/src/sgml/vci.sgml new file mode 100644 index 000000000000..e4986b026b03 --- /dev/null +++ b/doc/src/sgml/vci.sgml @@ -0,0 +1,150 @@ + + + + vci — Vertical Clustered Index + + + vci + + + + vci provides a columnar store that is implemented using + PostgreSQL index access methods (see ). + All data can be stored in memory. + + + + This module must be loaded by adding vci to both + and + in postgresql.conf, as it requires additional shared memory. + A server restart is required to add or remove the module. + + + + Functions + + + + + vci_runs_in_query() returns bool + + vci_runs_in_query + + + + + + + Returns true if a VCI index and custom scan are used + in the current query execution. This function is typically used to verify + whether a query is processed using VCI. For example: + + +SELECT vci_runs_in_query() AS vci_runs_in_query, key, count(*) FROM test_table; + + + + + + + + + + Parameters + + + + + vci.cost_threshold (integer) + + vci.cost_threshold configuration parameter + + + + + Specifies the CPU cost threshold beyond which the VCI control worker will + stop running. + + + + + + + vci.log_query (boolean) + + vci.log_query configuration parameter + + + + + Logs a message when a query cannot be executed using VCI. + + + + + + + vci.maintenance_work_mem (integer) + + vci.maintenance_work_mem configuration parameter + + + + + Specifies the maximum amount of memory that can be used by each VCI control + worker during maintenance operations. + + + + + + + TBD. There are many more parameters which are not yet documented. + + + + + Examples + + TBD. Add examples here. + + + + + Limitations + + + + + Currently, only a limited set of data types is supported for indexing with VCI. + + + + + This extension does not support the ALTER EXTENSION UPDATE command. + + + + + The command ALTER INDEX cannot be used for VCI indexes. + + + + + VCI indexes cannot be used with the CLUSTER command. + + + + + + + + + Authors + + Aya Iwata iwata.aya@fujitsu.com, + Fujitsu Limited, Kanagawa, Japan + + + +