-
Notifications
You must be signed in to change notification settings - Fork 4.1k
opt: computed columns not derived when pushed through lookup join #63735
Description
note: the name on this might be very off, I was just guessing
Statement diagnostics of relevant query: stmt-bundle-650217270527393794.zip
In multi-region TPC-C, we have a schema that looks like the following:
CREATE TABLE public.order_line (
ol_o_id INT8 NOT NULL,
ol_d_id INT8 NOT NULL,
ol_w_id INT8 NOT NULL,
ol_number INT8 NOT NULL,
ol_i_id INT8 NOT NULL,
ol_supply_w_id INT8 NULL,
ol_delivery_d TIMESTAMP NULL,
ol_quantity INT8 NULL,
ol_amount DECIMAL(6,2) NULL,
ol_dist_info CHAR(24) NULL,
crdb_region public.crdb_internal_region NOT VISIBLE NOT NULL AS (CASE WHEN ol_w_id BETWEEN 0:::INT8 AND 1665:::INT8 THEN 'us-east1':::public.crdb_internal_region WHEN ol_w_id BETWEEN 1666:::INT8 AND 3332:::INT8 THEN 'us-west1':::public.crdb_internal_region WHEN ol_w_id BETWEEN 3333:::INT8 AND 4999:::INT8 THEN 'europe-west2':::public.crdb_internal_region END) STORED,
CONSTRAINT "primary" PRIMARY KEY (ol_w_id ASC, ol_d_id ASC, ol_o_id DESC, ol_number ASC),
CONSTRAINT fk_ol_w_id_ref_order FOREIGN KEY (ol_w_id, ol_d_id, ol_o_id) REFERENCES public."order"(o_w_id, o_d_id, o_id) NOT VALID,
CONSTRAINT fk_ol_supply_w_id_ref_stock FOREIGN KEY (ol_supply_w_id, ol_i_id) REFERENCES public.stock(s_w_id, s_i_id) NOT VALID,
FAMILY "primary" (ol_o_id, ol_d_id, ol_w_id, ol_number, ol_i_id, ol_supply_w_id, ol_delivery_d, ol_quantity, ol_amount, ol_dist_info, crdb_region)
) LOCALITY REGIONAL BY ROW;
CREATE TABLE public.stock (
s_i_id INT8 NOT NULL,
s_w_id INT8 NOT NULL,
s_quantity INT8 NULL,
s_dist_01 CHAR(24) NULL,
s_dist_02 CHAR(24) NULL,
s_dist_03 CHAR(24) NULL,
s_dist_04 CHAR(24) NULL,
s_dist_05 CHAR(24) NULL,
s_dist_06 CHAR(24) NULL,
s_dist_07 CHAR(24) NULL,
s_dist_08 CHAR(24) NULL,
s_dist_09 CHAR(24) NULL,
s_dist_10 CHAR(24) NULL,
s_ytd INT8 NULL,
s_order_cnt INT8 NULL,
s_remote_cnt INT8 NULL,
s_data VARCHAR(50) NULL,
crdb_region public.crdb_internal_region NOT VISIBLE NOT NULL AS (CASE WHEN s_w_id BETWEEN 0:::INT8 AND 1665:::INT8 THEN 'us-east1':::public.crdb_internal_region WHEN s_w_id BETWEEN 1666:::INT8 AND 3332:::INT8 THEN 'us-west1':::public.crdb_internal_region WHEN s_w_id BETWEEN 3333:::INT8 AND 4999:::INT8 THEN 'europe-west2':::public.crdb_internal_region END) STORED,
CONSTRAINT "primary" PRIMARY KEY (s_w_id ASC, s_i_id ASC),
CONSTRAINT fk_s_w_id_ref_warehouse FOREIGN KEY (s_w_id) REFERENCES public.warehouse(w_id) NOT VALID,
CONSTRAINT fk_s_i_id_ref_item FOREIGN KEY (s_i_id) REFERENCES public.item(i_id) NOT VALID,
FAMILY "primary" (s_i_id, s_w_id, s_quantity, s_dist_01, s_dist_02, s_dist_03, s_dist_04, s_dist_05, s_dist_06, s_dist_07, s_dist_08, s_dist_09, s_dist_10, s_ytd, s_order_cnt, s_remote_cnt, s_data, crdb_region)
) LOCALITY REGIONAL BY ROW;Notice that each table has a crdb_region column which is computed from its warehouse column (ol_w_id for order_line, s_w_id for stock).
One of the "Stock Level" transaction statements runs a join against these two tables. The statement constraints the warehouse sufficiently so that we should be able to understand that we can search locally, but we don't, so the query is executed with a remote scan that bumps it from about 10ms to about 100ms:
root@localhost:26257/tpcc> EXPLAIN SELECT
count(*)
FROM
(
SELECT
DISTINCT s_i_id
FROM
order_line JOIN stock ON s_i_id = ol_i_id AND s_w_id = ol_w_id
WHERE
ol_w_id = 1 AND ol_d_id = 6 AND ol_o_id BETWEEN (3023 - 20) AND (3023 - 1) AND s_quantity < 14
);
info
---------------------------------------------------------------------------------------------------
distribution: full
vectorized: true
• group (scalar)
│
└── • distinct
│ distinct on: s_i_id
│
└── • lookup join
│ table: stock@primary
│ equality: (lookup_join_const_col_@31, ol_w_id, ol_i_id) = (crdb_region,s_w_id,s_i_id)
│ equality cols are key
│ pred: (s_w_id = 1) AND (s_quantity < 14)
│
└── • cross join
│
├── • values
│ size: 1 column, 3 rows
│
└── • scan
missing stats
table: order_line@primary
spans: [/'us-east1'/1/6/3022 - /'us-east1'/1/6/3003]
(23 rows)
Time: 2ms total (execution 2ms / network 0ms)
root@localhost:26257/tpcc> SELECT
count(*)
FROM
(
SELECT
DISTINCT s_i_id
FROM
order_line JOIN stock ON s_i_id = ol_i_id AND s_w_id = ol_w_id
WHERE
ol_w_id = 1 AND ol_d_id = 6 AND ol_o_id BETWEEN (3023 - 20) AND (3023 - 1) AND s_quantity < 14
);
count
---------
9
(1 row)
Time: 94ms total (execution 93ms / network 0ms)If we manually push the s_w_id through the join, things still don't improve:
root@localhost:26257/tpcc> EXPLAIN SELECT
count(*)
FROM
(
SELECT
DISTINCT s_i_id
FROM
order_line JOIN stock ON s_i_id = ol_i_id
WHERE
ol_w_id = 1 AND s_w_id = 1 AND ol_d_id = 6 AND ol_o_id BETWEEN (3023 - 20) AND (3023 - 1) AND s_quantity < 14
);
info
---------------------------------------------------------------------------------------------------------------------
distribution: full
vectorized: true
• group (scalar)
│
└── • distinct
│ distinct on: s_i_id
│
└── • lookup join
│ table: stock@primary
│ equality: (lookup_join_const_col_@31, lookup_join_const_col_@15, ol_i_id) = (crdb_region,s_w_id,s_i_id)
│ equality cols are key
│ pred: s_quantity < 14
│
└── • render
│
└── • cross join
│
├── • values
│ size: 1 column, 3 rows
│
└── • scan
missing stats
table: order_line@primary
spans: [/'us-east1'/1/6/3022 - /'us-east1'/1/6/3003]
(25 rows)
Time: 3ms total (execution 2ms / network 0ms)
root@localhost:26257/tpcc> SELECT
count(*)
FROM
(
SELECT
DISTINCT s_i_id
FROM
order_line JOIN stock ON s_i_id = ol_i_id
WHERE
ol_w_id = 1 AND s_w_id = 1 AND ol_d_id = 6 AND ol_o_id BETWEEN (3023 - 20) AND (3023 - 1) AND s_quantity < 14
);
count
---------
9
(1 row)
Time: 97ms total (execution 97ms / network 0ms)However, if we also manually derive the stock table's crdb_region column, things remain local to the gateway region. We can see this in both a change in query plan and a large speedup:
root@localhost:26257/tpcc> EXPLAIN SELECT
count(*)
FROM
(
SELECT
DISTINCT s_i_id
FROM
order_line JOIN stock ON s_i_id = ol_i_id
WHERE
ol_w_id = 1 AND s_w_id = 1 AND stock.crdb_region = 'us-east1' AND ol_d_id = 6 AND ol_o_id BETWEEN (3023 - 20) AND (3023 - 1) AND s_quantity < 14
);
info
---------------------------------------------------------------------------------------------------------------------
distribution: full
vectorized: true
• group (scalar)
│
└── • distinct
│ distinct on: s_i_id
│
└── • lookup join
│ table: stock@primary
│ equality: (lookup_join_const_col_@31, lookup_join_const_col_@15, ol_i_id) = (crdb_region,s_w_id,s_i_id)
│ equality cols are key
│ pred: s_quantity < 14
│
└── • render
│
└── • scan
missing stats
table: order_line@primary
spans: [/'us-east1'/1/6/3022 - /'us-east1'/1/6/3003]
(20 rows)
Time: 3ms total (execution 3ms / network 0ms)
root@localhost:26257/tpcc> SELECT
count(*)
FROM
(
SELECT
DISTINCT s_i_id
FROM
order_line JOIN stock ON s_i_id = ol_i_id
WHERE
ol_w_id = 1 AND s_w_id = 1 AND stock.crdb_region = 'us-east1' AND ol_d_id = 6 AND ol_o_id BETWEEN (3023 - 20) AND (3023 - 1) AND s_quantity < 14
);
count
---------
9
(1 row)
Time: 9ms total (execution 9ms / network 0ms)cc. @rytaft for triage.
Metadata
Metadata
Assignees
Labels
Type
Projects
Status