用 TPCH 练习性能优化 Q19
一、 SQL及分析
查询SQL语句如下:
select
sum(l_extendedprice * (1 - l_discount)) as revenue
from
lineitem,part
where
(
p_partkey = l_partkey
and p_brand = 'Brand#32'
and p_container in ('SM CASE', 'SM BOX', 'SM PACK', 'SM PKG')
and l_quantity >= 7 and l_quantity <= 7 + 10
and p_size between 1 and 5
and l_shipmode in ('AIR', 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
)
or
(
p_partkey = l_partkey
and p_brand = 'Brand#23'
and p_container in ('MED BAG', 'MED BOX', 'MED PKG', 'MED PACK')
and l_quantity >= 18 and l_quantity <= 18 + 10
and p_size between 1 and 10
and l_shipmode in ('AIR', 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
)
or
(
p_partkey = l_partkey
and p_brand = 'Brand#45'
and p_container in ('LG CASE', 'LG BOX', 'LG PACK', 'LG PKG')
and l_quantity >= 22 and l_quantity <= 22 + 10
and p_size between 1 and 15
and l_shipmode in ('AIR', 'AIR REG')
and l_shipinstruct = 'DELIVER IN PERSON'
);
这是两表连接的过滤查询汇总,结构上比较简单,但是过滤条件比较复杂。
二、 SPL实现
条件表达式中有不少项只与较小的表part相关,而且这些项的计算复杂度还较高(有in运算),如果先针对part表把这些项计算出来,则计算量将比针对两表连接结果集再计算要少得多,因为连接结果集是和lineitem表规模相同,要大得多。
A |
|
1 |
=now() |
2 |
>brand1="Brand#32" |
3 |
>brand2="Brand#23" |
4 |
>brand3="Brand#45" |
5 |
>quantity1=7 |
6 |
>quantity2=18 |
7 |
>quantity3=22 |
8 |
=["SM CASE", "SM BOX", "SM PACK", "SM PKG"] |
9 |
=["MED BAG", "MED BOX", "MED PKG", "MED PACK"] |
10 |
=["LG CASE", "LG BOX", "LG PACK", "LG PKG"] |
11 |
=["AIR","AIR REG"] |
12 |
=file("part.ctx").open().cursor@m(P_PARTKEY,P_BRAND,P_SIZE,P_CONTAINER,0:FLAG;P_SIZE>=1) |
13 |
=A12.run(FLAG=if(P_BRAND==brand1 && A8.contain(P_CONTAINER) && P_SIZE<=5:1, P_BRAND==brand2 && A9.contain(P_CONTAINER) && P_SIZE<=10:2, P_BRAND==brand3 && A10.contain(P_CONTAINER) && P_SIZE<=15:3; 0)).select(FLAG>0).fetch@o().keys@im(P_PARTKEY) |
14 |
=file("lineitem.ctx").open().cursor@m(L_PARTKEY,L_QUANTITY,L_EXTENDEDPRICE,L_DISCOUNT;L_PARTKEY:A13,L_SHIPINSTRUCT=="DELIVER IN PERSON" && A11.contain(L_SHIPMODE) && L_QUANTITY>= quantity1 && L_QUANTITY<=quantity3+10) |
15 |
=A14.groups(;sum(if(case(L_PARTKEY.FLAG,1:L_QUANTITY<=quantity1+10,2:L_QUANTITY>=quantity2 && L_QUANTITY<=quantity2+10;L_QUANTITY>=quantity3),L_EXTENDEDPRICE*(1-L_DISCOUNT),0))) |
16 |
=interval@ms(A1,now()) |
A12在part表中新产生一个FLAG字段,用于将part相关的复杂条件项先计算出来。当记录满足SQL中的or条件1、2、3时分别赋值为1、2、3,都不满足则赋值0,然后用FLAG值是否大于0来过滤。在A15中再根据FLAG的值来选择用哪个quantity值对L_QUANTITY进行过滤。FLAG相关的复杂计算式只需要计算part表的行数次。
三、 进一步优化
1. 优化方法
本例中要使用Q1中介绍的字符串整数化方法,part中的P_BRAND和P_CONTAINER已分别在Q16、Q17中转换过了,lineitem中的L_SHIPMODE已经在Q12中转换过了,本例中还需要转换L_SHIPINSTRUCT。还要使用Q2中介绍的维表主键序号化方法,part中的P_PARTKEY、lineitem中的L_PARTKEY都已在之前的例子中转换过了。
2. 数据转换代码
2.1 part转换
复制part_17.ctx重命名为part_19.ctx。
2.2 lineitem转换
A |
|
1 |
=file("lineitem.ctx").open().cursor(L_SHIPINSTRUCT) |
2 |
=A1.id(L_SHIPINSTRUCT).sort() |
3 |
=file("l_shipinstruct.txt").export(A2) |
4 |
=file("lineitem_18.ctx").open().cursor() |
5 |
=A4.run(L_SHIPINSTRUCT=A2.pos@b(L_SHIPINSTRUCT)) |
6 |
=file("lineitem_19.ctx").create@p(#L_ORDERKEY,#L_LINENUMBER,L_PARTKEY, L_SUPPKEY, L_QUANTITY, L_EXTENDEDPRICE,L_DISCOUNT, L_TAX, L_RETURNFLAG, L_LINESTATUS, L_SHIPDATE, L_COMMITDATE, L_RECEIPTDATE,L_SHIPINSTRUCT, L_SHIPMODE, L_COMMENT) |
7 |
>A6.append(A5) |
3. 数据转换后的计算代码
维表和字符串字段取值列表需要预加载,加载代码如下:
A |
|
1 |
>env(p_brand,file("p_brand.txt").import@si()) |
2 |
>env(p_container,file("p_container.txt").import@si()) |
3 |
>env(l_shipinstruct,file("l_shipinstruct.txt").import@si()) |
4 |
>env(l_shipmode,file("l_shipmode.txt").import@si()) |
5 |
>env(part, file("part_19.ctx").open().import()) |
在查询计算之前,需要先运行预加载代码,将小维表读入内存。
计算代码:
A |
|
1 |
=now() |
2 |
>brand1=p_brand.pos@b("Brand#32") |
3 |
>brand2=p_brand.pos@b("Brand#23") |
4 |
>brand3=p_brand.pos@b("Brand#45") |
5 |
>quantity1=7 |
6 |
>quantity2=18 |
7 |
>quantity3=22 |
8 |
=p_container.(["SM CASE", "SM BOX", "SM PACK", "SM PKG"].contain(~)) |
9 |
=p_container.(["MED BAG", "MED BOX", "MED PKG", "MED PACK"].contain(~)) |
10 |
=p_container.(["LG CASE", "LG BOX", "LG PACK", "LG PKG"].contain(~)) |
11 |
=l_shipmode.(["AIR","AIR REG"].contain(~)) |
12 |
=l_shipinstruct.pos@b("DELIVER IN PERSON") |
13 |
=part.@m(if(P_SIZE<1:null, P_BRAND==brand1 && A8(P_CONTAINER) && P_SIZE<=5:1, P_BRAND==brand2 && A9(P_CONTAINER) && P_SIZE<=10:2, P_BRAND==brand3 && A10(P_CONTAINER) && P_SIZE<=15:3; null)) |
14 |
=file("lineitem_19.ctx").open().cursor@m(L_PARTKEY,L_QUANTITY,L_EXTENDEDPRICE,L_DISCOUNT;L_PARTKEY:A13:#,L_SHIPINSTRUCT==A12 && A11(L_SHIPMODE) && L_QUANTITY>= quantity1 && L_QUANTITY<=quantity3+10) |
15 |
=A14.groups(;sum(if(case(L_PARTKEY,1:L_QUANTITY<=quantity1+10,2:L_QUANTITY>=quantity2 && L_QUANTITY<=quantity2+10,3:L_QUANTITY>=quantity3),L_EXTENDEDPRICE*(1-L_DISCOUNT),0))) |
16 |
=interval@ms(A1,now()) |
四、 使用企业版列式计算
1. 原始数据
A |
|
1 |
=now() |
2 |
>brand1="Brand#32" |
3 |
>brand2="Brand#23" |
4 |
>brand3="Brand#45" |
5 |
>quantity1=7 |
6 |
>quantity2=18 |
7 |
>quantity3=22 |
8 |
=["SM CASE", "SM BOX", "SM PACK", "SM PKG"].i() |
9 |
=["MED BAG", "MED BOX", "MED PKG", "MED PACK"].i() |
10 |
=["LG CASE", "LG BOX", "LG PACK", "LG PKG"].i() |
11 |
=["AIR","AIR REG"].i() |
12 |
=file("part.ctx").open().cursor@mv(P_PARTKEY,P_BRAND,P_SIZE,P_CONTAINER;P_SIZE>=1) |
13 |
=A12.derive@o(if(P_BRAND==brand1 && A8.contain(P_CONTAINER) && P_SIZE<=5:1, P_BRAND==brand2 && A9.contain(P_CONTAINER) && P_SIZE<=10:2, P_BRAND==brand3 && A10.contain(P_CONTAINER) && P_SIZE<=15:3; 0):FLAG).select@v(FLAG>0).fetch@o().keys@im(P_PARTKEY) |
14 |
=file("lineitem.ctx").open().cursor@mv(L_PARTKEY,L_QUANTITY,L_EXTENDEDPRICE,L_DISCOUNT;L_PARTKEY:A13,L_QUANTITY>= quantity1 && L_QUANTITY<=quantity3+10 && L_SHIPINSTRUCT=="DELIVER IN PERSON" && A11.contain(L_SHIPMODE)) |
15 |
=A14.groups(;sum(if(case(L_PARTKEY.FLAG,1:L_QUANTITY<=quantity1+10,2:L_QUANTITY>=quantity2 && L_QUANTITY<=quantity2+10;L_QUANTITY>=quantity3),L_EXTENDEDPRICE*(1-L_DISCOUNT),0))) |
16 |
=interval@ms(A1,now()) |
2. 优化后数据
维表和字符串字段取值列表需要预加载,加载代码如下:
A |
|
1 |
>env(p_brand,file("p_brand.txt").import@si()) |
2 |
>env(p_container,file("p_container.txt").import@si()) |
3 |
>env(l_shipinstruct,file("l_shipinstruct.txt").import@si()) |
4 |
>env(l_shipmode,file("l_shipmode.txt").import@si()) |
5 |
>env(part, file("part_19.ctx").open().import@v()) |
在查询计算之前,需要先运行预加载代码,将小维表读入内存。
计算代码:
A |
|
1 |
=now() |
2 |
>brand1=p_brand.pos@b("Brand#32") |
3 |
>brand2=p_brand.pos@b("Brand#23") |
4 |
>brand3=p_brand.pos@b("Brand#45") |
5 |
>quantity1=7 |
6 |
>quantity2=18 |
7 |
>quantity3=22 |
8 |
=p_container.(["SM CASE", "SM BOX", "SM PACK", "SM PKG"].contain(~)) |
9 |
=p_container.(["MED BAG", "MED BOX", "MED PKG", "MED PACK"].contain(~)) |
10 |
=p_container.(["LG CASE", "LG BOX", "LG PACK", "LG PKG"].contain(~)) |
11 |
=l_shipmode.(["AIR","AIR REG"].contain(~)) |
12 |
=l_shipinstruct.pos@b("DELIVER IN PERSON") |
13 |
=part.@m(if(P_SIZE<1:null, P_BRAND==brand1 && A8(P_CONTAINER) && P_SIZE<=5:1, P_BRAND==brand2 && A9(P_CONTAINER) && P_SIZE<=10:2, P_BRAND==brand3 && A10(P_CONTAINER) && P_SIZE<=15:3; null)) |
14 |
=file("lineitem_19.ctx").open().cursor@mv(L_PARTKEY,L_QUANTITY,L_EXTENDEDPRICE,L_DISCOUNT;L_PARTKEY:A13:#,L_SHIPINSTRUCT==A12 && A11(L_SHIPMODE) && L_QUANTITY>= quantity1 && L_QUANTITY<=quantity3+10) |
15 |
=A14.groups(;sum(if(case(L_PARTKEY,1:L_QUANTITY<=quantity1+10,2:L_QUANTITY>=quantity2 && L_QUANTITY<=quantity2+10,3:L_QUANTITY>=quantity3),L_EXTENDEDPRICE*(1-L_DISCOUNT),0))) |
16 |
=interval@ms(A1,now()) |
五、 测试结果
单位:秒
常规 |
列式 |
|
优化前 |
8.2 |
6.2 |
优化后 |
7.0 |
4.8 |
英文版