用 TPCH 练习性能优化 Q8
一、 SQL及分析
查询SQL语句如下:
select
o_year,
sum(case
when nation = 'CHINA' then volume
else 0
end) / sum(volume) as mkt_share
from
(
select
extract(year from o_orderdate) as o_year,
l_extendedprice * (1 - l_discount) as volume,
n2.n_name as nation
from
part,
supplier,
lineitem,
orders,
customer,
nation n1,
nation n2,
region
where
p_partkey = l_partkey
and s_suppkey = l_suppkey
and l_orderkey = o_orderkey
and o_custkey = c_custkey
and c_nationkey = n1.n_nationkey
and n1.n_regionkey = r_regionkey
and r_name = 'ASIA'
and s_nationkey = n2.n_nationkey
and o_orderdate between date '1995-01-01' and date '1996-12-31'
and p_type = 'STANDARD POLISHED BRASS'
) all_nations
group by
o_year
order by
o_year;
这就是多表关联过滤后的分组聚合运算,其中包括主子表orders和lineitem关联。
二、 SPL实现
实现原理与Q5类似。
A |
|
1 |
=now() |
2 |
1995-01-01 |
3 |
1996-12-31 |
4 |
>nation="CHINA" |
5 |
>name="ASIA" |
6 |
>type="STANDARD POLISHED BRASS" |
7 |
=file("region.btx").import@b().select@1(R_NAME==name).R_REGIONKEY |
8 |
=file("nation.btx").import@b() |
9 |
=A8.select@1(N_NAME==nation).N_NATIONKEY |
10 |
=A8.select(N_REGIONKEY==A7).derive@o().keys(N_NATIONKEY) |
11 |
=file("supplier.ctx").open().cursor@m(S_SUPPKEY;S_NATIONKEY==A9).fetch().keys@im(S_SUPPKEY) |
12 |
=file("part.ctx").open().cursor@m(P_PARTKEY;P_TYPE==type).fetch().keys@im(P_PARTKEY) |
13 |
=file("customer.ctx").open().cursor@m(C_CUSTKEY;A10.find(C_NATIONKEY)).fetch().keys@im(C_CUSTKEY) |
14 |
=file("orders.ctx").open().cursor@m(O_ORDERKEY,O_ORDERDATE;O_ORDERDATE>=A2 && O_ORDERDATE <=A3 && A13.find(O_CUSTKEY)) |
15 |
=file("lineitem.ctx").open().news(A14,L_SUPPKEY,L_EXTENDEDPRICE,L_DISCOUNT,O_ORDERDATE;A12.find(L_PARTKEY)) |
16 |
=A15.run(L_EXTENDEDPRICE*=(1-L_DISCOUNT)) |
17 |
=A16.groups(year(O_ORDERDATE):o_year;sum(if(A11.find(L_SUPPKEY),L_EXTENDEDPRICE,0)):s1,sum(L_EXTENDEDPRICE):s2) |
18 |
=A17.new(o_year,s1/s2:mkt_share) |
19 |
=interval@ms(A1,now()) |
三、 进一步优化
1. 优化方法
本例中要使用Q1中介绍的日期整数化优化方法,orders中的O_ORDERDATE已在Q3中转换过了。要使用Q1中介绍的字符串整数化方法,part中的P_TYPE已在Q2中转换过了。还要使用Q2中介绍的维表主键序号化方法,region中的R_REGIONKEY、nation中的N_NATIONKEY和N_REGIONKEY、part中的P_PARTKEY、supplier中的S_SUPPKEY和S_NATIONKEY、customer中的C_CUSTKEY和C_NATIONKEY、orders中的O_CUSTKEY、lineitem中的L_SUPPKEY都已在之前的例子中转换过了,本例还要对lineitem中的L_PARTKEY进行序号化转换。
2. 数据转换代码
2.1 region、nation、part、supplier、customer、orders转换
复制region_5.btx、nation_7.btx、part_2.ctx、supplier_7.ctx、customer_7.ctx、orders_7.ctx分别重命名为region_8.btx、nation_8.btx、part_8.ctx、supplier_8.ctx、customer_8.ctx、orders_8.ctx。
2.2 lineitem转换
A |
|
1 |
=file("part.ctx").open().import(P_PARTKEY).keys@im(P_PARTKEY) |
2 |
=file("lineitem_7.ctx").open().cursor() |
3 |
=A2.run(L_PARTKEY=A1.pfind(L_PARTKEY)) |
4 |
=file("lineitem_8.ctx").create@p(#L_ORDERKEY,#L_LINENUMBER,L_PARTKEY, L_SUPPKEY, L_QUANTITY, L_EXTENDEDPRICE,L_DISCOUNT, L_TAX, L_RETURNFLAG, L_LINESTATUS, L_SHIPDATE, L_COMMITDATE, L_RECEIPTDATE,L_SHIPINSTRUCT, L_SHIPMODE, L_COMMENT) |
5 |
>A4.append(A3) |
3. 数据转换后的计算代码
维表和字符串字段取值列表需要预加载,加载代码如下:
A |
|
1 |
>env(p_type,file("p_type.txt").import@si()) |
2 |
>env(region, file("region_8.btx").import@b()) |
3 |
>env(nation, file("nation_8.btx").import@b()) |
4 |
>env(supplier, file("supplier_8.ctx").open().import()) |
5 |
>env(customer, file("customer_8.ctx").open().import()) |
6 |
>env(part, file("part_8.ctx").open().import()) |
在查询计算之前,需要先运行预加载代码,将小维表读入内存。
计算代码:
A |
|
1 |
=now() |
2 |
=days@o(date("1995-01-01")) |
3 |
=days@o(date("1996-12-31")) |
4 |
>nationName="CHINA" |
5 |
>name="ASIA" |
6 |
>type=p_type.pos@b("STANDARD POLISHED BRASS") |
7 |
=region.pselect(R_NAME==name) |
8 |
=nation.pselect(N_NAME==nationName) |
9 |
=nation.(N_REGIONKEY==A7) |
10 |
=supplier.@m(S_NATIONKEY==A8) |
11 |
=part.@m(P_TYPE==type) |
12 |
=customer.@m(A9(C_NATIONKEY)) |
13 |
=file("orders_8.ctx").open().cursor@m(O_ORDERKEY,O_ORDERDATE;O_ORDERDATE>=A2 && O_ORDERDATE <=A3 && A12(O_CUSTKEY)) |
14 |
=file("lineitem_8.ctx").open().news(A13,L_SUPPKEY,L_EXTENDEDPRICE,L_DISCOUNT,O_ORDERDATE;A11(L_PARTKEY)) |
15 |
=A14.run(L_EXTENDEDPRICE*=(1-L_DISCOUNT)) |
16 |
=A15.groups(year(O_ORDERDATE):o_year;sum(if(A10(L_SUPPKEY),L_EXTENDEDPRICE,0)):s1,sum(L_EXTENDEDPRICE):s2) |
17 |
=A16.new(o_year,s1/s2:mkt_share) |
18 |
=interval@ms(A1,now()) |
四、 使用企业版列式计算
1. 原始数据
A |
|
1 |
=now() |
2 |
1995-01-01 |
3 |
1996-12-31 |
4 |
>nation="CHINA" |
5 |
>name="ASIA" |
6 |
>type="STANDARD POLISHED BRASS" |
7 |
=file("region.btx").import@b().select@1(R_NAME==name).R_REGIONKEY |
8 |
=file("nation.btx").import@b() |
9 |
=A8.select@1(N_NAME==nation).N_NATIONKEY |
10 |
=A8.select(N_REGIONKEY==A7).derive@o().keys(N_NATIONKEY) |
11 |
=file("supplier.ctx").open().cursor@mv(S_SUPPKEY;S_NATIONKEY==A9).fetch().keys@im(S_SUPPKEY) |
12 |
=file("part.ctx").open().cursor@mv(P_PARTKEY;P_TYPE==type).fetch().keys@im(P_PARTKEY) |
13 |
=file("customer.ctx").open().cursor@mv(C_CUSTKEY;A10.find(C_NATIONKEY)).fetch().keys@im(C_CUSTKEY) |
14 |
=file("orders.ctx").open().cursor@mv(O_ORDERKEY,O_ORDERDATE;O_ORDERDATE>=A2 && O_ORDERDATE <=A3 && A13.find(O_CUSTKEY)) |
15 |
=file("lineitem.ctx").open().cursor@v(L_ORDERKEY,L_SUPPKEY,L_EXTENDEDPRICE,L_DISCOUNT;A12.find(L_PARTKEY);A14) |
16 |
=A15.derive@o(L_EXTENDEDPRICE*(1-L_DISCOUNT):dp) |
17 |
=A16.joinx@im(L_ORDERKEY,A14:O_ORDERKEY,O_ORDERDATE) |
18 |
=A16.groups(year(O_ORDERDATE):o_year;sum(if(A11.find(L_SUPPKEY),dp,0)):s1,sum(dp):s2) |
19 |
=A18.new(o_year,s1/s2:mkt_share) |
20 |
=interval@ms(A1,now()) |
这里orders与lineitem主子关联时没有使用news,而是同步游标后用joinx@im进行有序归并连接。当然继续使用news也是可以的,实测中用joinx要略快一点。
A16中用derive@o新增一列dp,避免在A18中会2次计算dp。
2. 优化后数据
维表和字符串字段取值列表需要预加载,加载代码如下:
A |
|
1 |
>env(p_type,file("p_type.txt").import@si()) |
2 |
>env(region, file("region_8.btx").import@bv()) |
3 |
>env(nation, file("nation_8.btx").import@bv()) |
4 |
>env(supplier, file("supplier_8.ctx").open().import@v()) |
5 |
>env(customer, file("customer_8.ctx").open().import@v()) |
6 |
>env(part, file("part_8.ctx").open().import@v()) |
在查询计算之前,需要先运行预加载代码,将小维表读入内存。
计算代码:
A |
|
1 |
=now() |
2 |
=days@o(date("1995-01-01")) |
3 |
=days@o(date("1996-12-31")) |
4 |
>nationName="CHINA" |
5 |
>name="ASIA" |
6 |
>type=p_type.pos@b("STANDARD POLISHED BRASS") |
7 |
=region.pselect(R_NAME==name) |
8 |
=nation.pselect(N_NAME==nationName) |
9 |
=nation.(N_REGIONKEY==A7) |
10 |
=supplier.(S_NATIONKEY==A8) |
11 |
=part.(P_TYPE==type) |
12 |
=customer.(A9(C_NATIONKEY)) |
13 |
=file("orders_8.ctx").open().cursor@mv(O_ORDERKEY,O_ORDERDATE;O_ORDERDATE>=A2 && O_ORDERDATE<=A3 && A12(O_CUSTKEY)) |
14 |
=file("lineitem_8.ctx").open().cursor@mv(L_ORDERKEY,L_SUPPKEY,L_EXTENDEDPRICE,L_DISCOUNT;A11(L_PARTKEY);A13) |
15 |
=A14.joinx@im(L_ORDERKEY,A13:O_ORDERKEY,O_ORDERDATE) |
16 |
=A15.derive@o(L_EXTENDEDPRICE*(1-L_DISCOUNT):dp) |
17 |
=A16.groups(year(O_ORDERDATE):o_year;sum(if(A10(L_SUPPKEY),dp,0)):s1,sum(dp):s2) |
18 |
=A17.new(o_year,s1/s2:mkt_share) |
19 |
=interval@ms(A1,now()) |
五、 测试结果
单位:秒
常规 |
列式 |
|
优化前 |
20.5 |
8.0 |
优化后 |
13.3 |
5.1 |
英文版