用 TPCH 练习性能优化 Q7
一、 SQL及分析
查询SQL语句如下:
select
supp_nation,
cust_nation,
l_year,
sum(volume) as revenue
from
(
select
n1.n_name as supp_nation,
n2.n_name as cust_nation,
extract(year from l_shipdate) as l_year,
l_extendedprice * (1 - l_discount) as volume
from
supplier,
lineitem,
orders,
customer,
nation n1,
nation n2
where
s_suppkey = l_suppkey
and o_orderkey = l_orderkey
and c_custkey = o_custkey
and s_nationkey = n1.n_nationkey
and c_nationkey = n2.n_nationkey
and (
(n1.n_name = 'CHINA' and n2.n_name = 'RUSSIA')
or (n1.n_name = 'RUSSIA' and n2.n_name = 'CHINA')
)
and l_shipdate between date '1995-01-01' and date '1996-12-31'
) shipping
group by
supp_nation,
cust_nation,
l_year
order by
supp_nation,
cust_nation,
l_year;
这就是多表关联过滤后的分组聚合运算,其中包括主子表orders和lineitem关联。
二、 SPL实现
实现原理与Q5类似。
A |
|
1 |
=now() |
2 |
1995-01-01 |
3 |
1996-12-31 |
4 |
>name1="CHINA" |
5 |
>name2="RUSSIA" |
6 |
=file("nation.btx").import@b().keys@i(N_NATIONKEY) |
7 |
>n1=A6.select@1(N_NAME==name1).N_NATIONKEY |
8 |
>n2=A6.select@1(N_NAME==name2).N_NATIONKEY |
9 |
=file("supplier.ctx").open().cursor@m(S_SUPPKEY,S_NATIONKEY;S_NATIONKEY==n1 || S_NATIONKEY==n2).fetch().keys@im(S_SUPPKEY) |
10 |
=file("customer.ctx").open().cursor@m(C_CUSTKEY,C_NATIONKEY;C_NATIONKEY==n1 || C_NATIONKEY==n2).fetch().keys@im(C_CUSTKEY) |
11 |
=file("orders.ctx").open().cursor@m(O_ORDERKEY,O_CUSTKEY;O_CUSTKEY:A10) |
12 |
=file("lineitem.ctx").open().news(A11,L_SUPPKEY,L_EXTENDEDPRICE,L_DISCOUNT,L_SHIPDATE,O_CUSTKEY;L_SHIPDATE>=A2 && L_SHIPDATE <=A3,L_SUPPKEY:A9) |
13 |
=A12.select(O_CUSTKEY.C_NATIONKEY!=L_SUPPKEY.S_NATIONKEY) |
14 |
=A13.groups(L_SUPPKEY.S_NATIONKEY:supp_nation,O_CUSTKEY.C_NATIONKEY:cust_nation,year(L_SHIPDATE):l_year; sum(L_EXTENDEDPRICE*(1-L_DISCOUNT)): volume) |
15 |
=A14.run(supp_nation=A6.find(supp_nation).N_NAME,cust_nation=A6.find(cust_nation).N_NAME) |
16 |
=interval@ms(A1,now()) |
注意nation表数据A6在A7和A8分别使用了一次,用于外键匹配过滤,这和SQL的别名写法不同。
三、 进一步优化
1. 优化方法
本例中要使用Q1中介绍的日期整数化优化方法,lineitem中的L_SHIPDATE已在Q1中转换过了。还要使用Q2中介绍的维表主键序号化方法,nation中的N_NATIONKEY和N_REGIONKEY、supplier中的S_SUPPKEY和S_NATIONKEY、customer中的C_CUSTKEY和C_NATIONKEY、orders中的O_CUSTKEY、lineitem中的L_SUPPKEY都已在之前的例子中转化过了。
2. 数据转换代码
复制nation_5.btx、supplier_5.ctx、customer_5.ctx、orders_5.ctx、lineitem_5.ctx分别重命名为nation_7.btx、supplier_7.ctx、customer_7.ctx、orders_7.ctx、lineitem_7.ctx。
3. 数据转换后的计算代码
维表和字符串字段取值列表需要预加载,加载代码如下:
A |
|
1 |
>env(nation, file("nation_7.btx").import@b()) |
2 |
>env(customer, file("customer_7.ctx").open().import()) |
3 |
>env(supplier, file("supplier_7.ctx").open().import()) |
在查询计算之前,需要先运行预加载代码,将小维表读入内存。
计算代码:
A |
|
1 |
=now() |
2 |
=days@o(date("1995-01-01")) |
3 |
=days@o(date("1996-12-31")) |
4 |
[CHINA,RUSSIA] |
5 |
=nation.(A4.pos(N_NAME)) |
6 |
=supplier.@m(if(A5(S_NATIONKEY),S_NATIONKEY,null)) |
7 |
=customer.@m(if(A5(C_NATIONKEY),C_NATIONKEY,null)) |
8 |
=file("orders_7.ctx").open().cursor@m(O_ORDERKEY,O_CUSTKEY;O_CUSTKEY:A7:#) |
9 |
=file("lineitem_7.ctx").open().news(A8,L_SUPPKEY,L_EXTENDEDPRICE,L_DISCOUNT,L_SHIPDATE,O_CUSTKEY;L_SHIPDATE>=A2 && L_SHIPDATE <=A3,L_SUPPKEY:A6:#) |
10 |
=A9.select(O_CUSTKEY!=L_SUPPKEY).run(O_CUSTKEY=A5(O_CUSTKEY),L_SUPPKEY=A5(L_SUPPKEY)) |
11 |
=A10.groups@u(year(L_SHIPDATE)*9+L_SUPPKEY*3+O_CUSTKEY:gk; sum(L_EXTENDEDPRICE*(1-L_DISCOUNT)): volume) |
12 |
=A11.new(A4(gk%9\3):supp_nation,A4(gk%3):cust_nation,gk\9:l_year,volume).sort(supp_nation,cust_nation,l_year) |
13 |
=interval@ms(A1,now()) |
A10中把O_CUSTKEY和L_SUPPKEY替换成A5对位序列中的值,即1或2。A11中使用Q1中介绍的分组键技巧,把按3个字段分组转变成按一个字段分组。
A11中日期整数化以后仍然可以用year函数计算出它所属年份。
其它技巧类同Q5。
四、 使用企业版列式计算
1. 原始数据
A |
|
1 |
=now() |
2 |
1995-01-01 |
3 |
1996-12-31 |
4 |
>name1="CHINA" |
5 |
>name2="RUSSIA" |
6 |
=file("nation.btx").import@b().keys@i(N_NATIONKEY) |
7 |
>n1=A6.select@1(N_NAME==name1).N_NATIONKEY |
8 |
>n2=A6.select@1(N_NAME==name2).N_NATIONKEY |
9 |
=file("supplier.ctx").open().cursor@mv(S_SUPPKEY,S_NATIONKEY;S_NATIONKEY==n1 || S_NATIONKEY==n2).fetch().keys@im(S_SUPPKEY) |
10 |
=file("customer.ctx").open().cursor@mv(C_CUSTKEY,C_NATIONKEY;C_NATIONKEY==n1 || C_NATIONKEY==n2).fetch().keys@im(C_CUSTKEY) |
11 |
=file("orders.ctx").open().cursor@mv(O_ORDERKEY,O_CUSTKEY;O_CUSTKEY:A10) |
12 |
=file("lineitem.ctx").open().news(A11,L_SUPPKEY,L_EXTENDEDPRICE,L_DISCOUNT,L_SHIPDATE,O_CUSTKEY;L_SHIPDATE>=A2 && L_SHIPDATE<=A3,L_SUPPKEY:A9) |
13 |
=A12.select@v(O_CUSTKEY.C_NATIONKEY!=L_SUPPKEY.S_NATIONKEY) |
14 |
=A13.groups(L_SUPPKEY.S_NATIONKEY:supp_nation,O_CUSTKEY.C_NATIONKEY:cust_nation,year(L_SHIPDATE):l_year; sum(L_EXTENDEDPRICE*(1-L_DISCOUNT)): volume) |
15 |
=A14.new(A6.find(supp_nation).N_NAME:supp_nation,A6.find(cust_nation).N_NAME:cust_nation,l_year,volume) |
16 |
=interval@ms(A1,now()) |
2. 优化后数据
维表和字符串字段取值列表需要预加载,加载代码如下:
A |
|
1 |
>env(nation, file("nation_7.btx").import@bv()) |
2 |
>env(customer, file("customer_7.ctx").open().import@v()) |
3 |
>env(supplier, file("supplier_7.ctx").open().import@v()) |
在查询计算之前,需要先运行预加载代码,将小维表读入内存。
计算代码:
A |
|
1 |
=now() |
2 |
=days@o(date("1995-01-01")) |
3 |
=days@o(date("1996-12-31")) |
4 |
[CHINA,RUSSIA] |
5 |
=nation.(A4.pos(N_NAME)) |
6 |
=supplier.(A5(S_NATIONKEY)) |
7 |
=customer.(A5(C_NATIONKEY)) |
8 |
=file("orders_7.ctx").open().cursor@mv(O_ORDERKEY,O_CUSTKEY;A7(O_CUSTKEY)) |
9 |
=file("lineitem_7.ctx").open().news(A8,L_SUPPKEY,L_EXTENDEDPRICE,L_DISCOUNT,L_SHIPDATE,O_CUSTKEY;L_SHIPDATE>=A2 && L_SHIPDATE<=A3 && A6(L_SUPPKEY)) |
10 |
=A9.derive@o(A5(supplier(L_SUPPKEY).S_NATIONKEY):s,A5(customer(O_CUSTKEY).C_NATIONKEY):c).select@v(s!=c) |
11 |
=A10.groups@u(year(L_SHIPDATE)*9+c*3+s:gk; sum(L_EXTENDEDPRICE*(1-L_DISCOUNT)):volume) |
12 |
=A11.new(A4(gk%3):supp_nation,A4(gk%9\3):cust_nation,gk\9:l_year,volume).sort(supp_nation,cust_nation,l_year) |
13 |
=interval@ms(A1,now()) |
五、 测试结果
单位:秒
常规 |
列式 |
|
优化前 |
16.0 |
10.1 |
优化后 |
11.0 |
6.6 |
英文版