面試|HiveSQL常用的一些小技巧
SORT_ARRAY
函數(shù)聲明如下。
ARRAY?sort_array(ARRAY)
用途:對(duì)給定數(shù)組中的數(shù)據(jù)排序。
參數(shù)說明:ARRAY
返回值:ARRAY類型。
示例如下。
--建表
CREATE?TABLE?sort_array
(
????c1?ARRAY<STRING>
????,c2?ARRAY<INT>
)
;
--裝載數(shù)據(jù)
INSERT?OVERWRITE?TABLE?sort_array
SELECT??array('d','c','b','a')??AS?c1
????????,array(4,3,2,1)?????????AS?c2
;
--查詢
SELECT??sort_array(c1)
????????,sort_array(c2)
FROM????sort_array
;
--結(jié)果
["a","b","c","d"]?[1,2,3,4]
分析函數(shù)
基本語法
analytic_function_name([argument_list])
OVER?(
[PARTITION?BY?partition_expression,…]
[ORDER?BY?sort_expression,?…?[ASC|DESC]])
analytic_function_name: 函數(shù)名稱 — 比如 RANK(), SUM(), FIRST()等等 partition_expression: 分區(qū)列 sort_expression: 排序列
案例
數(shù)據(jù)準(zhǔn)備
CREATE?TABLE?`orders_new`?(
????`order_num`?String?COMMENT?'訂單號(hào)',
????`order_amount`?DECIMAL?(?12,?2?)?COMMENT?'訂單金額',
????`advance_amount`?DECIMAL?(?12,?2?)?COMMENT?'預(yù)付款',
????`order_date`?string?COMMENT?'訂單日期',
????`cust_code`?string?COMMENT?'客戶',
????`agent_code`?string?COMMENT?'代理商'?
);
INSERT?INTO?orders?VALUES('200100',?'1000.00',?'600.00',?'2020-08-01',?'C00013',?'A003');
INSERT?INTO?orders?VALUES('200110',?'3000.00',?'500.00',?'2020-04-15',?'C00019',?'A010');
INSERT?INTO?orders?VALUES('200107',?'4500.00',?'900.00',?'2020-08-30',?'C00007',?'A010');
INSERT?INTO?orders?VALUES('200112',?'2000.00',?'400.00',?'2020-05-30',?'C00016',?'A007');?
INSERT?INTO?orders?VALUES('200113',?'4000.00',?'600.00',?'2020-06-10',?'C00022',?'A002');
INSERT?INTO?orders?VALUES('200102',?'2000.00',?'300.00',?'2020-05-25',?'C00012',?'A012');
INSERT?INTO?orders?VALUES('200114',?'3500.00',?'2000.00',?'2020-08-15',?'C00002','A008');
INSERT?INTO?orders?VALUES('200122',?'2500.00',?'400.00',?'2020-09-16',?'C00003',?'A004');
INSERT?INTO?orders?VALUES('200118',?'500.00',?'100.00',?'2020-07-20',?'C00023',?'A006');
INSERT?INTO?orders?VALUES('200119',?'4000.00',?'700.00',?'2020-09-16',?'C00007',?'A010');
INSERT?INTO?orders?VALUES('200121',?'1500.00',?'600.00',?'2020-09-23',?'C00008',?'A004');
INSERT?INTO?orders?VALUES('200130',?'2500.00',?'400.00',?'2020-07-30',?'C00025',?'A011');
INSERT?INTO?orders?VALUES('200134',?'4200.00',?'1800.00',?'2020-09-25',?'C00004','A005');
INSERT?INTO?orders?VALUES('200108',?'4000.00',?'600.00',?'2020-02-15',?'C00008',?'A004');
INSERT?INTO?orders?VALUES('200103',?'1500.00',?'700.00',?'2020-05-15',?'C00021',?'A005');
INSERT?INTO?orders?VALUES('200105',?'2500.00',?'500.00',?'2020-07-18',?'C00025',?'A011');
INSERT?INTO?orders?VALUES('200109',?'3500.00',?'800.00',?'2020-07-30',?'C00011',?'A010');
INSERT?INTO?orders?VALUES('200101',?'3000.00',?'1000.00',?'2020-07-15',?'C00001','A008');
INSERT?INTO?orders?VALUES('200111',?'1000.00',?'300.00',?'2020-07-10',?'C00020',?'A008');
INSERT?INTO?orders?VALUES('200104',?'1500.00',?'500.00',?'2020-03-13',?'C00006',?'A004');
INSERT?INTO?orders?VALUES('200106',?'2500.00',?'700.00',?'2020-04-20',?'C00005',?'A002');
INSERT?INTO?orders?VALUES('200125',?'2000.00',?'600.00',?'2020-10-01',?'C00018',?'A005');
INSERT?INTO?orders?VALUES('200117',?'800.00',?'200.00',?'2020-10-20',?'C00014',?'A001');
INSERT?INTO?orders?VALUES('200123',?'500.00',?'100.00',?'2020-09-16',?'C00022',?'A002');
INSERT?INTO?orders?VALUES('200120',?'500.00',?'100.00',?'2020-07-20',?'C00009',?'A002');
INSERT?INTO?orders?VALUES('200116',?'500.00',?'100.00',?'2020-07-13',?'C00010',?'A009');
INSERT?INTO?orders?VALUES('200124',?'500.00',?'100.00',?'2020-06-20',?'C00017',?'A007');?
INSERT?INTO?orders?VALUES('200126',?'500.00',?'100.00',?'2020-06-24',?'C00022',?'A002');
INSERT?INTO?orders?VALUES('200129',?'2500.00',?'500.00',?'2020-07-20',?'C00024',?'A006');
INSERT?INTO?orders?VALUES('200127',?'2500.00',?'400.00',?'2020-07-20',?'C00015',?'A003');
INSERT?INTO?orders?VALUES('200128',?'3500.00',?'1500.00',?'2020-07-20',?'C00009','A002');
INSERT?INTO?orders?VALUES('200135',?'2000.00',?'800.00',?'2020-09-16',?'C00007',?'A010');
INSERT?INTO?orders?VALUES('200131',?'900.00',?'150.00',?'2020-08-26',?'C00012',?'A012');
INSERT?INTO?orders?VALUES('200133',?'1200.00',?'400.00',?'2020-06-29',?'C00009',?'A002');
排序累加
SELECT
????agent_code,
????order_date,
????order_amount,
????SUM(?order_amount?)?OVER?(?PARTITION?BY?agent_code?ORDER?BY?order_date?desc?rows?BETWEEN?unbounded?preceding?AND?current?row??)?total_rev?
FROM
orders_new
WHERE
order_date?>=?'2020-07-01'?
AND?order_date?<=?'2020-09-30';
結(jié)果
A002?2020-09-16?500.00?500.00
A002?2020-07-20?3500.00?4000.00
A002?2020-07-20?500.00?4500.00
A003?2020-08-01?1000.00?1000.00
A003?2020-07-20?2500.00?3500.00
A004?2020-09-23?1500.00?1500.00
A004?2020-09-16?2500.00?4000.00
A005?2020-09-25?4200.00?4200.00
A006?2020-07-20?2500.00?2500.00
A006?2020-07-20?500.00?3000.00
A008?2020-08-15?3500.00?3500.00
A008?2020-07-15?3000.00?6500.00
A008?2020-07-10?1000.00?7500.00
A009?2020-07-13?500.00?500.00
A010?2020-09-16?2000.00?2000.00
A010?2020-09-16?4000.00?6000.00
A010?2020-08-30?4500.00?10500.00
A010?2020-07-30?3500.00?14000.00
A011?2020-07-30?2500.00?2500.00
A011?2020-07-18?2500.00?5000.00
A012?2020-08-26?900.00?900.00
AVG() 和SUM()
需求描述:
第三季度每個(gè)代理商的移動(dòng)平均收入和總收入
SELECT
????agent_code,
????order_date,
????AVG(?order_amount?)?OVER?(?PARTITION?BY?agent_code?ORDER?BY?order_date)??avg_rev,
????SUM(?order_amount?)?OVER?(?PARTITION?BY?agent_code?ORDER?BY?order_date?)?total_rev?
FROM
orders?
WHERE
order_date?>=?'2020-07-01'?
AND?order_date?<=?'2020-09-30';
結(jié)果輸出
A002????2020-07-20??????2000????4000
A002????2020-07-20??????2000????4000
A002????2020-09-16??????1500????4500
A003????2020-07-20??????2500????2500
A003????2020-08-01??????1750????3500
A004????2020-09-16??????2500????2500
A004????2020-09-23??????2000????4000
A005????2020-09-25??????4200????4200
A006????2020-07-20??????1500????3000
A006????2020-07-20??????1500????3000
A008????2020-07-10??????1000????1000
A008????2020-07-15??????2000????4000
A008????2020-08-15??????2500????7500
A009????2020-07-13??????500?????500
A010????2020-07-30??????3500????3500
A010????2020-08-30??????4000????8000
A010????2020-09-16??????3500????14000
A010????2020-09-16??????3500????14000
A011????2020-07-18??????2500????2500
A011????2020-07-30??????2500????5000
A012????2020-08-26??????900?????900
FIRST_VALUE()和 LAST_VALUE()
first_value: 取分組內(nèi)排序后,截止到當(dāng)前行,第一個(gè)值 last_value: 取分組內(nèi)排序后,截止到當(dāng)前行,最后一個(gè)值
需求描述
客戶首次購買后多少天才進(jìn)行下一次購買
SELECT
????cust_code,
????order_date,
????datediff(order_date,FIRST_VALUE?(?order_date?)?OVER?(?PARTITION?BY?cust_code?ORDER?BY?order_date?))?next_order_gap?
FROM
orders?
order?by?cust_code,next_order_gap
結(jié)果輸出
C00001??2020-07-15??????0
C00002??2020-08-15??????0
C00003??2020-09-16??????0
C00004??2020-09-25??????0
C00005??2020-04-20??????0
C00006??2020-03-13??????0
C00007??2020-08-30??????0
C00007??2020-09-16??????17
C00007??2020-09-16??????17
C00008??2020-02-15??????0
C00008??2020-09-23??????221
C00009??2020-06-29??????0
C00009??2020-07-20??????21
C00009??2020-07-20??????21
C00010??2020-07-13??????0
C00011??2020-07-30??????0
C00012??2020-05-25??????0
C00012??2020-08-26??????93
C00013??2020-08-01??????0
C00014??2020-10-20??????0
C00015??2020-07-20??????0
C00016??2020-05-30??????0
C00017??2020-06-20??????0
C00018??2020-10-01??????0
C00019??2020-04-15??????0
C00020??2020-07-10??????0
C00021??2020-05-15??????0
C00022??2020-06-10??????0
C00022??2020-06-24??????14
C00022??2020-09-16??????98
C00023??2020-07-20??????0
C00024??2020-07-20??????0
C00025??2020-07-18??????0
C00025??2020-07-30??????12
LEAD() 和 LAG()
lead(value_expr[,offset[,default]]):用于統(tǒng)計(jì)窗口內(nèi)往下第n行值。第一個(gè)參數(shù)為列名,第二個(gè)參數(shù)為往下第n行(可選,默認(rèn)為1),第三個(gè)參數(shù)為默認(rèn)值(當(dāng)往下第n行為NULL時(shí)候,取默認(rèn)值,如不指定,則為NULL lag(value_expr[,offset[,default]]): 與lead相反,用于統(tǒng)計(jì)窗口內(nèi)往上第n行值。第一個(gè)參數(shù)為列名,第二個(gè)參數(shù)為往上第n行(可選,默認(rèn)為1),第三個(gè)參數(shù)為默認(rèn)值(當(dāng)往上第n行為NULL時(shí)候,取默認(rèn)值,如不指定,則為NULL)
需求描述
代理商最近一次出售的最高訂單金額是多少?
SELECT
?agent_code,
?order_amount,
?LAG?(?order_amount,?1?)?OVER?(?PARTITION?BY?agent_code?ORDER?BY?order_amount?DESC?)?last_highest_amount?
FROM
?orders?
ORDER?BY
?agent_code,
?order_amount?DESC;
結(jié)果輸出
A001????800?????NULL
A002????4000????NULL
A002????3500????4000
A002????2500????3500
A002????1200????2500
A002????500?????1200
A002????500?????500
A002????500?????500
A003????2500????NULL
A003????1000????2500
A004????4000????NULL
A004????2500????4000
A004????1500????2500
A004????1500????1500
A005????4200????NULL
A005????2000????4200
A005????1500????2000
A006????2500????NULL
A006????500?????2500
A007????2000????NULL
A007????500?????2000
A008????3500????NULL
A008????3000????3500
A008????1000????3000
A009????500?????NULL
A010????4500????NULL
A010????4000????4500
A010????3500????4000
A010????3000????3500
A010????2000????3000
A011????2500????NULL
A011????2500????2500
A012????2000????NULL
A012????900?????2000
RANK() 和DENSE_RANK()
**rank:**對(duì)組中的數(shù)據(jù)進(jìn)行排名,如果名次相同,則排名也相同,但是下一個(gè)名次的排名序號(hào)會(huì)出現(xiàn)不連續(xù)。比如查找具體條件的topN行。RANK()?排序?yàn)?(1,2,2,4)
**dense_rank:**dense_rank函數(shù)的功能與rank函數(shù)類似,dense_rank函數(shù)在生成序號(hào)時(shí)是連續(xù)的,而rank函數(shù)生成的序號(hào)有可能不連續(xù)。當(dāng)出現(xiàn)名次相同時(shí),則排名序號(hào)也相同。而下一個(gè)排名的序號(hào)與上一個(gè)排名序號(hào)是連續(xù)的。
DENSE_RANK()?排序?yàn)?(1,2,2,3)
需求描述
每月第二高的訂單金額是多少?
SELECT
?order_num,
?order_date,
?order_amount,
?order_month?
FROM
?(
SELECT
?order_num,
?order_date,
?order_amount,
?DATE_FORMAT(?order_date,?'YYYY-MM'?)?AS?order_month,
?DENSE_RANK?(?)?OVER?(?PARTITION?BY?DATE_FORMAT(?order_date,?'YYYY-MM'?)?ORDER?BY?order_amount?DESC?)?order_rank?
FROM
?orders?
?)?t?
WHERE
?order_rank?=?2?
ORDER?BY
?order_date;
結(jié)果輸出
200106??2020-04-20??????2500????2020-04
200103??2020-05-15??????1500????2020-05
200133??2020-06-29??????1200????2020-06
200101??2020-07-15??????3000????2020-07
200114??2020-08-15??????3500????2020-08
200119??2020-09-16??????4000????2020-09
200117??2020-10-20??????800?????2020-10
REGEXP_EXTRACT
命令格式
string?regexp_extract(string?,?string? [,?bigint? ]) 命令說明
將字符串source按照pattern的規(guī)則拆分為組,返回第occurrence個(gè)組的字符串。
參數(shù)說明
source:必填。STRING類型,待拆分的字符串。 pattern:必填。STRING類型常量或正則表達(dá)式。待匹配的模型。 occurrence:可選。BIGINT類型常量,必須大于等于0。 返回值說明
返回STRING類型。返回規(guī)則如下:
如果pattern為空串或pattern中沒有分組,返回報(bào)錯(cuò)。 occurrence非BIGINT類型或小于0時(shí),返回報(bào)錯(cuò)。不指定時(shí)默認(rèn)為1,表示返回第一個(gè)組。如果occurrence等于0,則返回滿足整個(gè)pattern的子串。 source、pattern或occurrence值為NULL時(shí),返回NULL。 示例
select?regexp_extract('foothebar',?'(foo)(.*?)(bar)',?0);?????--返回foothebar
select?regexp_extract('foothebar',?'(foo)(.*?)(bar)',?1);?????--返回foo
select?regexp_extract('foothebar',?'(foo)(.*?)(bar)',?2);?????--返回the
select?regexp_extract('foothebar',?'(foo)(.*?)(bar)',?3);?????--返回bar
多行數(shù)據(jù)合并為一行數(shù)據(jù)
WM_CONCAT
命令格式
string?wm_concat(string?,?string? ) 命令說明
用指定的separator做分隔符,連接colname中的值。
參數(shù)說明
separator:必填。STRING類型常量,分隔符。 colname:必填。STRING類型。如果輸入為BIGINT、DOUBLE或DATETIME類型,會(huì)隱式轉(zhuǎn)換為STRING類型后參與運(yùn)算。 返回值說明
返回STRING類型。返回規(guī)則如下:
separator非STRING類型常量時(shí),返回報(bào)錯(cuò)。 colname非STRING、BIGINT、DOUBLE或DATETIME類型時(shí),返回報(bào)錯(cuò)。 colname值為NULL時(shí),該行不會(huì)參與計(jì)算。 示例
--建表
CREATE?TABLE?stu?
(
????class?STRING
????,gender?STRING
????,name?STRING
)
;
--裝載數(shù)據(jù)
INSERT?INTO?TABLE?stu?SELECT??'1','M','lilei';
INSERT?INTO?TABLE?stu?SELECT??'1','F','hanmeimei';
INSERT?INTO?TABLE?stu?SELECT??'1','M','jim';
INSERT?INTO?TABLE?stu?SELECT??'1','M','hanmeimei';
INSERT?INTO?TABLE?stu?SELECT??'2','F','tom';
INSERT?INTO?TABLE?stu?SELECT??'2','M','peter';
--查詢
SELECT?class,?wm_concat(distinct?',',?name)?FROM?stu?GROUP?BY?class;
KEYVALUE
命令格式
keyvalue(string?,[string? ,string? ,]?string? )
keyvalue(string?,string? )? 命令說明
將字符串str按照split1分成Key-Value對(duì),并按split2將Key-Value對(duì)分開,返回key所對(duì)應(yīng)的Value。
參數(shù)說明
“?
即默認(rèn)的分隔符是**;,KV之間的分割是:**
key:必填。STRING類型。將字符串按照split1和split2拆分后,返回key值對(duì)應(yīng)的Value。 str:必填。STRING類型。待拆分的字符串。 split1、split2:可選。STRING類型。用于作為分隔符的字符串,按照指定的兩個(gè)分隔符拆分源字符串。如果表達(dá)式中沒有指定這兩項(xiàng),默認(rèn)split1為 ";",split2為":"。當(dāng)某個(gè)被split1拆分后的字符串中有多個(gè)split2時(shí),返回結(jié)果未定義。返回值說明
返回STRING類型。返回規(guī)則如下:
split1或split2值為NULL時(shí),返回NULL。 str或key值為NULL或沒有匹配的key時(shí),返回NULL。 如果有多個(gè)Key-Value匹配,返回第一個(gè)匹配上的key對(duì)應(yīng)的Value。
select?keyvalue('0:1\;1:2',?1);??--返回2
select?keyvalue('spm=123.qwe,cpn=101,act=890',',','=','spm')??----返回123.qwe
優(yōu)化相關(guān)
distribute by+sort by V.S order by
order by將結(jié)果按某字段全局排序,這會(huì)導(dǎo)致所有map端數(shù)據(jù)都進(jìn)入一個(gè)reducer中,在數(shù)據(jù)量大時(shí)可能會(huì)長(zhǎng)時(shí)間計(jì)算不完 distribute by用于控制map端數(shù)據(jù)分配到reducer的key,sort by會(huì)視情況啟動(dòng)多個(gè)reducer進(jìn)行排序,并且保證每個(gè)reducer內(nèi)局部有序
group by V.S count(distinct)
當(dāng)數(shù)據(jù)量級(jí)很大,用group by ,可以啟動(dòng)多個(gè)job 數(shù)據(jù)集很小或者key的傾斜比較明顯時(shí),用count(distinct),少量的reduce就可以處理
map join
Hive會(huì)將build table和probe table在map端直接完成join過程,消滅了reduce,效率很高 set hive.auto.convent.join=true;? /*+MAPJOIN(t1,t3,t4)*/
