hive常用sql结构

2024-10-12

一、存储方式

1.单分区全量数据存储

create EXTERNAL table IF NOT EXISTS $hive_db.$hive_tb(


COMMENT ''
STORED AS PARQUET
;

2.多分区全量数据存储1用于有变化数据表

create EXTERNAL table IF NOT EXISTS $hive_db.$hive_tb(

)
COMMENT ''
PARTITIONED BY ($order_month string)
STORED AS PARQUET
;

3.多分区全量数据存储2用于无变化流水日志表

create EXTERNAL table IF NOT EXISTS $hive_db.$hive_tb(

)
COMMENT ''
PARTITIONED BY ($dt string)
STORED AS PARQUET
;

4.带分区拉链

create EXTERNAL table IF NOT EXISTS $hive_db.$hive_tb(

start_dt string comment '开始日期',

)
COMMENT ''
PARTITIONED BY (end_dt string comment '结束日期')
STORED AS PARQUET
;

5.不带分区拉链

create EXTERNAL table IF NOT EXISTS $hive_db.$hive_tb(

start_dt string comment '开始日期',
end_dt string comment '结束日期'

)
COMMENT ''
STORED AS PARQUET
;

6.带分区快照

create EXTERNAL table IF NOT EXISTS $hive_db.$hive_tb(



)
COMMENT ''
PARTITIONED BY (dt string comment '数据日期')
STORED AS PARQUET
;

二、加工方式

1.增量还原1（全量无分区）

insert overwrite table $hive_db1.$hive_tb1
select 
	X
from $hive_db2.$hive_tb2
where dt='$v_date'

union all 

select 
from $hive_db1.$hive_tb1 t1 
left join $hive_db2.$hive_tb2 t2
on t1.X=T2.X and t2.dt='$v_date'
where t2.X is null

2.增量追加1（全量无分区）

insert overwrite table $hive_db1.$hive_tb1
select 
	X
from $hive_db2.$hive_tb2
where dt='$v_date'

union all 

select X
from $hive_db1.$hive_tb1
where $dt<>'$v_date'

3.全量覆盖（全量无分区）

insert overwrite table $hive_db1.$hive_tb1
select 
X
from $hive_db2.$hive_tb2
where dt='$v_date'

4.多分区增量还原

（1）
select 
	order_month
from $hive_db2.$hive_tb2
where dt='$v_date'
group by order_month

（2）for i in order_month 

insert overwrite table $hive_db1.$hive_tb1 partition(order_month)
select X
from $hive_db2.$hive_tb2
where dt='$v_date' and order_month=$i 

union all   

select X
from $hive_db1.$hive_tb1 t1 
left join $hive_db2.$hive_tb2 t2 
on t1.X=t2.X and t2.dt='$v_date' and t2.order_month=$i 
where t1.order_month=$i  AND T2.X IS NULL

5.增量追加1（全量多分区）

（1）
select order_month
from $hive_db2.$hive_tb2
where dt='$v_date'
group by order_month

（2）
insert overwrite table $hive_db1.$hive_tb1 partition(order_dt)
select X
from $hive_db2.$hive_tb2
where dt='$v_date'

union all 

select X
from $hive_db1.$hive_tb1 t1 
where order_dt='$1'

6.增量拉链（带分区）

insert overwrite table $hive_db1.$hive_tb1 partition(end_dt)
select 
   X,start_dt,
  if(sn=1,'2999-12-31','$v_date') end_dt
from (
  select 
    X,row_number() over(partition by X order by start_dt) sn 
  from (
      select X,'$v_date' start_dt,'2999-12-31' end_dt
      from $hive_db2.$hive_tb2
      where dt='$v_date' 
      
      union all   
      
      select X,start_dt,end_dt
      from $hive_db1.$hive_tb1 t1 
      where end_dt='2999-12-31'
   )t1 
)tt1

7.增量拉链（不带分区）

insert overwrite table $hive_db1.$hive_tb1 
select 
   X,start_dt,
  if(sn=1,'2999-12-31','$v_date') end_dt
from (
   select 
     X,row_number() over(partition by X order by start_dt) sn 
   from (
      select X,'$v_date' start_dt,'2999-12-31' end_dt
      from $hive_db2.$hive_tb2
      where dt='$v_date' 

      union all             

      select X,start_dt,end_dt
      from $hive_db1.$hive_tb1 t1 
      where end_dt='2999-12-31'
   )t1 
)tt1

union all 

select X,start_dt,end_dt
from $hive_db1.$hive_tb1 
where end_dt<='$s_date'

8.增量还原2（全量快照带分区）

insert overwrite table $hive_db1.$hive_tb1 partition(dt) 
select X
from $hive_db2.$hive_tb2
where dt='$v_date'

union all 

select x
from $hive_db1.$hive_tb1 t1 
left join $hive_db2.$hive_tb2 t2
on t1.X=T2.X and t2.dt='$v_date'
where t1.dt=date_sub('$v_date',1) 
and t2.X is null

9.全量追加（全量快照带分区）

insert overwrite table $hive_db1.$hive_tb1 partition(dt) 
select X
from $hive_db2.$hive_tb2
where dt='$v_date'

存储类型

1.当前全量不带分区（数据规模不大1000万以下）

2.当前全量带分区（数据规模较大1000万以上，3亿条以下）

3.拉链不带分区（数据规模小需保留历史状态变更1万条以下）

4.拉链带分区（数据规模不大需保留历史状态变更1万条以上1000万以下）

5.快照（数据规模不大数据变化较快经常使用历史状态数据）

3亿条以上数据单独分析

加工类型

1.增量还原

2.增量拉链

3.全量覆盖

4.增量追加

1.业务表只有insert场景

不做特殊处理，直接清洗追加

2.业务表存在update操作场景

根据当前总数据以及每月增量数据数量进行灵活判断

参考方案：

（1）总数据1000万（含）以下月并且增量100万以下，单分区直接处理成最新全量数据

（2）总数据量大于1000万小于1个亿并且月增规模小于1000万按业务发生时间按月分区，如无业务发生时间，再具体分析

处理加工按照单日数据涉及的分区数据按分区进行处理

（3）总数据量大于1个亿月增量数据规模大于1000万按业务发生时间按月分区，如无业务发生时间，再具体分析，按业务流水编号分桶

处理加工按照单日数据涉及的分区数据按分区进行处理