Hive是大数据体系下ETL、数据预处理、数仓等领域比较重要的组件,应用广泛。博主空闲时间研究一下Hive的数据类型。
1. Hive数据类型组成
关于Hive的数据类型,官方文档展示的比较全面,关于数据类型的详细说明Hive数据类型,这里总结如下
| 大类 | 类型 | 具体类型 |
| – | – | - |
| 基本类型| 数值型 |tinyint,smallint,int,bigint,float,double,decimal,numeric |
| 基本类型 | 字符型 | string,varchar,char |
| 基本类型 | 日期型 | timestamp,date,interval |
| 基本类型 | 其他 | boolean,binary |
| 复杂类型 | 数组 | array |
| 复杂类型 | 映射 | map |
| 复杂类型 | 结构 | struct |
| 复杂类型 | 联合 | uniontype |
2. 数据导入举例
这里主要研究具体建表、产生测试数据、导入数据等内容。
2.1. 建表语句
create table test_hive_meta(
name string,
age int,
score float,
insert_time string,-- 刚开始定义的是date类型,后面修改为string或timestamp
students struct,
infos map,
scores array
) comment "测试数据表"
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
COLLECTION ITEMS TERMINATED BY '-'
MAP KEYS TERMINATED BY ':'
STORED as TEXTFILE ;
2.2. 产生测试数据
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Random;
public class TestMain {
private static final String[] names = new String[] { "Laird", "莱尔德", "Lambert", "兰伯特", "Lamont", "拉蒙特", "Lance",
"兰斯", "Lang", "兰格", "Lange", "兰格", "Langston", "兰斯顿", "Lanny", "兰尼", "Larkin", "拉金", "Larry", "拉里",
"Clementina", "克莱门蒂娜", "Clementine", "克莱门廷", "Clemmie", "克莱米", "Cleo", "克利奥", "Cleopatra", "克利奥帕特拉",
"Colette", "科莱特", "Colleen", "科琳", "Conchita", "康奇塔", "Connie", "康妮,康尼", "Constance", "康斯坦斯", "Alvina",
"阿尔文娜", "Alvira", "阿尔薇拉", "Amabel", "阿玛贝尔", "Amanda", "阿曼达", "Amber", "安伯", "Amelia", "阿米莉亚", "Amity",
"阿米蒂", "Amor", "埃默", "Amy", "艾米", "Ana", "安娜", "Ware", "韦尔", "Warner", "沃纳", "Warren", "沃伦", "Washburn",
"沃什伯恩", "Washington", "华盛顿", "Watkins", "沃特金斯", "Watt", "瓦特", "Watts", "沃茨", "Wayne", "韦恩", "Webb", "韦布",
"Lina", "莉娜", "Linda", "琳达", "Lindy", "琳迪", "Linn", "林", "Linsey", "林赛", "Lisa", "莉萨", "Lisbeth", "莉斯贝思",
"Lise", "莉萨", "Lisette", "莉塞特", "Liz", "莉兹" };
private static Random random = new Random();
private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-mm-dd HH:MM:ss");
private static int size = names.length;
private static char[] a = { ',' };// fields termination
private static char[] b = { '-' };// collection item termination
private static char[] c = { ':' };// map key termination
public static void main(String[] args) {
try {
File file = new File("./data.txt");
if (!file.exists()) {
file.createNewFile();
}
FileWriter fw = new FileWriter(file.getAbsoluteFile());
BufferedWriter bw = new BufferedWriter(fw);
for (int i = 0; i < 100; i++) {
bw.write(gen1Line());
}
bw.close();
System.out.println("Done");
} catch (IOException e) {
e.printStackTrace();
}
}
private static String gen1Line() {
StringBuffer line = new StringBuffer();
line.append(names[random.nextInt(size)]);
line.append(a);
line.append(random.nextInt(100));
line.append(a);
line.append(random.nextDouble() * 10);
line.append(a);
line.append(sdf.format(new Date()));
line.append(a);
line.append(names[random.nextInt(size)]);
line.append(b);
line.append(random.nextInt(100));
line.append(a);
for (int i = 0; i < 10; i++) {
line.append(random.nextInt(100));
line.append(c);
line.append(names[random.nextInt(size)]);
line.append(b);
}
line.append(random.nextInt(100));
line.append(c);
line.append(names[random.nextInt(size)]);
line.append(a);
for (int i = 0; i < 10; i++) {
line.append(random.nextDouble() * 10);
line.append(b);
}
line.append(random.nextDouble() * 10);
line.append("\n");
return line.toString();
}
}
2.3. 导入数据测试
-- 导入数据
load data local inpath "/home/data.txt" into table test_hive_meta;
-- 为了便于查看导入数据结果,打开列显示
set hive.cli.print.header=true;
set hive.cli.print.row.to.vertical=true;
set hive.cli.print.row.to.vertical.num=1;
2.4. 查看导入结果
-- 查询
select * from test_hive_meta limit 1 ;
-- 结果
name age score insert_time students infos scores
Cleopatra 11 0.28206065 2018-14-11 17:08:15 {"sname":"阿玛贝尔","sage":15} {41:"Linda",82:"康斯坦斯",94:"艾米",81:"Washington",23:"兰尼",93:"Lise",36:"沃纳",70:"Lise",39:"克利奥帕特拉",35:"Lambert",67:"Colleen"} [1.8265022,6.058134,7.794176,4.096524,8.195735,5.866253,0.75852406,6.835354,2.7134678,8.078223,6.275408]
3. 采坑
- 关于date数据类型
发现使用date数据类型在导入的时候存在问题,刚开始使用date类型,导入类型为long,结果显示为null;后面导入数据修改为”yyyy-MM-dd HH:mm:ss”之后,结果仍为null。后面修改为string类型,导入数据为格式化的日期类型,或者修改为timestamp类型,导入类型为long。
4. 参考文章
本博客所有文章除特别声明外,均采用 CC BY-SA 4.0 协议 ,转载请注明出处!