Hive是大数据体系下ETL、数据预处理、数仓等领域比较重要的组件,应用广泛。博主空闲时间研究一下Hive的数据类型。

1. Hive数据类型组成

  关于Hive的数据类型,官方文档展示的比较全面,关于数据类型的详细说明Hive数据类型,这里总结如下
| 大类 | 类型 | 具体类型 |
| – | – | - |
| 基本类型| 数值型 |tinyint,smallint,int,bigint,float,double,decimal,numeric |
| 基本类型 | 字符型 | string,varchar,char |
| 基本类型 | 日期型 | timestamp,date,interval |
| 基本类型 | 其他 | boolean,binary |
| 复杂类型 | 数组 | array |
| 复杂类型 | 映射 | map |
| 复杂类型 | 结构 | struct |
| 复杂类型 | 联合 | uniontype |

2. 数据导入举例

  这里主要研究具体建表、产生测试数据、导入数据等内容。

2.1. 建表语句

create table test_hive_meta(
    name string,
    age int,
    score float,
    insert_time string,-- 刚开始定义的是date类型,后面修改为string或timestamp
    students struct,
    infos map,
    scores array
)  comment "测试数据表"
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'  
COLLECTION ITEMS TERMINATED BY '-'
MAP KEYS TERMINATED BY ':'
STORED as TEXTFILE ;

2.2. 产生测试数据

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Random;

public class TestMain {

	private static final String[] names = new String[] { "Laird", "莱尔德", "Lambert", "兰伯特", "Lamont", "拉蒙特", "Lance",
			"兰斯", "Lang", "兰格", "Lange", "兰格", "Langston", "兰斯顿", "Lanny", "兰尼", "Larkin", "拉金", "Larry", "拉里",
			"Clementina", "克莱门蒂娜", "Clementine", "克莱门廷", "Clemmie", "克莱米", "Cleo", "克利奥", "Cleopatra", "克利奥帕特拉",
			"Colette", "科莱特", "Colleen", "科琳", "Conchita", "康奇塔", "Connie", "康妮,康尼", "Constance", "康斯坦斯", "Alvina",
			"阿尔文娜", "Alvira", "阿尔薇拉", "Amabel", "阿玛贝尔", "Amanda", "阿曼达", "Amber", "安伯", "Amelia", "阿米莉亚", "Amity",
			"阿米蒂", "Amor", "埃默", "Amy", "艾米", "Ana", "安娜", "Ware", "韦尔", "Warner", "沃纳", "Warren", "沃伦", "Washburn",
			"沃什伯恩", "Washington", "华盛顿", "Watkins", "沃特金斯", "Watt", "瓦特", "Watts", "沃茨", "Wayne", "韦恩", "Webb", "韦布",
			"Lina", "莉娜", "Linda", "琳达", "Lindy", "琳迪", "Linn", "林", "Linsey", "林赛", "Lisa", "莉萨", "Lisbeth", "莉斯贝思",
			"Lise", "莉萨", "Lisette", "莉塞特", "Liz", "莉兹" };
	private static Random random = new Random();
	private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-mm-dd HH:MM:ss");
	private static int size = names.length;
	private static char[] a = { ',' };// fields termination
	private static char[] b = { '-' };// collection item termination
	private static char[] c = { ':' };// map key termination

	public static void main(String[] args) {

		try {
			File file = new File("./data.txt");
			if (!file.exists()) {
				file.createNewFile();
			}
			FileWriter fw = new FileWriter(file.getAbsoluteFile());
			BufferedWriter bw = new BufferedWriter(fw);
			for (int i = 0; i < 100; i++) {
				bw.write(gen1Line());
			}
			bw.close();
			System.out.println("Done");
		} catch (IOException e) {
			e.printStackTrace();
		}
	}

	private static String gen1Line() {

		StringBuffer line = new StringBuffer();
		line.append(names[random.nextInt(size)]);
		line.append(a);
		line.append(random.nextInt(100));
		line.append(a);
		line.append(random.nextDouble() * 10);
		line.append(a);
		line.append(sdf.format(new Date()));
		line.append(a);

		line.append(names[random.nextInt(size)]);
		line.append(b);
		line.append(random.nextInt(100));
		line.append(a);

		for (int i = 0; i < 10; i++) {
			line.append(random.nextInt(100));
			line.append(c);
			line.append(names[random.nextInt(size)]);
			line.append(b);
		}
		line.append(random.nextInt(100));
		line.append(c);
		line.append(names[random.nextInt(size)]);
		line.append(a);

		for (int i = 0; i < 10; i++) {
			line.append(random.nextDouble() * 10);
			line.append(b);
		}
		line.append(random.nextDouble() * 10);
		line.append("\n");

		return line.toString();
	}
}

2.3. 导入数据测试

-- 导入数据
load data local inpath "/home/data.txt" into table test_hive_meta;

-- 为了便于查看导入数据结果,打开列显示
set hive.cli.print.header=true;
set hive.cli.print.row.to.vertical=true;
set hive.cli.print.row.to.vertical.num=1;

2.4. 查看导入结果

-- 查询
select * from test_hive_meta limit 1 ;

-- 结果
name	age	score	insert_time	students	infos	scores
Cleopatra	11	0.28206065	2018-14-11 17:08:15	{"sname":"阿玛贝尔","sage":15}	{41:"Linda",82:"康斯坦斯",94:"艾米",81:"Washington",23:"兰尼",93:"Lise",36:"沃纳",70:"Lise",39:"克利奥帕特拉",35:"Lambert",67:"Colleen"}	[1.8265022,6.058134,7.794176,4.096524,8.195735,5.866253,0.75852406,6.835354,2.7134678,8.078223,6.275408]

3. 采坑

  1. 关于date数据类型
    发现使用date数据类型在导入的时候存在问题,刚开始使用date类型,导入类型为long,结果显示为null;后面导入数据修改为”yyyy-MM-dd HH:mm:ss”之后,结果仍为null。后面修改为string类型,导入数据为格式化的日期类型,或者修改为timestamp类型,导入类型为long。

4. 参考文章

  1. Apache Hive Document
  2. hive collection data type
  3. Hive中导入时间格式的数据显示为null