支持将结构化的数据文件映射成数据库表的形式,并提供类 SQL 的查询能力,能够把 SQL 查询自动转化为 MapReduce、Tez 或 Spark 等分布式计算任务进行执行。
核心特性:
- SQL 接口:通过 HiveQL(语法接近标准 SQL)实现数据查询操作。
- 大规模数据处理:适用于 PB 级别数据的高效处理。
- 数据仓库支持:专为离线批处理和企业级数据仓库构建设计。
- 高扩展性:支持用户自定义函数,包括 UDF(标量函数)、UDAF(聚合函数)和 UDTF(表生成函数)。
典型应用场景:
数据仓库与数据湖
- 企业级数据仓库(EDW)建设
- 作为数据湖的查询引擎
- 历史数据的存储与分析
数据处理流程
- ETL(抽取、转换、加载)任务执行
- 数据清洗与格式转换
- 数据质量校验与监控
离线分析与报表生成
- 按日、周、月生成业务报表
- 用户行为路径与交互分析
- 关键业务指标(KPI)统计计算
探索性数据分析
- 供数据科学家进行快速数据探查
- 支持即席查询(Ad-hoc Query),灵活响应分析需求
Hive 实践案例:“网站用户访问量统计”
一、环境与表结构准备
设计合理的 Hive 表结构是实现高效统计的前提。以下为相关建表语句:
-- 创建原始日志表(用于存储从 Kafka 或文件系统导入的原始访问记录)
CREATE TABLE IF NOT EXISTS page_view_logs (
log_id BIGINT COMMENT '日志ID',
page_url STRING COMMENT '页面URL',
user_id STRING COMMENT '用户ID',
event_type STRING COMMENT '事件类型',
server_time TIMESTAMP COMMENT '服务器时间',
client_time TIMESTAMP COMMENT '客户端时间',
ip_address STRING COMMENT 'IP地址',
user_agent STRING COMMENT '用户代理',
other_info STRING COMMENT '其他信息'
)
COMMENT '页面访问日志原始表'
PARTITIONED BY (dt STRING COMMENT '日期分区')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;
-- 创建每日统计结果表(用于长期存储聚合结果)
CREATE TABLE IF NOT EXISTS page_view_daily_stats (
stat_date STRING COMMENT '统计日期',
page_url STRING COMMENT '页面URL',
pv_count BIGINT COMMENT '页面访问量',
uv_count BIGINT COMMENT '独立用户数',
avg_pv_per_user DOUBLE COMMENT '人均访问次数',
peak_hour INT COMMENT '访问高峰小时',
create_time TIMESTAMP COMMENT '创建时间'
)
COMMENT '页面访问日统计表'
PARTITIONED BY (stat_month STRING COMMENT '月份分区')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS ORC;
-- 创建小时粒度统计表(支持近实时分析场景)
CREATE TABLE IF NOT EXISTS page_view_hourly_stats (
stat_hour STRING COMMENT '统计小时',
page_url STRING COMMENT '页面URL',
pv_count BIGINT COMMENT '页面访问量',
uv_count BIGINT COMMENT '独立用户数',
create_time TIMESTAMP COMMENT '创建时间'
)
COMMENT '页面访问小时统计表'
PARTITIONED BY (stat_date STRING COMMENT '日期分区')
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS ORC;
[此处为图片3]
二、数据导入与加载
可通过多种方式将外部数据载入 Hive 表中,常用方法如下:
-- 将 HDFS 上的数据文件直接加载至分区表
LOAD DATA INPATH '/user/hive/warehouse/logs/page_views_20240101.log'
INTO TABLE page_view_logs
PARTITION (dt='2024-01-01');
-- 推荐使用外部表方式管理数据,便于与其它系统共享
CREATE EXTERNAL TABLE IF NOT EXISTS page_view_logs_external (
log_id BIGINT,
page_url STRING,
user_id STRING,
event_type STRING,
server_time TIMESTAMP,
client_time TIMESTAMP,
ip_address STRING,
user_agent STRING,
other_info STRING
)
COMMENT '页面访问日志外部表'
LOCATION '/data/logs/page_views';
采用外部表可避免误删元数据导致的数据丢失风险,同时提升数据管理灵活性。
-- 创建外部表并指定分区、字段分隔符及存储路径 ROW FORMAT DELIMITED FIELDS TERMINATED BY '\t' PARTITIONED BY (dt STRING) LOCATION '/user/hive/warehouse/logs/'; -- 修复表的分区信息,使Hive识别已存在的分区数据 MSCK REPAIR TABLE page_view_logs_external; [此处为图片1]三、核心数据分析脚本
日级数据统计分析
-- 插入每日页面访问统计数据至分区表 INSERT OVERWRITE TABLE page_view_daily_stats PARTITION(stat_month='2024-01') SELECT dt AS stat_date, page_url, COUNT(*) AS pv_count, -- 页面总访问次数 COUNT(DISTINCT user_id) AS uv_count, -- 独立访客数量 ROUND(COUNT(*) / COUNT(DISTINCT user_id), 2) AS avg_pv_per_user, -- 提取访问最频繁时段的小时部分 CAST(SUBSTR(peak_hour_data.peak_hour, 1, 2) AS INT) AS peak_hour, CURRENT_TIMESTAMP AS create_time FROM page_view_logs -- 使用LATERAL VIEW计算各页面访问量最高的小时段 LATERAL VIEW ( SELECT CONCAT(stat_hour, ':00') AS peak_hour FROM ( SELECT page_url AS inner_page_url, DATE_FORMAT(server_time, 'HH') AS stat_hour, COUNT(*) AS hour_pv, ROW_NUMBER() OVER (PARTITION BY page_url ORDER BY COUNT(*) DESC) AS rn FROM page_view_logs WHERE dt = '2024-01-01' GROUP BY page_url, DATE_FORMAT(server_time, 'HH') ) t WHERE t.inner_page_url = page_view_logs.page_url AND rn = 1 ) peak_hour_data WHERE dt = '2024-01-01' AND event_type = 'page_view' GROUP BY dt, page_url, peak_hour_data.peak_hour; -- 查看日统计结果,按访问量降序排列,取前10条记录 SELECT * FROM page_view_daily_stats WHERE stat_month = '2024-01' ORDER BY pv_count DESC LIMIT 10; [此处为图片2]小时级别实时统计
-- 写入每小时维度的页面访问汇总数据 INSERT OVERWRITE TABLE page_view_hourly_stats PARTITION(stat_date='2024-01-01') SELECT CONCAT(DATE_FORMAT(server_time, 'yyyy-MM-dd HH'), ':00') AS stat_hour, page_url, COUNT(*) AS pv_count, COUNT(DISTINCT user_id) AS uv_count, CURRENT_TIMESTAMP AS create_time FROM page_view_logs WHERE dt = '2024-01-01' AND event_type = 'page_view' AND server_time >= '2024-01-01 00:00:00' AND server_time < '2024-01-02 00:00:00' GROUP BY DATE_FORMAT(server_time, 'yyyy-MM-dd HH'), page_url; -- 查询每小时统计详情,按时间和访问量排序 SELECT * FROM page_view_hourly_stats WHERE stat_date = '2024-01-01' ORDER BY stat_hour, pv_count DESC; [此处为图片3]四、高级行为分析脚本
用户访问路径挖掘
-- 构建用户会话序列以追踪浏览路径 WITH user_sessions AS ( SELECT user_id, page_url, server_time, LAG(page_url) OVER (PARTITION BY user_id ORDER BY server_time) AS prev_page,
-- 计算页面流转关系(从哪个页面跳转到哪个页面)
WITH user_sessions AS (
SELECT
user_id,
server_time,
page_url,
-- 使用 LEAD 窗口函数获取用户下一次访问的页面
LEAD(page_url) OVER (PARTITION BY user_id ORDER BY server_time) AS next_page
FROM page_view_logs
WHERE dt = '2024-01-01'
AND event_type = 'page_view'
),
-- 统计页面之间的跳转次数
page_transitions AS (
SELECT
COALESCE(prev_page, 'ENTRY') AS from_page, -- 若无前一页,则标记为入口页
page_url AS to_page,
COUNT(*) AS transition_count
FROM (
SELECT
user_id,
server_time,
page_url,
LAG(page_url) OVER (PARTITION BY user_id ORDER BY server_time) AS prev_page
FROM page_view_logs
WHERE dt = '2024-01-01'
AND event_type = 'page_view'
) t
GROUP BY COALESCE(prev_page, 'ENTRY'), page_url
)
-- 输出结果:来源页面、目标页面、跳转次数及占比
SELECT
from_page,
to_page,
transition_count,
ROUND(transition_count * 100.0 / SUM(transition_count) OVER (PARTITION BY from_page), 2) AS percentage
FROM page_transitions
ORDER BY from_page, transition_count DESC;
[此处为图片1]
-- 按小时统计热门页面的访问趋势
-- 获取当日 PV 排名前五的页面,分析其每小时流量变化
SELECT
page_url,
DATE_FORMAT(server_time, 'HH') AS hour,
COUNT(*) AS pv_count, -- 页面浏览量
COUNT(DISTINCT user_id) AS uv_count -- 独立访客数
FROM page_view_logs
WHERE dt = '2024-01-01'
AND event_type = 'page_view'
AND page_url IN (
-- 子查询:提取当日访问量最高的5个页面
SELECT page_url
FROM page_view_daily_stats
WHERE stat_date = '2024-01-01'
ORDER BY pv_count DESC
LIMIT 5
)
GROUP BY page_url, DATE_FORMAT(server_time, 'HH')
ORDER BY page_url, hour;
[此处为图片2]
Java 应用通过 JDBC 调用 Hive 执行数据分析
以下是一个使用 Java 程序连接 Hive 数据库并执行页面访问统计任务的完整示例。
import java.sql.*;
public class HivePageViewAnalysis {
// 定义驱动类名和数据库连接地址
private static final String DRIVER_NAME = "org.apache.hive.jdbc.HiveDriver";
private static final String CONNECTION_URL = "jdbc:hive2://localhost:10000/analytics_db";
public static void main(String[] args) {
Connection connection = null;
Statement statement = null;
ResultSet resultSet = null;
try {
// 第一步:加载 Hive JDBC 驱动
Class.forName(DRIVER_NAME);
// 第二步:建立与 Hive 的连接
connection = DriverManager.getConnection(CONNECTION_URL, "hive", "");
statement = connection.createStatement();
// 第三步:执行每日统计 SQL,将结果写入分区表
String dailyStatsSQL =
"INSERT OVERWRITE TABLE page_view_daily_stats PARTITION(stat_month='2024-01') " +
"SELECT " +
" dt AS stat_date, " +
" page_url, " +
" COUNT(*) AS pv_count, " +
" COUNT(DISTINCT user_id) AS uv_count, " +
" ROUND(COUNT(*) / COUNT(DISTINCT user_id), 2) AS avg_pv_per_user, " +
" 0 AS peak_hour, " + // 简化处理,高峰时段可后续扩展
" CURRENT_TIMESTAMP AS create_time " +
"FROM page_view_logs " +
"WHERE dt = '2024-01-01' " +
" AND event_type = 'page_view' " +
"GROUP BY dt, page_url";
System.out.println("执行日统计任务...");
statement.execute(dailyStatsSQL);
System.out.println("日统计任务执行完成。");
} catch (ClassNotFoundException e) {
System.err.println("Hive 驱动未找到:" + e.getMessage());
} catch (SQLException e) {
System.err.println("数据库操作异常:" + e.getMessage());
} finally {
// 资源释放
try {
if (resultSet != null) resultSet.close();
if (statement != null) statement.close();
if (connection != null) connection.close();
} catch (SQLException e) {
System.err.println("资源关闭失败:" + e.getMessage());
}
}
}
}
// 4. 查询统计结果
String querySQL =
"SELECT stat_date, page_url, pv_count, uv_count, avg_pv_per_user " +
"FROM page_view_daily_stats " +
"WHERE stat_month = '2024-01' " +
"ORDER BY pv_count DESC " +
"LIMIT 10";
resultSet = statement.executeQuery(querySQL);
// 5. 输出查询结果
System.out.println("=== 页面访问统计TOP10 ===");
System.out.println("日期\t\t页面URL\t\t访问量\t独立用户\t人均访问");
System.out.println("------------------------------------------------------------");
while (resultSet.next()) {
String statDate = resultSet.getString("stat_date");
String pageUrl = resultSet.getString("page_url");
long pvCount = resultSet.getLong("pv_count");
long uvCount = resultSet.getLong("uv_count");
double avgPv = resultSet.getDouble("avg_pv_per_user");
System.out.printf("%s\t%s\t%d\t%d\t%.2f%n",
statDate, pageUrl, pvCount, uvCount, avgPv);
}
// 3. 执行数据插入操作
statement.execute(dailyStatsSQL);
六、Shell 脚本调度
每日统计任务脚本
#!/bin/bash
# Hive 日统计任务执行脚本
# 运行方式: ./daily_pageview_stats.sh 2024-01-01
# 参数校验
if [ $# -ne 1 ]; then
echo "Usage: $0 <date>"
echo "Example: $0 2024-01-01"
exit 1
fi
STAT_DATE=$1
STAT_MONTH=$(date -d "$STAT_DATE" +%Y-%m)
echo "开始执行页面访问统计任务,日期: $STAT_DATE"
# 执行Hive SQL命令
hive -e "
SET hive.exec.dynamic.partition = true;
SET hive.exec.dynamic.partition.mode = nonstrict;
-- 日粒度统计任务
INSERT OVERWRITE TABLE page_view_daily_stats PARTITION(stat_month='$STAT_MONTH')
SELECT
dt as stat_date,
page_url,
COUNT(*) as pv_count,
COUNT(DISTINCT user_id) as uv_count,
ROUND(COUNT(*) / COUNT(DISTINCT user_id), 2) as avg_pv_per_user,
0 as peak_hour,
CURRENT_TIMESTAMP as create_time
FROM page_view_logs
WHERE dt = '$STAT_DATE'
AND event_type = 'page_view'
GROUP BY dt, page_url;
-- 小时粒度统计任务
INSERT OVERWRITE TABLE page_view_hourly_stats PARTITION(stat_date='$STAT_DATE')
SELECT
CONCAT(DATE_FORMAT(server_time, 'yyyy-MM-dd HH'), ':00') as stat_hour,
page_url,
[此处为图片1]
COUNT(*) as pv_count,
COUNT(DISTINCT user_id) as uv_count,
ROUND(COUNT(*) / COUNT(DISTINCT user_id), 2) as avg_pv_per_user,
CURRENT_TIMESTAMP as create_time
FROM page_view_logs
WHERE dt = '$STAT_DATE'
AND event_type = 'page_view'
GROUP BY DATE_FORMAT(server_time, 'yyyy-MM-dd HH'), page_url;
"
}catch(Exception e){
e.printStackTrace();
}finally{
// 释放数据库资源
try {
if (resultSet != null) resultSet.close();
if (statement != null) statement.close();
if (connection != null) connection.close();
} catch (SQLException e) {
e.printStackTrace();
}
}
}
}
-- 设置并行执行参数,提升任务并发处理能力
SET hive.exec.parallel = true;
SET hive.exec.parallel.thread.number = 8;
-- 开启自动Map Join优化,提高小表关联效率
SET hive.auto.convert.join = true;
-- 合并Map端输出的小文件,减少文件数量
SET hive.merge.mapfiles = true;
SET hive.merge.mapredfiles = true;
SET hive.merge.smallfiles.avgsize = 128000000;
SET hive.merge.size.per.task = 256000000;
-- 启用输出压缩,使用Snappy编码器节省存储空间
SET hive.exec.compress.output = true;
SET mapred.output.compression.codec = org.apache.hadoop.io.compress.SnappyCodec;
[此处为图片1]
-- 创建外部表以连接MySQL数据库(需预先安装Hive MySQL连接器)
CREATE EXTERNAL TABLE page_view_stats_mysql (
stat_date STRING,
page_url STRING,
pv_count BIGINT,
uv_count BIGINT,
avg_pv_per_user DOUBLE,
peak_hour INT,
create_time TIMESTAMP
)
STORED BY 'org.apache.hadoop.hive.mysql.storagehandler.MySQLStorageHandler'
TBLPROPERTIES (
"mysql.host" = "localhost",
"mysql.port" = "3306",
"mysql.database" = "analytics_db",
"mysql.table" = "page_view_stats",
"mysql.user" = "root",
"mysql.password" = "password"
);
-- 将每日页面访问统计结果写入MySQL目标表
INSERT OVERWRITE TABLE page_view_stats_mysql
SELECT
stat_date,
page_url,
pv_count,
uv_count,
avg_pv_per_user,
peak_hour,
create_time
FROM page_view_daily_stats
WHERE stat_month = '2024-01';
[此处为图片2]
-- 按小时和页面URL分组统计PV与UV,并记录创建时间
INSERT INTO TABLE page_view_daily_stats
SELECT
'$STAT_DATE' AS stat_date,
DATE_FORMAT(server_time, 'yyyy-MM-dd HH') AS stat_hour,
page_url,
COUNT(*) AS pv_count,
COUNT(DISTINCT user_id) AS uv_count,
AVG(pv_per_user) AS avg_pv_per_user,
HOUR(peak_time) AS peak_hour,
CURRENT_TIMESTAMP AS create_time
FROM (
SELECT
server_time,
page_url,
user_id,
COUNT(*) AS pv_per_user,
MAX(server_time) AS peak_time
FROM page_view_logs
WHERE dt = '$STAT_DATE'
AND event_type = 'page_view'
GROUP BY server_time, page_url, user_id
) t
GROUP BY DATE_FORMAT(server_time, 'yyyy-MM-dd HH'), page_url;
[此处为图片3]
-- 执行完成后检查返回状态码
if [ $? -eq 0 ]; then
echo "页面访问统计任务执行成功"
else
echo "页面访问统计任务执行失败"
exit 1
fi
-- 系统运行流程说明:
1. 环境准备:确保Hadoop集群正常运行,HDFS与Hive服务已启动
2. 表结构初始化:在Hive中执行建表语句,定义数据结构与存储格式
3. 数据导入:将原始日志文件上传至HDFS,并通过LOAD DATA或外部表方式加载进Hive
4. 统计分析执行:运行包含聚合计算的Hive SQL脚本,生成每日访问指标
5. 结果导出:将分析后的结果数据插入MySQL供前端或其他系统查询使用
6. 验证输出:通过命令行反馈或日志确认任务是否成功完成


雷达卡


京公网安备 11010802022788号







