1. Offline case comprehension and testing (create table, load data, export and output) shell to automatically execute (optional)
Application of
Hive in offline analysis platform
Ideas:
a. What kind of table to create (ETL filter cleaning: write MR program/filter empty value, dirty data/field analysis/completion)
B. According to the business (7: user module, order module, geographical dimension module), create intermediate tables, large tables and small tables, statistical analysis module can reduce repeated loading of data volume.
【Case】
1. Create a secondary partition table
@Create a database
ß------Hive---------->
create databasedb_track;
//Create a secondary partition table
Create testtable(
id string, string,
url, string,
referer string,
keyword, string,
type string,
guid, string,
pageId, string,
moduleId string,
linkId, string,
attachedInfo string,
sessionId string,
trackerU string,
trackerType string,
ip, string,
trackerSrc string,
......
provinceId string,
cityId string,
fee, character string,
buttonPosition string
)
partitioned by(datestring,hour string)
row formatdelimited fields terminated by'\t'
location'/test/test3';
@desc formattedtest3 View partition information
@Add data to the partition table
load data localinpath'/home/vampire/test3/20150828/2015082818' into table test3partition(date="20150828",hour="18");
load data localinpath'/home/vampire/test3/20150828/2015082819' into table test3partition(date="20150828",hour="19");
@再建一个表 (temporary table/outer table) to receive the required field information (as required)
create tableresoult(
date string,
hour string,
pv int,
uv int,
ip int
);
row format delimitedfields terminated by'\t';
Remember to desc formatted resoult to see the basic information in the table below;
@Query the required data and insert the result into the result table;
insert overwritetable resoult select date,hour,count(url) pv,count(distinct guid)uv,count(distinct ip) from test3 group by date,hour;
Hive multiple ways to create tables: http://blog.csdn.net/qq_39532946/article/details/76595182
Example:
create table test4 row format delimited fields terminated by',' as select date,hour,count(url)pv,count(distinct guid) uv,count(distinct ip) from test3 group by date,hour;
@Export the statistical analysis table to mysql, (SQOOP supports data transmission and CRUD operations of tables between mysql/hdfs/hive platforms);
ß------mysql---------->
create tabledairy(
datevarchar(20),
hourvarchar(20),
pv int,
uv int,
ip int
);
ß------sqoop export data---------->
bin/sqoop export\
--connectjdbc:mysql://vampire04:3306/test \
--username root\
--password123456 \
--table dairy \
--num-mappers 1\
--export-dir /user/hive/warehouse/test.db/test4 \
--export-dir use desc formatted static_PVUV in hive to confirm the table directory
--fields-terminated-by ‘,’
--export-dir use desc formattedstatic_PVUV in hive to confirm the table directory
@Can use shell scripts combined with timing scripts to automatically execute periodically to obtain the latest information coverage or append to the information tracking table\
Create testtable(
id string, string,
productId string,
curMerchantId string,
………………
resultSum string,
currentPage string,
linkPosition string,
buttonPosition string
)
partitioned by(datestring,hour string)
row formatdelimited fields terminated by'\t'
#!/bin/sh
##Execute environment variables so that they can be used in scripts. There is a space after ".", which means that the script file is executed
. /etc/profile
# Define Hive directory
HIVE_DIR=/opt/modules/cdh/hive-0.13.1-cdh5.3.6
# Define the data source directory
DATA_LOG=/home/vampire/tracklogs
# Define the date of yesterday, according to the fixed format, you can use echo $(date --date="1 day ago" +%Y%m%d) to try
YESTERDAY=$(date--date="1 day ago" +%Y%m%d)
# The number of times the loop body in the middle of do...done in the for loop is equal to the number of elements behind in
# In our case, the hql file was called twice
for LOG_DAY in`ls $DATA_LOG/$YESTERDAY`
do
#Retrieve the date and hour in the file name separately
DATE=${LOG_DAY:0:8}
HOUR=${LOG_DAY:8:2}
#When testing, you can use echo to print the following command
$HIVE_DIR/bin/hive-e "load data local inpath'$DATA_LOG/$YESTERDAY/${DATE}${HOUR}' intotable testdb.testlog partition (date='$DATE',hour='$HOUR')"
done
===crontable scheduled tasks
30 0 * * * * /bin/sh /home/vampire/loaddata.sh