1.tpc-ds download address is as follows
Http://www.tpc.org/tpc_documents_current_versions/current_specifications.asp
1. Installation dependencies
Yum-y install gcc gcc-c++ libstdc++-devel Bison BYACC Flex
2. Installation
Unzip A3083c5a-55ae-49bc-8d6f-cc2ab508f898-tpc-ds-tool.zip
CD V2.3.0/tools
Make
3. Generating data
Generate 10T Data
./dsdgen-scale 10000-dir/dfs/data
Background data generation
100G data generation (can be different machines simultaneously production order to change the degree of parallelism and the number behind the child, pay attention to the degree of parallelism you adjust yourself, for example, I here is 10, then ensure that there are 10 child to ensure that the data follow-up is complete. )
Nohup./dsdgen-scale 100-dir/dfs/data/-parallel 10-child 1 >nohup.log 2>&1 &
Nohup./dsdgen-scale 100-dir/dfs/data/-parallel 10-child 2 >nohup.log 2>&1 &
Nohup./dsdgen-scale 100-dir/dfs/data/-parallel 10-child 3 >nohup.log 2>&1 &
Nohup./dsdgen-scale 10000-dir/dfs/data/-parallel 20-child 4 >nohup.log 2>&1 &
Nohup./dsdgen-scale 100-dir/dfs/data/-parallel 10-child 5 >nohup.log 2>&1 &
Nohup./dsdgen-scale 100-dir/dfs/data/-parallel 10-child 6 >nohup.log 2>&1 &
Nohup./dsdgen-scale 100-dir/dfs/data/-parallel 10-child 7 >nohup.log 2>&1 &
Nohup./dsdgen-scale 100-dir/dfs/data/-parallel 10-child 8 >nohup.log 2>&1 &
Nohup./dsdgen-scale 100-dir/dfs/data/-parallel 10-child 9 >nohup.log 2>&1 &
Nohup./dsdgen-scale 100-dir/dfs/data/-parallel 10-child & >nohup.log 2>&1
1) uploading local data to HDFs
2) Start uploading data with the Hadoop-shell command:
3) nohup Hadoop fs-put/dfs/data/*/tpc_ds > Nohup.log 2>&1 &
Create a table in hive
git clone https://github. com/hortonworks/hive-testbench. Git
[Root@namenode01 text]# pwd
/root/hive-testbench/ddl-tpcds/text has created a table statement, self-installing its own changes under
Alltables.sql Analyze_everything.sql
Create Database Tpc_ds;
Create Database Tpc_ds2;
Use Tpc_ds;
drop table if exists call_center;
Create External Table Call_Center (
Cc_call_center_sk bigint
, cc_call_center_id string
, cc_rec_start_date string
, cc_rec_end_date string
, Cc_closed_date_sk bigint
, Cc_open_date_sk bigint
, Cc_name string
, Cc_class string
, Cc_employees int
, cc_sq_ft int
, cc_hours string
, Cc_manager string
, cc_mkt_id int
, Cc_mkt_class string
, Cc_mkt_desc string
, Cc_market_manager string
, Cc_division int
, Cc_division_name string
, Cc_company int
, Cc_company_name string
, Cc_street_number string
, Cc_street_name string
, Cc_street_type string
, Cc_suite_number string
, cc_city string
, Cc_county string
, cc_state string
, Cc_zip string
, cc_country string
, Cc_gmt_offset Double
, Cc_tax_percentage Double
)
Row format delimited fields Terminatedby ' | '
STORED as Textfile; There's a lot more to show.
Load data:
LOAD DATA inpath '/tpc_ds/call_center*.dat ' into tablecall_center;
Creating a partquet column data store
Use TPC_DS2;
Create External Table Call_Center (
Cc_call_center_sk bigint
, cc_call_center_id string
, cc_rec_start_date string
, cc_rec_end_date string
, Cc_closed_date_sk bigint
, Cc_open_date_sk bigint
, Cc_name string
, Cc_class string
, Cc_employees int
, cc_sq_ft int
, cc_hours string
, Cc_manager string
, cc_mkt_id int
, Cc_mkt_class string
, Cc_mkt_desc string
, Cc_market_manager string
, Cc_division int
, Cc_division_name string
, Cc_company int
, Cc_company_name string
, Cc_street_number string
, Cc_street_name string
, Cc_street_type string
, Cc_suite_number string
, cc_city string
, Cc_county string
, cc_state string
, Cc_zip string
, cc_country string
, Cc_gmt_offset Double
, Cc_tax_percentage Double
)
Row format delimited fields Terminatedby ' | ' STORED as Parquet;
Load data into Partquet data to table:
INSERT OVERWRITE TABLE call_center SELECT * from Tpc_ds.call_center; Other tables are similarly loaded and can then be tested for performance.
SQL statements:
SELECT * FROM
(Selecti_manufact_id,sum (ss_sales_price) sum_sales,avg (sum (ss_sales_price)) over (partition by i_manufact_id) Avg_ Quarterly_sales from Item,
Store_sales, Date_dim, store
where Ss_item_sk = I_item_sk and
Ss_sold_date_sk = D_date_sk and
Ss_store_sk = S_store_sk and
D_month_seq in (1212,1212+1,1212+2,1212+3,1212+4,1212+5,1212+6,1212+7,1212+8,1212+9,1212+10,1212+11) and
((I_category in (' Books ', ' children ', ' Electronics ') and
I_class in (' personal ', ' portable ', ' reference ', ' self-help ') and
I_brand in (' scholaramalgamalg#14 ', ' scholaramalgamalg #7 ', ' exportiunivamalg #9 ', ' scholaramalgamalg #9 ')) or
(I_category in (' Women ', ' Music ', ' Men ') and I_class in (' Accessories ', ' classical ', ' fragrances ', ' pants ') and
I_brand in (' Amalgimporto #1 ', ' edupackscholar #1 ', ' Exportiimporto #1 ', ' importoamalg #1 ')))
Group by i_manufact_id, D_qoy) tmp1where if avg_quarterly_sales > 0 Then ABS (sum_sales-avg_quarterly_sales)/ Avg_quarterly_sales
else NULL end > 0.1
ORDER BY Avg_quarterly_sales,sum_sales,
i_manufact_id
Limit 100;
2. Linux Buffering
Echo 3 >/proc/sys/vm/drop_caches
3. Execution time