Pig has a top function inside. I don't know why I can't use it. Have time to look at the pig source code.
SET job.name ' Top_k ';
SET Job.priority High;
--register Piggybank.jar;
REGISTER Wizad-etl-udf-0.1.jar;
--define Sequencefileloader Org.apache.pig.piggybank.storage.SequenceFileLoader ();
DEFINE Sequencefileloader Com.vpon.wizad.etl.pig.SequenceFileCSVLoader ();
--%default cleanedlog/user/wizad/data/wizad/cleaned/2014-07-30/*/part*
%default cleanedlog/user/wizad/data/wizad/cleaned/$date/*/part*
%default Output_path/user/wizad/tmp/hour_count
Origin_cleaned_data = LOAD ' $cleanedLog ' USING sequencefileloader
As (Ad_network_id:chararray,
Wizad_ad_id:chararray,
Guid:chararray,
Id:chararray,
Create_time:chararray,
Action_time:chararray,
Log_type:chararray,
Ad_id:chararray,
Positioning_method:chararray,
Location_accuracy:chararray,
Lat:chararray,
Lon:chararray,
Cell_id:chararray,
Lac:chararray,
Mcc:chararray,
Mnc:chararray,
Ip:chararray,
Connection_type:chararray,
Imei:chararray,
Android_id:chararray,
Android_advertising_id:chararray,
Udid:chararray,
Openudid:chararray,
Idfa:chararray,
Mac_address:chararray,
Uid:chararray,
Density:chararray,
Screen_height:chararray,
Screen_width:chararray,
User_agent:chararray,
App_id:chararray,
App_category_id:chararray,
Device_model_id:chararray,
Carrier_id:chararray,
Os_id:chararray,
Device_type:chararray,
Os_version:chararray,
Country_region_id:chararray,
Province_region_id:chararray,
City_region_id:chararray,
Ip_lat:chararray,
Ip_lon:chararray,
Quadkey:chararray);
show_log= FILTER Origin_cleaned_data by log_type== ' 1 ';
--extract column for analyzing. Extract sub-fields as new properties
Original_hour = FOREACH show_log GENERATE ad_network_id,wizad_ad_id,guid,app_category_id,log_type,substring (create_ time,11,13) as hour; --(Wizad_ad_id,guid,log_type,hour)
Hour_group = Group Original_hour by (HOUR,APP_CATEGORY_ID);--Classification by attributes,
Hour_count = foreach hour_group{
--guid_data = $1.guid;
--uniq_guid = distinct guid_data;--to re-process.
Check the unique number.
Ad_network_ids = original_hour.ad_network_id;
uniq_ad_network_ids = distinct ad_network_ids;
--count the number of each package and divide the Uniq_ad_network_ids into a single record.
For example, Uniq_ad_network_ids original value {3,5}, now becomes two records, divided into (xx,3) (xx,5) Two records
Generate Flatten (group), Count_star ($) as PV, flatten (uniq_ad_network_ids);
}
Describe Hour_count;
--View structure as: Hour_count: {group::hour:chararray,group::app_category_id:chararray,pv:long,uniq_ad_network_ids::ad_ Network_id:chararray}
Group_hour_count = Group Hour_count by (HOUR,AD_NETWORK_ID);
Top_2_data = foreach Group_hour_count {
--top_dataset = Top (2,HOUR_COUNT.PV, hour_count);--top function is not available. Who used to tell a voice. I don't have to watch the source code pull, haha
--hour_data = Hour_count;
--top k Implementation mode. Order, limit returns the first K.
Order_hour_count = Order Hour_count by PV DESC;
Top2_hour_count = Limit Order_hour_count 2;
--generate Group, TOP2_HOUR_COUNT.PV, top2_hour_count.app_category_id;--Note that the back is two bag. Separate.
Generate Flatten (top2_hour_count);
}
Copyright notice: This article blog original articles, blogs, without consent, may not be reproduced.
Pig asks for top K, each returning hour and ad_network_id Max two records (Substring,order,count_star,limit)