Pig Analysis Script _hadoop

Source: Internet
Author: User
--Read Data
data = LOAD '/user/mapred/pigdata.txt ' USING pigstorage (' | ') As (Imsi:chararray,time:chararray,loc:chararray);


--Conversion format
Register/home/mapred/software/hadoops/pig/pig-0.11.1/contrib/piggybank/java/piggybank.jar;
Register/home/mapred/practise/joda-time-2.0.jar;


DEFINE Customformattoiso Org.apache.pig.piggybank.evaluation.datetime.convert.CustomFormatToISO ();


Toiso = FOREACH data GENERATE imsi, Customformattoiso (SUBSTRING (time,0,13), ' Yyyy-mm-dd HH ') as Time:chararray,loc;


--Data grouping
GRP = GROUP Toiso by IMSI;


--Continuous data acquisition
Register/home/mapred/practise/datafu-1.2.0.jar
DEFINE markovpairs datafu.pig.stats.MarkovPairs ();


Pairs = FOREACH grp
{
sorted = order Toiso by time;
Pair = Markovpairs (sorted);
GENERATE Flatten (pair) as (Data:tuple (Imsi,time,loc), Next:tuple (Imsi,time,loc));
}


--Expand Data
PRJ = FOREACH pairs GENERATE Data.imsi as imsi,data.time as time,next.time as as next_time,data.loc as loc,next.loc as next_l Oc




DEFINE Isodaysbetween Org.apache.pig.piggybank.evaluation.datetime.diff.ISODaysBetween ();


Flt = FILTER prj by Isodaysbetween (next_time, time) = = 0L;




--Calculate the total number of each location


Total_count = FOREACH (group flt by Loc) GENERATE GROUP as Loc,count (FLT) as total;


--Calculate the number of positions per pair
Pairs_count = FOREACH (group flt by (Loc,next_loc)) GENERATE Flatten (group) as (Loc,next_loc), COUNT (FLT) as CNT;




Jnd = JOIN Pairs_count by Loc,total_count by Loc USING ' replicated ';


Prob = FOREACH jnd GENERATE pairs_count::loc as Loc, Pairs_count::next_loc as Next_loc, (double) cnt/(double) total as Probab ility;


TOP3 = FOREACH (GROUP prob by Loc)
{
sorted = order prob by probability DESC;
top = LIMIT sorted 3;
GENERATE flatten (top);
};


STORE top3 into ' output ';


Cat output;



Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.