Big Data-Hash
Teach you how to quickly kill: 99% of the massive data processing surface test http://blog.csdn.net/v_july_v/article/details/7382693
1: operator
2: import HEAPQ
3:
4: def hashfiles ():
5:
6: files = []
7: for in range (0, 10):
8: '. txt 'w ')
9:
Ten: queryfile = File ('./data/queryfile.txt 'R ')
One : for in queryfile:
: files[hash (query)%10].write (query)
:
: queryfile.close ()
:
: for in files:
: f.close ()
:
: def sortqueriesinfiles ():
: files = []
: For in range (0, 10):
: '. txt 'r+')
At :
: for in files:
: D = {}
: for in F:
+ : query = Query.strip ()
: if in D:
: d[query] + = 1
: Else:
To : D[query] = 1
: operator. Itemgetter (1))
:
: f.seek (0, 0)
: f.truncate ()
In Sorted_d:
PNS: ' \ t '\ n')
: f.close ()
:
Max : def iteratefiles (f):
A : for in F:
A : query, Count = Line.split (' \ t ', 1)
: yield (-int(count), query)
:
: def mergefiles ():
* : files = []
: for in range (0, 10):
() : '. txt 'r')
£ º
: dest_file = file (' dest.txt 'w ')
Wuyi:
(in) :
: Print Line
Wu : ' \ n ')
:
: dest_file.close ()
£ º
"for "in files:
: f.close ()
:
A : if' __main__ ':
:
: hashfiles ()
: sortqueriesinfiles ()
: mergefiles ()