How can we find the first 1 million URLs with the highest frequency ?, 1 million url
This is a phone interview question from Alibaba. The first solution I want is a little restricted, and I will write optimization questions later.
Experiment data, python captured from Baidu:
#-*-Coding: UTF-8-*-"Spyder EditorThis is a temporary script file. "import urllib2 import re import OS # connect to a URL # About 200 file_url = open('url.txt ',' AB + ') # Something in the search box, this can be set to a number so that the results of each search is not the same search = '000000' url = "http://www.baidu.com/s? Wd = "+ searchdef setUrlToFile (): website = urllib2.urlopen (url) # read html code html = website. read () # use re. findall to get all the links = re. findall ('"(http | ftp) s? ://.*?) "', Html) for s in links: print s [0] if len (s [0]) <256: file_url.write (s [0] +' \ r \ n ') # Collect experimental data for I in range (): setUrlToFile () file_url.close () ### you need to re-open the file and then read file_url = open('url.txt ', 'R ') file_lines = len (file_url.readlines () print "there are % d url in % s" % (file_lines, file_url) file_url.close ()
C ++ writes the read url.txt file into the map, sorts the values of map <string, int>, and obtains the first 100 values. It takes 55 s to run the command, which is still very fast, the url length is limited to less than 256 characters:
# Pragma once/* // class for calculating the runtime of the code segment // */# include <iostream> # ifndef ComputeTime_h # define ComputeTime_h // unit: millisecond class ComputeTime {private: int Initialized; _ int64 Frequency; _ int64 BeginTime; public: bool Avaliable (); double End (); bool Begin (); ComputeTime (); virtual ~ ComputeTime () ;};# endif # include "stdafx. h "# include" ComputeTime. h "# include <iostream> # include <Windows. h> ComputeTime: ComputeTime () {Initialized = QueryPerformanceFrequency (LARGE_INTEGER *) & Frequency);} ComputeTime ::~ ComputeTime () {} bool ComputeTime: Begin () {if (! Initialized) return 0; return QueryPerformanceCounter (LARGE_INTEGER *) & BeginTime);} double ComputeTime: End () {if (! Initialized) return 0; _ int64 endtime; QueryPerformanceCounter (LARGE_INTEGER *) & endtime); _ int64 elapsed = endtime-BeginTime; return (double) elapsed/(double) frequency) * 1000.0; // unit: millisecond} bool ComputeTime: Avaliable () {return Initialized;} // sortUrl. cpp: defines the entry point of the console application. // # Include "stdafx. h "// # include <utility> # include <vector> # include <map> # include <fstream> # include <iostream> # include <string> # include <algorithm> # include "ComputeTime. h "using namespace std; map <string, int> urlfrequency; typedef pair <string, int> PAIR; struct CmpByValue {bool operator () (const PAIR & lhs, const PAIR & rhs) {return lhs. second> rhs. second ;}; void find_largeTH (map <string, int> urlfrequency) {// Transfers the elements in the map to the vector and sorts the vector by value <PAIR> url_quency_vec (urlfrequency. begin (), urlfrequency. end (); sort (url_quency_vec.begin (), url_quency_vec.end (), CmpByValue (); // url_quency_vec.size () for (int I = 0; I! = 100; ++ I) {cout <url_quency_vec [I]. first <endl; cout <url_quency_vec [I]. second <endl ;}// urlheap creation process. The void insertUrl (string URL) {pair <map <string, int >:: iterator, bool> Insert_Pair; Insert_Pair = urlfrequency. insert (map <string, int >:: value_type (url, 1); if (Insert_Pair.second = false) {(Insert_Pair.first-> second ++ );}} int _ tmain (int argc, _ TCHAR * argv []) {fstream URLfile; char buffer [1024]; UR Lfile. open ("url.txt", ios: in | ios: out | ios: binary); if (! URLfile. is_open () {cout <"Error opening file"; exit (1) ;}else {cout <"open file success! "<Endl ;}computetime cp; cp. Begin (); int I = 0; while (! URLfile. eof () {URLfile. getline (buffer, 1024); // cout <buffer <endl; string temp (buffer); // cout <I ++ <endl; insertUrl (temp);} find_largeTH (urlfrequency); cout <"running time:" <cp. end () <"ms" <endl; getchar (); // system ("pause"); return 0 ;}
Experiment results:
Zookeeper
Copyright Disclaimer: This article is an original article by the blogger and cannot be reproduced without the permission of the blogger.