搜尋引擎的那些事(多線程web遍曆)

來源:互聯網
上載者:User

【 聲明:著作權,歡迎轉載,請勿用於商業用途。  聯絡信箱:feixiaoxing @163.com】 

    上面一篇部落格當中,我們可以利用單一的線程完成網頁的下載。今天,我們打算在此基礎上完成多線程的訪問和載入操作。使用多線程,倒不是因為這項技術有多牛,主要是因為我們想利用多線程的訪問機制,充分利用線程的阻塞時間,這樣可以在單位時間內完成更多的下載操作,這樣至少可以協助我們提高一部分效率。

    要做到這一點的話,首先要對我們原來的代碼進行少許的改造。怎麼改造呢?首先就是我們需要把網頁分成已訪問和待訪問兩種。一方面,我們對網頁進行下載,另一方面我們可以利用網頁中連結資訊下載新的網頁。如果把整個網頁看成是一個隊列,那麼就有一個front、一個end。每一個線程不斷地從end中讀取檔案,擷取到http地址之後插入到front之後,整個過程是一個while死迴圈過程。

/* index to read html file */static int get_read_index(){int index;WaitForSingleObject(h_index, INFINITE);if(end == front){ReleaseSemaphore(h_index, 1, NULL);return -1;}index = end ++;ReleaseSemaphore(h_index, 1, NULL);return index;}/* index to write html file */static int get_write_index(){int index;WaitForSingleObject(h_index, INFINITE);index = front ++;ReleaseSemaphore(h_index, 1, NULL);return index;}

    上面的過程就是記錄索引的過程。當然,我們還需要進行另外一項改造工作,那就是把原來的函數工作模式修改成while(1)的線程工作模式。對於線程來說,就是不斷地讀取隊列、不斷地分析檔案、不斷地插入隊列,就是這麼簡單。

/* download page and its linked pages */DWORD WINAPI download_page_entry(LPVOID param){char* buffer;int size;char name[64];int index;while(1){while( -1 == (index = get_read_index())){Sleep(100);}memset(name, 0, 64);sprintf(name, "E:/download/%d.html", index);if(OK == get_file_content(name, &buffer, &size)){get_http_and_download(buffer);free(buffer);}}}

    上面兩段代碼就是我們所做的基本修改動作。當然還有其他一些注意事項,比如說,注意第一個網頁的下載過程、在產生可執行檔的時候一定要選用MultithreadDebugDll選項,編寫代碼的時候注意進行拷機測試。最後的話,就啥也不說了,貼上源碼。當然,還是和原來一樣,如果大家希望可以運行執行這段代碼的話,一定要在E盤建立一個download目錄,其他的就什麼也不用管了。

#include <stdio.h>#include <windows.h>#include <wininet.h>#include <assert.h>#ifdef ERROR#undef ERROR#endif#define U8 unsigned char#define U32 unsigned int#define STATUS unsigned int#define OK 0#define ERROR (~0L)#define MAX_BLOCK_SIZE 1024#define MAX_DOMAIN_NAME_LENGTH 64#define MAX_THREAD_NUMBER (8)#pragma comment(lib, "wininet.lib")static STATUS download_web_page(const char* url, const char* path);static int end = 0;static int front = 1;static HANDLE h_index;/* index to read html file */static int get_read_index(){int index;WaitForSingleObject(h_index, INFINITE);if(end == front){ReleaseSemaphore(h_index, 1, NULL);return -1;}index = end ++;ReleaseSemaphore(h_index, 1, NULL);return index;}/* index to write html file */static int get_write_index(){int index;WaitForSingleObject(h_index, INFINITE);index = front ++;ReleaseSemaphore(h_index, 1, NULL);return index;}/* get length of html file */static int get_file_size(const char* path){HANDLE hFile;int size = 0;hFile = CreateFile(path, FILE_READ_EA, FILE_SHARE_READ, 0, OPEN_EXISTING, 0, 0);if (hFile != INVALID_HANDLE_VALUE)    {size = GetFileSize(hFile, NULL);        CloseHandle(hFile);    }return size;}/* get all data from html file */static STATUS get_file_content(const char* path, void** pp_buffer, int* size){int length;char* buffer;HANDLE hFile;if(NULL == path){return ERROR;}if(NULL == pp_buffer){return ERROR;}if(NULL == size){return ERROR;}length = get_file_size(path);if(0 == length){return ERROR;}buffer = (char*) malloc(length +1);if(NULL == buffer){return ERROR;}buffer[length] = '\0';hFile = fopen(path, "r+b");if(NULL == hFile){free(buffer);return ERROR;}fread(buffer, 1, length, hFile);fclose(hFile);*pp_buffer = buffer;*size = length;return OK;}/* show all http name, sometimes just for debug use */static void print_http_name(const char* buffer, int size){while(size --){printf("%c", *buffer ++);}printf("\n");}static void download_linked_page(const char* url, int size){char* data;char name[64];print_http_name(url, size);data = (char*)malloc(size + 1);if(NULL == data){return;}data[size] = '\0';memmove(data, url, size);memset(name, 0, 64);sprintf(name, "E:/download/%d.html", get_write_index());download_web_page(data, name);/*  free data memroy, which contained http domain name */free(data);}/* get http form html file, then download it by its name*/static void get_http_and_download(const char* buffer){const char* prev;const char* next;char letter;int count;if(NULL == buffer){return;}next = buffer;while(1){next = strstr(next, "http://");if(NULL == next){break;}count = MAX_DOMAIN_NAME_LENGTH;prev = next;next += strlen("http://");while(1){if(!count){break;}count --;letter = *next;if('"' == letter || '\'' == letter || ')' ==  letter || '>' == letter){break;}next ++;}if(count){download_linked_page(prev, next - prev);}}}/* implement page download */static STATUS download_web_page(const char* url, const char* path){U8 buffer[MAX_BLOCK_SIZE];U32 iNumber;FILE* hFile;HINTERNET hSession;HINTERNET hUrl;STATUS result;hSession = InternetOpen("RookIE/1.0", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);if(NULL == hSession){return ERROR;}hUrl = InternetOpenUrl(hSession, url, NULL, 0, INTERNET_FLAG_DONT_CACHE, 0);if(NULL == hUrl){result = ERROR;goto error1;}hFile = fopen(path, "wb");if(NULL == hFile){result = ERROR;goto error2;}iNumber = 1;while(iNumber > 0){InternetReadFile(hUrl, buffer, MAX_BLOCK_SIZE -1, &iNumber);fwrite(buffer, sizeof(char), iNumber, hFile);}fclose(hFile);result = OK;error2:InternetCloseHandle(hUrl);error1:InternetCloseHandle(hSession);return result;}/* download page and its linked pages */DWORD WINAPI download_page_entry(LPVOID param){char* buffer;int size;char name[64];int index;while(1){while( -1 == (index = get_read_index())){Sleep(100);}memset(name, 0, 64);sprintf(name, "E:/download/%d.html", index);if(OK == get_file_content(name, &buffer, &size)){get_http_and_download(buffer);free(buffer);}}}/* entry of programme */int main(int argc, char* argv[]){int index;HANDLE h_download[MAX_THREAD_NUMBER];h_index = CreateSemaphore(NULL, 1, 1, NULL);if(NULL == h_index){assert(0);}/* 0.html is just the start page */download_web_page("http://book.dangdang.com", "E:/download/0.html");for(index = 0; index < MAX_THREAD_NUMBER; index ++){h_download[index] = CreateThread(NULL, 0, download_page_entry, 0, 0, NULL);if(NULL == h_download[index]){assert(0);}}WaitForMultipleObjects(MAX_THREAD_NUMBER, h_download, TRUE, INFINITE);CloseHandle(h_index);return 1;}

 

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.