搜尋引擎的那些事(摘取價格資料)

來源:互聯網
上載者:User

【 聲明:著作權,歡迎轉載,請勿用於商業用途。  聯絡信箱:feixiaoxing @163.com】 

    下載網頁不難,提取資料其實也不難。前面,我們說到了如何在噹噹網頁中提取title。當然了,不僅僅是噹噹網頁可以提取title,幾乎所有的網頁都可以提取標題。因為噹噹是一家電商網站,所以基本上其標題資訊和它賣的商品是分不開的。但是,現在我們已經不滿足於此了,我們還想從網頁中提取價格資訊,那應該怎麼做呢?

    要從網頁中提取價格資訊,關鍵是要尋找規律,怎麼樣又好又快地將價格資訊找出來。我們可以隨便找一個噹噹的網頁,查看一下它的原始碼資訊,就會發現這樣的資料,大家可以看一下,

     <p>當 當 價:<b id="d_price" class="d_price "><span class="yen">&yen;</span>171.00</b><span class="break"></span></p>

    幾乎所有噹噹網頁商品中都會保留這樣格式的資訊,但是數字當然不一樣了。我們可以想到一個比較簡單的提取方法,就是分成下面兩個步驟:(1)尋找“當 當 價”的起始位置;(2)尋找到起始位置後,尋找第一個數字資訊,就可以發現171.00這些資料了,也就是我們需要的定價資料資訊。

#include <stdio.h>#include <windows.h>#include <wininet.h>#include <assert.h>#ifdef ERROR#undef ERROR#endif#define U8 unsigned char#define U32 unsigned int#define STATUS unsigned int#define OK 0#define ERROR (~0L)#define MAX_BLOCK_SIZE 1024#define HTTP_NAME_ADDRESS "http://product.dangdang.com/main/product.aspx?product_id=22560249&ref=book-11712-3032_1-63349-0"#pragma comment(lib, "wininet.lib")/* show file content */static void show_file_content(char* buffer, int size){while(size --){printf("%c", *buffer++);}}/* find pattern content */static STATUS find_pattern_content(char* buffer, char* start, char* end,  char** pp_buffer, int* size){char* prev;char* next;if(NULL == buffer){return ERROR;}if(NULL == start || NULL == end){return ERROR;}if(NULL == pp_buffer || 0 == size){return ERROR;}next = strstr(buffer, start);if(NULL == next){return ERROR;}prev = next;next += strlen(start);next = strstr(next, end);if(NULL == next){return ERROR;}*pp_buffer = prev + strlen(start);*size = next - (prev + strlen(start));return OK;}/* get length of html file */static int get_file_size(const char* path){HANDLE hFile;int size = 0;hFile = CreateFile(path, FILE_READ_EA, FILE_SHARE_READ, 0, OPEN_EXISTING, 0, 0);if (hFile != INVALID_HANDLE_VALUE)    {size = GetFileSize(hFile, NULL);        CloseHandle(hFile);    }return size;}/* get all data from html file */static STATUS get_file_content(const char* path, void** pp_buffer, int* size){int length;char* buffer;HANDLE hFile;if(NULL == path){return ERROR;}if(NULL == pp_buffer){return ERROR;}if(NULL == size){return ERROR;}length = get_file_size(path);if(0 == length){return ERROR;}buffer = (char*) malloc(length +1);if(NULL == buffer){return ERROR;}buffer[length] = '\0';hFile = fopen(path, "r+b");if(NULL == hFile){free(buffer);return ERROR;}fread(buffer, 1, length, hFile);fclose(hFile);*pp_buffer = buffer;*size = length;return OK;}/* implement page download */static STATUS download_web_page(const char* url, const char* path){U8 buffer[MAX_BLOCK_SIZE];U32 iNumber;FILE* hFile;HINTERNET hSession;HINTERNET hUrl;STATUS result;hSession = InternetOpen("RookIE/1.0", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);if(NULL == hSession){return ERROR;}hUrl = InternetOpenUrl(hSession, url, NULL, 0, INTERNET_FLAG_DONT_CACHE, 0);if(NULL == hUrl){result = ERROR;goto error1;}hFile = fopen(path, "wb");if(NULL == hFile){result = ERROR;goto error2;}iNumber = 1;while(iNumber > 0){InternetReadFile(hUrl, buffer, MAX_BLOCK_SIZE -1, &iNumber);fwrite(buffer, sizeof(char), iNumber, hFile);}fclose(hFile);result = OK;error2:InternetCloseHandle(hUrl);error1:InternetCloseHandle(hSession);return result;}static STATUS is_char_digital(char value){if(value >= '0' && value <= '9'){return OK;}return ERROR;}/* get product price */static STATUS find_product_price(char* buffer,  char* str, int len){char* prev;char* next;if(NULL == buffer){return ERROR;}if(NULL == str || 0 == len){return ERROR;}memset(str, 0, len);next = strstr(buffer, "當 當 價:");if(NULL == next){return ERROR;}next += strlen("當 當 價:");while(ERROR == is_char_digital(*next)){next ++;}prev = next;while('<' != *next){next ++;}memmove(str, prev, next - prev);return OK;}/* entry of programme */int main(int argc, char* argv[]){char* buffer;char* begin;int length;int size;char price[16];/* 0.html is just the start page */download_web_page(HTTP_NAME_ADDRESS, "E:/0.html");if(OK == get_file_content("E:/0.html", &buffer, &size)){memset(price, 0, 16);if(OK == find_pattern_content(buffer, "<title>", "</title>", &begin, &length))          {  printf("商品:");            show_file_content(begin, length);printf("\n");        } if(OK == find_product_price(buffer, price, 16)){printf("噹噹價:%s\n", price);}free(buffer);}return 1;}

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.