Web crawler webcrawler (1)-http Web content Crawl

Source: Internet
Author: User

Under Windows, C + + uses the HTTP protocol to crawl the content of Web pages:

Let's start with the next two important packages (typically open source packages under Linux, which call their dynamic-link library DLLs under Windows): Curl packets and Pthreads_dll, where the curl package is interpreted as a command-line browser by calling the built-in Curl_easy_ Setopt and other functions to achieve a specific Web content acquisition (correct compilation of the imported Curl link library, but also need another package c-ares). The pthreads is a multithreaded control package that contains a mutex variable plus lock and unlock. Functions such as program process allocation.

: Click the Open link. To correctly import the external dynamic link library, you need to step: 1, properties----property----Configuration Properties--general---Additional Include directories (path to add include), 2, Project----Properties-- Additional Library directories (add Lib-included paths), general---connector, and 3, additional dependencies (libcurld.lib; pthreadvc2.lib;ws2_32.lib), inputs, linker ; winmm.lib;wldap32.lib;areslib.lib add) 4, preprocessor definition in c/c++-> preprocessor (_console;building_libcurl;http_only)

The specific implementation process is described:

1: Custom hashtable structure to store the obtained string characters. Implemented in the form of the Hashtable class, containing the hash table set type, plus add, find, and several common string hash functions

Code:

Hashtable.h#ifndef hashtable_h#define hashtable_h#include <set> #include <string> #include <vector >class hashtable{public:hashtable (void); ~hashtable (void); unsigned int forceadd (const std::string& str); unsigned int Find (const std::string& str);/*string's common hash method */unsigned int Rshash (const std::string& str);    unsigned int jshash (const std::string& str);    unsigned int pjwhash (const std::string& str);    unsigned int elfhash (const std::string& str);    unsigned int bkdrhash (const std::string& str);    unsigned int sdbmhash (const std::string& str);    unsigned int djbhash (const std::string& str);    unsigned int dekhash (const std::string& str);    unsigned int bphash (const std::string& str);    unsigned int fnvhash (const std::string& str); unsigned int aphash (const std::string& str);p rivate:std::set<unsigned int> hashfunctionresultset;std:: vector<unsigned int> hhh;}; #endif
Hashtable.cpp#include "HashTable.h" hashtable::hashtable (void) {}hashtable::~hashtable (void) {}unsigned int Hashtable::forceadd (const std::string& str) {unsigned int i=elfhash (str); Hashfunctionresultset.insert (i); return i;} unsigned int hashtable::find (const std::string& str) {int ff=hhh.size (); const unsigned int i=elfhash (str); std::set <unsigned int>::const_iterator it;if (hashfunctionresultset.size () >0) {it=hashfunctionresultset.find (i); if (It==hashfunctionresultset.end ()) return-1;} else{return-1;} return i;} /* Several common string hash methods implement functions */unsigned int hashtable::aphash (const std::string& str) {unsigned int hash=0xaaaaaaaa;for ( std::size_t i=0;i<str.length (); i++) {hash^= ((i & 1) = = 0)?  ((hash << 7) ^ str[i] * (hash >> 3)): ((hash << one) + str[i] ^ (hash >> 5)));} return hash;}   unsigned int hashtable::bkdrhash (const std::string& str) {unsigned int seed=131; 131 1313 13131 131313 etcunsigned int hash=0;for (std::size_t i=0;i<str.length (); i++) {hash= (hash*seed) +str[i];} return hash;} unsigned int hashtable::bphash (const std::string& str) {unsigned int hash = 0;for (std::size_t i = 0; i < str.length ( ); i++) {hash = hash << 7 ^ str[i];} return hash;} unsigned int HashTable::D ekhash (const std::string& str) {unsigned int hash = static_cast<unsigned int> ( Str.length ()); for (std::size_t i = 0; i < str.length (); i++) {hash = ((hash << 5) ^ (hash >>)) ^ str[i];} return hash;}    unsigned int HashTable::D jbhash (const std::string& str) {unsigned int hash = 5381;    for (std::size_t i = 0; i < str.length (); i++) {hash = ((hash << 5) + hash) + str[i]; } return hash;} unsigned int hashtable::elfhash (const std::string& str) {unsigned int hash=0;unsigned int x=0;for (std::size_t i = 0; i < Str.length (); i++) {hash= (hash<<4) +str[i];if ((x = hash & 0xf0000000l)! = 0) hash^= (x>>24); hash&=~x;} return hash;} unsigned int HASHTABLE::FNVHAsh (const std::string& str) {const unsigned int fnv_prime = 0X811C9DC5;    unsigned int hash = 0;         for (std::size_t i = 0; i < str.length (); i++) {hash *= fnv_prime;    Hash ^= str[i]; } return hash;} unsigned int hashtable::jshash (const std::string& str) {unsigned int hash = 1315423911;for (std::size_t i = 0; i < St R.length (); i++) {Hash ^= (hash << 5) + str[i] + (hash >> 2));} return hash;} unsigned int HashTable::P jwhash (const std::string& str) {unsigned int bitsinunsignedint = (unsigned int) (sizeof ( unsigned int) * 8); unsigned int threequarters = (unsigned int) ((Bitsinunsignedint * 3)/4); unsigned int oneeighth = (unsigned int) (BITSINUNSIGNEDINT/8); unsigned int highbits = (unsigned int) (0xFFFFFFFF) << (bitsinunsignedint-oneeighth); unsigned int hash = 0;      unsigned int test = 0; for (std::size_t i = 0; i < str.length (); i++) {hash = (hash << oneeighth) + str[i];  if (test = hash & highbits)! = 0) hash = ((hash ^ (test >> threequarters)) & (~highbits)); } return hash;}    unsigned int hashtable::rshash (const std::string& str) {unsigned int b = 378551;    unsigned int a = 63689;        unsigned int hash = 0;for (std::size_t i = 0; i < str.length (); i++) {hash = hash * a + str[i]; A = a * b;} return hash;} unsigned int hashtable::sdbmhash (const std::string& str) {unsigned int hash = 0;for (std::size_t i = 0; i < Str.lengt H (); i++) {hash = Str[i] + (hash << 6) + (hash << +)-hash;} return hash;}


2: Implement an inter-process mutex function (also providing the process ID for the current operation in order to lock the mechanism). Implemented in the Singletone class. This class can only have static function instance to establish a unique class object. The basic operation of Hashtable is implemented in mutually exclusive way, in which the variable lock and unlock are implemented by the mutex class, see the code as follows:

Mutex.h#ifndef mutex_h#define mutex_h#pragma once#include "Pthread.h" class mutex{pthread_mutex_t& M_mutex; Public:mutex (pthread_mutex_t& m): M_mutex (m) {pthread_mutex_lock (&m_mutex);} ~mutex (void) {Pthread_mutex_unlock (&m_mutex);}}; #endif


Singletone.h#ifndef singletone_h#define singletone_h#include <string> #include <list> #include < map> #include "Constants.h" #include "HashTable.h" #include "pthread.h" #include "curl/curl.h" class singletone{ Public:static singletone* Instance (); void Push_back (std::string s); void Pop_back (); int size (); STD::LIST&LT;STD:: String>::reference back (); Std::list<std::string>::iterator begin (); std::list<std::string>:: Iterator end (); void Push_front (std::string s); bool empty (); unsigned int get_m_uniquemap_forceadd (const std::string & key,const std::string& URL); unsigned int get_m_uniquemap_find (const std::string& Key,const std::string & URL); HashTable get_m_uniquemap (const std::string& key); void Set_m_uniquemap (const std::string& key,hashtable& hash); curl* Getpcurl ();p rotected:singletone () ~singletone ();p thread_mutex_t m_singleton_mutex;private:static SingleTone * M_psingletone;std::list<std::string> M_linkstack;std::map<std::string,hAshtable> M_uniquemap; CURL *m_pcurl;}; #endif
#include "SingleTone.h" #include "mutex.h" singletone* singletone::m_psingletone=null; Singletone::singletone () {pthread_mutex_init (&m_singleton_mutex,null); M_pcurl=curl_easy_init ();} Singletone::~singletone () {Pthread_mutex_destroy (&m_singleton_mutex);} singletone* singletone::instance () {if (m_psingletone==null) {m_psingletone=new singletone ();} return (M_psingletone);} void Singletone::p ush_back (std::string s) {Mutex m (M_singleton_mutex); return M_linkstack.push_back (s);} void Singletone::p op_back () {Mutex m (M_singleton_mutex); return M_linkstack.pop_back ();} int Singletone::size () {return m_linkstack.size ();} Std::list<std::string>::iterator Singletone::begin () {return m_linkstack.begin ();} Std::list<std::string>::reference Singletone::back () {Mutex m (M_singleton_mutex); return M_linkstack.back ();} Std::list<std::string>::iterator Singletone::end () {return m_linkstack.end ();}    void Singletone::p ush_front (std::string s) {Mutex m (M_singleton_mutex); Return M_linkstack.push_front(s);} BOOL Singletone::empty () {return m_linkstack.empty ();} unsigned int singletone::get_m_uniquemap_forceadd (const std::string& key,const std::string& URL) {Mutex m (m_si    Ngleton_mutex); Return M_uniquemap[key]. Forceadd (URL);}  unsigned int singletone::get_m_uniquemap_find (const std::string& key,const std::string& URL) {HashTable HSS    = M_uniquemap[key]; unsigned int uiret =hss.    Find (URL);    unsigned int uiret = m_uniquemap[key]->find (URL); return Uiret;} HashTable singletone::get_m_uniquemap (const std::string& key) {return m_uniquemap[key];}      void Singletone::set_m_uniquemap (const std::string& key,hashtable& hash) {M_uniquemap[key] = hash; }curl* Singletone::getpcurl () {return m_pcurl;}

3: Implementation of HTTP access to Web content: Features include the initial page content acquisition, and URL settings and other functions. This process requires mutual exclusion, so the content of the Singletone class is introduced.

Code:

Http.h#ifndef http_h#define http_h#include "curl/curl.h" #include "pthread.h" #include <string>using namespace Std;class http{public:http (void); ~http (void); bool Initcurl (void); bool Initcurl (const std::string& URL, std::string& szbuffer); bool Deinitcurl (); void SetUrl (const std::string& URL); string SetUrl (); const string GetBuffer ();p rivate:static void writer (void* buffer,size_t size,size_t nmemb,void* f); int SetBuffer (char* buffer,size_ T size,size_t Nmemb); CURL *m_pcurl;char m_errorbuffer[curl_error_size];string m_szbuffer;string m_szurl;pthread_mutex_t M_http_mutex;}; #endif
#include "Http.h" #include "SingleTone.h" #include "mutex.h" http::http (void) {m_pcurl=singletone::instance () Getpcurl ();} Http::~http (void) {}bool http::initcurl (void) {return false;} int Http::setbuffer (char *buffer, size_t size, size_t nmemb) {int result = 0;if (buffer!=null) {m_szbuffer.append (buffer, s ize * nmemb); result = Size * NMEMB;}       buffer = NULL; return result;} void Http::writer (void *buffer, size_t size, size_t nmemb,void* f) {static_cast

Where M_szbuffer stores the content of the Web page. The contents of the initial Web page are stored in the parameters of the init function.




Web crawler webcrawler (1)-http Web content Crawl

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.