完善自己的html parse

來源:互聯網
上載者:User

       說來慚愧,我發現自己寫的那個html parse過於理想化,解析xml還差不多,想解析現今的html,估計是不大可能的,所以我把代碼重寫了一遍,資料結構改成雙向鏈表,這樣解析速度更快,最大程度地對html進行容錯處理,實驗時發現的確可以解析普通html字串,不需要再嚴格符合那個xhtml標準了。而且在鏈式調用的傳遞參數上不需要再傳入dom類型的參數,這樣寫起來就更加簡便了。

        好了,廢話不多說,看代碼:

#include<iostream>#include<string>#include "windows.h"#include "wininet.h"using namespace std;#pragma comment(lib,"wininet.lib")wstring UTF8ToUnicode( const string& str )    {        int len = str.length();        int unicodeLen = ::MultiByteToWideChar( CP_UTF8, 0, str.c_str(), -1, NULL, 0 );        wchar_t * pUnicode;        pUnicode = new wchar_t[unicodeLen+1];        memset(pUnicode,0,(unicodeLen+1)*sizeof(wchar_t));        MultiByteToWideChar( CP_UTF8, 0, str.c_str(), -1, (LPWSTR)pUnicode, unicodeLen );        wstring rt = ( wchar_t* )pUnicode;        delete pUnicode;        return rt;    }  string UnicodeToANSI( const wstring& str )    {        char* pElementText;        int iTextLen;        // wide char to multi char        iTextLen = WideCharToMultiByte( CP_ACP, 0, str.c_str(), -1, NULL, 0, NULL, NULL );        pElementText = new char[iTextLen + 1];        memset( ( void* )pElementText, 0, sizeof( char ) * ( iTextLen + 1 ) );        WideCharToMultiByte( CP_ACP, 0, str.c_str(), -1, pElementText, iTextLen, NULL, NULL );        string strText = pElementText;        delete[] pElementText;        return strText;    }class node;class dom;class nodecollect{private: node *n; int length;public: nodecollect(); ~nodecollect();    int getlength(); void add(node *nn); node* item(int i);};class node{private: int start; int len; char name[20];public: char* nodehtml(); char* innerhtml(); char* outerhtml(); char* innertext(); char* getattr(char* str); char* tagname(); void setname(char *str); node* getparent(); nodecollect* getchild();  node* getnext(); node* getprevious(); nodecollect* getbytagname(char *tagname); node *next; node *previous; void setstart(int i); void setlen(int i); int getstart(); int getlen(); dom *d;};class dom{private: char *text; node *start; node *end; int count; int parse(char *s);public: ~dom(); char *gettext(); void load(char *str); node* getitem(int i); int getcount(); node *getbyid(char* id); void praseurl(char *url); nodecollect* getbytagname(char *tagname);};void dom::load(char* str){ start=0; end=0; count=0; int l=strlen(str); text=new char[l+1]; strcpy(text,str); char *t=text; parse(t); }int dom::getcount(){ return count;}char *dom::gettext(){ return text;}node* dom::getitem(int i){ node* n1=start; while(i--){  if(n1){   n1=n1->next;  }else{   return 0;  } } return n1; }node *dom::getbyid(char *id){ for(int i=0;i<this->getcount();i++){  if(::stricmp(this->getitem(i)->getattr("id"),id)==0){   return this->getitem(i);  } } return 0;}nodecollect* dom::getbytagname(char *tagname){ nodecollect *nnode=new nodecollect;  for(int i=0;i<this->getcount();i++){  // cout<<strlen(this->getitem(i)->tagname(*this))<<endl;  if(::stricmp(this->getitem(i)->tagname(),tagname)==0){   nnode->add(this->getitem(i));  } } return nnode;}dom::~dom(){ delete[] text; node *n1=start,*n2; if(n1){  while(n1->next!=0){   n2=n1;   n1=n1->next;   delete n2;  } }}void dom::praseurl(char *url){ HINTERNET ie=InternetOpen("sx",INTERNET_OPEN_TYPE_PRECONFIG,0,0,0);HINTERNET ieo=InternetOpenUrl(ie,url,0,0,INTERNET_FLAG_RELOAD,0);DWORD read=0;char b[100]={0};char *bb=0;string str="";while(1){ InternetReadFile(ieo,b,99,&read); if(read==0){  break; }str+=b;memset(b,0,100);}string temp=UnicodeToANSI( UTF8ToUnicode(str));bb=(char*)temp.c_str();load(bb);InternetCloseHandle(ieo);InternetCloseHandle(ie);}int dom::parse(char *s){ int i1=0,i2=0,i3=0,i4=0; while(*s!=0){    if(*s==0){   return (long)s;  }  if(i3==1 || i4==1){   if(*s=='\"' && *(s-1)!='\\'){    if(i1==0){     i1=1;    }else{     i1=0;    }   }   if(*s=='\'' && *(s-1)!='\\'){    if(i2==0){     i2=1;    }else{     i2=0;    }   }  }  if(*s=='<' && *(s+1)=='!'){      if(i1==0 && i2==0 && i4==0){    i3=1;    node *nn=new node;    nn->setstart(s-text);    nn->setlen(0);    nn->d=this;    nn->setname(s+1);    nn->next=0;    nn->previous=0;    if(start){     node *n1=start;     while(n1->next!=0){      n1=n1->next;     }     n1->next=nn;     nn->previous=n1;     end=nn;    }else{     start=nn;     end=nn;    }    int s1=(long)s;    while(*s){     if(*s=='\"'){      if(i1==0){       i1=1;      }else{       i1=0;      }     }     if(*s=='\''){      if(i2==0){       i2=1;      }else{       i2=0;      }     }     if(*s=='>' && ((*(s-1)=='-' && *(s-2)=='-') ||  (*(s-1)=='/"' && *(s-2)=='d' && *(s-3)=='t' && *(s-4)=='d' && *(s-5)=='.') || (*(s-1)=='l' && *(s-2)=='m' && *(s-3)=='t' && *(s-4)=='h' && *(s-5)==' '))){      if(i1==0 && i2==0){       //cout<<(long)s+1-s1<<endl;       nn->setlen((long)s+1-s1);       s++;       break;      }           }     s++;    }    count++;   }  }  if(*s=='<' && *(s+1)!='/' && *(s+1)!='!'){      if(i1==0 && i2==0 && i4==0){    i3=1;    node *nn=new node;    //cout<<s-text<<endl;    nn->setstart(s-text);    nn->setlen(0);    nn->d=this;    nn->setname(s+1);    nn->next=0;    nn->previous=0;    if(start){     node *n1=start;     while(n1->next!=0){      n1=n1->next;     }     n1->next=nn;     nn->previous=n1;     end=nn;    }else{     start=nn;     end=nn;    }    if((*(s+1)=='s' || *(s+1)=='S') && (*(s+2)=='c' || *(s+2)=='C') && (*(s+3)=='r' || *(s+3)=='R') && (*(s+4)=='i' || *(s+4)=='I') && (*(s+5)=='p' || *(s+5)=='P') && (*(s+6)=='t' || *(s+6)=='T')){     i4=1;    }    if((*(s+1)=='s' || *(s+1)=='S') && (*(s+2)=='t' || *(s+2)=='T') && (*(s+3)=='y' || *(s+3)=='Y') && (*(s+4)=='l' || *(s+4)=='L') && (*(s+5)=='e' || *(s+5)=='E') ){     i4=1;    }    count++;   }  }  if(*s=='>'){   if(i1==0 && i2==0){    i3=0;    if(i4==0){     node *n1=end;     // cout<<(long)s+2-(n1->getstart())-(long)text<<endl;     n1->setlen((long)s+1-(n1->getstart())-(long)text);         }    if(stricmp(strlwr(end->tagname()),"script")==0){     i4=1;    }    if(stricmp(strlwr(end->tagname()),"style")==0){     i4=1;    }   }  }    if(*s=='<' && *(s+1)=='/'){   if(i1==0 && i2==0){    i3=0;    if(i4=1){     i4=0;    }    char temp[20]={0};    s=s+2;    int i=0;    while(*s!='>'){     temp[i++]=*s;     s++;    }    node *n1=end;    node* min;    while(n1!=0){     if(stricmp(n1->tagname(),temp)==0 && strlen(n1->nodehtml())==n1->getlen()){      min=n1;      break;      // cout<<min->getstart()<<"*"<<i<<endl;     }     n1=n1->previous;    }    n1=min;    // cout<<strlen(n1->nodehtml())<<"      "<<n1->getlen()<<endl;    // cout<<(long)s-(long)text-n1->getstart()<<endl;        n1->setlen((long)s+1-(n1->getstart())-(long)text);       }  }    s++; }}void node::setname(char *str){ memset(name,0,20); int i=0; while(1){  if(*str=='!'){   name[0]='!';   break;  }else if(*str==' ' || *str=='/' || *str=='>'){   break;  }else if(*str==13 || *str==10){   break;  }else{   name[i++]=*str;   str++;  } }}node *node::getparent(){ int p=-1; for(int i=0;i<d->getcount();i++){  if(d->getitem(i)->getstart()<start){   if(d->getitem(i)->getlen()+d->getitem(i)->getstart()>start+len){    p=i;   }  }else{   break;  } } if(p==-1){  return 0; }else{  return d->getitem(p); }}nodecollect* node::getchild(){ int p=-1; nodecollect *nn=new nodecollect; for(int i=0;i<d->getcount();i++){  if(d->getitem(i)->getstart()>start){   p=i;   break;  } } if(p!=-1){  for(;p<d->getcount();p++){   if(start+len>d->getitem(p)->getlen()+d->getitem(p)->getstart()){    nn->add(d->getitem(p));   }else{    break;   }  }   } return nn;}nodecollect* node::getbytagname(char *tagname){nodecollect *nn=new nodecollect;nodecollect *nn1=getchild();for(int i=0;i<nn1->getlength();i++){ if(stricmp(nn1->item(i)->tagname(),tagname)==0){ nn->add(nn1->item(i)); }}return nn;}char *node::tagname(){ return name;}void node::setstart(int i){ start=i;}void node::setlen(int i){ len=i;}int node::getstart(){ return start;}int node::getlen(){ return len;}char *node::nodehtml(){ char *out=outerhtml(); int i=0,i1=0,i2=0; char *v=new char[strlen(out)+1]; ::memset(v,0,strlen(out)+1); while(*out!=0){    if(*out==0){   return v;  }  if(*out=='\"' && *(out-1)!='\\'){   if(i1==0){    i1=1;   }else{    i1=0;   }  }  if(*out=='\'' && *(out-1)!='\\'){   if(i2==0){    i2=1;   }else{    i2=0;   }  }  if(*out=='>'){   if(i1==0 && i2==0){    v[i]='>';    return v;   }  }  v[i++]=*(out++); }}char *node::outerhtml(){ char *out=new char[len+1]; char *c=d->gettext()+start; for(int i=0;i<len;i++){  *(out+i)=*(c+i); } out[len]=0; return out;}char *node::getattr(char* str){ char *v=nodehtml(); char* index=v; char *attr=new char[strlen(v)+1]; memset(attr,0,strlen(v)+1); while(1){  index=strstr(index,str);  if(index==0){   return attr;  }else{   if((*(index-1)==' ' || *(index-1)==10 || *(index-1)==13) && *(index+strlen(str))=='='){    index=index+strlen(str)+1;    break;   }  }  index++; } int i1=0,i2=0; if(*index=='\"'){  i1=1;  index++;   } if(*index=='\''){  i2=1;  index++;   } int i=0; while(*index!=0){    if(*index==0){   return attr;  }  if(*index=='\"' && *(index-1)!='\\'){   if(i1==0){    i1=1;   }else{    i1=0;   }  }  if(*index=='\'' && *(index-1)!='\\'){   if(i2==0){    i2=1;   }else{    i2=0;   }  }  if(*index==' ' || *index=='>' || *index=='/'){   if(i1==0 && i2==0){    if(*(index-1)=='\'' || *(index-1)=='\"'){     attr[strlen(attr)-1]=0;     return attr;    }else{     return attr;    }   }  }  *(attr+(i++))=*(index++); }}char *node::innerhtml(){ char* out=outerhtml(); char *base=out; int l=strlen(out); int i1=0,i2=0; char *inner=new char[strlen(out)+1]; ::memset(inner,0,strlen(out)+1); while(*out!=0){    if(*out==0){   return inner;  }  if(*out=='\"'){   if(i1==0){    i1=1;   }else{    i1=0;   }  }  if(*out=='\''){   if(i2==0){    i2=1;   }else{    i2=0;   }  }  if(*out=='>'){   if(i1==0 && i2==0){    break;   }  }  out++; } int innerlen=l-(strlen(tagname())+3)-(out-base+1); if(innerlen==0){  return inner; }else{  for(int i=0;i<innerlen;i++){   inner[i]=*(out+i+1);  }  return inner; }}char *node::innertext(){ char *h=innerhtml(); char *h1=h; char *inner; if(h[0]==0){  inner=new char;  *inner=0;  return inner; }else if(stricmp(strlwr(this->tagname()),"script")==0){  inner=new char[strlen(h)+1];  strcpy(inner,h);  return inner; }else{  inner=new char[strlen(h)+1];  ::memset(inner,0,strlen(h)+1); } int i=0,i1=0,i2=0,i3=0; for(;*h!=0;h++){    if(*h==0){   return inner;  }  if(*h=='<'){   if(i3==0){    if((*(h+1)=='s' || *(h+1)=='S') && (*(h+2)=='c' || *(h+2)=='C') && (*(h+3)=='r' || *(h+3)=='R') && (*(h+4)=='i' || *(h+4)=='I') && (*(h+5)=='p' || *(h+5)=='P') && (*(h+6)=='t' || *(h+6)=='T')){     int l=start+strlen(nodehtml())+(h-h1);     for(int i1=0;i1<d->getcount();i1++){      if(d->getitem(i1)->getstart()==l){       strcat(inner,d->getitem(i1)->innertext());       h=h+strlen(d->getitem(i1)->outerhtml());       i=i+strlen(d->getitem(i1)->innertext());       i3=0;       break;      }     }    }else{     i3=1;    }               }  }  if(i3==1){   if(*h=='/"' && *(h-1)!='//'){    if(i1==0){     i1=1;    }else{     i1=0;    }       }   if(*h=='\'' && *(h-1)!='\\'){    if(i2==0){     i2=1;    }else{     i2=0;    }       }   if(*h=='>'){    if(i1==0 && i2==0){     i3=0;         }   }  }else{   //cout<<*h;   *(inner+i)=*h;   i++;  }     } return inner;}node* node::getprevious(){ node *nn=0; for(int i=0;i<d->getcount();i++){    if(d->getitem(i)->getstart()==start && d->getitem(i)->getlen()==len){   break;  }else{   if(start>=d->getitem(i)->getstart()+d->getitem(i)->getlen()){    nn=d->getitem(i);       }  } } return nn;}node* node::getnext(){ node *nn=0; for(int i=0;i<d->getcount();i++){  if(start+len<=d->getitem(i)->getstart()){   nn=d->getitem(i);   break;  } } return nn;}nodecollect::nodecollect(){ n=0; length=0;}int nodecollect::getlength(){ return length;}node *nodecollect::item(int i){ node* n1=n; while(i--){  if(n1){   n1=n1->next;  }else{   return 0;  } } return n1;}void nodecollect::add(node *nn){ node *n1=new node; n1->setstart(nn->getstart()); n1->setlen(nn->getlen()); n1->next=0; char *temp=new char[strlen(nn->tagname())+2]; strcpy(temp,nn->tagname()); n1->setname(strcat(temp," ")); delete[] temp; n1->d=nn->d; if(n){  node *n2=n;  while(n2->next){   n2=n2->next;  }  n2->next=n1;   }else{  n=n1;   } length++;}nodecollect::~nodecollect(){ node *n1=n,*n2; if(n1){  while(n1->next!=0){   n2=n1;   n1=n1->next;   delete n2;  } }}

 

上面是標頭檔以及實現介面,下面給出個執行個體:

main(){
 dom d;
 d.load("<body>wef<script d=sd>gh</script>1'<img id=we/>sx<a>df<script d='d'>'<a></a>'1>22<3</script></body>");
 // for(int i=0;i<d.getcount();i++){
 cout<<d.getbyid("we")->outerhtml()<<endl;
 // }
}

       有什麼不足之處,還望大家多多指正。

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.