標籤:xmlhttp 亂碼 Regex asp adodb.stream
Asp 使用 Microsoft.XMLHTTP 抓取網頁內容,並過濾需要的內容
Asp 使用 Microsoft.XMLHTTP 抓取網頁內容無亂碼處理,並過濾需要的內容
樣本源碼:
<% Dim xmlUrl,http,strHTML,strBody xmlUrl = Request.QueryString("u") REM 非同步讀取XML源 Set http = server.CreateObject("Microsoft.XMLHTTP") http.Open "POST",xmlUrl,false http.setrequestheader "User-Agent", "Mozilla/4.0" http.setrequestheader "Connection", "Keep-Alive" http.setRequestHeader "Content-Type", "application/x-www-form-urlencoded" http.Send() strHTML = BytesToBstr(http.ResponseBody) set http = nothing REM 抓取主要內容 strBody = GetBody(strHTML,"<div id=""Div_newsContentc"" class=""cnt"">","</div>",0,0) strBody =Replace(strBody,"(本文首發於","") strBody =Replace(strBody,"財富動力網</a>,轉載請註明出處。)","") strBody =Replace(strBody,"本文首發於,轉載請註明出處。)","") strBody =Replace(strBody,"財富動力網</a>:http://www.927953.com","") strBody =Replace(strBody,"本文首發於","") Response.Write RegRemoveHref(strBody)REM 擷取對應網址響應的HTMLFunction BytesToBstr(body) dim objstream set objstream = Server.CreateObject("adodb.stream") objstream.Type = 1 objstream.Mode =3 objstream.Open objstream.Write body objstream.Position = 0 objstream.Type = 2 objstream.Charset = "UTF-8" ‘轉換原來預設的UTF-8編碼轉換成GB2312編碼,否則直接用 ‘XMLHTTP調用有中文字元的網頁得到的將是亂碼 BytesToBstr = objstream.ReadText objstream.Close set objstream = nothingEnd FunctionREM 使用Regex,抓取之內標記的內容Function GetBody(ConStr,StartStr,OverStr,IncluL,IncluR) If ConStr="$False$" or ConStr="" or IsNull(ConStr)=True Or StartStr="" or IsNull(StartStr)=True Or OverStr="" or IsNull(OverStr)=True Then GetBody="$False$" Exit Function End If Dim ConStrTemp Dim Start,Over ConStrTemp=Lcase(ConStr) StartStr=Lcase(StartStr) OverStr=Lcase(OverStr) Start = InStrB(1, ConStrTemp, StartStr, vbBinaryCompare) If Start<=0 then GetBody="$False$" Exit Function Else If IncluL=False Then Start=Start+LenB(StartStr) End If End If Over=InStrB(Start,ConStrTemp,OverStr,vbBinaryCompare) If Over<=0 Or Over<=Start then GetBody="$False$" Exit Function Else If IncluR=True Then Over=Over+LenB(OverStr) End If End If GetBody=MidB(ConStr,Start,Over-Start)End FunctionREM 過濾a超連結Function RegRemoveHref(HTMLstr) Set ra = New RegExp ra.IgnoreCase = True ra.Global = True ra.Pattern = "<a[^>]+>(.+?)<\/a>" RegRemoveHref = Replace(ra.replace(HTMLstr,"$1"),"href=""http://www.927953.com""","") END Function%>
如下: