c#網頁抓取

來源:互聯網
上載者:User

我先來了。
首先mshtml很有用,對於html元素的解析很強大。比如:
using mshtml;
private string ConvertToAbsoluteUrls(string html, Uri relativeLocation, HtmlTag _htmlTag)
{
IHTMLDocument2 doc = new HTMLDocumentClass();
doc.write(new object[] { html });
doc.close();
string b = string.Empty;
if (_htmlTag == HtmlTag.link)
{
foreach (IHTMLAnchorElement anchor in doc.links)
{
IHTMLElement element = (IHTMLElement)anchor;
string href = (string)element.getAttribute("href", 2);
if (href != null)
{
Uri addr = new Uri(relativeLocation, href);
anchor.href = addr.AbsoluteUri;
}
}
}
else
{
foreach (IHTMLImgElement image in doc.images)
{
IHTMLElement element = (IHTMLElement)image;
string src = (string)element.getAttribute("src", 2);
if (src != null)
{
Uri addr = new Uri(relativeLocation, src);
image.src = addr.AbsoluteUri;
}
}
}
return doc.body.innerHTML;
}

-----------------------------------------

其實網爬程式有很多,需要注意的地方也有很多,
1,要注意內容的格式,有的類容帶了有格式,比如換行,縮排等,想這樣的內容一般需要把內容裡面的<table>,<tr>,<td>等一系列的HTML元素一起抓取下來儲存到資料庫。
2,帶圖片的需要下載圖片到本地,用正則驗證圖片格式。
下面是我自己的一個抓取網路新聞的一個類。

C# code

struct resinfo
    {
        /// <summary>
        /// 完整的原始路徑
        /// </summary>
        public string orgurl;
        /// <summary>
        /// 原始檔案名
        /// </summary>
        public string orgname;
        /// <summary>
        /// 原始的擴充檔案名稱
        /// </summary>
        public string extname;
        /// <summary>
        /// 新檔案名稱
        /// </summary>
        public string newname;
    }
    /// <summary>
    /// 擷取(網頁)內容中的遠端資源
    /// </summary>
    public class RemoteResource
    {
        private int SeriesNum;
        private string FileNum;
        private string restype = ".gif|.jpg|.bmp|.png|.jpeg";
        private string _remoteurl;
        private string _localurl;
        private string _localpath;
        private string _content = "";
        private bool _rename;
        private bool bcomp = false;
        /// <summary>
        /// 建構函式
        /// </summary>
        /// <param name="Content">包含要擷取遠端資源的內容</param>
        /// <param name="LocalURLDirectory">要將檔案儲存到本機伺服器的虛擬目錄,用於替換原來的遠程連結地址,如:http://www.Com2000888.com/remoteres,可以為空白,也可以為../一個或多個。</param>
        /// <param name="LocalPhysicalDirectory">要將檔案儲存到本機伺服器的磁碟路徑,如:C:\Inetpub\wwwroot\remoteres,如果不存在可以建立</param>
        /// <param name="RemoteUrl">用於處理相對路徑(如src="../images/Com2000888.gif")的資源,如果為空白,則只取完整路徑的資源,以http(或https,ftp,rtsp,mms)://開頭</param>
        /// <param name="RenameFile">是否要重新命名資源檔,如為false則自動覆蓋重名檔案</param>
        public RemoteResource(string Content,string LocalURLDirectory,string LocalPhysicalDirectory,string RemoteUrl,bool RenameFile)
        {
            _content = Content;
             _localurl= LocalURLDirectory.Trim();
             _localpath = LocalPhysicalDirectory.Trim();
            if (RemoteUrl == null)
                _remoteurl = "";
            else
                _remoteurl = RemoteUrl.Trim();
            if (_remoteurl.Equals(""))
                bcomp = true;
            if (_localpath.Equals(""))
                throw new NullReferenceException ("本地的實體路徑不可為空!");
            _rename = RenameFile;
            SeriesNum = 1;
           // FileNum = Com2000888.Common.Rand.Number(6);
            _localpath = _localpath.Replace("/", "\\");
            _localurl = _localurl.Replace("\\", "/");
            _remoteurl = _remoteurl.Replace("\\", "/");
            _localpath = _localpath.TrimEnd('\\');
            _localurl = _localurl.TrimEnd('/');
            if (!Directory.Exists(_localpath))
                Directory.CreateDirectory(_localpath);
        }
        /// <summary>
        /// 要擷取的資源檔副檔名,副檔名不要加點(.),如{"gif","jpg","png"},預設的下載檔案有gif,jpg,bmp,png
        /// </summary>
        public string[] FileExtends
        {
            set
            {
                restype = "";
                string[] flexs = value;
                for(int i=0;i<flexs.Length;i++)
                {
                    if (i > 0)
                        restype += "|";
                    restype += "." + flexs[i].TrimStart('.');
                }
            }
        }
        /// <summary>
        /// 擷取遠端資源的路徑
        /// </summary>
        private IList<resinfo> ObtainResURL()
        {
            IList<resinfo> list = new List<resinfo>();
            string pattern = "src\\s?=\\s?['\"]?(?<resurl>.+?(" + restype.Replace(".", "\\.") + "))";
            //string pattern = "[=\\(]['\"\\ ]??(?<resurl>[^<>\"]+?(" + restype.Replace(".","\\.") + "))";
            if (bcomp)
                pattern = @"(http|https|ftp|rtsp|mms)://\S+(" + restype.Replace(".", "\\.") + ")";
            Regex reg = new Regex(pattern, RegexOptions.Compiled | RegexOptions.IgnoreCase);
            Match m = reg.Match(_content);
            while (m.Success)
            {
                string url = "";
                if (bcomp)
                {
                    url = m.Value;
                }
                else
                {
                    url = m.Groups["resurl"].Value;
                }
                bool bsame = false;
                foreach (resinfo res in list)
                {
                    if (res.orgurl.Equals(url))
                    {
                        bsame = true;
                        break;
                    }
                }
                if (!bsame)
                {
                    #region 加入資源清單
                    string name = "";
                    string curl = url.Replace("\\", "/").Trim();
                    if (curl.IndexOf("/") >= 0)
                    {
                        name = curl.Substring(curl.LastIndexOf("/") + 1);
                    }
                    else
                    {
                        name = url;
                    }
                    int pos = name.LastIndexOf(".");
                    resinfo r;
                    r.orgurl = url;
                    r.orgname = name.Substring(0, pos);
                    r.extname = name.Substring(pos + 1);
                    r.newname = "";
                    list.Add(r);
                    #endregion 加入資源清單
                }
                m = m.NextMatch();
            }
            return list;
        }
        /// <summary>
        /// 儲存遠程圖片並替換原文內容
        /// </summary>
        public void FetchResource()
        {
            WebClient wb = new WebClient();
            IList<resinfo> list = ObtainResURL();
            if(!_localurl.Equals(""))
                _localurl += "/";
            foreach (resinfo r in list)
            {
                try
                {
                    string url = UtilityPage.StickUrl(_remoteurl, r.orgurl);
                    string newurl = "",newpath="";
                    if (_rename)
                    {
                        #region 產生新檔案名稱
                        string newname = FileNum + SeriesNum.ToString().PadLeft(3, '0') + "." + r.extname;

                        while (File.Exists(_localpath +"\\"+ newname))
                        {
                            SeriesNum++;
                            newname = FileNum + SeriesNum.ToString().PadLeft(3, '0') + "." + r.extname;
                        }
                        newpath = _localpath + "\\" + newname;
                        newurl = _localurl + newname;
                        wb.DownloadFile(url,newpath);
                        #endregion
                    }
                    else
                    {
                        newurl = _localurl + r.orgname + "." + r.extname;
                        wb.DownloadFile(url, _localpath + "\\" + r.orgname + "." + r.extname);
                    }
                    #region 替換檔案名稱
                    _content = _content.Replace(r.orgurl,newurl);
                    #endregion 替換檔案名稱
                    SeriesNum++;
                }
                catch
                { }
            }
            if (wb != null)
                wb.Dispose();
        }
        /// <summary>
        /// 擷取內容
        /// </summary>
        public string Content
        {
            get { return _content; }
        }
    }

把html裡面的相對連結改為絕對連結。

C# code

        static public string FormatUrlInHTML_New(Uri bsUrl, string strHTML)
        {
            string Ptn = "<[\\S\\s][^>]*(\\ssrc=|\\svalue=|\\shref=)('|\")?([^>\"'\\s]*\\.(gif|jpg|bmp|jpeg|psd|png|svg|dxf|wmf|tiff|swf))('|\")?[\\S\\s][^>]*>";

          
            foreach (Match match in Regex.Matches(strHTML, Ptn, RegexOptions.IgnoreCase))
            {
                string text2 = match.Groups[0].Value;//原連結
                string text1 = match.Groups[3].Value;
                Uri addr = new Uri( bsUrl , text1);

                string text3 = text2.Replace(text1, addr.AbsoluteUri);//新連結
                strHTML = strHTML.Replace(text2, text3);
            }
            return strHTML;
        }

我這裡提供一個專門解析頁面的類,Winista.HtmlParser.dll

http://topic.csdn.net/u/20100504/15/379579b9-7cf0-4400-8fdb-995a644f7917.html?26912

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.