Regex抓取網頁資訊

來源:互聯網
上載者:User
聲明:此Regex只適用於.net ,使用的流程為發送http請求返回整個html網頁,然後從此html頁面抓取想要的資料。

第一部分:發送httpWebRequest 請求

C#代碼

//url 地址  HttpWebRequest request = (HttpWebRequest)WebRequest.Create("URL"));              HttpWebResponse response = (HttpWebResponse)request.GetResponse();              //瀏覽器類型設定              request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; .NET CLR 3.0.04506; .NET CLR 3.5.21022; .NET CLR 1.0.3705; .NET CLR 1.1.4322)";              StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("UTF-8"));                //返回的html網頁資料              String htmlStr = reader.ReadToEnd();

第二部分:根據返回的html擷取有用資料,此方法適用於所有想通過ID或Class等等的標籤找到html的需求,拿下面一個方法為例

C#代碼

  /// <summary>          /// 獲得顏色           /// </summary>          /// <param name="htmlStr"></param>          /// <returns></returns>          public String getColor(String htmlStr)          {    //擷取class為  DetailsC_Sku的html ,還可改為ID的方式     //string regstr6 = @"<(?<HtmlTag>[\w]+)[^>]*\s[iI][dD]=(?<Quote>";      string regstr6 = @"<(?<HtmlTag>[\w]+)[^>]*\s[cC][lL][aA][sS][sS]=(?<Quote>";              string regstr7 = "[\"']?)DetailsC_Sku(?(Quote)";              string regstr8 = @"\k<Quote>)";              string regstr9 = "[\"']?[^>]*>";              string regstr10 = @"((?<Nested><\k<HtmlTag>[^>]*>)|</\k<HtmlTag>>(?<-Nested>)|.*?)*</\k<HtmlTag>>";              StringBuilder sb2 = new StringBuilder();              sb2.Append(regstr6);              sb2.Append(regstr7);              sb2.Append(regstr8);              sb2.Append(regstr9);              sb2.Append(regstr10);          //根據Regex擷取的html              String sizeHtml = Regex.Match(htmlStr, sb2.ToString(), RegexOptions.Singleline).ToString();              if (!String.IsNullOrEmpty(sizeHtml))              {                  String newhtml = htmlStr.Replace(sizeHtml, "");                  string regstr11 = @"<(?<HtmlTag>[\w]+)[^>]*\s[cC][lL][aA][sS][sS]=(?<Quote>";                  string regstr12 = "[\"']?)DetailsC_Sku(?(Quote)";                  string regstr13 = @"\k<Quote>)";                  string regstr14 = "[\"']?[^>]*>";                  string regstr15 = @"((?<Nested><\k<HtmlTag>[^>]*>)|</\k<HtmlTag>>(?<-Nested>)|.*?)*</\k<HtmlTag>>";                  StringBuilder sb3 = new StringBuilder();                  sb3.Append(regstr11);                  sb3.Append(regstr12);                  sb3.Append(regstr13);                  sb3.Append(regstr14);                  sb3.Append(regstr15);                  String colorHtml = Regex.Match(newhtml, sb3.ToString(), RegexOptions.Singleline).ToString();                  if (String.IsNullOrEmpty(colorHtml))                      return "";                    //找出此colorHtml中的所有a 標籤                  Regex regex2 = new Regex(@"<a.*?>[\s\S]*?<\/a>");                  MatchCollection mc2 = regex2.Matches(colorHtml);                  StringBuilder sbs = new StringBuilder();                  //迴圈找到顏色                  if (mc2.Count > 0)                  {                      foreach (Match mm in mc2)                      {                          sbs.Append(RemoveHtml(mm.Value.ToString())).Append(",");                      }                  }                  return sbs.ToString();              }              return "";                        }

C#代碼

/// <summary>        /// 替換字串中的html標籤為空白返回標籤裡的內容         /// </summary>         /// <param name="src"></param>         /// <returns></returns>         public string RemoveHtml(string src)         {             Regex htmlReg = new Regex(@"<[^>]+>", RegexOptions.Compiled | RegexOptions.IgnoreCase);             Regex htmlSpaceReg = new Regex("\\&nbsp\\;", RegexOptions.Compiled | RegexOptions.IgnoreCase);             Regex spaceReg = new Regex("\\s{2,}|\\ \\;", RegexOptions.Compiled | RegexOptions.IgnoreCase);             Regex styleReg = new Regex(@"<style(.*?)</style>", RegexOptions.Compiled | RegexOptions.IgnoreCase);             Regex scriptReg = new Regex(@"<script(.*?)</script>", RegexOptions.Compiled | RegexOptions.IgnoreCase);               src = styleReg.Replace(src, string.Empty);             src = scriptReg.Replace(src, string.Empty);             src = htmlReg.Replace(src, string.Empty);             src = htmlSpaceReg.Replace(src, " ");             src = spaceReg.Replace(src, " ");             return src.Trim();         }
  • 相關文章

    聯繫我們

    該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

    如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

    A Free Trial That Lets You Build Big!

    Start building with 50+ products and up to 12 months usage for Elastic Compute Service

    • Sales Support

      1 on 1 presale consultation

    • After-Sales Support

      24/7 Technical Support 6 Free Tickets per Quarter Faster Response

    • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.