聲明:此Regex只適用於.net ,使用的流程為發送http請求返回整個html網頁,然後從此html頁面抓取想要的資料。
第一部分:發送httpWebRequest 請求
C#代碼
//url 地址 HttpWebRequest request = (HttpWebRequest)WebRequest.Create("URL")); HttpWebResponse response = (HttpWebResponse)request.GetResponse(); //瀏覽器類型設定 request.UserAgent = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; SLCC1; .NET CLR 2.0.50727; .NET CLR 3.0.04506; .NET CLR 3.5.21022; .NET CLR 1.0.3705; .NET CLR 1.1.4322)"; StreamReader reader = new StreamReader(response.GetResponseStream(), Encoding.GetEncoding("UTF-8")); //返回的html網頁資料 String htmlStr = reader.ReadToEnd();
第二部分:根據返回的html擷取有用資料,此方法適用於所有想通過ID或Class等等的標籤找到html的需求,拿下面一個方法為例
C#代碼
/// <summary> /// 獲得顏色 /// </summary> /// <param name="htmlStr"></param> /// <returns></returns> public String getColor(String htmlStr) { //擷取class為 DetailsC_Sku的html ,還可改為ID的方式 //string regstr6 = @"<(?<HtmlTag>[\w]+)[^>]*\s[iI][dD]=(?<Quote>"; string regstr6 = @"<(?<HtmlTag>[\w]+)[^>]*\s[cC][lL][aA][sS][sS]=(?<Quote>"; string regstr7 = "[\"']?)DetailsC_Sku(?(Quote)"; string regstr8 = @"\k<Quote>)"; string regstr9 = "[\"']?[^>]*>"; string regstr10 = @"((?<Nested><\k<HtmlTag>[^>]*>)|</\k<HtmlTag>>(?<-Nested>)|.*?)*</\k<HtmlTag>>"; StringBuilder sb2 = new StringBuilder(); sb2.Append(regstr6); sb2.Append(regstr7); sb2.Append(regstr8); sb2.Append(regstr9); sb2.Append(regstr10); //根據Regex擷取的html String sizeHtml = Regex.Match(htmlStr, sb2.ToString(), RegexOptions.Singleline).ToString(); if (!String.IsNullOrEmpty(sizeHtml)) { String newhtml = htmlStr.Replace(sizeHtml, ""); string regstr11 = @"<(?<HtmlTag>[\w]+)[^>]*\s[cC][lL][aA][sS][sS]=(?<Quote>"; string regstr12 = "[\"']?)DetailsC_Sku(?(Quote)"; string regstr13 = @"\k<Quote>)"; string regstr14 = "[\"']?[^>]*>"; string regstr15 = @"((?<Nested><\k<HtmlTag>[^>]*>)|</\k<HtmlTag>>(?<-Nested>)|.*?)*</\k<HtmlTag>>"; StringBuilder sb3 = new StringBuilder(); sb3.Append(regstr11); sb3.Append(regstr12); sb3.Append(regstr13); sb3.Append(regstr14); sb3.Append(regstr15); String colorHtml = Regex.Match(newhtml, sb3.ToString(), RegexOptions.Singleline).ToString(); if (String.IsNullOrEmpty(colorHtml)) return ""; //找出此colorHtml中的所有a 標籤 Regex regex2 = new Regex(@"<a.*?>[\s\S]*?<\/a>"); MatchCollection mc2 = regex2.Matches(colorHtml); StringBuilder sbs = new StringBuilder(); //迴圈找到顏色 if (mc2.Count > 0) { foreach (Match mm in mc2) { sbs.Append(RemoveHtml(mm.Value.ToString())).Append(","); } } return sbs.ToString(); } return ""; }
C#代碼
/// <summary> /// 替換字串中的html標籤為空白返回標籤裡的內容 /// </summary> /// <param name="src"></param> /// <returns></returns> public string RemoveHtml(string src) { Regex htmlReg = new Regex(@"<[^>]+>", RegexOptions.Compiled | RegexOptions.IgnoreCase); Regex htmlSpaceReg = new Regex("\\ \\;", RegexOptions.Compiled | RegexOptions.IgnoreCase); Regex spaceReg = new Regex("\\s{2,}|\\ \\;", RegexOptions.Compiled | RegexOptions.IgnoreCase); Regex styleReg = new Regex(@"<style(.*?)</style>", RegexOptions.Compiled | RegexOptions.IgnoreCase); Regex scriptReg = new Regex(@"<script(.*?)</script>", RegexOptions.Compiled | RegexOptions.IgnoreCase); src = styleReg.Replace(src, string.Empty); src = scriptReg.Replace(src, string.Empty); src = htmlReg.Replace(src, string.Empty); src = htmlSpaceReg.Replace(src, " "); src = spaceReg.Replace(src, " "); return src.Trim(); }