C#版採集程式源碼介紹
來源:互聯網
上載者:User
因為工作需要,自己寫了一個採集程式,如果冒犯了你的網站,我在這裡說一聲對不起 !! 哎~!我只是一個普通的程式員. namespace CJ { public partial class Form1 : Form { public int proxy = 0; public int keyi = 0; public int keyj = 0; public int keym = 0; public int keyn = 0; public int sum = 0; public string newurl = ""; public string cururl = ""; public string dirname = ""; public string curdir = ""; public string responseFromServer = ""; public string filename = ""; public string sql = ""; public string mulu = ""; StringBuilder sbs = new StringBuilder(); List<Class1> cls = new List<Class1>(); public ArrayList al = new ArrayList(); public string insertdl = "insert into mzinedl values("; public string insertxl = "insert into mzinexl values("; public string insertinfo = "insert into mzineinfo values("; public string insertwz = "insert into mzinewz values("; public Form1() { InitializeComponent(); } /// <summary> /// 儲存網頁 /// </summary> /// <param name="FILE_NAME">檔案的路徑</param> /// <param name="data">資料</param> public void TextToFile(string FILE_NAME, string data) { if (File.Exists(FILE_NAME)) { return; } using (StreamWriter sw = File.CreateText(FILE_NAME)) { sw.Write(data); sw.Close(); } } /// <summary> /// 下載檔案 /// </summary> /// <param name="PageUrl">網址</param> /// <param name="filename">儲存檔案路徑</param> public void DownFile(string PageUrl, string filename) { if (!Directory.Exists(filename)) { Directory.CreateDirectory(filename); } string path = PageUrl.Substring(PageUrl.LastIndexOf("/") + 1); string dirname = filename + "\\" + path; if (File.Exists(dirname)) { return; } else { try { WebClient wc = new WebClient(); WebProxy wp = new WebProxy(al[proxy].ToString(), true); wc.Proxy = wp; wc.DownloadFile(PageUrl, dirname); } catch (WebException ex) { if (ex.Status == WebExceptionStatus.ConnectFailure) { //無法串連到遠程伺服器, --換代理 IP //MessageBox.Show(ex.ToString()); proxy++; if (proxy >= al.Count) { al = ReadIPproxy("e:\\test.txt");//初始化代理 IP } DownFile(PageUrl, filename); } else if (ex.Status == WebExceptionStatus.Timeout) { //逾時 --換代理 IP //MessageBox.Show(ex.ToString()); proxy++; if (proxy >= al.Count) { al = ReadIPproxy("e:\\test.txt");//初始化代理 IP } DownFile(PageUrl, filename); } else if (ex.Status == WebExceptionStatus.ProtocolError) { //檔案未找到--跳出 //MessageBox.Show(ex.ToString()); return; } } } } /// <summary> /// 讀檔案 /// </summary> /// <param name="FILE_NAME">檔案的路徑</param> /// <returns>資料</returns> public ArrayList ReadIPproxy(string FILE_NAME) { using (StreamReader sr = File.OpenText(FILE_NAME)) { String input; while ((input = sr.ReadLine()) != null) { al.Add(input); } sr.Close(); } return al; } /// <summary> /// 資料庫 /// </summary> public void Executesql() { SqlHelper.ExecuteNonQuery(SqlHelper.sqlstr, CommandType.Text, sbs.ToString(), null); } /// <summary> /// 讀檔案 /// </summary> /// <param name="FILE_NAME">檔案的路徑</param> /// <returns>資料</returns> public string FileToText(string FILE_NAME) { string data; using (StreamReader sr = File.OpenText(FILE_NAME)) { data=sr.ReadToEnd(); sr.Close(); } return data; } /// <summary> /// 儲存SQL /// </summary> /// <param name="sql"></param> public void SaveSqls(string sql) { sbs.Append(sql).Append(" "); } /// <summary> /// 請求失敗的時候,反覆操作 /// </summary> /// <param name="PageUrl"></param> /// <returns></returns> public string ToServer(string PageUrl) { string responseFromServer = ""; try { while (1 == 1) { WebRequest request = WebRequest.Create(PageUrl); WebProxy wp = new WebProxy(al[proxy].ToString(), true); request.Proxy = wp; request.Timeout = 1000 * 60; HttpWebResponse response = (HttpWebResponse)request.GetResponse(); Stream dataStream = response.GetResponseStream(); StreamReader reader=null; try { reader = new StreamReader(dataStream, System.Text.Encoding.Default); responseFromServer = reader.ReadToEnd(); } catch { proxy++; if (proxy >= al.Count) { al = ReadIPproxy("e:\\test.txt");//初始化代理 IP } ToServer(PageUrl); }; reader.Close(); dataStream.Close(); response.Close(); if (responseFromServer.Contains("refresh") || responseFromServer == "") { proxy++; if (proxy >= al.Count) { al = ReadIPproxy("e:\\test.txt");//初始化代理 IP } //ToServer(PageUrl); } else { break; } } } catch (WebException ex) { if (ex.Status == WebExceptionStatus.ProtocolError) { responseFromServer = ""; } else { proxy++; if (proxy >= al.Count) { al = ReadIPproxy("e:\\test.txt");//初始化代理 IP } ToServer(PageUrl); } } return responseFromServer; } /// <summary> /// 儲存XML 檔案 /// </summary> public void SaveXmls() { string pathxml = ""; foreach (Class1 c in cls) { Class1 s = c; pathxml = s.address; if (!File.Exists(pathxml)) { XmlSerializer xs = new XmlSerializer(typeof(Class1)); Stream stream = new FileStream(pathxml, FileMode.Create, FileAccess.Write, FileShare.ReadWrite); xs.Serialize(stream, s); stream.Close(); } } } /// <summary> /// 移除HTMl 標記 /// </summary> /// <param name="Html"></param> /// <param name="RegStr"></param> /// <returns></returns> public static string Remove(string Html) { //Regex Reg = new Regex(RegStr); //foreach (Match m in Reg.Matches(Html)) //{ // Html = Html.Replace(m.Value, ""); //} //return Html.Trim(); string regesstr = "<.*?>"; return Regex.Replace(Html, regesstr, string.Empty, RegexOptions.IgnoreCase); } public static string FilterScript(string content) { string regexstr = @"<(script)[^>]*>(\s*|.)*</\1>"; return Regex.Replace(content,regexstr,string.Empty,RegexOptions.IgnoreCase); } /// <summary> /// 過略所有的 危險標記 /// </summary> /// <param name="html"></param> /// <returns></returns> public string wipeScript(string html) { System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"(<script){1,}[^<>]*>[^\0]*(<\/script>){1,}", System.Text.RegularExpressions.RegexOptions.IgnoreCase); System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@"href *= *[\s\S]*script *:", System.Text.RegularExpressions.RegexOptions.IgnoreCase); System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@"on[\s\S]*=", System.Text.RegularExpressions.RegexOptions.IgnoreCase); System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[\s\S]+</iframe*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[\s\S]+</frameset*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase); html = regex1.Replace(html, ""); //過濾<script></script>標記 html = regex2.Replace(html, ""); //過濾href=javascript: (<A>) 屬性 html = regex3.Replace(html, " _disibledevent="); //過濾其它控制項的on...事件 html = regex4.Replace(html, ""); //過濾iframe html = regex5.Replace(html, ""); //過濾frameset return html; } public void HtmlSource(string urlpri) { //要寫入的檔案路徑 filename = "E:\\觀2\\magazine.html"; if (!Directory.Exists("E:\\觀2")) { Directory.CreateDirectory("E:\\觀2"); } if (File.Exists(filename)) { responseFromServer=FileToText(filename); //存在 } else { responseFromServer = ToServer(urlpri); //不存在 } sum++; if (responseFromServer != "") { //分析內容 TextToFile(filename,responseFromServer); MatchCollection mc = Regex.Matches(responseFromServer, @"href=""/magazine/(.*)""><b>(.*)</b>", RegexOptions.IgnoreCase); foreach (Match m in mc) { newurl = m.Groups[1].Value; dirname = m.Groups[2].Value; int key = ++keyi; sql = insertdl + key + ",'" + dirname + "')"; SaveSqls(sql); cururl = urlpri + newurl; curdir = "E:\\觀2\\" + dirname; one(cururl, curdir,key); } SaveXmls(); Executesql(); this.textBox1.Text = sum.ToString(); MessageBox.Show("採集成功!"); } } public void one(string urlpri,string _dirname,int _key) { //要寫入的檔案路徑 filename = _dirname +"\\"+ urlpri.Substring(urlpri.LastIndexOf("/") + 1); if (!Directory.Exists(_dirname)) { Directory.CreateDirectory(_dirname); } if (File.Exists(filename)) { responseFromServer = FileToText(filename); } else { responseFromServer = ToServer(urlpri); } sum++; if (responseFromServer != "") { TextToFile(filename, responseFromServer); MatchCollection mc = Regex.Matches(responseFromServer, @"href=""\.\./(.*list.html)""[\s\S]*?《(.*?)》", RegexOptions.IgnoreCase); foreach (Match m in mc) { newurl = m.Groups[1].Value; dirname = m.Groups[2].Value; cururl = "http://www.zydg.net/magazine/" + newurl; curdir = _dirname + "\\" + dirname; two(cururl, curdir, _key); } } } public void two(string urlpri,string _dirname,int _key) { filename = urlpri.Substring(0, urlpri.LastIndexOf("/")); filename = filename.Substring(filename.LastIndexOf("/") + 1) + ".html"; filename = _dirname + "\\" + filename; if (!Directory.Exists(_dirname)) { Directory.CreateDirectory(_dirname); } if (File.Exists(filename)) { responseFromServer = FileToText(filename); } else { responseFromServer = ToServer(urlpri); } sum++; if (responseFromServer != "") { TextToFile(filename, responseFromServer); Match mc = Regex.Match(responseFromServer, @"刊\s+期:(.*?)<br>[\s\S]*?編\s+輯:(.*?)<br>[\s\S]*?出\s+版: (.*?)<br>[\s\S]*?聯絡電話:(.*?)<br>[\s\S]*?E-mail: (.*?)<br>[\s\S]*?社\s+址:(.*?)<br>[\s\S]*?郵\s+編: (.*?)<br>[\s\S]*?郵發代號:(.*?)<br>[\s\S]*?國外發行代號: (.*?)<br>[\s\S]*?國際標準刊號:(.*?)<br>[\s\S]*?國內統一刊號: (.*?)</td>", RegexOptions.IgnoreCase); Match content = Regex.Match(responseFromServer, @"刊\s+物\s+簡\s+介\s+:::...([\s\S]*?)...:::\s+收錄期號列表", RegexOptions.Multiline); int key = ++keyj; sql = insertxl + keyj + "," + _key + ",'" + dirname + "','" + mc.Groups[1].Value + "','" + mc.Groups[2].Value + "','" + mc.Groups[3].Value + "','" + mc.Groups[4].Value + "','" + mc.Groups[5].Value + "','" + mc.Groups[6].Value + "','" + mc.Groups[7].Value + "','" + mc.Groups[8].Value + "','" + mc.Groups[9].Value + "','" + mc.Groups[10].Value + "','" + mc.Groups[11].Value + "','" + Remove(content.Groups[1].Value) + "')"; SaveSqls(sql); MatchCollection mc2 = Regex.Matches(responseFromServer, @"href='(.*?)'\s+target.*>(.*?)</a>", RegexOptions.IgnoreCase); foreach (Match m2 in mc2) { newurl = m2.Groups[1].Value; dirname = m2.Groups[2].Value.Replace("年", "-").Replace("第", "").Replace("期", ""); cururl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) + newurl; curdir = _dirname + "\\" + dirname; three(cururl, curdir,key,dirname); } } } public void three(string urlpri,string _dirname,int _key,string qishu) { //要寫入的檔案路徑 filename = _dirname + "\\" + urlpri.Substring(urlpri.LastIndexOf("/") + 1); if (!Directory.Exists(_dirname)) { Directory.CreateDirectory(_dirname); } if (File.Exists(filename)) { responseFromServer = FileToText(filename); } else { responseFromServer = ToServer(urlpri); } sum++; if (responseFromServer != "") { TextToFile(filename, responseFromServer); Match m = Regex.Match(responseFromServer, @"src='face_(.*?)'", RegexOptions.IgnoreCase); string photoName = ""; if (m.Groups[1].Value.Trim() != "") { photoName = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) +"face_" + m.Groups[1].Value; DownFile(photoName, _dirname); } int key = ++keym; sql = insertinfo + key + "," + _key + ",'" + qishu + "','" + _dirname +"\\"+ "face_" + m.Groups[1].Value + "')"; SaveSqls(sql); MatchCollection mc2 = Regex.Matches(responseFromServer, @"href='(\d+.html?)'[\s\S]*?<font\s+color=black>(.*?)</a>|& lt;font[^>]*?>[(.+?)]", RegexOptions.IgnoreCase); foreach (Match m2 in mc2) { newurl = m2.Groups[1].Value; string muName = m2.Groups[3].Value; if (muName == "") { muName = mulu; } string lstr = m2.Groups[2].Value; string s1 = ""; string s2 = ""; if (lstr != "") { if (lstr.Contains(".")) { s1 = lstr.Substring(0, lstr.IndexOf(".")); s2 = lstr.Substring(lstr.LastIndexOf(".") + 1); } else { s1 = lstr; s2 = ""; } int k2 = ++keyn; sql = insertwz + k2 + "," + key + ",'" + muName + "','" + s1 + "','" + s2 + "')"; SaveSqls(sql); cururl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) + newurl; curdir = _dirname; four(cururl, curdir,k2); } mulu = muName; } } } public void four(string urlpri,string _dirname,int _key) { filename = _dirname + "\\" + urlpri.Substring(urlpri.LastIndexOf("/") + 1); if (!Directory.Exists(_dirname)) { Directory.CreateDirectory(_dirname); } if (File.Exists(filename)) { responseFromServer = FileToText(filename); } else { responseFromServer = ToServer(urlpri); } sum++; if (responseFromServer != "") { TextToFile(filename, responseFromServer); //分析內容 Match m = Regex.Match(responseFromServer, @"本文開始-->(?<text>[\s\S]*?)<!--本文結束", RegexOptions.IgnoreCase); string content = m.Groups["text"].Value; //得到本文的所有內容 string c = FilterScript(content); c = Remove(c); //得到過濾後的本文內容 // Match ms = Regex.Match(c, @"本文開始-->(?<text>[\s\S]*?)<!--本文結束", RegexOptions.IgnoreCase); //設定要儲存的XML 檔案的名稱 string xmlname = urlpri.Substring(urlpri.LastIndexOf("/") + 1, urlpri.LastIndexOf(".") - urlpri.LastIndexOf("/")); string pathxml = _dirname + "\\" + xmlname + "xml"; //將路徑 和名字一起傳過去 Class1 cs = new Class1(_key, c, pathxml); cls.Add(cs); //序列化成功 MatchCollection mc = Regex.Matches(responseFromServer, @"(<img\s+src=""(?<imgs>.*)""\s+hspace|HreF=""([^>]*PDF)"")", RegexOptions.IgnoreCase); foreach (Match m2 in mc) { string imgurl = m2.Groups["imgs"].Value.Trim(); //得到單個圖片的名稱 string zhuurl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1); if (imgurl != "") { string jurl = zhuurl + imgurl; //得到圖片的絕對路徑 DownFile(jurl, _dirname); } string pdfurl = m2.Groups["pdfs"].Value.Trim(); //得到單個PDF 的名稱 if (pdfurl != "") { string jurl = zhuurl + pdfurl; //得到 pdf 的絕對路徑 DownFile(jurl, _dirname); } } } } private void btnOK_Click(object sender, EventArgs e) { al = ReadIPproxy("e:\\test.txt");//初始化代理 IP HtmlSource("http://www.zydg.net/magazine/"); } private void button1_Click(object sender, EventArgs e) { Application.Exit(); } } }