C#版採集程式源碼介紹

來源:互聯網
上載者:User
因為工作需要,自己寫了一個採集程式,如果冒犯了你的網站,我在這裡說一聲對不起 !!  哎~!我只是一個普通的程式員.  namespace CJ  {   public partial class Form1 : Form   {    public int proxy = 0;   public int keyi = 0;   public int keyj = 0;   public int keym = 0;   public int keyn = 0;   public int sum = 0;   public string newurl = "";   public string cururl = "";   public string dirname = "";   public string curdir = "";   public string responseFromServer = "";   public string filename = "";   public string sql = "";   public string mulu = "";   StringBuilder sbs = new StringBuilder();   List<Class1> cls = new List<Class1>();   public ArrayList al = new ArrayList();   public string insertdl = "insert into mzinedl values(";   public string insertxl = "insert into mzinexl values(";   public string insertinfo = "insert into mzineinfo values(";   public string insertwz = "insert into mzinewz values(";   public Form1()   {   InitializeComponent();   }   /// <summary>   /// 儲存網頁   /// </summary>   /// <param name="FILE_NAME">檔案的路徑</param>   /// <param name="data">資料</param>   public void TextToFile(string FILE_NAME, string data)   {   if (File.Exists(FILE_NAME))   {   return;   }   using (StreamWriter sw = File.CreateText(FILE_NAME))   {   sw.Write(data);   sw.Close();   }   }   /// <summary>   /// 下載檔案   /// </summary>   /// <param name="PageUrl">網址</param>   /// <param name="filename">儲存檔案路徑</param>   public void DownFile(string PageUrl, string filename)   {   if (!Directory.Exists(filename))   {   Directory.CreateDirectory(filename);   }   string path = PageUrl.Substring(PageUrl.LastIndexOf("/") + 1);   string dirname = filename + "\\" + path;   if (File.Exists(dirname))   {   return;   }   else   {   try   {    WebClient wc = new WebClient();   WebProxy wp = new WebProxy(al[proxy].ToString(), true);   wc.Proxy = wp;   wc.DownloadFile(PageUrl, dirname);   }   catch (WebException ex)   {   if (ex.Status == WebExceptionStatus.ConnectFailure)   {   //無法串連到遠程伺服器, --換代理 IP   //MessageBox.Show(ex.ToString());   proxy++;   if (proxy >= al.Count)   {   al = ReadIPproxy("e:\\test.txt");//初始化代理 IP   }   DownFile(PageUrl, filename);   }   else if (ex.Status == WebExceptionStatus.Timeout)   {   //逾時 --換代理 IP   //MessageBox.Show(ex.ToString());   proxy++;   if (proxy >= al.Count)   {   al = ReadIPproxy("e:\\test.txt");//初始化代理 IP   }   DownFile(PageUrl, filename);   }   else if (ex.Status == WebExceptionStatus.ProtocolError)   {   //檔案未找到--跳出    //MessageBox.Show(ex.ToString());   return;   }   }   }   }   /// <summary>   /// 讀檔案   /// </summary>   /// <param name="FILE_NAME">檔案的路徑</param>   /// <returns>資料</returns>   public ArrayList ReadIPproxy(string FILE_NAME)   {    using (StreamReader sr = File.OpenText(FILE_NAME))   {   String input;   while ((input = sr.ReadLine()) != null)   {   al.Add(input);   }    sr.Close();   }   return al;   }   /// <summary>   /// 資料庫   /// </summary>   public void Executesql()   {   SqlHelper.ExecuteNonQuery(SqlHelper.sqlstr, CommandType.Text, sbs.ToString(), null);   }   /// <summary>   /// 讀檔案   /// </summary>   /// <param name="FILE_NAME">檔案的路徑</param>   /// <returns>資料</returns>   public string FileToText(string FILE_NAME)   {   string data;   using (StreamReader sr = File.OpenText(FILE_NAME))   {   data=sr.ReadToEnd();   sr.Close();   }   return data;   }   /// <summary>   /// 儲存SQL   /// </summary>   /// <param name="sql"></param>   public void SaveSqls(string sql)   {   sbs.Append(sql).Append(" ");   }    /// <summary>   /// 請求失敗的時候,反覆操作   /// </summary>   /// <param name="PageUrl"></param>   /// <returns></returns>   public string ToServer(string PageUrl)   {   string responseFromServer = "";      try   {    while (1 == 1)   {   WebRequest request = WebRequest.Create(PageUrl);   WebProxy wp = new WebProxy(al[proxy].ToString(), true);   request.Proxy = wp;   request.Timeout = 1000 * 60;   HttpWebResponse response = (HttpWebResponse)request.GetResponse();   Stream dataStream = response.GetResponseStream();   StreamReader reader=null;   try   {   reader = new StreamReader(dataStream, System.Text.Encoding.Default);   responseFromServer = reader.ReadToEnd();   }   catch    {   proxy++;   if (proxy >= al.Count)   {   al = ReadIPproxy("e:\\test.txt");//初始化代理 IP   }   ToServer(PageUrl);   };   reader.Close();   dataStream.Close();   response.Close();   if (responseFromServer.Contains("refresh") || responseFromServer == "")   {   proxy++;   if (proxy >= al.Count)   {   al = ReadIPproxy("e:\\test.txt");//初始化代理 IP   }   //ToServer(PageUrl);   }   else   {   break;   }   }   }   catch (WebException ex)   {    if (ex.Status == WebExceptionStatus.ProtocolError)   {    responseFromServer = "";   }   else   {   proxy++;   if (proxy >= al.Count)   {   al = ReadIPproxy("e:\\test.txt");//初始化代理 IP   }   ToServer(PageUrl);   }   }    return responseFromServer;   }   /// <summary>   /// 儲存XML 檔案   /// </summary>   public void SaveXmls()   {   string pathxml = "";   foreach (Class1 c in cls)   {   Class1 s = c;   pathxml = s.address;   if (!File.Exists(pathxml))   {   XmlSerializer xs = new XmlSerializer(typeof(Class1));   Stream stream = new FileStream(pathxml, FileMode.Create, FileAccess.Write, FileShare.ReadWrite);   xs.Serialize(stream, s);   stream.Close();   }       }   }   /// <summary>   /// 移除HTMl 標記   /// </summary>   /// <param name="Html"></param>   /// <param name="RegStr"></param>   /// <returns></returns>   public static string Remove(string Html)   {   //Regex Reg = new Regex(RegStr);   //foreach (Match m in Reg.Matches(Html))   //{   // Html = Html.Replace(m.Value, "");   //}   //return Html.Trim();   string regesstr = "<.*?>";   return Regex.Replace(Html, regesstr, string.Empty, RegexOptions.IgnoreCase);   }   public static string FilterScript(string content)   {   string regexstr = @"<(script)[^>]*>(\s*|.)*</\1>";   return Regex.Replace(content,regexstr,string.Empty,RegexOptions.IgnoreCase);   }   /// <summary>   /// 過略所有的 危險標記   /// </summary>   /// <param name="html"></param>   /// <returns></returns>   public string wipeScript(string html)   {   System.Text.RegularExpressions.Regex regex1 = new System.Text.RegularExpressions.Regex(@"(<script){1,}[^<>]*>[^\0]*(<\/script>){1,}", System.Text.RegularExpressions.RegexOptions.IgnoreCase);   System.Text.RegularExpressions.Regex regex2 = new System.Text.RegularExpressions.Regex(@"href *= *[\s\S]*script *:", System.Text.RegularExpressions.RegexOptions.IgnoreCase);   System.Text.RegularExpressions.Regex regex3 = new System.Text.RegularExpressions.Regex(@"on[\s\S]*=", System.Text.RegularExpressions.RegexOptions.IgnoreCase);   System.Text.RegularExpressions.Regex regex4 = new System.Text.RegularExpressions.Regex(@"<iframe[\s\S]+</iframe*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);   System.Text.RegularExpressions.Regex regex5 = new System.Text.RegularExpressions.Regex(@"<frameset[\s\S]+</frameset*>", System.Text.RegularExpressions.RegexOptions.IgnoreCase);   html = regex1.Replace(html, ""); //過濾<script></script>標記    html = regex2.Replace(html, ""); //過濾href=javascript: (<A>) 屬性    html = regex3.Replace(html, " _disibledevent="); //過濾其它控制項的on...事件    html = regex4.Replace(html, ""); //過濾iframe    html = regex5.Replace(html, ""); //過濾frameset    return html;   }   public void HtmlSource(string urlpri)   {   //要寫入的檔案路徑   filename = "E:\\觀2\\magazine.html";   if (!Directory.Exists("E:\\觀2"))   {   Directory.CreateDirectory("E:\\觀2");   }   if (File.Exists(filename))   {   responseFromServer=FileToText(filename); //存在      }   else   {   responseFromServer = ToServer(urlpri); //不存在      }   sum++;   if (responseFromServer != "")   {   //分析內容   TextToFile(filename,responseFromServer);   MatchCollection mc = Regex.Matches(responseFromServer, @"href=""/magazine/(.*)""><b>(.*)</b>", RegexOptions.IgnoreCase);   foreach (Match m in mc)   {   newurl = m.Groups[1].Value;   dirname = m.Groups[2].Value;   int key = ++keyi;   sql = insertdl + key + ",'" + dirname + "')";   SaveSqls(sql);   cururl = urlpri + newurl;   curdir = "E:\\觀2\\" + dirname;   one(cururl, curdir,key);   }   SaveXmls();   Executesql();       this.textBox1.Text = sum.ToString();   MessageBox.Show("採集成功!");   }   }   public void one(string urlpri,string _dirname,int _key)   {   //要寫入的檔案路徑   filename = _dirname +"\\"+ urlpri.Substring(urlpri.LastIndexOf("/") + 1);   if (!Directory.Exists(_dirname))   {   Directory.CreateDirectory(_dirname);   }   if (File.Exists(filename))   {   responseFromServer = FileToText(filename);   }   else   {   responseFromServer = ToServer(urlpri);   }   sum++;   if (responseFromServer != "")   {   TextToFile(filename, responseFromServer);   MatchCollection mc = Regex.Matches(responseFromServer, @"href=""\.\./(.*list.html)""[\s\S]*?《(.*?)》", RegexOptions.IgnoreCase);   foreach (Match m in mc)   {   newurl = m.Groups[1].Value;   dirname = m.Groups[2].Value;   cururl = "http://www.zydg.net/magazine/" + newurl;   curdir = _dirname + "\\" + dirname;   two(cururl, curdir, _key);   }    }    }   public void two(string urlpri,string _dirname,int _key)   {   filename = urlpri.Substring(0, urlpri.LastIndexOf("/"));   filename = filename.Substring(filename.LastIndexOf("/") + 1) + ".html";   filename = _dirname + "\\" + filename;   if (!Directory.Exists(_dirname))   {   Directory.CreateDirectory(_dirname);   }   if (File.Exists(filename))   {   responseFromServer = FileToText(filename);   }   else   {   responseFromServer = ToServer(urlpri);   }   sum++;   if (responseFromServer != "")   {   TextToFile(filename, responseFromServer);   Match mc = Regex.Match(responseFromServer, @"刊\s+期:(.*?)<br>[\s\S]*?編\s+輯:(.*?)<br>[\s\S]*?出\s+版: (.*?)<br>[\s\S]*?聯絡電話:(.*?)<br>[\s\S]*?E-mail: (.*?)<br>[\s\S]*?社\s+址:(.*?)<br>[\s\S]*?郵\s+編: (.*?)<br>[\s\S]*?郵發代號:(.*?)<br>[\s\S]*?國外發行代號: (.*?)<br>[\s\S]*?國際標準刊號:(.*?)<br>[\s\S]*?國內統一刊號: (.*?)</td>", RegexOptions.IgnoreCase);   Match content = Regex.Match(responseFromServer, @"刊\s+物\s+簡\s+介\s+:::...([\s\S]*?)...:::\s+收錄期號列表", RegexOptions.Multiline);   int key = ++keyj;   sql = insertxl + keyj + "," + _key + ",'" + dirname + "','" + mc.Groups[1].Value + "','" + mc.Groups[2].Value + "','" +   mc.Groups[3].Value + "','" + mc.Groups[4].Value + "','" + mc.Groups[5].Value + "','" + mc.Groups[6].Value + "','" +   mc.Groups[7].Value + "','" + mc.Groups[8].Value + "','" + mc.Groups[9].Value + "','" + mc.Groups[10].Value + "','" + mc.Groups[11].Value + "','" + Remove(content.Groups[1].Value) + "')";   SaveSqls(sql);   MatchCollection mc2 = Regex.Matches(responseFromServer, @"href='(.*?)'\s+target.*>(.*?)</a>", RegexOptions.IgnoreCase);   foreach (Match m2 in mc2)   {   newurl = m2.Groups[1].Value;   dirname = m2.Groups[2].Value.Replace("年", "-").Replace("第", "").Replace("期", "");   cururl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) + newurl;   curdir = _dirname + "\\" + dirname;   three(cururl, curdir,key,dirname);   }       }    }   public void three(string urlpri,string _dirname,int _key,string qishu)   {   //要寫入的檔案路徑   filename = _dirname + "\\" + urlpri.Substring(urlpri.LastIndexOf("/") + 1);   if (!Directory.Exists(_dirname))   {   Directory.CreateDirectory(_dirname);   }   if (File.Exists(filename))   {   responseFromServer = FileToText(filename);   }   else   {   responseFromServer = ToServer(urlpri);   }   sum++;   if (responseFromServer != "")   {   TextToFile(filename, responseFromServer);   Match m = Regex.Match(responseFromServer, @"src='face_(.*?)'", RegexOptions.IgnoreCase);   string photoName = "";   if (m.Groups[1].Value.Trim() != "")   {   photoName = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) +"face_" + m.Groups[1].Value;   DownFile(photoName, _dirname);      }   int key = ++keym;   sql = insertinfo + key + "," + _key + ",'" + qishu + "','" + _dirname +"\\"+ "face_" + m.Groups[1].Value + "')";   SaveSqls(sql);   MatchCollection mc2 = Regex.Matches(responseFromServer, @"href='(\d+.html?)'[\s\S]*?<font\s+color=black>(.*?)</a>|& lt;font[^>]*?>[(.+?)]", RegexOptions.IgnoreCase);   foreach (Match m2 in mc2)   {   newurl = m2.Groups[1].Value;   string muName = m2.Groups[3].Value;   if (muName == "")   {   muName = mulu;   }   string lstr = m2.Groups[2].Value;   string s1 = "";   string s2 = "";   if (lstr != "")   {   if (lstr.Contains("."))   {   s1 = lstr.Substring(0, lstr.IndexOf("."));   s2 = lstr.Substring(lstr.LastIndexOf(".") + 1);   }   else   {   s1 = lstr;   s2 = "";   }   int k2 = ++keyn;   sql = insertwz + k2 + "," + key + ",'" + muName + "','" + s1 + "','" + s2 + "')";   SaveSqls(sql);   cururl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1) + newurl;   curdir = _dirname;   four(cururl, curdir,k2);   }   mulu = muName;   }    }   }    public void four(string urlpri,string _dirname,int _key)   {   filename = _dirname + "\\" + urlpri.Substring(urlpri.LastIndexOf("/") + 1);   if (!Directory.Exists(_dirname))   {   Directory.CreateDirectory(_dirname);   }   if (File.Exists(filename))   {   responseFromServer = FileToText(filename);   }   else   {   responseFromServer = ToServer(urlpri);   }   sum++;   if (responseFromServer != "")   {   TextToFile(filename, responseFromServer);   //分析內容   Match m = Regex.Match(responseFromServer, @"本文開始-->(?<text>[\s\S]*?)<!--本文結束", RegexOptions.IgnoreCase);   string content = m.Groups["text"].Value; //得到本文的所有內容   string c = FilterScript(content);   c = Remove(c); //得到過濾後的本文內容   // Match ms = Regex.Match(c, @"本文開始-->(?<text>[\s\S]*?)<!--本文結束", RegexOptions.IgnoreCase);         //設定要儲存的XML 檔案的名稱   string xmlname = urlpri.Substring(urlpri.LastIndexOf("/") + 1, urlpri.LastIndexOf(".") - urlpri.LastIndexOf("/"));   string pathxml = _dirname + "\\" + xmlname + "xml"; //將路徑 和名字一起傳過去   Class1 cs = new Class1(_key, c, pathxml);   cls.Add(cs);   //序列化成功   MatchCollection mc = Regex.Matches(responseFromServer, @"(<img\s+src=""(?<imgs>.*)""\s+hspace|HreF=""([^>]*PDF)"")", RegexOptions.IgnoreCase);   foreach (Match m2 in mc)   {   string imgurl = m2.Groups["imgs"].Value.Trim(); //得到單個圖片的名稱   string zhuurl = urlpri.Substring(0, urlpri.LastIndexOf("/") + 1);   if (imgurl != "")   {   string jurl = zhuurl + imgurl; //得到圖片的絕對路徑    DownFile(jurl, _dirname);      }   string pdfurl = m2.Groups["pdfs"].Value.Trim(); //得到單個PDF 的名稱   if (pdfurl != "")   {   string jurl = zhuurl + pdfurl; //得到 pdf 的絕對路徑    DownFile(jurl, _dirname);      }   }   }   }    private void btnOK_Click(object sender, EventArgs e)   {   al = ReadIPproxy("e:\\test.txt");//初始化代理 IP   HtmlSource("http://www.zydg.net/magazine/");    }   private void button1_Click(object sender, EventArgs e)   {   Application.Exit();    }      }  }
相關文章

聯繫我們

該頁面正文內容均來源於網絡整理,並不代表阿里雲官方的觀點,該頁面所提到的產品和服務也與阿里云無關,如果該頁面內容對您造成了困擾,歡迎寫郵件給我們,收到郵件我們將在5個工作日內處理。

如果您發現本社區中有涉嫌抄襲的內容,歡迎發送郵件至: info-contact@alibabacloud.com 進行舉報並提供相關證據,工作人員會在 5 個工作天內聯絡您,一經查實,本站將立刻刪除涉嫌侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.