This is my own write a data acquisition program, is to collect a talent network inside talent information, the first time to write a blog, write a bad don't laugh.
First, create a datasheet by following the field
The following are the referenced contents:
public partial class Form2:form
{
Public Form2 ()
{
InitializeComponent ();
}
Name
public static string XM = "";
Age
public static string nl = "";
Gender
public static string XB = "";
Height
public static string SG = "";
Political outlook
public static string mm = "";
National
public static string MZ = "";
Degree
public static string XL = "";
Marital status
public static string HK = "";
The subject of study
public static string ZY = "";
Work experience
public static string gzjy = "";
Working units
public static string zzdw = "";
Job Title
public static string zzzw = "";
Work experience
public static string gzjl = "";
Salary Required
public static string YX = "";
Nature of work
public static string gzxz = "";
Job intention
public static string Qzyx = "";
Specific positions
public static string jtzw = "";
Expect to work
public static string qwgzd = "";
Educational situation, language proficiency, technical expertise
public static string QT = "";
private void Button1_Click (object sender, EventArgs e)
{
Label1. Text = "Collecting data ...";
Number of pages traversing data
for (int i = 1; I <=50; i++)
{
CJ ("http://www.xcjob.cn/renli.asp?pageno=" + i);
}
Label1. Text = "Congratulations on your collection!" ";
MessageBox.Show ("Congratulations on your collection!") ");
}
Collect Data
private void CJ (string Url)
{
Get page source file (Html)
String strwebcontent = YM (URL);
According to the tags inside the HTML to remove the data related to the source code
int ibodystart = Strwebcontent.indexof ("<body", 0);
int AAA = Strwebcontent.indexof ("keywords:", ibodystart);
int itablestart = Strwebcontent.indexof ("<table", AAA);
int itableend = Strwebcontent.indexof ("</table>", Itablestart);
String strweb = Strwebcontent.substring (Itablestart, Itableend-itablestart);
Generate HTMLDocument
HtmlElementCollection htmltr = htmltr_content (Strweb, "tr");
foreach (HtmlElement tr in htmltr)
{
Try
{
Name
XM = tr. getElementsByTagName ("a") [0]. InnerText;
Get the URL for the details page
String a = tr. getElementsByTagName ("a") [0]. GetAttribute ("href"). ToString ();
A = "http://www.xcjob.cn" + a.substring (11);
Content (a);
}
Catch {}
}
}
Collect detailed data
private void Content (string URL)
{
Try
{
String strwebcontent = YM (URL);
According to the tags inside the HTML to remove the data related to the source code
int ibodystart = Strwebcontent.indexof ("<body", 0);
int itablestart = strwebcontent.indexof ("Browse Times", Ibodystart);
int itableend = Strwebcontent.indexof ("<table", Itablestart);
int dd = Strwebcontent.indexof ("</table>", itableend);
String strweb = Strwebcontent.substring (itableend, dd-itableend + 8);
HtmlElementCollection htmltr = htmltr_content (strweb, "table");
foreach (HtmlElement tr in htmltr)
{
Try
{
Age
NL = tr. getElementsByTagName ("tr") [1]. getElementsByTagName ("TD") [1]. InnerText;
Gender
String xb_sg = tr. getElementsByTagName ("tr") [1]. getElementsByTagName ("TD") [3]. InnerText;
XB = Xb_sg. Substring (0, 1);
Height
SG = Xb_sg. Substring (11);
Political outlook
MM = tr. getElementsByTagName ("tr") [2]. getElementsByTagName ("TD") [1]. InnerText;
National
MZ = tr. getElementsByTagName ("tr") [2]. getElementsByTagName ("TD") [3]. InnerText;
Degree
XL = tr. getElementsByTagName ("tr") [3]. getElementsByTagName ("TD") [1]. InnerText;
Status of marital smoke
HK = tr. getElementsByTagName ("tr") [3]. getElementsByTagName ("TD") [3]. InnerText;
The subject of study
ZY = tr. getElementsByTagName ("tr") [5]. getElementsByTagName ("TD") [1]. InnerText;
Work experience
Gzjy = tr. getElementsByTagName ("tr") [5]. getElementsByTagName ("TD") [3]. InnerText;
Working units
ZZDW = tr. getElementsByTagName ("tr") [6]. getElementsByTagName ("TD") [1]. InnerText;
Job Title
ZZZW = tr. getElementsByTagName ("tr") [6]. getElementsByTagName ("TD") [3]. InnerText;
Work experience
Gzjy = tr. getElementsByTagName ("tr") [7]. getElementsByTagName ("TD") [1]. InnerText;
Salary Required
YX = tr. getElementsByTagName ("tr") [9]. getElementsByTagName ("TD") [1]. InnerText;
Nature of work
GZXZ = tr. getElementsByTagName ("tr") [9]. getElementsByTagName ("TD") [3]. InnerText;
Job intention
Qzyx = tr. getElementsByTagName ("tr") [10]. getElementsByTagName ("TD") [1]. InnerText;
Specific positions
JTZW = tr. getElementsByTagName ("tr") [10]. getElementsByTagName ("TD") [3]. InnerText;
Expect to work
Qwgzd = tr. getElementsByTagName ("tr") [11]. getElementsByTagName ("TD") [1]. InnerText;
Educational situation, language proficiency, technical expertise
QT = tr. getElementsByTagName ("tr") [13]. getElementsByTagName ("TD") [1]. InnerText;
Insert ();
}
Catch
{ }
}
}
Catch {}
}
Inserting data into a database
private void Insert ()
{
Try
{
String str = "Provider=Microsoft.Jet.OLEDB.4.0;Data Source=data.mdb";
String sql = "INSERT INTO talent information (name, age, sex, height, political appearance, ethnicity, education, marital status, specialty,";
SQL + + "work experience, in-service units, in-job positions, work experience, salary requirements, job nature, job search intention, specific positions, expectations workplace, other) values";
sql = "(' + XM +" ', "+ nl +", "" + XB + "', '" + SG + "', '" + mm + "', '" + MZ + "', '" + XL + "', '" + HK + "', '" + ZY + " ', ' "+ Gzjy +" ', ' "+ ZZDW +" ', ' "+ zzzw +" ', ";
SQL + + "' + Gzjy +" ', ' "+ YX +" ', ' "+ gzxz +" ', ' "+ Qzyx +" ', ' "+ jtzw +" ', ' "+ Qwgzd +" ', ' "+ QT +" ') ";
OleDbConnection con = new OleDbConnection (str);
OleDbCommand com = new OleDbCommand (sql, con);
Con. Open ();
Com. ExecuteNonQuery ();
Con. Close ();
}
Catch {}
}
Returns a htmlelementcollection and then queries the content
Private HtmlElementCollection htmltr_content (String strweb, String TJ)
{
Try
{
Generate HTMLDocument
WebBrowser Webb = new WebBrowser ();
Webb. Navigate ("About:blank");
Window.document returns a HTMLDocument object that represents the operation of an HTML document
The HTMLDocument object is established on the basis of XmlDocument, with all the method attributes of XmlDocument
HTMLDocument htmldoc = Webb. Document.opennew (TRUE);
Htmldoc. Write (Strweb);
HtmlElementCollection htmltr = Htmldoc. getElementsByTagName (TJ);
return htmltr;
}
catch {return null;}
}
Get the original URL code
private string YM (String Url)
{
String strresult = "";
Try
{
HttpWebRequest request = (HttpWebRequest) webrequest.create (URL);
Request. method = ' Get ';
HttpWebResponse response = (HttpWebResponse) request. GetResponse ();
Stream streamreceive = Response. GetResponseStream ();
Encoding Encoding = encoding.getencoding ("GB2312");
StreamReader StreamReader = new StreamReader (streamreceive, encoding);
strresult = Streamreader.readtoend ();
}
Catch {}
return strresult;
}
}
This program is not written too well, are all used for loop traversal out, the efficiency is not too high, the master can use multithreading pointing.