C # Baidu search results XPath analysis

Last Update:2017-05-24 Source: Internet

Author: User

Tags xpath

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Using system;using system.collections.generic;using system.io;using system.linq;using System.Net;using System.Text; Using system.threading.tasks;using Htmlagilitypack;namespace xpathget{class Program {#region webclient Create public class WebClientBD:System.Net.WebClient {protected override System.Net.WebRequest Get WebRequest (Uri address) {HttpWebRequest request = base.                GetWebRequest (address) as HttpWebRequest; Request.                AllowAutoRedirect = false; Request. Automaticdecompression = Decompressionmethods.deflate |                Decompressionmethods.gzip; Request. useragent = "mozilla/5.0 (Windows NT 6.1; Win64;                x64) applewebkit/537.36 (khtml, like Gecko) chrome/58.0.3029.110 safari/537.36 ";            return request;            Public WebResponse Response {get; private set;}              protected override WebResponse GetWebResponse (WebRequest request) {  try {this. Response = base.                GetWebResponse (Request); } catch {} return this.            Response;            }} public static string Lasturl (string url) {byte[] pagedata = null;            string lasturl = null;            string redirectlocal = null;                try {webclientbd WC = new Webclientbd ();//Create a WebClient instance to provide a resource that is identified by the URI to send data and receive data from the resource identified by the URI Wc. Credentials = CredentialCache.DefaultCredentials;                 Gets or sets the network credentials that are used to authenticate requests to Internet resources. Encoding enc = encoding.getencoding ("Utf-8"); If it is garbled, change to utf-8/GB2312 pagedata = WC. Downloaddata (URL);                                Downloads data from a resource and returns a byte array. if (WC. Response as HttpWebResponse). StatusCode = = httpstatuscode.found) {redirectlocal = (WC. Response as HttpWebResponse). headers["Location"]. StartsWith ("http") = = true? String. Empty: "http://www.baidu.com") + (WC. Response as HttpWebResponse).                    headers["Location"]; WC = new Webclientbd (); Creates a WebClient instance that provides data sent to a URI identity and receives data from a URI-identified resource WC. Credentials = CredentialCache.DefaultCredentials;                     Gets or sets the network credentials that are used to authenticate requests to Internet resources. Pagedata = WC.                    Downloaddata (redirectlocal); if (WC. Response as HttpWebResponse). StatusCode = = httpstatuscode.found) {Lasturl = (WC. Response as HttpWebResponse).                    headers["Location"]; } else if (WC. Response as HttpWebResponse).                    StatusCode = = Httpstatuscode.ok) {lasturl = redirectlocal;            }} return Lasturl; } catch (Exception ex) {return "error:" + ex.            Message;   }} #endregion public static string Gethtmlsource (string url)     {WEBCLIENTBD WC = new Webclientbd ();//Create a WebClient instance provides data sent to the URI identified by the resource and receives data from the URI identified by the resource WC. Credentials = CredentialCache.DefaultCredentials;             Gets or sets the network credentials that are used to authenticate requests to Internet resources. Encoding enc = encoding.getencoding ("Utf-8"); If it is garbled, change to Utf-8/GB2312 var pagedata = WC. Downloaddata (URL);               Downloads data from a resource and returns a byte array. Return ENC.        GetString (Pagedata); } static void Main (string[] args) {#region Paging URL//http://www.baidu.com/s?wd={0}&amp ;p n={1}&oq={0}&ie=utf-8&usm=4 #endregion String address = "Http://www.baidu.com/s?wd={0}            &pn={1}0&oq={0}&ie=utf-8&usm=4 ";            String key = "purchasing"; String Htmlpagesource = String.            Empty;            list<string> other Last List = new list<string> ();            List<string> AD the last List = new list<string> (); String ad = "//div[@id = ' content_left ']//div[contains (@id, ' 300 ')]{0}|div[@id = ' content_left ']//div[contains (@id, ' 400 ')]{1} ";            string other = "//div[@id = ' content_left ']//div[contains (@class, ' C-container ')]{0}"; Take the title for (int pnindex = 0; Pnindex < 5; pnindex++) {Htmldo                Cument doc = new HTMLDocument ();                list<string> Other Titles List = new List<string> ();                List<string> other title Links List = new list<string> ();                list<string> other content List = new list<string> ();                list<string> other citelist = new list<string> ();                List<string> ad Title List = new List<string> ();                List<string> ad title Link List = new list<string> ();                list<string> ad Content List = new list<string> ();                list<string> advertisement citelist = new list<string> (); Htmlpagesource = Gethtmlsource (string.              Format (address, Key, pnindex.tostring ()));  Doc.               Loadhtml (Htmlpagesource); Doc.                Load ("d:\\rootinfo.html", Encoding.UTF8); Htmlnodecollection ad title = Doc. Documentnode.selectnodes (String.                Format (advertisement, "/div[1]/h3/a[1", "/div[1]/h3/a[1]")); Htmlnodecollection ad Title link = doc. Documentnode.selectnodes (String.                Format (advertisement, "/div[1]/h3/a[1", "/div[1]/h3/a[1]")); Htmlnodecollection advertising content = Doc. Documentnode.selectnodes (String.                Format (advertisement, "/div[2", "/div[2]")); htmlnodecollection advertisement cite = doc. Documentnode.selectnodes (String.                Format (AD, "/div[2]//a/span[1", "/div[3]/a/span")); Htmlnodecollection other title = Doc. Documentnode.selectnodes (String.                Format (Other, "/h3/a[1]")); Htmlnodecollection other title link = doc. Documentnode.selectnodes (String.                Format (Other, "/h3/a[1]")); htmlnodecollection other content = Doc. Documentnode.selectnodes (String. Format (Other, "//div[@class = ' c-abstract ']") + "|" + String. Format (Other, "//div[' c-span18 c-span-last ']/p[1]") + "|" + String.Format (Other, "//div[@class = ' c-offset ']") + "|" + String. Format (Other, "//div[@class = ' op_dict_content ']") + "|" + String.                Format (Other, "//p[contains () (Text (), ' due to the presence of the website's robots.txt file restriction directive ')]); htmlnodecollection other cite = doc. Documentnode.selectnodes (String. Format (Other, "//span[@class = ' C-showurl ']") + "|" + String.                Format (Other, "//a[@class = ' c-showurl ')");                Analysis each result has a title, now is the results of the latest relevant information is not cite, to determine which does not cite to its assigned value "new Info"//If there is the latest relevant information results, assuming the title has 9 results, then there are 8 cite. 1. If the latest message is in the last Index=8,cite index=8 of the title at this time, the last title index=9, but the cite index is not present, so add an element "new info". Other locations Insert the element foreach (var item in other cite) {other Citelist.add (item. Innertext.trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String.                Empty)); } foreach (var item in other content) {Other content List.add (item. Innertext.trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String.                Empty)); } for (int i = 0; i < other headings. Count; i++) {Other title list.add (other headings [i].) Innertext.trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String.                    Empty)); Other title Links List.add (other title links [i]. Getattributevalue ("href", ""). Trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String.                Empty)); if (other caption List.count! = Other Citelist.count) {if (other title [Other Citelist.count].                    Innertext.contains ("Latest relevant information")) {other Citelist.add ("new info");                        } else {for (int i = 0; i < other headings list.count; i++) {if (other title List[i].                     Contains ("Latest relevant information")) {Other Citelist.insert (I, "new info");       }}}}//list<string> other Last List =                New List<string> (); for (int j = 0; J < other title List.count; j + +) {Other last List.add (other title List[j] + "|" + other title links Li                ST[J] + "|" + other content List[j] + "|" + other CITELIST[J] + "\ t"); The other last List.add (String.Format ("above for the {0} page search results.                ", Pnindex + 1));                string path = @ "D:\\infolist_page" + (pnindex+1) + ". html";                File.writealltext (Path, Htmlpagesource, Encoding.UTF8); for (int i = 0; i < ad title. Count; i++) {ad title List.add (ad title [i]. Innertext.trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String.                    Empty)); Advertising content List.add (ad content [i]. Innertext.trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String.                    Empty)); Ad title link list.add (ad title link [i]. GetattributEvalue ("href", ""). Trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String.                    Empty)); Advertising Citelist.add (advertising cite[i). Innertext.trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String.                Empty));  } for (int j = 0; J < advertising title List.count; j + +) {List.add of the last advertisement (ad title List[j]                + "|" + advertising title link List[j] + "|" + advertising content List[j] + "|" + advertising CITELIST[J] + "\ t"); The last List.add of the ad (String.Format ("above for the {0} page search results.            ", Pnindex + 1));            } file.writealllines (@ "D:\\infolist.txt", Other last List.toarray (), Encoding.UTF8);        File.writealllines (@ "D:\\infolist2.txt", Advertising last List.toarray (), Encoding.UTF8); }    }}

C # Baidu search results XPath analysis

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More