Using system;using system.collections.generic;using system.io;using system.linq;using System.Net;using System.Text; Using system.threading.tasks;using Htmlagilitypack;namespace xpathget{class Program {#region webclient Create public class WebClientBD:System.Net.WebClient {protected override System.Net.WebRequest Get WebRequest (Uri address) {HttpWebRequest request = base. GetWebRequest (address) as HttpWebRequest; Request. AllowAutoRedirect = false; Request. Automaticdecompression = Decompressionmethods.deflate | Decompressionmethods.gzip; Request. useragent = "mozilla/5.0 (Windows NT 6.1; Win64; x64) applewebkit/537.36 (khtml, like Gecko) chrome/58.0.3029.110 safari/537.36 "; return request; Public WebResponse Response {get; private set;} protected override WebResponse GetWebResponse (WebRequest request) { try {this. Response = base. GetWebResponse (Request); } catch {} return this. Response; }} public static string Lasturl (string url) {byte[] pagedata = null; string lasturl = null; string redirectlocal = null; try {webclientbd WC = new Webclientbd ();//Create a WebClient instance to provide a resource that is identified by the URI to send data and receive data from the resource identified by the URI Wc. Credentials = CredentialCache.DefaultCredentials; Gets or sets the network credentials that are used to authenticate requests to Internet resources. Encoding enc = encoding.getencoding ("Utf-8"); If it is garbled, change to utf-8/GB2312 pagedata = WC. Downloaddata (URL); Downloads data from a resource and returns a byte array. if (WC. Response as HttpWebResponse). StatusCode = = httpstatuscode.found) {redirectlocal = (WC. Response as HttpWebResponse). headers["Location"]. StartsWith ("http") = = true? String. Empty: "http://www.baidu.com") + (WC. Response as HttpWebResponse). headers["Location"]; WC = new Webclientbd (); Creates a WebClient instance that provides data sent to a URI identity and receives data from a URI-identified resource WC. Credentials = CredentialCache.DefaultCredentials; Gets or sets the network credentials that are used to authenticate requests to Internet resources. Pagedata = WC. Downloaddata (redirectlocal); if (WC. Response as HttpWebResponse). StatusCode = = httpstatuscode.found) {Lasturl = (WC. Response as HttpWebResponse). headers["Location"]; } else if (WC. Response as HttpWebResponse). StatusCode = = Httpstatuscode.ok) {lasturl = redirectlocal; }} return Lasturl; } catch (Exception ex) {return "error:" + ex. Message; }} #endregion public static string Gethtmlsource (string url) {WEBCLIENTBD WC = new Webclientbd ();//Create a WebClient instance provides data sent to the URI identified by the resource and receives data from the URI identified by the resource WC. Credentials = CredentialCache.DefaultCredentials; Gets or sets the network credentials that are used to authenticate requests to Internet resources. Encoding enc = encoding.getencoding ("Utf-8"); If it is garbled, change to Utf-8/GB2312 var pagedata = WC. Downloaddata (URL); Downloads data from a resource and returns a byte array. Return ENC. GetString (Pagedata); } static void Main (string[] args) {#region Paging URL//http://www.baidu.com/s?wd={0}& ;p n={1}&oq={0}&ie=utf-8&usm=4 #endregion String address = "Http://www.baidu.com/s?wd={0} &pn={1}0&oq={0}&ie=utf-8&usm=4 "; String key = "purchasing"; String Htmlpagesource = String. Empty; list<string> other Last List = new list<string> (); List<string> AD the last List = new list<string> (); String ad = "//div[@id = ' content_left ']//div[contains (@id, ' 300 ')]{0}|div[@id = ' content_left ']//div[contains (@id, ' 400 ')]{1} "; string other = "//div[@id = ' content_left ']//div[contains (@class, ' C-container ')]{0}"; Take the title for (int pnindex = 0; Pnindex < 5; pnindex++) {Htmldo Cument doc = new HTMLDocument (); list<string> Other Titles List = new List<string> (); List<string> other title Links List = new list<string> (); list<string> other content List = new list<string> (); list<string> other citelist = new list<string> (); List<string> ad Title List = new List<string> (); List<string> ad title Link List = new list<string> (); list<string> ad Content List = new list<string> (); list<string> advertisement citelist = new list<string> (); Htmlpagesource = Gethtmlsource (string. Format (address, Key, pnindex.tostring ())); Doc. Loadhtml (Htmlpagesource); Doc. Load ("d:\\rootinfo.html", Encoding.UTF8); Htmlnodecollection ad title = Doc. Documentnode.selectnodes (String. Format (advertisement, "/div[1]/h3/a[1", "/div[1]/h3/a[1]")); Htmlnodecollection ad Title link = doc. Documentnode.selectnodes (String. Format (advertisement, "/div[1]/h3/a[1", "/div[1]/h3/a[1]")); Htmlnodecollection advertising content = Doc. Documentnode.selectnodes (String. Format (advertisement, "/div[2", "/div[2]")); htmlnodecollection advertisement cite = doc. Documentnode.selectnodes (String. Format (AD, "/div[2]//a/span[1", "/div[3]/a/span")); Htmlnodecollection other title = Doc. Documentnode.selectnodes (String. Format (Other, "/h3/a[1]")); Htmlnodecollection other title link = doc. Documentnode.selectnodes (String. Format (Other, "/h3/a[1]")); htmlnodecollection other content = Doc. Documentnode.selectnodes (String. Format (Other, "//div[@class = ' c-abstract ']") + "|" + String. Format (Other, "//div[' c-span18 c-span-last ']/p[1]") + "|" + String.Format (Other, "//div[@class = ' c-offset ']") + "|" + String. Format (Other, "//div[@class = ' op_dict_content ']") + "|" + String. Format (Other, "//p[contains () (Text (), ' due to the presence of the website's robots.txt file restriction directive ')]); htmlnodecollection other cite = doc. Documentnode.selectnodes (String. Format (Other, "//span[@class = ' C-showurl ']") + "|" + String. Format (Other, "//a[@class = ' c-showurl ')"); Analysis each result has a title, now is the results of the latest relevant information is not cite, to determine which does not cite to its assigned value "new Info"//If there is the latest relevant information results, assuming the title has 9 results, then there are 8 cite. 1. If the latest message is in the last Index=8,cite index=8 of the title at this time, the last title index=9, but the cite index is not present, so add an element "new info". Other locations Insert the element foreach (var item in other cite) {other Citelist.add (item. Innertext.trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String. Empty)); } foreach (var item in other content) {Other content List.add (item. Innertext.trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String. Empty)); } for (int i = 0; i < other headings. Count; i++) {Other title list.add (other headings [i].) Innertext.trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String. Empty)); Other title Links List.add (other title links [i]. Getattributevalue ("href", ""). Trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String. Empty)); if (other caption List.count! = Other Citelist.count) {if (other title [Other Citelist.count]. Innertext.contains ("Latest relevant information")) {other Citelist.add ("new info"); } else {for (int i = 0; i < other headings list.count; i++) {if (other title List[i]. Contains ("Latest relevant information")) {Other Citelist.insert (I, "new info"); }}}}//list<string> other Last List = New List<string> (); for (int j = 0; J < other title List.count; j + +) {Other last List.add (other title List[j] + "|" + other title links Li ST[J] + "|" + other content List[j] + "|" + other CITELIST[J] + "\ t"); The other last List.add (String.Format ("above for the {0} page search results. ", Pnindex + 1)); string path = @ "D:\\infolist_page" + (pnindex+1) + ". html"; File.writealltext (Path, Htmlpagesource, Encoding.UTF8); for (int i = 0; i < ad title. Count; i++) {ad title List.add (ad title [i]. Innertext.trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String. Empty)); Advertising content List.add (ad content [i]. Innertext.trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String. Empty)); Ad title link list.add (ad title link [i]. GetattributEvalue ("href", ""). Trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String. Empty)); Advertising Citelist.add (advertising cite[i). Innertext.trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String. Empty)); } for (int j = 0; J < advertising title List.count; j + +) {List.add of the last advertisement (ad title List[j] + "|" + advertising title link List[j] + "|" + advertising content List[j] + "|" + advertising CITELIST[J] + "\ t"); The last List.add of the ad (String.Format ("above for the {0} page search results. ", Pnindex + 1)); } file.writealllines (@ "D:\\infolist.txt", Other last List.toarray (), Encoding.UTF8); File.writealllines (@ "D:\\infolist2.txt", Advertising last List.toarray (), Encoding.UTF8); } }}
C # Baidu search results XPath analysis