C # Baidu search results XPath analysis

Source: Internet
Author: User
Tags xpath

Using system;using system.collections.generic;using system.io;using system.linq;using System.Net;using System.Text; Using system.threading.tasks;using Htmlagilitypack;namespace xpathget{class Program {#region webclient Create public class WebClientBD:System.Net.WebClient {protected override System.Net.WebRequest Get WebRequest (Uri address) {HttpWebRequest request = base.                GetWebRequest (address) as HttpWebRequest; Request.                AllowAutoRedirect = false; Request. Automaticdecompression = Decompressionmethods.deflate |                Decompressionmethods.gzip; Request. useragent = "mozilla/5.0 (Windows NT 6.1; Win64;                x64) applewebkit/537.36 (khtml, like Gecko) chrome/58.0.3029.110 safari/537.36 ";            return request;            Public WebResponse Response {get; private set;}              protected override WebResponse GetWebResponse (WebRequest request) {  try {this. Response = base.                GetWebResponse (Request); } catch {} return this.            Response;            }} public static string Lasturl (string url) {byte[] pagedata = null;            string lasturl = null;            string redirectlocal = null;                try {webclientbd WC = new Webclientbd ();//Create a WebClient instance to provide a resource that is identified by the URI to send data and receive data from the resource identified by the URI Wc. Credentials = CredentialCache.DefaultCredentials;                 Gets or sets the network credentials that are used to authenticate requests to Internet resources. Encoding enc = encoding.getencoding ("Utf-8"); If it is garbled, change to utf-8/GB2312 pagedata = WC. Downloaddata (URL);                                Downloads data from a resource and returns a byte array. if (WC. Response as HttpWebResponse). StatusCode = = httpstatuscode.found) {redirectlocal = (WC. Response as HttpWebResponse). headers["Location"]. StartsWith ("http") = = true? String. Empty: "http://www.baidu.com") + (WC. Response as HttpWebResponse).                    headers["Location"]; WC = new Webclientbd (); Creates a WebClient instance that provides data sent to a URI identity and receives data from a URI-identified resource WC. Credentials = CredentialCache.DefaultCredentials;                     Gets or sets the network credentials that are used to authenticate requests to Internet resources. Pagedata = WC.                    Downloaddata (redirectlocal); if (WC. Response as HttpWebResponse). StatusCode = = httpstatuscode.found) {Lasturl = (WC. Response as HttpWebResponse).                    headers["Location"]; } else if (WC. Response as HttpWebResponse).                    StatusCode = = Httpstatuscode.ok) {lasturl = redirectlocal;            }} return Lasturl; } catch (Exception ex) {return "error:" + ex.            Message;   }} #endregion public static string Gethtmlsource (string url)     {WEBCLIENTBD WC = new Webclientbd ();//Create a WebClient instance provides data sent to the URI identified by the resource and receives data from the URI identified by the resource WC. Credentials = CredentialCache.DefaultCredentials;             Gets or sets the network credentials that are used to authenticate requests to Internet resources. Encoding enc = encoding.getencoding ("Utf-8"); If it is garbled, change to Utf-8/GB2312 var pagedata = WC. Downloaddata (URL);               Downloads data from a resource and returns a byte array. Return ENC.        GetString (Pagedata); } static void Main (string[] args) {#region Paging URL//http://www.baidu.com/s?wd={0}&amp ;p n={1}&oq={0}&ie=utf-8&usm=4 #endregion String address = "Http://www.baidu.com/s?wd={0}            &pn={1}0&oq={0}&ie=utf-8&usm=4 ";            String key = "purchasing"; String Htmlpagesource = String.            Empty;            list<string> other Last List = new list<string> ();            List<string> AD the last List = new list<string> (); String ad = "//div[@id = ' content_left ']//div[contains (@id, ' 300 ')]{0}|div[@id = ' content_left ']//div[contains (@id, ' 400 ')]{1} ";            string other = "//div[@id = ' content_left ']//div[contains (@class, ' C-container ')]{0}"; Take the title for (int pnindex = 0; Pnindex < 5; pnindex++) {Htmldo                Cument doc = new HTMLDocument ();                list<string> Other Titles List = new List<string> ();                List<string> other title Links List = new list<string> ();                list<string> other content List = new list<string> ();                list<string> other citelist = new list<string> ();                List<string> ad Title List = new List<string> ();                List<string> ad title Link List = new list<string> ();                list<string> ad Content List = new list<string> ();                list<string> advertisement citelist = new list<string> (); Htmlpagesource = Gethtmlsource (string.              Format (address, Key, pnindex.tostring ()));  Doc.               Loadhtml (Htmlpagesource); Doc.                Load ("d:\\rootinfo.html", Encoding.UTF8); Htmlnodecollection ad title = Doc. Documentnode.selectnodes (String.                Format (advertisement, "/div[1]/h3/a[1", "/div[1]/h3/a[1]")); Htmlnodecollection ad Title link = doc. Documentnode.selectnodes (String.                Format (advertisement, "/div[1]/h3/a[1", "/div[1]/h3/a[1]")); Htmlnodecollection advertising content = Doc. Documentnode.selectnodes (String.                Format (advertisement, "/div[2", "/div[2]")); htmlnodecollection advertisement cite = doc. Documentnode.selectnodes (String.                Format (AD, "/div[2]//a/span[1", "/div[3]/a/span")); Htmlnodecollection other title = Doc. Documentnode.selectnodes (String.                Format (Other, "/h3/a[1]")); Htmlnodecollection other title link = doc. Documentnode.selectnodes (String.                Format (Other, "/h3/a[1]")); htmlnodecollection other content = Doc. Documentnode.selectnodes (String. Format (Other, "//div[@class = ' c-abstract ']") + "|" + String. Format (Other, "//div[' c-span18 c-span-last ']/p[1]") + "|" + String.Format (Other, "//div[@class = ' c-offset ']") + "|" + String. Format (Other, "//div[@class = ' op_dict_content ']") + "|" + String.                Format (Other, "//p[contains () (Text (), ' due to the presence of the website's robots.txt file restriction directive ')]); htmlnodecollection other cite = doc. Documentnode.selectnodes (String. Format (Other, "//span[@class = ' C-showurl ']") + "|" + String.                Format (Other, "//a[@class = ' c-showurl ')");                Analysis each result has a title, now is the results of the latest relevant information is not cite, to determine which does not cite to its assigned value "new Info"//If there is the latest relevant information results, assuming the title has 9 results, then there are 8 cite. 1. If the latest message is in the last Index=8,cite index=8 of the title at this time, the last title index=9, but the cite index is not present, so add an element "new info". Other locations Insert the element foreach (var item in other cite) {other Citelist.add (item. Innertext.trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String.                Empty)); } foreach (var item in other content) {Other content List.add (item. Innertext.trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String.                Empty)); } for (int i = 0; i < other headings. Count; i++) {Other title list.add (other headings [i].) Innertext.trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String.                    Empty)); Other title Links List.add (other title links [i]. Getattributevalue ("href", ""). Trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String.                Empty)); if (other caption List.count! = Other Citelist.count) {if (other title [Other Citelist.count].                    Innertext.contains ("Latest relevant information")) {other Citelist.add ("new info");                        } else {for (int i = 0; i < other headings list.count; i++) {if (other title List[i].                     Contains ("Latest relevant information")) {Other Citelist.insert (I, "new info");       }}}}//list<string> other Last List =                New List<string> (); for (int j = 0; J < other title List.count; j + +) {Other last List.add (other title List[j] + "|" + other title links Li                ST[J] + "|" + other content List[j] + "|" + other CITELIST[J] + "\ t"); The other last List.add (String.Format ("above for the {0} page search results.                ", Pnindex + 1));                string path = @ "D:\\infolist_page" + (pnindex+1) + ". html";                File.writealltext (Path, Htmlpagesource, Encoding.UTF8); for (int i = 0; i < ad title. Count; i++) {ad title List.add (ad title [i]. Innertext.trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String.                    Empty)); Advertising content List.add (ad content [i]. Innertext.trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String.                    Empty)); Ad title link list.add (ad title link [i]. GetattributEvalue ("href", ""). Trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String.                    Empty)); Advertising Citelist.add (advertising cite[i). Innertext.trim (). Replace (" ", String.Empty). Replace ("\ n", String. Empty). Replace ("", String.                Empty));  } for (int j = 0; J < advertising title List.count; j + +) {List.add of the last advertisement (ad title List[j]                + "|" + advertising title link List[j] + "|" + advertising content List[j] + "|" + advertising CITELIST[J] + "\ t"); The last List.add of the ad (String.Format ("above for the {0} page search results.            ", Pnindex + 1));            } file.writealllines (@ "D:\\infolist.txt", Other last List.toarray (), Encoding.UTF8);        File.writealllines (@ "D:\\infolist2.txt", Advertising last List.toarray (), Encoding.UTF8); }    }}

  

C # Baidu search results XPath analysis

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.