Simple crawling of small programs and presentation ., Capture mini-Programs

Source: Internet
Author: User

Simple crawling of small programs and presentation ., Capture mini-Programs

Preface: to use the applet navigation page to increase website traffic, find www.xcxdh666.com and navigate the applet website.

 

Analysis of Web Page 1 found that the website is actually loading data using asynchronous paging requests, so there is no need for xpath to parse html and directly analyze its request url


 

2. Click "load more" and find the request. The pageNum and cagegory parameters are used.


 

3. Therefore, directly request the url to include the parameter, and the json result is returned after analysis.


 

Write code 1 first create the receiving type

public class XcxApplet
    {
        public int id {get; set;}

        public string categoryName {get; set;}

        public string name {get; set;}

        public string saomaUrl {get; set;}

        public string sum {get; set;}

        public string logoUrl {get; set;}
    }

    public class Result
    {
        public List <XcxApplet> dataList {get; set;}
        public string category {get; set;}
        public int status {get; set;}
        public int pageNum {get; set;}
    }


     2 Package request page method
  

   public static string GetPostPage (this string posturl, string postData)
            {
                Encoding encoding = Encoding.UTF8;
                byte [] data = null;
                if (! string.IsNullOrEmpty (postData)) data = encoding.GetBytes (postData);
                try
                {
                    // Setting parameters
                    var request = WebRequest.Create (posturl) as HttpWebRequest;
                    if (request == null) return string.Empty;
                    var cookieContainer = new CookieContainer ();
                    request.CookieContainer = cookieContainer;
                    request.AllowAutoRedirect = true;
                    request.Method = "POST";
                    request.ContentType = "application / x-www-form-urlencoded";
                    if (data! = null)
                    {
                        request.ContentLength = data.Length;
                        Stream outstream = request.GetRequestStream ();
                        outstream.Write (data, 0, data.Length);
                        outstream.Close ();
                    }
                    // Send request and get corresponding response data
                    var response = request.GetResponse () as HttpWebResponse;
                    if (response == null) return string.Empty;

                    // It is not until the request.GetResponse () program starts sending Post requests to the target webpage
                    Stream instream = response.GetResponseStream ();
                    if (instream == null) return string.Empty;
                    var sr = new StreamReader (instream, encoding);
                    // Return to result page (html) code
                    string content = sr.ReadToEnd ();
                    string err = string.Empty;
                    //Response.Write(content);
                    return content;
                }
                catch (Exception ex)
                {
                    string err = ex.Message;
                    return string.Empty;
                }
            }
3 The idea of image url processing is to download the returned url request to the local or upload it to the corresponding image server. I use Qiniu Cloud to store the img here. Here you can change it to download to the local and return the local url. .
  

public string QiniuUplod (string imgurl)
        {
              
            var accessKey = "Your accesskey";
            var secretKey = "Your secretkey";

            // This Mac is required to generate (upload) credentials
            // This example uses a Settings class alone, which contains AccessKey and SecretKey
            // In practice, please set your AccessKey and SecretKey by yourself
            Mac mac = new Mac (accessKey, secretKey);
            string bucket = "siyouku";
            string saveKey = imgurl.Substring (imgurl.LastIndexOf ('/') + 1, imgurl.Length- imgurl.LastIndexOf ('/')-1);


            // Please make sure that AK and BUCKET are correct before use, otherwise this function will throw an exception (such as code612 / 631 and other errors)
            Qiniu.Common.Config.AutoZone (accessKey, bucket, false);


            // Upload strategy, see
            // https://developer.qiniu.com/kodo/manual/put-policy
            PutPolicy putPolicy = new PutPolicy ();
            // If you need to set to "overwrite" upload (overwrite if there is a file with the same name in the cloud), please use SCOPE = "BUCKET: KEY"
             putPolicy.Scope = bucket + ":" + saveKey;
            putPolicy.Scope = bucket;
            // Upload policy validity period (corresponding to the validity period of the generated certificate)
            putPolicy.SetExpires (3600);
            // How many days after uploading to the cloud, the file will be deleted automatically, if it is not set (ie keep the default), it will not be deleted
            //putPolicy.DeleteAfterDays = 1;

            // Generate upload credentials, see
            // https://developer.qiniu.com/kodo/manual/upload-token
            string jstr = putPolicy.ToJsonString ();
            string token = Auth.CreateUploadToken (mac, jstr);
            try
            {
                
                var wReq = System.Net.WebRequest.Create (imgurl) as System.Net.HttpWebRequest;
                var resp = wReq.GetResponse () as System.Net.HttpWebResponse;
                using (var stream = resp.GetResponseStream ())
                {
                    // Please do not use the UploadStream method of UploadManager, because this stream does not support searching (cannot get Stream.Length)
                    // Please use the UploadStream method of FormUploader or ResumableUploader
                    FormUploader fu = new FormUploader ();
                    var result = fu.UploadStream (stream, saveKey, token);
                    var x = Newtonsoft.Json.JsonConvert.DeserializeObject <QiniuResult> (result.Text);
                    return $ "http://img.siyouku.cn/{x.key}";
                }
            }
            catch (Exception ex)
            {
                return "";
            }

 
        }


   4 Finally, the request body method
 

public ActionResult GetxcxList ()
        {
            Stopwatch watch = new Stopwatch (); // Monitoring capture time-consuming
            watch.Start ();
            //https://www.xcxdh666.com/pageList.htm?pageNum=0 dataList
            var result = new Result ();

            for (int j = 0; j <54; j ++)
            {
                string url =
                    $ "https://www.xcxdh666.com/pageList.htm?pageNum={j}";

                var str = url.GetPostPage (null); // HttpWebRequest request page
                if (str! = null)
                {
                    result = str.JsonConvert <Result> (); // Serialization extension method of string
                }

                result.dataList.ForEach (i =>
                {
                    if (! Db.Applet.Any (x => x.Name == i.name)) // Judge duplicate insertion
                    {
                        var x = new Applet ()
                        {
                            CategoryName = string.IsNullOrEmpty (i.categoryName)? "Other": i.categoryName,
                            Name = i.name,
                            SaomiaoUrl = QiniuUplod ($ "http://img.xcxdh666.com/wxappnav/{i.saomaUrl}"),
                            Summary = i.sum,
                            LogoUrl = QiniuUplod ($ "http://img.xcxdh666.com/wxappnav/{i.logoUrl}"),
                            SortNum = j,
                            CreateUser = "wenqing",
                            CreateTime = DateTime.Now

                        };
                        Db.Applet.Add (x);
                    }

                });

                Db.SaveChanges ();


            }
            watch.Stop ();
            return Content ("Crawl completed! This request takes a total of time:" + watch.ElapsedMilliseconds);
        }
    }


 

ok here to complete the crawl

         Attach here the display address http://siyouku.cn/Applet

 

Blogger URL: http://www.siyouku.cn

This article is permanently more detailed address: http://siyouku.cn/article/6806.html

Related Article

Contact Us

The content source of this page is from Internet, which doesn't represent Alibaba Cloud's opinion; products and services mentioned on that page don't have any relationship with Alibaba Cloud. If the content of the page makes you feel confusing, please write us an email, we will handle the problem within 5 days after receiving your email.

If you find any instances of plagiarism from the community, please send an email to: info-contact@alibabacloud.com and provide relevant evidence. A staff member will contact you within 5 working days.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

  • Sales Support

    1 on 1 presale consultation

  • After-Sales Support

    24/7 Technical Support 6 Free Tickets per Quarter Faster Response

  • Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.