Simple crawling of small programs and presentation ., Capture mini-Programs
Preface: to use the applet navigation page to increase website traffic, find www.xcxdh666.com and navigate the applet website.
Analysis of Web Page 1 found that the website is actually loading data using asynchronous paging requests, so there is no need for xpath to parse html and directly analyze its request url
2. Click "load more" and find the request. The pageNum and cagegory parameters are used.
3. Therefore, directly request the url to include the parameter, and the json result is returned after analysis.
Write code 1 first create the receiving type
public class XcxApplet
{
public int id {get; set;}
public string categoryName {get; set;}
public string name {get; set;}
public string saomaUrl {get; set;}
public string sum {get; set;}
public string logoUrl {get; set;}
}
public class Result
{
public List <XcxApplet> dataList {get; set;}
public string category {get; set;}
public int status {get; set;}
public int pageNum {get; set;}
}
2 Package request page method
public static string GetPostPage (this string posturl, string postData)
{
Encoding encoding = Encoding.UTF8;
byte [] data = null;
if (! string.IsNullOrEmpty (postData)) data = encoding.GetBytes (postData);
try
{
// Setting parameters
var request = WebRequest.Create (posturl) as HttpWebRequest;
if (request == null) return string.Empty;
var cookieContainer = new CookieContainer ();
request.CookieContainer = cookieContainer;
request.AllowAutoRedirect = true;
request.Method = "POST";
request.ContentType = "application / x-www-form-urlencoded";
if (data! = null)
{
request.ContentLength = data.Length;
Stream outstream = request.GetRequestStream ();
outstream.Write (data, 0, data.Length);
outstream.Close ();
}
// Send request and get corresponding response data
var response = request.GetResponse () as HttpWebResponse;
if (response == null) return string.Empty;
// It is not until the request.GetResponse () program starts sending Post requests to the target webpage
Stream instream = response.GetResponseStream ();
if (instream == null) return string.Empty;
var sr = new StreamReader (instream, encoding);
// Return to result page (html) code
string content = sr.ReadToEnd ();
string err = string.Empty;
//Response.Write(content);
return content;
}
catch (Exception ex)
{
string err = ex.Message;
return string.Empty;
}
}
3 The idea of image url processing is to download the returned url request to the local or upload it to the corresponding image server. I use Qiniu Cloud to store the img here. Here you can change it to download to the local and return the local url. .
public string QiniuUplod (string imgurl)
{
var accessKey = "Your accesskey";
var secretKey = "Your secretkey";
// This Mac is required to generate (upload) credentials
// This example uses a Settings class alone, which contains AccessKey and SecretKey
// In practice, please set your AccessKey and SecretKey by yourself
Mac mac = new Mac (accessKey, secretKey);
string bucket = "siyouku";
string saveKey = imgurl.Substring (imgurl.LastIndexOf ('/') + 1, imgurl.Length- imgurl.LastIndexOf ('/')-1);
// Please make sure that AK and BUCKET are correct before use, otherwise this function will throw an exception (such as code612 / 631 and other errors)
Qiniu.Common.Config.AutoZone (accessKey, bucket, false);
// Upload strategy, see
// https://developer.qiniu.com/kodo/manual/put-policy
PutPolicy putPolicy = new PutPolicy ();
// If you need to set to "overwrite" upload (overwrite if there is a file with the same name in the cloud), please use SCOPE = "BUCKET: KEY"
putPolicy.Scope = bucket + ":" + saveKey;
putPolicy.Scope = bucket;
// Upload policy validity period (corresponding to the validity period of the generated certificate)
putPolicy.SetExpires (3600);
// How many days after uploading to the cloud, the file will be deleted automatically, if it is not set (ie keep the default), it will not be deleted
//putPolicy.DeleteAfterDays = 1;
// Generate upload credentials, see
// https://developer.qiniu.com/kodo/manual/upload-token
string jstr = putPolicy.ToJsonString ();
string token = Auth.CreateUploadToken (mac, jstr);
try
{
var wReq = System.Net.WebRequest.Create (imgurl) as System.Net.HttpWebRequest;
var resp = wReq.GetResponse () as System.Net.HttpWebResponse;
using (var stream = resp.GetResponseStream ())
{
// Please do not use the UploadStream method of UploadManager, because this stream does not support searching (cannot get Stream.Length)
// Please use the UploadStream method of FormUploader or ResumableUploader
FormUploader fu = new FormUploader ();
var result = fu.UploadStream (stream, saveKey, token);
var x = Newtonsoft.Json.JsonConvert.DeserializeObject <QiniuResult> (result.Text);
return $ "http://img.siyouku.cn/{x.key}";
}
}
catch (Exception ex)
{
return "";
}
}
4 Finally, the request body method
public ActionResult GetxcxList ()
{
Stopwatch watch = new Stopwatch (); // Monitoring capture time-consuming
watch.Start ();
//https://www.xcxdh666.com/pageList.htm?pageNum=0 dataList
var result = new Result ();
for (int j = 0; j <54; j ++)
{
string url =
$ "https://www.xcxdh666.com/pageList.htm?pageNum={j}";
var str = url.GetPostPage (null); // HttpWebRequest request page
if (str! = null)
{
result = str.JsonConvert <Result> (); // Serialization extension method of string
}
result.dataList.ForEach (i =>
{
if (! Db.Applet.Any (x => x.Name == i.name)) // Judge duplicate insertion
{
var x = new Applet ()
{
CategoryName = string.IsNullOrEmpty (i.categoryName)? "Other": i.categoryName,
Name = i.name,
SaomiaoUrl = QiniuUplod ($ "http://img.xcxdh666.com/wxappnav/{i.saomaUrl}"),
Summary = i.sum,
LogoUrl = QiniuUplod ($ "http://img.xcxdh666.com/wxappnav/{i.logoUrl}"),
SortNum = j,
CreateUser = "wenqing",
CreateTime = DateTime.Now
};
Db.Applet.Add (x);
}
});
Db.SaveChanges ();
}
watch.Stop ();
return Content ("Crawl completed! This request takes a total of time:" + watch.ElapsedMilliseconds);
}
}
ok here to complete the crawl
Attach here the display address http://siyouku.cn/Applet
Blogger URL: http://www.siyouku.cn
This article is permanently more detailed address: http://siyouku.cn/article/6806.html