View two development officially started, ran into trouble at the beginning, because in the statistical module, need to get the source search links keyword, so at the beginning with a regular keyword part of the match out, and then with the uri.unescapedatastring () Converts urlcoding to text.
Originally everything is very smooth, the results will be Baidu, NetEase search added to the rules inside, began to error. I guess it may be related to coding, because Google has always been UTF-8, the domestic site is mostly biased to use GB2312, so this problem I am not very worried.
Where to know, the seriousness of this problem, let me almost lose confidence in the research of program algorithm. Because today I heard someone say I have a lot of guts, I dare to get rid of all the linked events, in case there is the same incident, then there will be serious problems. I have always thought I was a relatively forestall person, before I did that bold thing, I was careful to consult the existence of the same event, in the affirmative reply, I began to do. Although the code is not much, but I dare to say that can be considered in the inside, can not be considered errors, I have basically done.
Back to the point, a search on the internet, found that asp.net this aspect of the information is really too little, and then looked at the ASP and PHP, found that the practice is very complex, but also easy to make mistakes, even to build a gb2312 to utf-8 coding table.
I firmly believe that Microsoft will give a perfect solution, so began to check MSDN, starting from encoding, found that there is a method of conversion coding. But the problem again, because I get is the URL form of string, now do not know whether this string belongs to Utf-8, or gb2312, so you can not use unescapedatastring converted to text after the transcoding, because this will immediately error.
So we started to analyze the coding principles of gb2312 and Utf-8, Found them to be relevant in. NET, one is 2-bit code, one is three-bit code, so I tried to decompose the encoding of the gb2312, put every 2 bits in a byte array as 2 16, and then switch to the byte type of utf-8 by normal transcoding, then into Char Face, and finally turn into normal text.
The results are immediately announced, incredibly successful, and then make persistent efforts to write a search engine type to judge the method, so that everything to solve.
Here's the code, hoping to help a useful friend:
Program code
Using System;
using System.Data;
using System.Configuration;
using System.Web;
using System.Web.Security;
using System.Web.UI;
using System.Web.UI.WebControls;
using System.Web.UI.WebControls.WebParts;
using System.Web.UI.HtmlControls;
using System.Text.RegularExpressions;
using System.Text;
///<summary>
///search engine processing
///</summary>
public class Exjudgesystem
{
public Exjudgesystem ()
{
}
#region Initialization Variable
//Search engine features
private string[][] _enginers = new string[][] {
new string[]{"Google", "UTF8", "Q"},
new string[]{"Baidu", "gb2312", "WD"},
new string[]{"Yahoo", "UTF8", "P"},
new string[]{"Yisou", "UTF8", "search"},
new string[]{"Live", "UTF8", "Q"},
new string[]{"Tom", "gb2312", "word"},
new string[]{"163", "gb2312", "Q"},
new string[]{"Iask", "gb2312", "K"},
new string[]{"Soso", "gb2312", "W"},
new string[]{"Sogou", "gb2312", "Query"},
new string[]{"Zhongsou", "gb2312", "W"},
new string[]{"3721", "gb2312", "P"},
new string[]{"Openfind", "UTF8", "Q"},
new string[]{"AllTheWeb", "UTF8", "Q"},
new string[]{"Lycos", "UTF8", "Query"},
new string[]{"Onseek", "UTF8", "Q"}
};
//Search engine name
private String _enginename = "";
public string Enginename
{
Get
{
return _enginename;
}
}
//Search engine Coding
private String _coding = "UTF8";
public string Coding
{
Get
{
return _coding;
}
}
//Search engine keyword query parameter name
private String _regexword = "";
public string Regexword
{
Get
{
return _regexword;
}
}
private String _regex = @ "(";
#endregion
#region Search engine keyword
//Establish search keyword regular expression
public void Engineregex (string myString)
{
for (int i = 0, j = _enginers.length; I < J; i++)
{
if (Mystring.contains (_enginers[i][0))
{
_enginename = _enginers[i][0];
_coding = _enginers[i][1];
_regexword = _enginers[i][2];
_regex + = _enginename + @ "\.+.*[?/&]" + _regexword + @ "[=:]) (? <key>[^&]*)";
break;
}
}
}
//Get search engine keyword
public string Searchkey (string myString)
{
Engineregex (Mystring.tolower ());
if (_enginename!= "")
{
regex myreg = new Regex (_regex, regexoptions.ignorecase);
Match matche = Myreg.match (myString);
myString = Matche. groups["Key"]. Value;
//Place is represented as a space +
myString = mystring.replace ("+", "");
if (_coding = = "gb2312")
{
myString = getutf8string (myString);
}
Else
{
myString = uri.unescapedatastring (myString);
}
}
return myString;
}
//Whole sentence transfer code
public string getutf8string (string myString)
{
regex myreg = new regex (? <key>%. %..) ", regexoptions.ignorecase);
MatchCollection matches = myreg.matches (myString);
string MyWord;
for (int i = 0, j = matches. Count; I < J; i++)
{
MyWord = matches[i]. groups["Key"]. Value.tostring ();
myString = Mystring.replace (MyWord, Gb2312toutf8 (MyWord));
}
return myString;
}
//Word GB2312 to UTF8 URL encoding
public string Gb2312toutf8 (string myString)
{
string[] MyWord = mystring.split ('% ');
byte[] MyByte = new byte[] {convert.tobyte (myword[1],), Convert.tobyte (myword[2), 16)};
Encoding GB = encoding.getencoding ("GB2312");
Encoding U8 = Encoding.UTF8;
MyByte = Encoding.convert (GB, U8, MyByte);
char[] Chars = new char[u8. GetCharCount (mybyte, 0, Mybyte.length)];
U8. GetChars (mybyte, 0, Mybyte.length, Chars, 0);
return new string (Chars);
}
#endregion
//Judge whether it is a search engine crawler and return its type
public string Iscrawler (string systeminfo)
{
string[] botlist = new string[] {"Google", "Baidu", "MSN", "Yahoo", "Tmcrawler", "Iask", "Sogou"};
foreach (String Bot in Botlist)
{
if (Systeminfo.tolower (). Contains (Bot.tolower ())
{
return Bot;
}
}
return "null";
}
}
Afternoon off work, in the subway when pick up a wallet, open looked at a bit have hundreds of, hey! Heart itch, but the primary school teacher taught us founder, I in the station mouth silly stand for a long time, waiting for the wallet back, no results, had to give the purse to the subway staff. The quality of society is a civilization that needs to start from everyone, not to talk about flashy nonsense every day.