Crawler combined with sqlmapi judgment Injection

Last Update:2015-11-23 Source: Internet

Author: User

Tags url example

Developer on Alibaba Coud: Build your first app with APIs, SDKs, and tutorials on the Alibaba Cloud. Read more ＞

Crawler combined with sqlmapi judgment Injection

Recently, I am working on something that hurts. crawlers and scanning. The scan is sent to sqlmapapi. Currently, there are not many materials, but you can still find some

Use sqlmapapi. py batch scan practices

Look at his encapsulated sqlmapapi class

#! /Usr/bin/python #-*-coding: UTF-8-*-import requestsimport timeimport json class AutoSqli (object ): "use sqlmapapi to interact with the server created By sqlmapapi By Manning" def _ init _ (self, server = '', target = '', data = '', referer ='', cookie = ''): super (AutoSqli, self ). _ init _ () self. server = server if self. server [-1]! = '/': Self. server = self. server + '/' self.tar get = target self. taskid = ''self. engineid = ''self. status = ''self. data = data self. referer = referer self. cookie = cookie self. start_time = time. time () # create a scan task def task_new (self): self. taskid = json. loads (requests. get (self. server + 'Task/new '). text) ['taskid'] print 'created new task: '+ self. taskid # obtain the taskid. Other if len (self. taskid)> 0: return True return False # Delete scan task def task_delete (self): if json. loads (requests. get (self. server + 'Task/'+ self. taskid + '/delete '). text) ['success']: print '[% s] Deleted task' % (self. taskid) return True return False # scan task start def scan_start (self): headers = {'content-type ': 'application/json'} # address to be scanned payload = {'url': self.tar get} url = self. server + 'Scan/'+ self. taskid + '/start '# http://127.0.0.1:8557/scan/xxxxxxxxxx/start T = json. loads (requests. post (url, data = json. dumps (payload), headers = headers ). text) self. engineid = t ['engineid'] if len (str (self. engineid)> 0 and t ['success']: print 'startedscan' return True return False # scan task status def scan_status (self): self. status = json. loads (requests. get (self. server + 'Scan/'+ self. taskid + '/status '). text) ['status'] if self. status = 'running': return 'running' elif self. status = 'terminate': return 'terminate' else: return 'error' # scan task details def scan_data (self): self. data = json. loads (requests. get (self. server + 'Scan/'+ self. taskid + '/data '). text) ['data'] if len (self. data) = 0: print 'not injection: \ t' else: print 'injection: \ t' + self.tar get # scan settings, the main parameter settings are def option_set (self): headers = {'content-type': 'application/json'} option = {"options": {"smart ": true ,...}} url = self. server + 'option/'+ self. taskid + '/set' t = json. loads (requests. post (url, data = json. dumps (option), headers = headers ). text) print t # Stop the scan task def scan_stop (self): json. loads (requests. get (self. server + 'Scan/'+ self. taskid + '/stop '). text) ['success'] # Kill scan task process def scan_kill (self): json. loads (requests. get (self. server + 'Scan/'+ self. taskid + '/kill '). text) ['success'] def run (self): if not self. task_new (): return False self. option_set () if not self. scan_start (): return False while True: if self. scan_status () = 'running': time. sleep (10) elif self. scan_status () = 'termination': break else: break print time. time ()-self. start_time if time. time ()-self. start_time & gt; 3000: error = True self. scan_stop () self. scan_kill () break self. scan_data () self. task_delete () print time. time ()-self. start_time if _ name _ = '_ main _': t = AutoSqli (' http://127.0.0.1:8774 ',' http://192.168.3.171/1.php?id=1 ') T. run ()

Its working process is

Get request to create a task and get the task id
Set parameters for the specific task id of the get request
The specified task id of the post request starts scanning the specified url.
Get request specific task id get status
Get request specific task id to obtain test results
Get request specific task id Delete task

Go to the lib/utils/api. py server class, and you can find that you submit data to the server to interact with the service. There are three types.

Users 'Methods user Method
Admin function
Sqlmap core interact functions core interaction functions
The types of data that can be submitted are as follows.

User Method

@ Get ("/task/new ")
@ Get ("/task // delete ")
Management Functions

@ Get ("/admin // list ")
@ Get ("/admin // flush ")
Core interaction Functions

@ Get ("/option // list ")
@ Post ("/option // get ")
@ Post ("/option // set ")
@ Post ("/scan // start ")
@ Get ("/scan // stop ")
@ Get ("/scan // kill ")
@ Get ("/scan // status ")
@ Get ("/scan // data ")
@ Get ("/scan // log //")
@ Get ("/scan // log ")
@ Get ("/download ///")
Finally, the Code determines whether there is an injection vulnerability. If the returned dictionary contains values in data, there is an injection vulnerability.

Then

#! /Usr/bin/python # vim: set fileencoding = UTF-8: import sysimport urllib2import refrom BeautifulSoup import autosql class SpriderUrl: # initialize def _ init _ (self, url ): self. url = url # self. con = Db_Connector ('sprider. ini ') # obtain the first url list of the target url def get_self (self): urls = [] try: body_text = urllib2.urlopen (self. url ). read () failed T: print "[*] Web Get Error: checking the Url" soup = BeautifulSoup (body _ Text) links = soup. findAll ('A') for link in links: # Get the target url, but still need to process _ url = link. get ('href ') # Then judge and process it # first judge whether it is the beginning of a meaningless character and whether it is a None value # judge the URL suffix, not the list that does not capture if re. match ('^ (javascript |:; | #)', _ url) or _ url is None or re. match ('. (jpg | png | bmp | mp3 | wma | wmv | gz | zip | rar | iso | pdf | txt | db) $ ', _ url ): continue # then determine whether it is an http | https prefix. For these headers, determine whether it is a site or not. if re is not a crawler that exceeds the site. match ('^ (http | https)', _ url): if not re. match ('^' + self. Url, _ url): continue else: urls. append (_ url) else: urls. append (self. url + _ url) rst = list (set (urls) for rurl in rst: try: self. sprider_self_all (rurl) # recursion, but the disadvantage is too obvious. It repeats recursion on all pages. Then submit it to autosql # AutoSqli (' http://127.0.0.1:8775 ', Rurl ). run extension T: print "spider error" def sprider_self_all (self, domain): urls = [] try: body_text = urllib2.urlopen (domain ). read () failed T: print "[*] Web Get Error: checking the Url" sys. exit (0) soup = BeautifulSoup (body_text) links = soup. findAll ('A') for link in links: # Get the target url, but still need to process _ url = link. get ('href ') # Then judge and process it # first judge whether it is the beginning of a meaningless character and whether it is a None value # judge the URL suffix, not the list not to capture try: if re. match ('^ (javascript |:; | #) ', Str (_ url) or str (_ url) is None or re. match ('. (jpg | png | bmp | mp3 | wma | wmv | gz | zip | rar | iso | pdf | txt | db) $ ', str (_ url )): continue failed t TypeError: print "[*] Type is Error!: "+ Str (_ url) continue # Then judge whether it is http | the beginning of https. For these prefixes, you must determine whether it is the site or not, and do not search for crawlers beyond the site if re. match ('^ (http | https)', _ url): if not re. match ('^' + self. url, _ url): continue else: urls. append (_ url) else: urls. append (self. url + _ url) res = list (set (urls) for rurl in res: try: print rurl # AutoSqli (' http://127.0.0.1:8775 ', Rurl). run failed T: print "spider error" spi =" http://0day5.com/ "T = SpriderUrl (spi) # capture t. get_self () for the first time ()

The best way is to store it in the database and check whether it is repeated.

for rurl in res:    if self.con.find_item("select * from url_sprider where url='"+rurl+"' and domain='"+self.url+"'"):        continue    else:        try:            self.con.insert_item("insert into url_sprider(url,tag,domain)values('"+rurl+"',0,'"+self.url+"')")        except:            print "[*] insert into is Error!"

We are still sorting out the crawler information recently:

1. For many crawlers with obvious features, you can specify the corresponding User-Agent.

2. For some WAF instances, you can use the Bypass method. What if it's from Baidu?

Improved

USER_AGENTS = ["Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; ACO obrowser ;. net clr 1.1.4322 ;. net clr 2.0.50727) "," Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; Server Load balancer ;.. net clr 2.0.50727; Media Center PC 5.0 ;. net clr 3.0.04506) "," Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1 ;. net clr 1.1.4322 ;. net clr 2.0.50727) "," Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US) "," Mozilla/5.0 (compatible; MSIE 9.0; windows NT 6.1; Win64; x64; Trident/5.0 ;. net clr 3.5.30729 ;. net clr 3.0.30729 ;. net clr 2.0.50727; Media Center PC 6.0) "," Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2 ;. net clr 2.0.50727 ;. net clr 3.5.30729 ;. net clr 3.0.30729 ;. net clr 1.0.3705 ;. net clr 1.1.4322) "," Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2 ;. net clr 1.1.4322 ;. net clr 2.0.50727; InfoPath.2 ;. net clr 3.0.04506.30) "," Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) arora/0.3 (Change: 287 c9dfb30) "," Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527 + (KHTML, like Gecko, Safari/419.3) arora/0.6 "," Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv: 1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1 ", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv: 1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; linux i686; U;) Gecko/20070322 Kazehakase/0.4.5 "," Mozilla/5.0 (X11; U; Linux i686; en-US; rv: 1.9.0.8) gecko Fedora/1.9.0.8-1. fc10 Kazehakase/0.5.6 "," Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11 ", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52 ",] REFERERS = [" https://www.baidu.com "," http://www.baidu.com "," https://www.google.com.hk "," http://www.so.com "," http://www.sogou.com "," http://www.soso.com "," http://www.bing.com ",] Default_cookies ={}# random User-Agent.default_headers = {'user-agent': random. choice (USER_AGENTS), 'access': text/html, application/xhtml + xml, application/xml; q = 0.9, */*; q = 100 ', 'cache-control': 'max-age = 0', 'Referer': random. choice (REFERERS), 'Accept-charset': 'gbk, UTF-8; q = 0.7, *; q = 100 ',}

Then it is still the filtering problem, that is, the similarity check ~ Then the effective crawlers can get more accurate results. This algorithm mainly relies on URL splitting and HASH of disassembling objects. This algorithm is applicable to similar requirements. This algorithm Splits a URL into three dimensions. The first dimension is netloc, the second dimension is the length of path, and the third dimension is the sorted list of parameters of the query object. A Data Structure combines the preceding three dimensions to construct a hash object.

#!/usr/bin/env python# coding:utf-8import timeimport osimport urlparseimport hashlibimport syssys.path.append("..") from config.config import *reload(sys)sys.setdefaultencoding("utf-8") SIMILAR_SET = set()REPEAT_SET = set()

Clearly differentiate what crawlers focus and what filters are.

Focus: If keyword is in the url, True is returned; otherwise, False is returned.

Filter: If keyword is in the url, False is returned; otherwise, True ''' is returned '''

Def format (url ): '''policy is used to construct a triple. The first item is the netloc of the url. The second item is the split length of each item in the path. The third item is the name of each parameter in the query. (The parameters are sorted alphabetically, avoid repeated issues due to different order) ''' if urlparse. urlparse (url) [2] = '': url = url + '/'url_structure = urlparse. urlparse (url) netloc = url_structure [1] path = url_structure [2] query = url_structure [4] temp = (netloc, tuple ([len (I) for I in path. split ('/')]), tuple (sorted ([I. split ('=') [0] for I in query. split ('&')]) # print temp return temp def check_netloc_is_ip (netloc ): ''' if the url's netloc is in the ip address format return True, otherwise return False ''' flag = 0 t = netloc. split ('. ') for I in t: try: int (I) flag + = 1 random t Exception, e: break if flag = 4: return True return False def url_domain_control (url, keyword): ''' URL domain name control focus True url match domain name judgment False url does not match domain name judgment 1, keyword can be list or str 2, if the url netloc is in the ip Format, return True ''' t = format (url) if check_netloc_is_ip (t [0]): return True elif str (type (keyword )) = "<type 'LIST'>": for I in keyword: if I. lower () in t [0]. lower (): return True elif str (type (keyword) = "<type 'str'>": if keyword. lower () in t [0]. lower (): return True return False def url_domain_control_ignore (url, keyword): ''' URL domain name control filter True ignore keywords not in url False ignore keywords in url example: Ignore blog, if a blog exists in the netloc of the domain name, false ''' t = format (url) for I in keyword: if I in t [0] is returned. lower (): return False return True def url_similar_control (url): ''' URL similarity control True url not repeated False url repeated ''' t = format (url) if t not in SIMILAR_SET: SIMILAR_SET.add (t) return True return False def url_format_control (url ): '''url format Control Filtering True URL matches the format judgment False url does not conform to the format judgment '''if'} 'not in url and '20140901' not in url and url [0]. lower () = 'H' and '// 'not in url and len (format (url) [1]) <6: if len (format (url) [2])> 0: for I in format (url) [2]: if len (I)> 20: return False if 'viewthread' in url or 'forumdisplay' in url: return False return True return False def url_custom_control (url ): '''url custom keyword control filter True False ''' for I in CUSTOM_KEY: if I in URL: return False return True def url_custom_focus_control (url, focuskey ): '''url custom keyword control focus True match focus policy false''' if len (focuskey) = 0: return True for I in focuskey: if I in URL: return True return False def url_repeat_control (url): ''' URL repetition control True url not repeated False url duplicate ''' if url not in REPEAT_SET: REPEAT_SET.add (url) return True return False def url_filter_similarity (url, keyword, ignore_keyword, focuskey): if url_format_control (url) and url_similar_control (url) \ and url_domain_control (url, keyword, IGNORE_KEY_WORD) \ and url_custom_control (url) and url_custom_focus_control (url, focuskey): return True else: return Falsedef convert (url, keyword, ignore_keyword, focuskey): if url_format_control (url) and url_repeat_control (url) \ and url_domain_control (url, keyword) and between (url, IGNORE_KEY_WORD) \ and url_custom_control (url) and url_custom_focus_control (url, focuskey): return True else: return False if _ name _ = "_ main _": print url_format_control (" http://www.gznu.edu.cn ")

This article is an English version of an article which is originally in the Chinese language on aliyun.com and is provided for information purposes only. This website makes no representation or warranty of any kind, either expressed or implied, as to the accuracy, completeness ownership or reliability of the article or any translations thereof. If you have any concerns or complaints relating to the article, please send an email, providing a detailed description of the concern or complaint, to info-contact@alibabacloud.com. A staff member will contact you within 5 working days. Once verified, infringing content will be removed immediately.

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More