Because of the need for work, deep learning is needed to identify malicious binary files, so crawl some resources.
#-*-Coding:utf-8-*-import requestsimport reimport sysimport loggingreload (SYS) sys.setdefaultencoding (' Utf-8 ') Logger = Logging.getlogger ("Rrjia") formatter = logging. Formatter ("% (asctime) s-% (name) s-% (levelname) s-% (message) s") File_handler = logging. Filehandler ("/home/rrjia/python/test.log") File_handler.setformatter (Formatter) Logger.addhandler (File_handler) Logger.setlevel ("INFO") if __name__ = = ' __main__ ': # url = ' http://malwaredb.malekal.com ' # Http://malwaredb.malekal. Com/index.php?page=1 # <td width= "30px" align= "center" ><a href= "./files.php?file= 25e8bf41343bda75a9170aad44094647 "></a>< /td> count = 1 Error_count = 0 Begin_url = ' http://malwaredb.malekal.com ' begin_html = Requests.get (Begin_u RL) IMG_SRC = Re.findall (' <a href= ' \./files\.php\?file=\w+ ' > ', Begin_html.text, re. S) Imgurl = [] for each_src in Img_src:arr = Each_src.split ("=") IMGURL.APpend ("http://malwaredb.malekal.com/files.php?file=" + arr[2].replace (' "," "). Replace (" > "," ")) Logger.info (" 0 Page contains%d virus file "% Len (Imgurl)) for each of ImgUrl:try:imgContext = Requests.get (each, timeout=120). Content fileName = each.split ("=") [1] with open ("/home/rrjia/testdata/" + str (fileName) + ". zip", "wb+") as Code:code.write (Imgcontext) code.close () logger.info ("Succe SS Download%d page%d file "% (0, count) + str (fileName) +". zip ") except Exception as E:error_count + = 1 logger.info ("This URL error download failed") Count + = # for page in range (1, 828): # URL = "http://malwaredb.malekal.com/index.php?page=" + str (page) # html = requests.get (URL) # img_src = Re.findal L (' <a href= ' \./files\.php\?file=\w+ ' > ', Html.text, re. S) # Imgurl = []# for each_src in img_src:# arr = Each_src.split ("= ") # imgurl.append (" http://malwaredb.malekal.com/files.php?file= "+ arr[2].replace ('", ""). Replace (">", "") # logger.info ("%d page contains%d virus file"% (page, Len (Imgurl))) # for each in imgurl:# try: # Imgcontext = Requests.get (each, timeout=120). content# fileName = each.split ("=") [1]# With open ("/home/rrjia/testdata/" + str (fileName) + ". Zip", "wb+") as code:# Code.write (Imgco ntext) # code.close () # logger.info ("Success download%d page%d file"% (page, count) + STR (fileName) + ". zip") # except Exception as e:# Error_count + # logger.info ("t His URL error ") # count + = 1
Python crawler crawls massive virus files