Remove duplicate IDs and deposit to database:
ImportMySQLdb#Open a database connectiondb = MySQLdb.connect (host='localhost', user='Root', passwd='123456', port=3306, charset="UTF8", db="Db_websiterecommend") cur=db.cursor ()#db = MySQLdb.connect (host= "localhost", user= "root", passwd= "123456", db= "Db_websiterecommend", port= ' 3306 ')## Get an action cursor using the cursor () method#cursor = db.cursor ()#executing SQL statements using the Execute methodSql='Select IP from T_useripvisittrace'cur.execute (SQL)#Use the Fetchone () method to get a database. data =Cur.fetchall ()#print (data)user_ip=[] forLinchdata:user_ip.append (l[0])#print (Type (USER_IP))#print (USER_IP)#To close a database connectionUser_ip_cai=Set (USER_IP)#print (USER_IP_CAI)userip=list (USER_IP_CAI) value=[] forIinchRange (len (userip)): Value.append ((i,userip[i)) Cur.executemany ('INSERT into t_userip_list values (%s,%s)', value) db.commit ()
According to the site situation, the classification of the site:
#-*-coding:utf-8-*-ImportMySQLdbImportReImportRequests fromlxmlImportetree#Open a database connectionCount=0url='http://www.tipdm.org'DB= MySQLdb.connect (host='localhost', user='Root', passwd='123456', port=3306, charset="UTF8", db="Db_websiterecommend") cur=db.cursor () cur1=db.cursor () SQL='Select Page_path from T_useripvisittrace'SQL1='Select url_list from URLs'cur.execute (SQL) Cur1.execute (SQL1)#Use the Fetchone () method to get a database. Value=[]data=Cur.fetchall ()Print(len (data)) foreachinchData:#print (Type (each[0])) ifeach[0]=='/': Print('2222') Value.append ((Each[0],'Home Page')) Cur.executemany ('INSERT into t_url_classify values (%s,%s)', value) db.commit () Count+=1Print(count)#print (value) elifeach[0]=='/index.jhtml': Print('3333') Value.append ((Each[0],'Home Page')) Cur.executemany ('INSERT into t_url_classify values (%s,%s)', value) db.commit () Count+ = 1Print(count)#print (value) elif 'Index' inchEach[0]:Print('4444') URLs=url+Each[0] HTML=requests.get (URLs) Selector=etree. HTML (html.text) content=selector.xpath ('/html/body/div[6]/div[2]/div[1]/div/a[2]/text ()') Value.append ((each[0],content)) Cur.executemany ('INSERT into t_url_classify values (%s,%s)', value) db.commit () Count+ = 1Print(count)#print (value) elif '. jhtml' inchEach[0]:Print('5555') Url1=url+Each[0] HTML=requests.get (URL1) Selector=etree. HTML (html.text) content=selector.xpath ('/html/body/div[5]/div[2]/div[1]/div[1]/a[2]/text ()') Value.append ((each[0],content)) Cur.executemany ('INSERT into t_url_classify values (%s,%s)', value) db.commit () Count+ = 1Print(count)Else: Print('666') Value.append ((Each[0],'other')) Print(Each[0]) Cur.executemany ('INSERT into t_url_classify values (%s,%s)', value) db.commit () Count+ = 1Print(count)Print(value)Print('Finish')
Use pandas to read a database for statistics
ImportPandas as PD fromSQLAlchemyImportCreate_engineengine= Create_engine ('Mysql+pymysql://root:[email Protected]:3306/db_websiterecommend?charset=utf8') SQL= Pd.read_sql ('T_useripvisittrace', engine, chunksize = 10000) Output='C:\\users\\lenovo\\desktop\\count_.xls'" "Z use Create_engine to establish a connection, the connection address means "database format (MySQL) + program name (pymysql) + account password @ Address port/Database name (test)", and finally specify that the encoding is utf8;all_gzdata is the table name, The engine is a connection to the data, chunksize specifies that 10,000 records are read each time. At this point SQL is a container that does not actually read the data. " "ImportMySQLdb#Open a database connectiondb = MySQLdb.connect (host='localhost', user='Root', passwd='123456', port=3306, charset="UTF8", db="Db_websiterecommend") cur=db.cursor () value=[]#For J in range (len list (SQL )):#s=sql[j][' IP '].value_counts ()#Value.append ((j,s))#Print Value forIinchsql:s=i['IP'].value_counts ()Printtype (s) value.append ((List (i['IP']), list (s)) Cur.executemany ('INSERT into Userip values (%,%s)', value) db.commit ()PrintValue
Web page Behavior Analysis