標籤:
去掉重複的id,並且存入資料庫:
import MySQLdb# 開啟資料庫連接db = MySQLdb.connect(host=‘localhost‘, user=‘root‘, passwd=‘123456‘, port=3306, charset="utf8", db="db_websiterecommend")cur = db.cursor()# db = MySQLdb.connect(host="localhost",user="root",passwd="123456",db="db_websiterecommend",port=‘3306‘ )# # 使用cursor()方法擷取操作遊標# cursor = db.cursor()# 使用execute方法執行SQL語句sql=‘select ip from t_useripvisittrace‘cur.execute(sql)# 使用 fetchone() 方法擷取一條資料庫。data = cur.fetchall()#print(data)user_ip=[]for l in data: user_ip.append(l[0])#print(type(user_ip))#print(user_ip)# 關閉資料庫連接user_ip_cai=set(user_ip)# print(user_ip_cai)userip=list(user_ip_cai)value=[]for i in range(len(userip)): value.append((i,userip[i]))cur.executemany(‘insert into t_userIP_list values(%s,%s)‘,value)db.commit()
根據網站情況,進行網站的分類:
# -*- coding: utf-8 -*-import MySQLdbimport reimport requestsfrom lxml import etree# 開啟資料庫連接count=0url=‘http://www.tipdm.org‘db = MySQLdb.connect(host=‘localhost‘, user=‘root‘, passwd=‘123456‘, port=3306, charset="utf8", db="db_websiterecommend")cur = db.cursor()cur1=db.cursor()sql=‘select page_path from t_useripvisittrace‘sql1=‘select url_list from urls‘cur.execute(sql)cur1.execute(sql1)# 使用 fetchone() 方法擷取一條資料庫。value=[]data = cur.fetchall()print(len(data))for each in data: #print(type(each[0])) if each[0]==‘/‘: print(‘2222‘) value.append((each[0],‘首頁‘)) cur.executemany(‘insert into t_url_classify values(%s,%s)‘, value) db.commit() count+=1 print(count) #print(value) elif each[0]==‘/index.jhtml‘: print(‘3333‘) value.append((each[0],‘首頁‘)) cur.executemany(‘insert into t_url_classify values(%s,%s)‘, value) db.commit() count += 1 print(count) #print(value) elif ‘index‘ in each[0]: print(‘4444‘) urls=url+each[0] html = requests.get(urls) selector = etree.HTML(html.text) content=selector.xpath(‘/html/body/div[6]/div[2]/div[1]/div/a[2]/text()‘) value.append((each[0],content)) cur.executemany(‘insert into t_url_classify values(%s,%s)‘, value) db.commit() count += 1 print(count) #print(value) elif ‘.jhtml‘ in each[0]: print(‘5555‘) url1=url+each[0] html = requests.get(url1) selector = etree.HTML(html.text) content=selector.xpath(‘/html/body/div[5]/div[2]/div[1]/div[1]/a[2]/text()‘) value.append((each[0],content)) cur.executemany(‘insert into t_url_classify values(%s,%s)‘, value) db.commit() count += 1 print(count) else: print(‘666‘) value.append((each[0],‘其他‘)) print(each[0]) cur.executemany(‘insert into t_url_classify values(%s,%s)‘, value) db.commit() count += 1 print(count)print(value)print(‘finish‘)
使用pandas讀取資料庫進行統計
import pandas as pdfrom sqlalchemy import create_engineengine = create_engine(‘mysql+pymysql://root:[email protected]:3306/db_websiterecommend?charset=utf8‘)sql = pd.read_sql(‘t_useripvisittrace‘, engine, chunksize = 10000)output=‘C:\\Users\\lenovo\\Desktop\\count_.xls‘‘‘‘z用create_engine建立串連,串連地址的意思依次為“資料庫格式(mysql)+程式名(pymysql)+帳號密碼@地址連接埠/資料庫名(test)”,最後指定編碼為utf8;all_gzdata是表名,engine是串連資料的引擎,chunksize指定每次讀取1萬條記錄。這時候sql是一個容器,未真正讀取資料。‘‘‘import MySQLdb# 開啟資料庫連接db = MySQLdb.connect(host=‘localhost‘, user=‘root‘, passwd=‘123456‘, port=3306, charset="utf8", db="db_websiterecommend")cur = db.cursor()value=[]# for j in range(len(list(sql))):# s=sql[j][‘ip‘].value_counts()# value.append((j,s))# print valuefor i in sql: s=i[‘ip‘].value_counts() print type(s) value.append((list(i[‘ip‘]),list(s))) cur.executemany(‘insert into userip values(%,%s)‘, value) db.commit()print value
網頁行為分析