import pandas as pd #导入pandasimport matplotlib.pyplot as plt #导入matplotlibfrom pylab import *mpl.rcParams[‘font.sans-serif‘] = [‘SimHei‘]mpl.rcParams[‘axes.unicode_minus‘] = False%matplotlib inline
Data reading and indexing
bra = pd.read_csv(‘data/bra.csv‘)bra.head()
Select columns
bra.content
bra[[‘creationTime‘,‘productColor‘]].head()
Select rows
bra[1:6]
Select Rows and Columns
bra.ix[[2,3],[1,3]] #使用ix
bra.ix[1:5,[‘productColor‘]]
bra.iloc[[2,3],[1,3]] #使用iloc
bra.loc[1:5,[‘content‘,‘creationTime‘,‘productSize‘]] #使用loc
bra.loc[1:5,‘content‘:‘userClientShow‘]
Data preprocessing missing values
bra.describe() #查看数据的分布情况,可返回变量和观测的数量、缺失值和唯一值的数目、平均值、分位数等相关信息
bra[‘userClientShow‘].unique() #userClientShow列有几种选项
bra[‘userClientShow‘].isnull().sum() #初始缺失值数量
bra[‘userClientShow‘].fillna(‘不详‘,inplace=True) #缺失值替换为“不详”bra[‘userClientShow‘].isnull().sum() #赋值后的缺失值数量
New column
bra.dtypes #查看属性
bra[‘creationTime‘] = pd.to_datetime(bra[‘creationTime‘]) #更新类型bra.dtypes
bra[‘hour‘] = [i.hour for i in bra[‘creationTime‘]] #新建hour列bra
String manipulation
bra.productSize.unique() #查看productSize的唯一值
cup = bra.productSize.str.findall(‘[a-zA-Z]+‘).str[0] #新增列cupcup2 = cup.str.replace(‘M‘,‘B‘)cup3 = cup2.str.replace(‘L‘,‘C‘)cup4 = cup3.str.replace(‘XC‘,‘D‘)bra[‘cup‘] = cup4 bra.head()
bra[‘cup‘].unique() #查看cup唯一值
Data conversion
bra.productColor.unique() #查看productColor唯一值
def getColor(s): if ‘黑‘ in s: return ‘黑色‘ elif ‘肤‘ in s: return ‘肤色‘ elif ‘蓝‘ in s: return ‘蓝色‘ elif ‘红‘ in s: return ‘红色‘ elif ‘紫‘ in s: return ‘紫色‘ elif ‘白‘ in s: return ‘白色‘ elif ‘粉‘ in s: return ‘粉色‘ elif ‘灰‘ in s: return ‘灰色‘ elif ‘绿‘ in s: return ‘绿色‘ elif ‘青‘ in s: return ‘青色‘ else: return sbra[‘color‘] = bra[‘productColor‘].map(getColor) #从productColor列查询,赋值到定义的函数getColor,最终新增列colorbra
bra.color.unique() #查询color的唯一值
Visualization of data
x = [1991,1992,1993,1994,1995,1996,1997]y = [23,56,38,29,34,56,92]plt.plot(x,y) #调用函数plot
plt.figure(figsize=(8,6),dpi=80) #调用函数firgureplt.plot(x,y)
hour = bra.groupby(‘hour‘)[‘hour‘].count() #hour列排序hour
plt.xlim(0,25) #横轴0~25plt.plot(hour,linestyle=‘solid‘,color=‘royalblue‘,marker=‘8‘) #颜色深蓝
cup_style = bra.groupby(‘cup‘)[‘cup‘].count() #cup列唯一值得数量cup_style
plt.figure(figsize=(8,6),dpi=80)labels = list(cup_style.index)plt.xlabel(‘cup‘) #x轴为cupplt.ylabel(‘count‘) #y轴为count数量plt.bar(range(len(labels)),cup_style,color=‘royalblue‘,alpha=0.7) #alpha为透明度plt.xticks(range(len(labels)),labels,fontsize=12)plt.grid(color=‘#95a5a6‘,linestyle=‘--‘,linewidth=1,axis=‘y‘,alpha=0.6)plt.legend([‘user-count‘])for x,y in zip(range(len(labels)),cup_style):plt.text(x,y,y,ha=‘center‘,va=‘bottom‘)
color_style = bra.groupby(‘color‘)[‘color‘].count() #color列唯一值得数量color_style
plt.figure(figsize=(8,6),dpi=80)plt.subplot(facecolor=‘gainsboro‘,alpha=0.2)colors = [‘brown‘,‘orange‘,‘gray‘,‘white‘,‘pink‘,‘purple‘,‘red‘,‘green‘,‘wheat‘,‘blue‘,‘gold‘,‘springgreen‘,‘black‘] #颜色种类labels = list(color_style.index)plt.xlabel(‘count‘) #x轴为count数量plt.ylabel(‘color‘) #y轴为colorplt.title(‘Color Distribution‘) #定义标题plt.barh(range(len(labels)),color_style,color=colors,alpha=1)plt.yticks(range(len(labels)),labels,fontsize=12)plt.grid(color=‘#95a5a6‘,linestyle=‘--‘,linewidth=1,axis=‘x‘,alpha=0.4)
bra.head(30)
Python Data Analysis Instance operations