Python Data Analysis Overview
The meaning and goal of data analysis
Statistical analysis method
Extracting useful information
Research, generalization, summary
Python and data analytics
Python:guido Van Rossum Christmas Holiday, 1989
Features: Introduction Development efficiency slow operation (relative to C + + and Java) glue characteristics (Integrated C language)
Data analysis: NumPy, scipy, matplotlib, Pandas, Scikit-learn, Keras
Python Data analytics big family
NumPy (Numeric Python): Data structure basics. is an open-source numerical computation extension of Python. This tool can be used to store and manipulate large matrices, which is much more efficient than Python's own nested list (nested list structure) structure, which is also useful for representing matrices (matrix). It is said that NumPy Python is the equivalent of becoming a free, more powerful MATLAB system.
SCIPY: Powerful Scientific Computing Methods (matrix analysis, signal and image analysis, mathematical analysis ...) )
Matplotlib: a rich visualization suite
Pandas: Basic data Analysis suite. The tool was created to solve the data analysis task. Pandas incorporates a number of libraries and a number of standard data models, providing the tools needed to efficiently manipulate large datasets. Pandas provides a number of functions and methods that enable us to process data quickly and easily. Pandas was originally developed as a financial data analysis tool, so pandas provides a good support for time series analysis. The name of the pandas comes from panel data and Python data analysis.
Scikit-learn: A powerful data analysis modeling Library
Keras: (depth) Artificial neural network
Python Environment setup
Platforms: Windows, Linux, MacOS
Scientific Computing tools: Anaconda
Basic techniques for data analysis in Python
I. NumPy
Keywords: Open source data calculation extension
Function: Ndarray multidimensional operation linear algebra
Ndarray
#encoding=utf-8import numpy as npdef main(): lst=[[1,2,3],[2,4,6]] print(type(lst)) np_lst=np.array(lst) print(type(lst)) np_lst=np.array(lst,dtype=np.float) # bool # int,int8,int16,int32,int64,int128 # uint8,uint16,uint32,uint64,uint128, # float16/32/64,complex64/128 print(np_lst.shape) # 行列数 print(np_lst.ndim) # 维数 print(np_lst.dtype) # 数据类型 print(np_lst.itemsize) # 每个数据的数据存储大小 print(np_lst.size) # 元素个数
Some kinds of array
#encoding=utf-8import numpy as npdef main(): print(np.zeros([2, 4])) print(np.ones([3, 5])) print("Rand:") print(np.random.rand()) # 0-1内均匀分布随机数 print(np.random.rand(2, 4)) print("RandInt:") print(np.random.randint(1, 10, 3)) # 3个1-10内随机分布整数 print("Randn:") print(np.random.randn(2, 4)) # 标准正态随机数 print("Choice:") print(np.random.choice([10, 20, 30])) # 指定范围内的随机数 print("Distribute:") print(np.random.beta(1, 10, 100)) # 比如Beta分布,Dirichlet分布etc
Opeartion
#encoding =utf-8import NumPy as Npdef main (): Print (Np.arange (1, one). Reshape ([2, 5]) LST = Np.arange (1, one). Reshape ( [2, 5]) Print ("EXP:") Print (Np.exp (LST)) print ("EXP2:") Print (NP.EXP2 (LST)) print ("SQRT:") Print (NP.SQRT (LST)) p Rint ("Sin:") Print (Np.sin (LST)) print ("Log:") Print (Np.log (LST)) LST = Np.array ([[[[[[1], 2, 3, 4], [4, 5, 6, 7]], [[7, 8, 9, 10], [10, 11, 12, 13]], [[14, 15 , [+], [+], [+]] Print (LST) print ("Sum:") print (Lst.sum ()) # all elements sum print (Lst.sum (axis=0)) # Outermost sum print (Lst.sum (Axis=1)) # Second Layer summation print (Lst.sum (axis=-1 ) # Innermost sum print ("Max:") print (Lst.max ()) Print ("Min:") print (Lst.min ()) Lst1 = Np.array ([10, 20, 30, 4 0]) Lst2 = Np.array ([4, 3, 2, 1]) print ("ADD:") print (Lst1 + lst2) print ("Sub:") print (LST1-LST2) prin T ("Mul:")Print (LST1 * lst2) print ("Div:") print (lst1/lst2) print ("Square:") Print (LST1 * * lst2) print ("Dot:") p Rint (Np.dot (Lst1.reshape ([2, 2]), Lst2.reshape ([2, 2])) print ("Cancatenate") Print (Np.concatenate (Lst1, lst2), Axi s=0)) Print (Np.vstack ((Lst1, Lst2)) # line stitching print (Np.hstack (Lst1, Lst2)) # By column Connect print (Np.split (Lst1, 2)) # vector split print (np.copy (lst1)) # vector copy
Liner algebra
#encoding=utf-8import numpy as npfrom numpy.linalg import *def main(): ## Liner print(np.eye(3)) lst = np.array([[1, 2], [3, 4]]) print("Inv:") print(inv(lst)) print("T:") print(lst.transpose()) print("Det:") print(det(lst)) print("Eig:") print(eig(lst)) y = np.array([[5], [7]]) print("Solve") print(solve(lst, y))
Others
#encoding=utf-8import numpy as npdef main(): ## Other print("FFT:") print(np.fft.fft(np.array([1, 1, 1, 1, 1, 1, 1, 1, 1]))) print("Coef:") print(np.corrcoef([1, 0, 1], [0, 2, 1])) print("Poly:") print(np.poly1d([2, 1, 3])) #一元多次方程
II. matplotlib
Keywords: Drawing library
Line
#encoding =utf-8import NumPy as Npimport Matplotlib.pyplot as Pltdef Main (): # # line x = Np.linspace (-np.pi, Np.pi, 2 Endpoint=true) C, s = Np.cos (x), Np.sin (x) Plt.plot (x, C) plt.figure (1) plt.plot (x, C, color= "Blue", linew idth=1.5, linestyle= "-", label= "cos", alpha=0.6) Plt.plot (x, S, "r*", label= "SIN", alpha=0.6) plt.title ("Cos & S In ", size=16) ax = PLT.GCA () # axis editor ax.spines[" right "].set_color (" none ") ax.spines[" Top "].set_colo R ("none") ax.spines["left"].set_position (("Data", 0) ax.spines["Bottom"].set_position (("Data", 0) ax.xaxis.set_ Ticks_position ("Bottom") ax.yaxis.set_ticks_position ("left") plt.xticks ([-np.pi,-NP.PI/2, 0, NP.PI/2, Np.pi], [R ' $- \pi$ ', R ' $-\pi/2$ ', R ' $0$ ', R ' $\pi/2$ ', R ' $\pi$ ']) # Regular Expression plt.yticks (Np.linspace ( -1, 1, 5, endpoint=t Rue)) for label in Ax.get_xticklabels () +ax.get_yticklabels (): label.set_fontsize (+) Label.set_bbox (Dict ( Facecolor= "White", EDGECOlor= "None", alpha=0.2)) plt.legend (loc= "upper left") Plt.grid () # Plt.axis ([-2, 1, -0.5, 1]) # Fill: Fill the PLT . Fill_between (x, Np.abs (x) < 0.5, C, C > 0.5, color= "green", alpha=0.25) T = 1 plt.plot ([T, T], [0, Np.cos (t)] , "Y", linewidth=3, linestyle= "--") plt.annotate ("cos (1)", xy= (T, Np.cos (1)), xycoords= "Data", xytext= (+10, +30), Textco Ords= "offset points", arrowprops=dict (arrowstyle= ", connectionstyle=" arc3,rad=.2 "))
Plt.fill_between (x, Np.abs (x) < 0.5, C, C > 0.5, color= "green", alpha=0.25)
The first parameter x represents the x-axis, the second parameter Np.abs (x) represents the absolute value of x, Np.abs (x) < 0.5 is a decision variable, C is the y-axis, and C > 0.5 is a criterion.
When Np.abs (x) < 0.5 is true (1), the y-axis of 1 (satisfying c>0.5) begins to fill both sides (of course the x-axis is the area between 0.5 and 0.5), at which point the two small blocks above the graph are filled. When Np.abs (x) >= 0.5 is False (0), it fills up from 0 of the y-axis, and of course only fills the area of the c>0.5, which is the two large symmetric regions in the diagram.
Many types of figures
#encoding =utf-8import NumPy as Npimport Matplotlib.pyplot as Pltdef Main (): Fig = Plt.figure () # # Scatter ax = fi G.add_subplot (3, 3, 1) n = X = Np.random.normal (0, 1, n) Y = np.random.normal (0, 1, n) T = np.arctan2 (Y, X # plt.axes ([0.025, 0.025, 0.95, 0.95]) Plt.scatter (X, Y, s=75, c=t, alpha=.5) Plt.xlim ( -1.5, 1.5), Plt.xticks ([ ]) Plt.ylim ( -1.5, 1.5), Plt.yticks ([]) Plt.axis () plt.title ("Scatter") Plt.xlabel ("X") Plt.ylabel ("Y") # # bar Fig.add_subplot (332) n = ten X = Np.arange (n) Y1 = (1-x/float (n)) * Np.random.uniform (0.5, 1, N) Y 2 = (1-x/float (n)) * Np.random.uniform (0.5, 1, N) plt.bar (X, +y1, facecolor= ' #9999ff ', edgecolor= ' white ') plt.ba R (x,-y2, facecolor= ' #ff9999 ', edgecolor= ' white ') for X, y in Zip (x,y1): Plt.text (x + 0.4, y + 0.05, '%.2f '% y, Ha= ' center ', va= ' bottom ') for x, y in Zip (x,y2): Plt.text (x + 0.4,-y-0.05, '%.2f '% y, ha= ' center ', va= ' top ') # # Pie Fig.add_Subplot (333) n = z = Np.ones (n) z[-1] *= 2 # Explode sector away from center Plt.pie (Z, explode=z*.05, colors=['%f '% (i /float (n)) for I in range (n)], labels=['%.2f '% (I/float (n)) for I in range (n)]) PLT.GCA (). Set_aspect (' EQ UAL ') # round plt.xticks ([]), Plt.yticks ([]) # # Polar Fig.add_subplot (334, polar=true) n = theta = NP . Arange (0, 2 * np.pi, 2 * np.pi/n) radii = ten * Np.random.rand (n) plt.polar (theta, radii) # Plt.plot (Theta, Rad II) # # Heatmap Fig.add_subplot (335) from matplotlib import cm data = Np.random.rand (5, ten) CMap = cm. Blues map = plt.imshow (data, interpolation= ' nearest ', Cmap=cmap, aspect= ' auto ', vmin=0, vmax=1) # 3D from mpl_to Olkits.mplot3d import Axes3d ax = Fig.add_subplot (336, projection= "3d") Ax.scatter (1, 1, 3, s=100) # # Hot Map Fig.add_subplot (313) def f (x, Y): Return (1-x/2 + x * * 5 + y * * 3) * NP.EXP (-X * * * 2-y * 2) n = 256 x = Np.linspace ( -3, 3, n * 2) y = Np.linspace ( -3, 3, N) X, y = Np.meshgrid (x, y) plt.contourf (x, Y, f (x, y), 8, alpha=.75, Cmap=plt.cm.hot) Plt.savefig ("./data/fig.png") Plt.show ()
Python data Analysis I