First, introduce
Data mining needs data often distributed in different datasets, and data integration is the process of merging multiple datasets into a consistent data store.
For Dataframe, its connections are sometimes indexed.
Third, code example
# coding:utf-8 # In[2]: From pandas import dataframe import pandas as PD import NumPy as NP # #dataframe合并 #1 DF1 = Dataframe ({' key ': [' B ', ' B ', ', ' a ', ' C ', ' A ', ' a ', ' B ', ' Data1 ': Range (7)}) DF2 = Dataframe ({' key ': [' A ', ' B ', ' d '], ' data2 ': Range (3)}) DF1 # In[8]: DF2 # In[9]: Pd.merge (DF1,DF2) # In[10]: #默认重叠列名当作键, multiple column names are treated as keys if there are multiple overlapping column names #如果不希望把多个列
Name as a key and can be specified with on. Pd.merge (df1,df2,on= ' key ') # In[12]: Df3 = dataframe ({' Lkey ': [' B ', ' B ', ' A ', ' C ', ' A ', ' a ', ' a ', ' a ', ' B '], ' d Ata1 ': Range (7)}) Df4 = Dataframe ({' Rkey ': [' A ', ' B ', ' d '], ' data2 ': Range (3)}) DF3 # In[13]: DF4 # IN[14]: #默认内连接, take intersection pd.merge (Df3, Df4, left_on= ' Lkey ', right_on= ' Rkey ') # In[15]: pd.merge (df1,df2,how= ' outer ') # IN[16]: df1 = dataframe ({' key ': [' B ', ' ' B ', ' A ', ' C ', ' A ', ' B ', ' A ', ', ', ', ', ') ', ' data1 ': Range (6)}) DF2 = Dataframe EY ': [' A ', ' B ', ' A ', ' B ', ' d '], ' data2 ': Range (5)}) DF1 # In[17]: DF2 # In[18]: Pd.merge (DF1, DF2 , on= ' key ', how= ' left ') # In[19]: Pd.merge (DF1, DF2, on= ' key ', how= ' right ') # In[20]: Pd.merge (DF1, DF2, how= ' inner ')
[O]: #多个键的连接 left = Dataframe ({' Key1 ': [' foo ', ' foo ', ' Bar '], ' key2 ': [' one ', ' two ', ' one '], ' Lval ': [1, 2, 3]} right = Dataframe ({' Key1 ': [' foo ', ' foo ', ' Bar ', ' Bar '], ' key2 ': [' one ',
' One ', ' one ', ' two '], ' rval ': [4, 5, 6, 7]}] Pd.merge (left, right, on=[' key1 ', ' Key2 '], how= ' outer ') # In[27]: Pd.merge (left, right, on= ' Key1 ', copy= ' True ') # In[24]: Pd.merge (left, right, on= ' Key1 ', suffixes= (' _left '), ' _right ') # In[28]: # # #索引上的合并 #1 left1 = dataframe ({' key ': [' A ', ' B ', ' A ', ' a ', ' a ', ' a ', ' B ', ' C '], ' value ': Range (6)}) right1 = Dataframe ({' Group_val ': [3.5, 7]}, index=[' A ', ' B ']) LEFT1 # in[29]: right1 # In[31]: Pd.merge (Left1,right1,lef t_on= ' key ', right_index=true) # In[32]: Pd.merge (LEFT1, right1, left_on= ' key ', right_index=true, how= ' outer ') # in[34 ]: #2 import NumPy asNP lefth = Dataframe ({' Key1 ': [' Ohio ', ' Ohio ', ' Ohio ', ' Nevada ', ' Nevada '], ' Key2 ': [2000, 2001, 2002,
2001, 2002], ' Data ': Np.arange (5.)}) righth = Dataframe (Np.arange) reshape ((6, 2)), index=[[' Nevada ', ' Nevada ', ' Ohio ', ' Ohio ', ' Ohio ', '
Ohio '], [2001, Watts, Watts, 2001, 2002]], columns=[' event1 ', ' Event2 ']
Lefth # In[35]: righth # in[36]: Pd.merge (lefth, righth, left_on=[' key1 ', ' Key2 '], right_index=true) # In[37]: left2 = Dataframe ([[1., 2.], [3., 4.], [5., 6.]], index=[' A ', ' C ', ' e '], columns=[' Ohio ', ' Nevada ']) lef T2 # in[38]: right2 = Dataframe ([[7., 8.], [9, A.], [One, one,], [a]], index=[' B ', ' C ', ' d ', ' E '], columns=[' Missouri ', ' Alabama '] right2 # in[39]: Pd.merge (left2, right2, how= ' outer ', Left_index=true, right_in dex=true) # In[40]: Left2.join (right2, how= ' outer ') # In[41]: Left1.join (right1, On= ' key ') # In[42]: another = Dataframe ([[7., 8.], [9.,], [One, one,], [a], index=[' a ', ' C ', ' e ', ' f '], columns=[' New York ', ' Oregon '] left2.join ([Right2, another]) # In[43]: Left2.join ([Right2,another],ho
w= ' outer ') # In[3]: # # #轴向连接 #1 arr = Np.arange ((3, 4)) reshape # arr]: in[4 ([Np.concatenate) # In[8]: #2 from pandas import Series s1 = Series ([0, 1], index=[' A ', ' B ']) s2 = Series ([2, 3, 4], index=[' C ', ' d ', ' e '
] S3 = Series ([5, 6], index=[' f ', ' G ']) # In[9]: Pd.concat ([S1,S2,S3]) # In[10]: Pd.concat ([S1, S2, S3], Axis=1) # In[11]: S4 = Pd.concat ([s1 * 5, S3]) S4 # In[12]: Pd.concat ([S1, S4], Axis=1) # In[13]: Pd.concat ([S1, S4], Axis=1, join= ' inner ') # In[14]: Pd.concat ([S1, S4], Axis=1, join_axes=[[' A ', ' C ', ' B ', ' e ']]) # IN[15]: result = P D.concat ([S1, S1, S3], keys=[' one ', ' two ', ' three ']) result # In[16]: Result.unstack () # In[17]: #4 pd.concat ([S1, S2, S3], Axis=1, keys=[' One ', ' two ', ' three '] # In[18]: df1 = Dataframe (Np.arange (6). Reshape (3, 2), index=[' A ', ' B ', ' C ', C olumns=[' One ', ' two ']) DF2 = dataframe (5 + np.arange (4). Reshape (2, 2), index=[' A ', ' C '], columns=[' three ', ' Four ']) # In[19]: Pd.concat ([Df1, DF2], Axis=1, keys=[' level1 ', ' Level2 ']) # In[20]: Pd.concat ({' Level1 ': df1, ' Level2 ': Df2}, Axis=1) # In[21]: Pd.concat ([Df1, DF2], Axis=1, keys=[' level1 ', ' Level2 '], names=[' Upper ', ' l Ower ']) # In[22]: #5 df1 = Dataframe (Np.random.randn (3, 4), columns=[' A ', ' B ', ' C ', ' d ']) DF2 = Dataframe (np.random.ra NDN (2, 3), columns=[' B ', ' d ', ' A ']) # In[23]: DF1 # In[24]: DF2 # In[25]: Pd.concat ([Df1, DF2], ignore_index=t Rue) # In[26]: # # #合并重叠数据 #1 a = Series ([Np.nan, 2.5, Np.nan, 3.5, 4.5, Np.nan], index=[' f ', ' e ', ' d ', ' C '),
' B ', ' a '] b = Series (Np.arange (Len (a), Dtype=np.float64), index=[' f ', ' e ', ' d ', ' C ', ' B ', ' a ']) b[-1] = Np.nan A # In[27]: B #IN[29]: #如果a里面是NaN, then replace with B, otherwise use a np.where (Pd.isnull (a), b,a) # In[30]: #2 b[:-2].combine_first (a[2:]) # In[31]: #3 df1 = Dataframe ({' A ': [1., Np.nan, 5., Np.nan], ' B ': [Np.nan, 2, Np.nan, 6.], ' C ': Range (2, 4)}) DF2 = Dataframe ({' A ': [5., 4., Np.nan, 3., 7.], ' B ': [Np.nan, 3., 4., 6., 8.]}) df1.com Bine_first (DF2) # in[32]: # # #重塑层次化索引 #1 data = Dataframe (Np.arange (6). Reshape ((2, 3)), INDEX=PD. Index ([' Ohio ', ' Colorado '], name= ' state '), COLUMNS=PD. Index ([' One ', ' two ', ' three '], name= ' number ') data # in[33]: result = Data.stack () result # In[34]: Result.unstac K () # In[38]: result.unstack (0) # In[36]: Result.unstack (' state ') # In[39]: #2 S1 = Series ([0, 1, 2, 3], index =[' A ', ' B ', ' C ', ' d ']) s2 = Series ([4, 5, 6], index=[' C ', ' d ', ' e ']) data2 = Pd.concat ([S1, S2], keys=[' one ', ' two ']) data 2.unstack () # In[40]: Data2.unstack (). Stack () # in[41]: Data2.unstack ().Stack (dropna=false) # In[42]: #3 df = dataframe ({"Left": result, "right": Result + 5}, COLUMNS=PD. Index ([' Left ', ' right '], name= ' side ') DF # in[43]: Df.unstack ("state") # In[44]: Df.unstack (' state '). Stack (' side ') # in[46]: # # #长宽格式的转换 #1 data = pd.read_csv (' data/macrodata.csv ') periods = PD. Periodindex (Year=data.year, Quarter=data.quarter, name= ' date ') data = Dataframe (Data.to_records (), column S=PD. Index ([' REALGDP ', ' infl ', ' unemp '], name= ' item '), Index=periods.to_timestamp (' D ', ' End ')) Ldata = Data.s
Tack (). Reset_index (). Rename (columns={0: ' Value '}) Wdata = Ldata.pivot (' Date ', ' Item ', ' value ') # In[47]: #2 ldata[:10] # in[48]: pivoted = Ldata.pivot (' Date ', ' Item ', ' value ') Pivoted.head () # in[49]: ldata[' value2 '] = Np.random.ran DN (len (ldata)) ldata[:10] # In[50]: pivoted = Ldata.pivot (' Date ', ' item ') Pivoted[:5] # in[53]: pivoted[' value ' [: 5] # in[54]: unstacked = Ldata.set_index ([' Date ', ' Item ']). Unstack (' item ') Unstacked[:7] # in[55]: # # #移除重复数据 data = dataframe ({' K1 ': [' One '] * 3 + [' two '] * 4, ' K2 ': [1, 1, 2, 3, 3, 4, 4]} data # In[56]: data.duplicated () # in[57]: Data.drop_duplicates () # in[58]: DA ta[' v1 ' = range (7) #增加一列 # in[59]: Data # In[60]: data.drop_duplicates (["K1"]) # In[67]: Data.drop_duplicates (
[' K1 ', ' K2 ']) # in[102]: