discrete feature encoding are divided into two situations:
1, the value of discrete feature is not the size of the meaning, such as color:[red,blue], then the use of One-hot encoding
2, the value of discrete feature has the meaning of size, such as SIZE:[X,XL,XXL], then use the mapping of the value of {x:1,xl:2,xxl:3} in [90]:
Import NumPy as NP
Import Pandas as PD
From pandas import Series, dataframe
Np.set_printoptions (precision=4)
X ... In [91]:
DF = PD. Dataframe ([
[' Green ', ' M ', 10.1, ' Class1 '],
[' Red ', ' L ', 13.5, ' class2 '],
[' Blue ', ' XL ', 15.3, ' Class1 ']]
Df.columns = [' Color ', ' size ', ' Prize ', ' Class label ']
Df
XOUT[91]:
|
Color |
size |
Prize |
class Label |
0 |
Green |
M |
10.1 |
Class1 |
1 |
Red |
L |
13.5 |
Class2 |
2 |
Blue |
Xl |
15.3 |
Class1 |
... In [92]:
Size_mapping = {
' XL ': 3,
' L ': 2,
' M ': 1}
Df
XOUT[92]:
|
Color |
size |
Prize |
class Label |
0 |
Green |
1 |
10.1 |
Class1 |
1 |
Red |
2 |
13.5 |
Class2 |
2 |
Blue |
3 |
15.3 |
Class1 |
... In [93]:
# -----------------------------------------------
# use Pd.get_dummies () for processing
Pd.get_dummies (DF)
XOUT[93]:
|
size |
Prize |
Color_blue |
Color_green |
color_red |
class Label_class1 |
class Label_class2 |
0 |
1 |
10.1 |
0 |
1 |
0 |
1 |
0 |
1 |
2 |
13.5 |
0 |
0 |
1 |
0 |
1 |
2 |
3 |
15.3 |
1 |
0 |
0 |
1 |
0 |
... In [94]:
Df
XOUT[94]:
|
Color |
size |
Prize |
class Label |
0 |
Green |
1 |
10.1 |
Class1 |
1 |
Red |
2 |
13.5 |
Class2 |
2 |
Blue |
3 |
15.3 |
Class1 |
... In [95]:
X
# -----------------------------------------------
# Use sklearn.feature_extraction. Dictvectorizer for processing
Feature_list = []
Label_list = []
For row in df.index[:]:
Label_list.append (Df.ix[row][-1])
Rowdict = {}
For I in range (0, Len (Df.ix[row])-1):
Rowdict[df.columns[i]] = Df.ix[row][i]
Feature_list.append (rowdict)
Feature_list
XOUT[95]:
[{' Color ': ' Green ', ' Prize ': 10.1, ' Size ': 1},
{' Color ': ' Red ', ' prize ': 13.5, ' Size ': 2},
{' Color ': ' Blue ', ' Prize ': 15.300000000000001, ' Size ': 3}]
... In [96]:
Label_list
XOUT[96]:
[' Class1 ', ' class2 ', ' Class1 ']
... In [97]:
From sklearn.feature_extraction import Dictvectorizer
VEC = Dictvectorizer ()
# Dictvectorizer.fit_transform () accept a list made up of dict
dummy_x = Vec.fit_transform (feature_list). ToArray ()
Dummy_x
XOUT[97]:
Array ([[ 0., 1., 0, 10.1, 1.],
[0. , 0., 1., 13.5, 2.],
[ C12/>1. , 0., 0., 15.3, 3.]]
... In [98]:
From Sklearn Import preprocessing
Label_bin = preprocessing. Labelbinarizer ()
# preprocessing. Labelbinarizer.fit_transform () accepts a list
dummy_y = Label_bin.fit_transform (label_list)
Dummy_y
XOUT[98]:
Array ([[0],
[1],
[0]])
... In [99]: