Tree regression for cart algorithm:
Each node returned is finally a final determined average.
#coding:utf-8import numpy as np# Loading file Data def loaddataset (fileName): #general function to parse tab -delimited floats dataMat = [] #assume &NBSP;LAST&NBSP;COLUMN&NBSP;IS&NBSP;TARGET&NBSP;VALUE&NBSP;&NBSP;&NBSP;&NBSP;FR = open (FileName) for line in fr.readlines (): curline = line.strip (). Split (' \ t ') fltline = map (float,curline) #map all elements to float () datamat.append (Fltline) return datamat# Select this column with the feature feature in the DataSet and divide the value values into two parts def binsplitdataset (dataset, feature, value): mat0&nBsp;= dataset[np.nonzero (dataset[:,feature] > value) [0],:][0]&NBSP;&NBSP;&NBSP;&NBSP;MAT1 = dataset[np.nonzero (Dataset[:,feature] <= value) [0],:][0] Return mat0,mat1# calculates the average of the result of the last column of this matrix, using the average as the final return, and the subsequent model tree returns a linear model def regleaf (DataSet): return np.mean (Dataset[:,-1]) #计算dataset结果的混乱程度, with variance reaction, because it is a continuous data Def regerr (dataset): return np.var (Dataset[:,-1]) * np.shape (DataSet) [0] #选择最佳的分离特征和该特征的分离点 # Here the OPS is a predetermined value, 1 is too small difference is not divided, 4 is divided after the respective sample number, too small to leave, this is a method of pre-pruning def choosebestsplit (dataset, Leaftype=regleaf, errtype=regerr, ops= (1,4)): tols = ops[0]; toln = ops[1] #if all the target variables are The same value: quit and return value if len (Set ( DATASET[:,-1]. T.tolist () [0]) == 1: #exit cond 1 return none, leaftype (DataSet) m,n = np.shape (DataSet) #the choice of the best feature is driven by Reduction in Rss error from mean s = errtype (DataSet) bestS = np.inf; bestIndex = 0; bestValue = 0 #循环所有的特征 for featindex in range (n-1): #循环该特征下的所有特征值 for splitval in set (Dataset[:,featindex]): mat0, mat1 = binsplitdataset (Dataset, featindex, splitval) #如果更具这个特征值分成的两类有一个小与预先给定值, stating that the classification is too biased, do not consider if (Np.shape (mat0) [0] < toln) or (Np.shape (MAT1) [0] < TOLN): continue news = Errtype (mat0) + errtype (MAT1) if newS < bestS: bestIndex = featIndex bestValue = splitVal bestS = newS #if the decrease (s-bests) is less than a threshold don ' t do the split if (s - bests) < tolS: return none, leaftype (dataSet) #exit cond 2 mat0, Mat1 = binsplitdataset (Dataset, bestindex, bestvalue) if ( Np.shape (mat0) [0] < toln) or (Np.shape (MAT1) [0] < toln]: #exit cond 3 return none, leaftype (DataSet) return bestIndex,bestValue #创建树def createtree (Dataset, leaftype=regleaf, errtype=regerr, ops= (1,4)): feat, val = choosebestsplit (DataSet, leaftype, errtype, ops) if feat == none: return val retTree = {} rettree[' spind '] = feat rettree[' spval '] = val lset, rset = binsplitdataset (Dataset, feat, val) Rettree[' left '] = createtree (lset, leaftype, errtype, ops) rettree[' right '] = createtree (rset, leaftype, errtype, ops) Return rettreemydat = loaddataset (' Ex0.txt ') Mymat = np.mat (Mydat) result = Createtree (Mymat) Print result
Results:
{' Spind ': 1, ' Spval ': Matrix ([[[[0.39435]]), ' right ': {' spind ': 1, ' Spval ': Matrix ([[[0.197834]]), ' right ':-0.023838155555 555553, ' left ': 1.0289583666666666}, ' left ': {' spind ': 1, ' Spval ': Matrix ([[[0.582002]]), ' right ': 1.980035071428571, ' Le Ft ': {' spind ': 1, ' Spval ': Matrix ([[[[0.797583]]), ' right ': 2.9836209534883724, ' Left ': 3.9871631999999999}}}
The result means: the first several characteristics, to how large as the characteristic value separates, divides into the left and right, divides successively.
This algorithm is very good, but the classification of the data is too high, it is easy to create overfitting. pruning techniques are therefore used.
The process of avoiding overfitting by reducing the complexity of the decision tree is called pruning.
#判断obj是否是一个子树def istree (obj): return (type (obj). __name__== ' Dict ') #用于坍塌处理, When the test data set is empty, then take the average of the entire tree Def getmean (tree): if istree (tree[' right '): tree[' Right '] = getmean (tree[' right ']) if istree (tree[' left '): tree[' Left '] = getmean (tree["left") return (tree[' left ']+tree[' right ')/2.0# Pruning function Def prune (tree, testdata): #如果测试数据集为空, collapse treatment if np.shape (TestData) [0] == 0: return getmean (tree) #如果左或者右是树, the test data set is segmented according to the decision Tree if (Istree (tree[' right ") or istree (tree[' left ')): lset, rset = binsplitdataset (testdata, tree[' spind '], tree[' SpVal ']) #如果左侧是树, bring the dataset and subtree to continue to find if istree (tree[' left '): tree[' left '] = Prune (tree[' left '], lset) #同理 if istree (tree[' right ') : tree[' right '] = prune (tree[' right '], rset) #if they are now both leafs, see if we can merge them #如果左右都是节点, compute node error if not istree (tree[' left ') and not istree (tree[' right '): lset, rset = Binsplitdataset (testdata, tree[' spind '], tree[' spval ']) #计算不合并的误差 errornomerge = sum (Np.power (lSet[:,-1 ] - tree[' left '],2)] + sum (Np.power (rset[:,-1] - tree[' right '],2)) treemean = (tree[' left ']+tree[' right ')/2.0 # Calculates the error errormerge = sum after merging the current two leaf nodes (Np.power (testData[ :, -1] - treemean,2)) if errorMerge < errorNoMerge: print "Merging " #可以合并就返回平均值 return treeMean #不可以合并就返回树, no change else: return tree else: return tree
In general, both pre-pruning and post-pruning combined use
Model Tree
Each node is a linear model
The rest is basically the same:
#对数据集进行线性回归def linearsolve (DataSet): m,n = np.shape (DataSet) x = np.mat (Np.ones (m,n)) y = np.mat (Np.ones (m,1)) #有一列是常数项, so you want to put more than one column in the constant entry X[:,1:n] = dataSet[:,0:n-1]; Y = Dataset[:,-1] xtx = x.t*x if np.linalg.det (xTx) == 0.0: raise nameerror (' This matrix is singular, cannot do inverse,\n try Increasing the second value of ops ') ws = xtx.i * (X.t * y) return ws,x,y# produces a linear model for the data set # equivalent to the REGLEAF function def above modelleaf (DataSet): ws,x,y = linearsolve (DataSet) return ws# generated for theThe linear model of the data set, and calculates the error return # equivalent to the above Regerr function, calculates the error of the model, if the difference between the sum and the error is almost the choice of Def modelerr (DataSet): Ws,x,y = linearsolve (DataSet) yHat = X * ws return sum (Np.power (y - yhat,2))
The model tree returns well and can be used as a predictor
Learning Log---Tree regression (regression tree, model tree)