用C++實現DBSCAN聚類演算法

最後更新：2018-12-08 來源：互聯網

上載者：User

創建阿里雲帳戶，並獲得超過 40 款產品的免費試用版；而企業帳戶則可以享有總值 $1200 的免費試用版。立即註冊！

這幾天由於工作需要，對DBSCAN聚類演算法進行了C++的實現。時間複雜度O(n^2)，主要花在算每個點領域內的點上。演算法很簡單，現共用大家參考，也希望有更多交流。
資料點類型描述如下：
複製代碼代碼如下:#include <vector>

using namespace std;

const int DIME_NUM=2; //資料維度為2，全域常量

//資料點類型
class DataPoint
{
private:
unsigned long dpID; //資料點ID
double dimension[DIME_NUM]; //維度資料
long clusterId; //所屬聚類ID
bool isKey; //是否核心對象
bool visited; //是否已訪問
vector<unsigned long> arrivalPoints; //領域資料點id列表
public:
DataPoint(); //預設建構函式
DataPoint(unsigned long dpID,double* dimension , bool isKey); //建構函式

unsigned long GetDpId(); //GetDpId方法
void SetDpId(unsigned long dpID); //SetDpId方法
double* GetDimension(); //GetDimension方法
void SetDimension(double* dimension); //SetDimension方法
bool IsKey(); //GetIsKey方法
void SetKey(bool isKey); //SetKey方法
bool isVisited(); //GetIsVisited方法
void SetVisited(bool visited); //SetIsVisited方法
long GetClusterId(); //GetClusterId方法
void SetClusterId(long classId); //SetClusterId方法
vector<unsigned long>& GetArrivalPoints(); //GetArrivalPoints方法
};

這是實現：複製代碼代碼如下:#include "DataPoint.h"

//預設建構函式
DataPoint::DataPoint()
{
}

//建構函式
DataPoint::DataPoint(unsigned long dpID,double* dimension , bool isKey):isKey(isKey),dpID(dpID)
{
//傳遞每維的維度資料
for(int i=0; i<DIME_NUM;i++)
{
this->dimension[i]=dimension[i];
}
}

//設定維度資料
void DataPoint::SetDimension(double* dimension)
{
for(int i=0; i<DIME_NUM;i++)
{
this->dimension[i]=dimension[i];
}
}

//擷取維度資料
double* DataPoint::GetDimension()
{
return this->dimension;
}

//擷取是否為核心對象
bool DataPoint::IsKey()
{
return this->isKey;
}

//設定核心對象標誌
void DataPoint::SetKey(bool isKey)
{
this->isKey = isKey;
}

//擷取DpId方法
unsigned long DataPoint::GetDpId()
{
return this->dpID;
}

//設定DpId方法
void DataPoint::SetDpId(unsigned long dpID)
{
this->dpID = dpID;
}

//GetIsVisited方法
bool DataPoint::isVisited()
{
return this->visited;
}

//SetIsVisited方法
void DataPoint::SetVisited( bool visited )
{
this->visited = visited;
}

//GetClusterId方法
long DataPoint::GetClusterId()
{
return this->clusterId;
}

//GetClusterId方法
void DataPoint::SetClusterId( long clusterId )
{
this->clusterId = clusterId;
}

//GetArrivalPoints方法
vector<unsigned long>& DataPoint::GetArrivalPoints()
{
return arrivalPoints;
}

DBSCAN演算法類型描述：複製代碼代碼如下:#include <iostream>
#include <cmath>

using namespace std;

//群集類型
class ClusterAnalysis
{
private:
vector<DataPoint> dadaSets; //資料集合
unsigned int dimNum; //維度
double radius; //半徑
unsigned int dataNum; //資料數量
unsigned int minPTs; //鄰域最小資料個數

double GetDistance(DataPoint& dp1, DataPoint& dp2); //距離函數
void SetArrivalPoints(DataPoint& dp); //設定資料點的領域點列表
void KeyPointCluster( unsigned long i, unsigned long clusterId ); //對資料點領域內的點執行聚類操作
public:

ClusterAnalysis(){} //預設建構函式
bool Init(char* fileName, double radius, int minPTs); //初始化操作
bool DoDBSCANRecursive(); //DBSCAN遞迴演算法
bool WriteToFile(char* fileName); //將聚類結果寫入檔案
};

聚類實現：複製代碼代碼如下:#include "ClusterAnalysis.h"
#include <fstream>
#include <iosfwd>
#include <math.h>

/*
函數：聚類初始化操作
說明：將資料檔案名，半徑，領域最小資料個數資訊寫入聚類演算法類，讀取檔案，把資料資訊讀入寫進演算法類資料集合中
參數：
char* fileName; //檔案名稱
double radius; //半徑
int minPTs; //領域最小資料個數
傳回值： true; */
bool ClusterAnalysis::Init(char* fileName, double radius, int minPTs)
{
this->radius = radius; //設定半徑
this->minPTs = minPTs; //設定領域最小資料個數
this->dimNum = DIME_NUM; //設定資料維度
ifstream ifs(fileName); //開啟檔案
if (! ifs.is_open()) //若檔案已經被開啟，報錯誤資訊
{
cout << "Error opening file"; //輸出錯誤資訊
exit (-1); //程式退出
}

unsigned long i=0; //資料個數統計
while (! ifs.eof() ) //從檔案中讀取POI資訊，將POI資訊寫入POI列表中
{
DataPoint tempDP; //臨時資料點對象
double tempDimData[DIME_NUM]; //臨時資料點維度資訊
for(int j=0; j<DIME_NUM; j++) //讀檔案，讀取每一維資料
{
ifs>>tempDimData[j];
}
tempDP.SetDimension(tempDimData); //將維度資訊存入資料點對象內

//char date[20]="";
//char time[20]="";
////double type; //無用資訊
//ifs >> date;
//ifs >> time; //無用資訊讀入

tempDP.SetDpId(i); //將資料點對象ID設定為i
tempDP.SetVisited(false); //資料點對象isVisited設定為false
tempDP.SetClusterId(-1); //設定預設簇ID為-1
dadaSets.push_back(tempDP); //將對象壓入資料集合容器
i++; //計數+1
}
ifs.close(); //關閉檔案流
dataNum =i; //設定資料對象集合大小為i
for(unsigned long i=0; i<dataNum;i++)
{
SetArrivalPoints(dadaSets[i]); //計算資料點領域內對象
}
return true; //返回
}

/*
函數：將已經過聚類演算法處理的資料集合寫迴文件
說明：將已經過聚類結果寫迴文件
參數：
char* fileName; //要寫入的檔案名稱
傳回值： true */
bool ClusterAnalysis::WriteToFile(char* fileName )
{
ofstream of1(fileName); //初始設定檔案輸出資料流
for(unsigned long i=0; i<dataNum;i++) //對處理過的每個資料點寫入檔案
{
for(int d=0; d<DIME_NUM ; d++) //將維度資訊寫入檔案
of1<<dadaSets[i].GetDimension()[d]<<'\t';
of1 << dadaSets[i].GetClusterId() <<endl; //將所屬簇ID寫入檔案
}
of1.close(); //關閉輸出檔案流
return true; //返回
}

/*
函數：設定資料點的領域點列表
說明：設定資料點的領域點列表
參數：
傳回值： true; */
void ClusterAnalysis::SetArrivalPoints(DataPoint& dp)
{
for(unsigned long i=0; i<dataNum; i++) //對每個資料點執行
{
double distance =GetDistance(dadaSets[i], dp); //擷取與特定點之間的距離
if(distance <= radius && i!=dp.GetDpId()) //若距離小於半徑，並且特定點的id與dp的id不同執行
dp.GetArrivalPoints().push_back(i); //將特定點id壓力dp的領域列表中
}
if(dp.GetArrivalPoints().size() >= minPTs) //若dp領域內資料點資料量> minPTs執行
{
dp.SetKey(true); //將dp核心對象標誌位設為true
return; //返回
}
dp.SetKey(false); //若非核心對象，則將dp核心對象標誌位設為false
}

/*
函數：執行聚類操作
說明：執行聚類操作
參數：
傳回值： true; */
bool ClusterAnalysis::DoDBSCANRecursive()
{
unsigned long clusterId=0; //聚類id計數，初始化為0
for(unsigned long i=0; i<dataNum;i++) //對每一個資料點執行
{
DataPoint& dp=dadaSets[i]; //取到第i個資料點對象
if(!dp.isVisited() && dp.IsKey()) //若對象沒被訪問過，並且是核心對象執行
{
dp.SetClusterId(clusterId); //設定該對象所屬簇ID為clusterId
dp.SetVisited(true); //設定該對象已被訪問過
KeyPointCluster(i,clusterId); //對該對象領域內點進行聚類
clusterId++; //clusterId自增1
}
//cout << "孤立點\T" << i << endl;
}

cout <<"共聚類" <<clusterId<<"個"<< endl; //演算法完成後，輸出聚類個數
return true; //返回
}

/*
函數：對資料點領域內的點執行聚類操作
說明：採用遞迴的方法，深度優先聚類資料
參數：
unsigned long dpID; //資料點id
unsigned long clusterId; //資料點所屬簇id
傳回值： void; */
void ClusterAnalysis::KeyPointCluster(unsigned long dpID, unsigned long clusterId )
{
DataPoint& srcDp = dadaSets[dpID]; //擷取資料點對象
if(!srcDp.IsKey()) return;
vector<unsigned long>& arrvalPoints = srcDp.GetArrivalPoints(); //擷取對象領域內點ID列表
for(unsigned long i=0; i<arrvalPoints.size(); i++)
{
DataPoint& desDp = dadaSets[arrvalPoints[i]]; //擷取領域內點資料點
if(!desDp.isVisited()) //若該對象沒有被訪問過執行
{
//cout << "資料點\t"<< desDp.GetDpId()<<"聚類ID為\t" <<clusterId << endl;
desDp.SetClusterId(clusterId); //設定該對象所屬簇的ID為clusterId，即將該對象吸入簇中
desDp.SetVisited(true); //設定該對象已被訪問
if(desDp.IsKey()) //若該對象是核心對象
{
KeyPointCluster(desDp.GetDpId(),clusterId); //遞迴地對該領域點資料的領域內的點執行聚類操作，採用深度優先方法
}
}
}
}

//兩資料點之間距離
/*
函數：擷取兩資料點之間距離
說明：擷取兩資料點之間的歐式距離
參數：
DataPoint& dp1; //資料點1
DataPoint& dp2; //資料點2
傳回值： double; //兩點之間的距離 */
double ClusterAnalysis::GetDistance(DataPoint& dp1, DataPoint& dp2)
{
double distance =0; //初始化距離為0
for(int i=0; i<DIME_NUM;i++) //對資料每一維資料執行
{
distance += pow(dp1.GetDimension()[i] - dp2.GetDimension()[i],2); //距離+每一維差的平方
}
return pow(distance,0.5); //開方並返回距離
}

演算法調用就簡單了：複製代碼代碼如下:#include "ClusterAnalysis.h"
#include <cstdio>

using namespace std;

int main()
{
ClusterAnalysis myClusterAnalysis; //聚類演算法對象聲明
myClusterAnalysis.Init("D:\\1108\\XY.txt",500,9); //演算法初始化操作，指定半徑為15，領域內最小資料點個數為3，（在程式中已指定資料維度為2）
myClusterAnalysis.DoDBSCANRecursive(); //執行聚類演算法
myClusterAnalysis.WriteToFile("D:\\1108\\XYResult.txt");//寫執行後的結果寫入檔案

system("pause"); //顯示結果
return 0; //返回
}

本文章原先以中文撰寫並發佈於 aliyun.com，亦設英文版本，僅作資訊用途。本網站不對文章的準確性，完整性或可靠性或其任何翻譯作出任何明示或暗示的陳述或保證。如對該文章有任何疑慮或投訴，請傳送電郵至 info-contact@alibabacloud.com 並提供相關疑慮或投訴的詳細說明。職員會於 5 個工作天內與您聯絡，一經驗證之後，即會刪除該侵權內容。

A Free Trial That Lets You Build Big!

Start building with 50+ products and up to 12 months usage for Elastic Compute Service

Get Started for Free

Sales Support

1 on 1 presale consultation

Chat Contact Sales
After-Sales Support

24/7 Technical Support 6 Free Tickets per Quarter Faster Response

Open a Ticket
Alibaba Cloud offers highly flexible support services tailored to meet your exact needs.

Learn More