Implementing k-means clustering in C++
#include <iostream>
#include <fstream>
#include <string>
#include <stdlib.h>
#include <cmath>
#include <random>
int main(){
std::ifstream my_file;
std::string my_line;
std::string token;
std::string delimiter = ",";
std::string spec_0 = "\"setosa\"";
std::string spec_1 = "\"versicolor\"";
double my_array[150][4];
int spec_vec[150];
int ind;
ind = -1;
my_file.open("/home/jan/iris.txt");
while(my_file.good()){
std::getline(my_file, my_line);
if(ind > -1 & ind < 151){
for(int i = 0; i < 5; ++i){
token = my_line.substr(0, my_line.find(delimiter));
if(i < 4){
double var_val = atof(token.c_str());
my_array[ind][i] = var_val;
} else {
if(token == spec_0){
spec_vec[ind] = int(0);
} else {
if(token == spec_1){
spec_vec[ind] = int(1);
} else {
spec_vec[ind] = int(2);
}
}
}
my_line = my_line.erase(0, my_line.find(delimiter) + delimiter.length());
}
}
ind = ind+1;
}
double my_array_std[150][4];
double data_means[4];
double data_sds[4];
double data_mins[4];
double data_maxs[4];
for(int i = 0; i < 4; ++i){
double my_vec[150];
double my_vec_sum = 0;
double for_sd = 0;
data_mins[i] = my_array[0][i];
data_maxs[i] = my_array[0][i];
for(int j = 0; j < 150; ++j){
my_vec[j] = my_array[j][i];
my_vec_sum += my_vec[j];
data_mins[i] = std::min(data_mins[i], my_vec[j]);
data_maxs[i] = std::max(data_maxs[i], my_vec[j]);
}
data_means[i] = my_vec_sum / 150;
for(int j = 0; j < 150; ++j){
for_sd += pow(my_vec[j] - data_means[i], 2);
}
data_sds[i] = sqrt(for_sd / 150);
for(int j = 0; j < 150; ++j){
my_array_std[j][i] = (my_vec[j] - data_means[i]) / data_sds[i];
}
}
int num_clusts = 10;
double clust_vec[num_clusts][4];
for(int i = 0; i < num_clusts; ++i){
for(int j = 0; j < 4; ++j){
auto distr = std::uniform_real_distribution<double> {(data_mins[j] - data_means[j]) / data_sds[j], (data_maxs[j] - data_means[j]) / data_sds[j]};
auto numgen = std::mt19937 {i};
clust_vec[i][j] = distr(numgen);
}
}
double clust_diffs[150][num_clusts];
int mindiff_idxs[150];
double tot_loss = 0;
for(int i = 0; i < 150; ++i){
for(int j = 0; j < num_clusts; ++j){
clust_diffs[i][j] = 0;
}
}
double best_clust_diff[150];
for(int i = 0; i < 150; ++i){
best_clust_diff[i] = 0;
}
for(int i = 0; i < 150; ++i){
for(int j = 0; j < num_clusts; ++j){
for(int k = 0; k < 4; ++k){
clust_diffs[i][j] += pow(clust_vec[j][k] - my_array_std[i][k], 2);
tot_loss += pow(clust_vec[j][k] - my_array_std[i][k], 2);
}
if(j == 0){
mindiff_idxs[i] = j;
best_clust_diff[i] = clust_diffs[i][j]; // records the best "matching" cluster found - initially assumed to be the first cluster
} else {
if(clust_diffs[i][j] < best_clust_diff[i]){ // compares loss for jth cluster with that of best cluster found up to j
mindiff_idxs[i] = j;
best_clust_diff[i] = clust_diffs[i][j]; // records the best "matching" cluster found if match is better than best previous best match
}
}
}
}
tot_loss = tot_loss / num_clusts;
int num_new_clusts = 0;
for(int i = 0; i < num_clusts; ++i){
for(int j = 0; j < 150; ++j){
if(i == mindiff_idxs[j]){
num_new_clusts += 1;
break;
}
}
}
int unique_new_clusts[num_new_clusts];
int k = 0;
for(int i = 0; i < num_clusts; ++i){
for(int j = 0; j < 150; ++j){
if(i == mindiff_idxs[j]){
unique_new_clusts[k] = mindiff_idxs[j];
k += 1;
break;
}
}
}
float new_clusts_pre[num_clusts][4];
float new_clusts[num_clusts][4];
int clust_counter[num_clusts];
for(int i = 0; i < num_new_clusts; ++i){
clust_counter[i] = 0;
}
for(int i = 0; i < num_new_clusts; ++i){
for(int j = 0; j < num_new_clusts; ++j){
new_clusts_pre[i][j] = 0;
new_clusts[i][j] = 0;
}
}
for(int j = 0; j < num_new_clusts; ++j){
for(int k = 0; k < 4; ++k){
for(int i = 0; i < 150; ++i){
if(unique_new_clusts[j] == mindiff_idxs[i]){
clust_counter[j] += 1;
new_clusts_pre[j][k] += my_array_std[i][k];
}
}
}
}
for(int i = 0; i < num_new_clusts; ++i){
for(int j = 0; j < 4; ++j){
new_clusts[i][j] = new_clusts_pre[i][j] / clust_counter[i];
}
}
std::ofstream my_file_out;
my_file_out.open("iris_clustering_results.txt");
my_file_out << "iteration" << '\t' << "index" << '\t' << "Sepal.Length" << '\t' << "Sepal.Width" << '\t' << "Petal.Length" << '\t' << "Petal.Width" << '\t' << "species" << '\t' << "cluster" << '\t' << "loss" << '\n'; // header line
for(int i = 0; i < 150; ++i){
my_file_out << 0 << '\t' << i << '\t' << my_array[i][0] << '\t' << my_array[i][1] << '\t' << my_array[i][2] << '\t' << my_array[i][3] << '\t' << spec_vec[i] << '\t' << mindiff_idxs[i] << '\t' << tot_loss << '\n';
}
for(int h = 0; h < 20; ++h){
double tot_loss = 0;
for(int i = 0; i < 150; ++i){
for(int j = 0; j < num_new_clusts; ++j){
clust_diffs[i][j] = 999;
}
}
for(int i = 0; i < 150; ++i){
best_clust_diff[i] = 0;
}
for(int i = 0; i < 150; ++i){
for(int j = 0; j < num_new_clusts; ++j){
for(int k = 0; k < 4; ++k){
if(k == 0){
if(clust_diffs[i][j] == 999){
clust_diffs[i][j] = 0;
}
}
clust_diffs[i][j] += pow(new_clusts[j][k] - my_array_std[i][k], 2);
tot_loss += pow(new_clusts[j][k] - my_array_std[i][k], 2);
}
if(j == 0){
mindiff_idxs[i] = j;
best_clust_diff[i] = clust_diffs[i][j];
} else {
if(clust_diffs[i][j] < best_clust_diff[i]){
mindiff_idxs[i] = j;
best_clust_diff[i] = clust_diffs[i][j];
}
}
}
}
tot_loss = tot_loss / num_new_clusts;
for(int i = 0; i < 150; ++i){
my_file_out << h+1 << '\t' << i << '\t' << my_array[i][0] << '\t' << my_array[i][1] << '\t' << my_array[i][2] << '\t' << my_array[i][3] << '\t'<< spec_vec[i] << '\t' << mindiff_idxs[i] << '\t' << tot_loss << '\n';
}
num_clusts = num_new_clusts;
num_new_clusts = 0;
for(int i = 0; i < num_clusts; ++i){
for(int j = 0; j < 150; ++j){
if(i == mindiff_idxs[j]){
num_new_clusts += 1;
break;
}
}
}
int unique_new_clusts[num_new_clusts];
int k = 0;
for(int i = 0; i < num_clusts; ++i){
for(int j = 0; j < 150; ++j){
if(i == mindiff_idxs[j]){
unique_new_clusts[k] = mindiff_idxs[j];
k += 1;
break;
}
}
}
for(int i = 0; i < num_new_clusts; ++i){
clust_counter[i] = 0;
}
for(int i = 0; i < num_new_clusts; ++i){
for(int j = 0; j < num_new_clusts; ++j){
new_clusts_pre[i][j] = 999;
new_clusts[i][j] = 999;
}
}
for(int j = 0; j < num_new_clusts; ++j){
for(int k = 0; k < 4; ++k){
for(int i = 0; i < 150; ++i){
if(unique_new_clusts[j] == mindiff_idxs[i]){
clust_counter[j] += 1;
if(new_clusts_pre[j][k] == 999){
new_clusts_pre[j][k] = 0;
}
new_clusts_pre[j][k] += my_array_std[i][k];
}
}
}
}
for(int i = 0; i < num_new_clusts; ++i){
for(int j = 0; j < 4; ++j){
new_clusts[i][j] = new_clusts_pre[i][j] / clust_counter[i];
}
}
}
my_file_out.close();
return 0;
}