Implementing k-means clustering in C++
#include <iostream> #include <fstream> #include <string> #include <stdlib.h> #include <cmath> #include <random> int main(){ std::ifstream my_file; std::string my_line; std::string token; std::string delimiter = ","; std::string spec_0 = "\"setosa\""; std::string spec_1 = "\"versicolor\""; double my_array[150][4]; int spec_vec[150]; int ind; ind = -1; my_file.open("/home/jan/iris.txt"); while(my_file.good()){ std::getline(my_file, my_line); if(ind > -1 & ind < 151){ for(int i = 0; i < 5; ++i){ token = my_line.substr(0, my_line.find(delimiter)); if(i < 4){ double var_val = atof(token.c_str()); my_array[ind][i] = var_val; } else { if(token == spec_0){ spec_vec[ind] = int(0); } else { if(token == spec_1){ spec_vec[ind] = int(1); } else { spec_vec[ind] = int(2); } } } my_line = my_line.erase(0, my_line.find(delimiter) + delimiter.length()); } } ind = ind+1; } double my_array_std[150][4]; double data_means[4]; double data_sds[4]; double data_mins[4]; double data_maxs[4]; for(int i = 0; i < 4; ++i){ double my_vec[150]; double my_vec_sum = 0; double for_sd = 0; data_mins[i] = my_array[0][i]; data_maxs[i] = my_array[0][i]; for(int j = 0; j < 150; ++j){ my_vec[j] = my_array[j][i]; my_vec_sum += my_vec[j]; data_mins[i] = std::min(data_mins[i], my_vec[j]); data_maxs[i] = std::max(data_maxs[i], my_vec[j]); } data_means[i] = my_vec_sum / 150; for(int j = 0; j < 150; ++j){ for_sd += pow(my_vec[j] - data_means[i], 2); } data_sds[i] = sqrt(for_sd / 150); for(int j = 0; j < 150; ++j){ my_array_std[j][i] = (my_vec[j] - data_means[i]) / data_sds[i]; } } int num_clusts = 10; double clust_vec[num_clusts][4]; for(int i = 0; i < num_clusts; ++i){ for(int j = 0; j < 4; ++j){ auto distr = std::uniform_real_distribution<double> {(data_mins[j] - data_means[j]) / data_sds[j], (data_maxs[j] - data_means[j]) / data_sds[j]}; auto numgen = std::mt19937 {i}; clust_vec[i][j] = distr(numgen); } } double clust_diffs[150][num_clusts]; int mindiff_idxs[150]; double tot_loss = 0; for(int i = 0; i < 150; ++i){ for(int j = 0; j < num_clusts; ++j){ clust_diffs[i][j] = 0; } } double best_clust_diff[150]; for(int i = 0; i < 150; ++i){ best_clust_diff[i] = 0; } for(int i = 0; i < 150; ++i){ for(int j = 0; j < num_clusts; ++j){ for(int k = 0; k < 4; ++k){ clust_diffs[i][j] += pow(clust_vec[j][k] - my_array_std[i][k], 2); tot_loss += pow(clust_vec[j][k] - my_array_std[i][k], 2); } if(j == 0){ mindiff_idxs[i] = j; best_clust_diff[i] = clust_diffs[i][j]; // records the best "matching" cluster found - initially assumed to be the first cluster } else { if(clust_diffs[i][j] < best_clust_diff[i]){ // compares loss for jth cluster with that of best cluster found up to j mindiff_idxs[i] = j; best_clust_diff[i] = clust_diffs[i][j]; // records the best "matching" cluster found if match is better than best previous best match } } } } tot_loss = tot_loss / num_clusts; int num_new_clusts = 0; for(int i = 0; i < num_clusts; ++i){ for(int j = 0; j < 150; ++j){ if(i == mindiff_idxs[j]){ num_new_clusts += 1; break; } } } int unique_new_clusts[num_new_clusts]; int k = 0; for(int i = 0; i < num_clusts; ++i){ for(int j = 0; j < 150; ++j){ if(i == mindiff_idxs[j]){ unique_new_clusts[k] = mindiff_idxs[j]; k += 1; break; } } } float new_clusts_pre[num_clusts][4]; float new_clusts[num_clusts][4]; int clust_counter[num_clusts]; for(int i = 0; i < num_new_clusts; ++i){ clust_counter[i] = 0; } for(int i = 0; i < num_new_clusts; ++i){ for(int j = 0; j < num_new_clusts; ++j){ new_clusts_pre[i][j] = 0; new_clusts[i][j] = 0; } } for(int j = 0; j < num_new_clusts; ++j){ for(int k = 0; k < 4; ++k){ for(int i = 0; i < 150; ++i){ if(unique_new_clusts[j] == mindiff_idxs[i]){ clust_counter[j] += 1; new_clusts_pre[j][k] += my_array_std[i][k]; } } } } for(int i = 0; i < num_new_clusts; ++i){ for(int j = 0; j < 4; ++j){ new_clusts[i][j] = new_clusts_pre[i][j] / clust_counter[i]; } } std::ofstream my_file_out; my_file_out.open("iris_clustering_results.txt"); my_file_out << "iteration" << '\t' << "index" << '\t' << "Sepal.Length" << '\t' << "Sepal.Width" << '\t' << "Petal.Length" << '\t' << "Petal.Width" << '\t' << "species" << '\t' << "cluster" << '\t' << "loss" << '\n'; // header line for(int i = 0; i < 150; ++i){ my_file_out << 0 << '\t' << i << '\t' << my_array[i][0] << '\t' << my_array[i][1] << '\t' << my_array[i][2] << '\t' << my_array[i][3] << '\t' << spec_vec[i] << '\t' << mindiff_idxs[i] << '\t' << tot_loss << '\n'; } for(int h = 0; h < 20; ++h){ double tot_loss = 0; for(int i = 0; i < 150; ++i){ for(int j = 0; j < num_new_clusts; ++j){ clust_diffs[i][j] = 999; } } for(int i = 0; i < 150; ++i){ best_clust_diff[i] = 0; } for(int i = 0; i < 150; ++i){ for(int j = 0; j < num_new_clusts; ++j){ for(int k = 0; k < 4; ++k){ if(k == 0){ if(clust_diffs[i][j] == 999){ clust_diffs[i][j] = 0; } } clust_diffs[i][j] += pow(new_clusts[j][k] - my_array_std[i][k], 2); tot_loss += pow(new_clusts[j][k] - my_array_std[i][k], 2); } if(j == 0){ mindiff_idxs[i] = j; best_clust_diff[i] = clust_diffs[i][j]; } else { if(clust_diffs[i][j] < best_clust_diff[i]){ mindiff_idxs[i] = j; best_clust_diff[i] = clust_diffs[i][j]; } } } } tot_loss = tot_loss / num_new_clusts; for(int i = 0; i < 150; ++i){ my_file_out << h+1 << '\t' << i << '\t' << my_array[i][0] << '\t' << my_array[i][1] << '\t' << my_array[i][2] << '\t' << my_array[i][3] << '\t'<< spec_vec[i] << '\t' << mindiff_idxs[i] << '\t' << tot_loss << '\n'; } num_clusts = num_new_clusts; num_new_clusts = 0; for(int i = 0; i < num_clusts; ++i){ for(int j = 0; j < 150; ++j){ if(i == mindiff_idxs[j]){ num_new_clusts += 1; break; } } } int unique_new_clusts[num_new_clusts]; int k = 0; for(int i = 0; i < num_clusts; ++i){ for(int j = 0; j < 150; ++j){ if(i == mindiff_idxs[j]){ unique_new_clusts[k] = mindiff_idxs[j]; k += 1; break; } } } for(int i = 0; i < num_new_clusts; ++i){ clust_counter[i] = 0; } for(int i = 0; i < num_new_clusts; ++i){ for(int j = 0; j < num_new_clusts; ++j){ new_clusts_pre[i][j] = 999; new_clusts[i][j] = 999; } } for(int j = 0; j < num_new_clusts; ++j){ for(int k = 0; k < 4; ++k){ for(int i = 0; i < 150; ++i){ if(unique_new_clusts[j] == mindiff_idxs[i]){ clust_counter[j] += 1; if(new_clusts_pre[j][k] == 999){ new_clusts_pre[j][k] = 0; } new_clusts_pre[j][k] += my_array_std[i][k]; } } } } for(int i = 0; i < num_new_clusts; ++i){ for(int j = 0; j < 4; ++j){ new_clusts[i][j] = new_clusts_pre[i][j] / clust_counter[i]; } } } my_file_out.close(); return 0; }