Ex Data, Scientia

Home Contact

Implementing k-means clustering in C++

#include <iostream>
#include <fstream>
#include <string>
#include <stdlib.h>
#include <cmath>
#include <random>

int main(){
  std::ifstream my_file;
  std::string my_line;
  std::string token;
  
  std::string delimiter = ",";
  std::string spec_0 = "\"setosa\"";
  std::string spec_1 = "\"versicolor\"";
  
  double my_array[150][4];
  int spec_vec[150];
  
  int ind;
  ind = -1;
  
  my_file.open("/home/jan/iris.txt");
  
  while(my_file.good()){
    std::getline(my_file, my_line);
    
    if(ind > -1 & ind < 151){
      for(int i = 0; i < 5; ++i){
        token = my_line.substr(0, my_line.find(delimiter));
        
        if(i < 4){
          double var_val = atof(token.c_str());
          my_array[ind][i] = var_val;
          
        } else {
          if(token == spec_0){
            spec_vec[ind] = int(0);
          } else {
            if(token == spec_1){
              spec_vec[ind] = int(1);
            } else {
              spec_vec[ind] = int(2);
            }
          }
        }
        
        my_line = my_line.erase(0, my_line.find(delimiter) + delimiter.length());
      }
    }
    
    ind = ind+1;
  }
    
  double my_array_std[150][4];
  double data_means[4];
  double data_sds[4];
  double data_mins[4];
  double data_maxs[4];
  
  for(int i = 0; i < 4; ++i){
    double my_vec[150];
    double my_vec_sum = 0;
    double for_sd = 0;
    
    data_mins[i] = my_array[0][i];
    data_maxs[i] = my_array[0][i];
    
    for(int j = 0; j < 150; ++j){
      my_vec[j] = my_array[j][i];
      my_vec_sum += my_vec[j];
      
      data_mins[i] = std::min(data_mins[i], my_vec[j]);
      data_maxs[i] = std::max(data_maxs[i], my_vec[j]);
    }
    
    data_means[i] = my_vec_sum / 150;
    
    for(int j = 0; j < 150; ++j){
      for_sd += pow(my_vec[j] - data_means[i], 2);
    }
    
    data_sds[i] = sqrt(for_sd / 150);
    
    for(int j = 0; j < 150; ++j){
      my_array_std[j][i] = (my_vec[j] - data_means[i]) / data_sds[i];
    }
  }
  
  int num_clusts = 10;
  double clust_vec[num_clusts][4];
  
  for(int i = 0; i < num_clusts; ++i){
    for(int j = 0; j < 4; ++j){
      auto distr = std::uniform_real_distribution<double> {(data_mins[j] - data_means[j]) / data_sds[j], (data_maxs[j] - data_means[j]) / data_sds[j]};
      auto numgen = std::mt19937 {i};

      clust_vec[i][j] = distr(numgen);
    }
  }
  
  double clust_diffs[150][num_clusts];
  int mindiff_idxs[150];
  
  double tot_loss = 0;
  
  for(int i = 0; i < 150; ++i){
    for(int j = 0; j < num_clusts; ++j){
      clust_diffs[i][j] = 0;
    }
  }
  
  double best_clust_diff[150];
  
  for(int i = 0; i < 150; ++i){
    best_clust_diff[i] = 0;
  }
  
  for(int i = 0; i < 150; ++i){
    for(int j = 0; j < num_clusts; ++j){
      for(int k = 0; k < 4; ++k){
        clust_diffs[i][j] += pow(clust_vec[j][k] - my_array_std[i][k], 2);
        tot_loss += pow(clust_vec[j][k] - my_array_std[i][k], 2);
      }
      if(j == 0){
        mindiff_idxs[i] = j;
        best_clust_diff[i] = clust_diffs[i][j]; // records the best "matching" cluster found - initially assumed to be the first cluster
      } else {
        if(clust_diffs[i][j] < best_clust_diff[i]){ // compares loss for jth cluster with that of best cluster found up to j
          mindiff_idxs[i] = j;
          best_clust_diff[i] = clust_diffs[i][j]; // records the best "matching" cluster found if match is better than best previous best match
        }
      }
    }
  }
  
  tot_loss = tot_loss / num_clusts;
  
  int num_new_clusts = 0;
  
  for(int i = 0; i < num_clusts; ++i){
    for(int j = 0; j < 150; ++j){
      if(i == mindiff_idxs[j]){
        num_new_clusts += 1;
        break;
      }
    }
  }
  
  int unique_new_clusts[num_new_clusts];
  
  int k = 0;
  for(int i = 0; i < num_clusts; ++i){
    for(int j = 0; j < 150; ++j){
      if(i == mindiff_idxs[j]){
        unique_new_clusts[k] = mindiff_idxs[j];
        k += 1;
        break;
      }
    }
  }
  
  float new_clusts_pre[num_clusts][4];
  float new_clusts[num_clusts][4];
  int clust_counter[num_clusts];
  
  for(int i = 0; i < num_new_clusts; ++i){
    clust_counter[i] = 0;
  }
  
  for(int i = 0; i < num_new_clusts; ++i){
    for(int j = 0; j < num_new_clusts; ++j){
      new_clusts_pre[i][j] = 0;
      new_clusts[i][j] = 0;
    }
  }
  
  for(int j = 0; j < num_new_clusts; ++j){
    for(int k = 0; k < 4; ++k){
      for(int i = 0; i < 150; ++i){
        if(unique_new_clusts[j] == mindiff_idxs[i]){
          clust_counter[j] += 1;
          new_clusts_pre[j][k] += my_array_std[i][k];
        }
      }
    }
  }
  
  for(int i = 0; i < num_new_clusts; ++i){
    for(int j = 0; j < 4; ++j){
      new_clusts[i][j] = new_clusts_pre[i][j] / clust_counter[i];
    }
  }
  
  std::ofstream my_file_out;
  my_file_out.open("iris_clustering_results.txt");
  
  my_file_out << "iteration" << '\t' << "index" << '\t' << "Sepal.Length" << '\t' << "Sepal.Width" << '\t' << "Petal.Length" << '\t' << "Petal.Width" << '\t' << "species" << '\t' << "cluster" << '\t' << "loss" << '\n'; // header line
  
  for(int i = 0; i < 150; ++i){
    my_file_out << 0 << '\t' << i << '\t' << my_array[i][0] << '\t' << my_array[i][1] << '\t' << my_array[i][2] << '\t' << my_array[i][3] <<  '\t' << spec_vec[i] << '\t' << mindiff_idxs[i] << '\t' << tot_loss << '\n';
  }
  
  for(int h = 0; h < 20; ++h){
    double tot_loss = 0;
    
    for(int i = 0; i < 150; ++i){
      for(int j = 0; j < num_new_clusts; ++j){
        clust_diffs[i][j] = 999;
      }
    }
    
    for(int i = 0; i < 150; ++i){
      best_clust_diff[i] = 0;
    }
    
    for(int i = 0; i < 150; ++i){
      for(int j = 0; j < num_new_clusts; ++j){
        for(int k = 0; k < 4; ++k){
          if(k == 0){
            if(clust_diffs[i][j] == 999){
              clust_diffs[i][j] = 0;
            }
          }
          clust_diffs[i][j] += pow(new_clusts[j][k] - my_array_std[i][k], 2);
          tot_loss += pow(new_clusts[j][k] - my_array_std[i][k], 2);
        }
        if(j == 0){
          mindiff_idxs[i] = j;
          best_clust_diff[i] = clust_diffs[i][j];
        } else {
          if(clust_diffs[i][j] < best_clust_diff[i]){
            mindiff_idxs[i] = j;
            best_clust_diff[i] = clust_diffs[i][j];
          }
        }
      }
    }
    
    tot_loss = tot_loss / num_new_clusts;
    
    for(int i = 0; i < 150; ++i){
      my_file_out << h+1 << '\t' << i << '\t' << my_array[i][0] << '\t' << my_array[i][1] << '\t' << my_array[i][2] << '\t' << my_array[i][3] << '\t'<< spec_vec[i] << '\t' << mindiff_idxs[i] << '\t' << tot_loss << '\n';
    }
    
    num_clusts = num_new_clusts;
    num_new_clusts = 0;
    
    for(int i = 0; i < num_clusts; ++i){
      for(int j = 0; j < 150; ++j){
        if(i == mindiff_idxs[j]){
          num_new_clusts += 1;
          break;
        }
      }
    }
    
    int unique_new_clusts[num_new_clusts];
    
    int k = 0;
    for(int i = 0; i < num_clusts; ++i){
      for(int j = 0; j < 150; ++j){
        if(i == mindiff_idxs[j]){
          unique_new_clusts[k] = mindiff_idxs[j];
          k += 1;
          break;
        }
      }
    }
    
    for(int i = 0; i < num_new_clusts; ++i){
      clust_counter[i] = 0;
    }
    
    for(int i = 0; i < num_new_clusts; ++i){
      for(int j = 0; j < num_new_clusts; ++j){
        new_clusts_pre[i][j] = 999;
        new_clusts[i][j] = 999;
      }
    }
    
    for(int j = 0; j < num_new_clusts; ++j){
      for(int k = 0; k < 4; ++k){
        for(int i = 0; i < 150; ++i){
          if(unique_new_clusts[j] == mindiff_idxs[i]){
            clust_counter[j] += 1;
            if(new_clusts_pre[j][k] == 999){
              new_clusts_pre[j][k] = 0;
            }
            new_clusts_pre[j][k] += my_array_std[i][k];
          }
        }
      }
    }
    
    for(int i = 0; i < num_new_clusts; ++i){
      for(int j = 0; j < 4; ++j){
        new_clusts[i][j] = new_clusts_pre[i][j] / clust_counter[i];
      }
    }
  }
  
  my_file_out.close();
  
  return 0;
}