/** * 1-D kmeans * * This is a very efficient version of kmeans that is specialized for * 1-dimensional data. We can run very quickly by keeping only the * indices of class boundaries. * * class 1 = 0 * class 2 = n_1 * class 3 = n_2 * . * . * class k = n_k * * The data points which belong to a given class i are [n_(i-1) n_i-1]. * * When the means are updated we can use a binary search to find the new * boundary. By starting the search from the old boundary, we find the * new index *very* quickly. Also, we need only update the means based on * the data points which change clusters. * * Overall, this algorithm runs in time ~O( i k log n ) * * Copyright 2005 * Lucas Scharenbroich */ #include #include /** Pointer to the data array */ static float *data = NULL; /** * This comparison function takes two integer values and compares the float * values stored in the array at the given indices. */ int my_compar( const void *a, const void *b ) { unsigned int i = *( (unsigned int *) a ); unsigned int j = *( (unsigned int *) b ); if ( data[i] < data[j] ) return -1; if ( data[i] > data[j] ) return 1; return 0; } /** * Finds the index of the datapoint in the array which is the midpoint * between the values of x and y. This is done in logarithmic time using * a binary search. * * There are some subtle issues here. First, when moving from a midpoint, * we need to keep the selected midpoint as the upper or lower bound, since * we might otherwise overstep the proper value. At the end, we need to * check whether the upper or lower value is the best. */ unsigned int find_boundary( unsigned int *a, unsigned int n, float x, float y ) { unsigned int lower, upper, mid; float w, z, lower_sum, upper_sum; lower = 0; upper = n - 1; while ( lower < upper - 1 ) { mid = ( upper + lower ) / 2; z = data[a[mid]]; if ( y - z <= z - x ) upper = mid; else lower = mid; } /** Find the best one */ w = data[a[lower]]; z = data[a[upper]]; lower_sum = (w - x) * (w - x) + (y - w) * (y - w); upper_sum = (z - x) * (z - x) + (y - z) * (y - z); return ( lower_sum < upper_sum ) ? lower : upper; } static void swap( unsigned int **a, unsigned int **b ) { unsigned int *c = *a; *a = *b; *b = c; } /** * Compare two partitionings of the data and return true if they differ * from each other or false if not. This is the convergence criteria for * the kmeans clustering. */ static int changed( unsigned int *a, unsigned int *b, unsigned int n ) { unsigned int i; for ( i = 0; i < n; i++ ) if ( a[i] != b[i] ) return 1; return 0; } /** * Perform k-means clustering on a one-dimensional data set. Due to the * ordering imposed by scalar data, we can make the code much more * efficient. Also, some tricks are used in order to keep the results * numerically robust. There are certainly improvements to be made, though. */ float *kmeans( unsigned int n, float *a, unsigned int k ) { unsigned int i, j, *split, *prev, *perm, iteration; float *mean, temp, mean_1, mean_2; int step; /** * Allocate an array in order to hold the sorted permutation of the data * without having to sort the actual data set. Initialize with the * identity permuatation. */ perm = ( unsigned int * ) malloc( sizeof( unsigned int ) * n ); for ( i = 0; i < n; i++ ) perm[i] = i; /** * Set a global pointer to the data */ data = a; /** * Sort the data via the permutation */ qsort( perm, n, sizeof( unsigned int ), my_compar ); /** * Allocate space for the means */ mean = ( float * ) malloc( sizeof( float ) * k ); split = ( unsigned int * ) malloc( sizeof( unsigned int ) * ( k + 1 )); prev = ( unsigned int * ) malloc( sizeof( unsigned int ) * ( k + 1 )); /** * Initialize the split points */ split[0] = prev[0] = 0; split[k] = prev[k] = n; for ( i = 1; i < k; i++ ) { prev[i] = 0; split[i] = ( i * n ) / k; } /** * Compute the initial mean sums */ for ( i = 0; i < k; i++ ) { mean[i] = 0.0; for ( j = split[i]; j < split[i+1]; j++ ) { mean[i] += data[perm[j]]; } } /** * Loop until the intervals do not change */ iteration = 1; while ( changed( split, prev, k )) { /** * Make the current partition the previous */ swap( &prev, &split ); /** * Find the new partitions */ for ( i = 1; i < k; i++ ) { mean_1 = mean[i-1] / ((float) ( prev[i] - prev[i-1] )); mean_2 = mean[i] / ((float) ( prev[i+1] - prev[i] )); split[i] = find_boundary( perm, n, mean_1, mean_2 ); } /** * Update the mean sums. Accumulate the difference in order to * slightly improve the numerical robusteness. It also saves some * work. */ for ( i = 1; i < k; i++ ) { step = ( prev[i] < split[i] ) ? 1 : -1; temp = 0.0; for ( j = prev[i]; j != split[i]; j += step ) { temp += data[perm[j]]; } temp = ((float) step) * temp; mean[i-1] += temp; mean[i] -= temp; } iteration++; } printf( "Converged in %d interations\n", iteration ); /** * Divide the mean sums by the proper denominator */ for ( i = 0; i < k; i++ ) { mean[i] /= (float) (split[i+1] - split[i]); } free( prev ); free( split ); free( perm ); return mean; }