blob: 24506058a0ff1520df66d6442e84bcefb7f66250 [file] [log] [blame]
/*
* Copyright (C) 2012 The Android Open Source Project
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef KMEANS_H
#define KMEANS_H
#include <cstdlib>
#include <math.h>
// Helper functions
template <typename T, typename N>
inline void sum(T values[], int len, int dimension, int stride, N dst[]) {
int x, y;
// zero out dst vector
for (x = 0; x < dimension; x++) {
dst[x] = 0;
}
for (x = 0; x < len; x+= stride) {
for (y = 0; y < dimension; y++) {
dst[y] += values[x + y];
}
}
}
template <typename T, typename N>
inline void set(T val1[], N val2[], int dimension) {
int x;
for (x = 0; x < dimension; x++) {
val1[x] = val2[x];
}
}
template <typename T, typename N>
inline void add(T val[], N dst[], int dimension) {
int x;
for (x = 0; x < dimension; x++) {
dst[x] += val[x];
}
}
template <typename T, typename N>
inline void divide(T dst[], N divisor, int dimension) {
int x;
if (divisor == 0) {
return;
}
for (x = 0; x < dimension; x++) {
dst[x] /= divisor;
}
}
/**
* Calculates euclidean distance.
*/
template <typename T, typename N>
inline N euclideanDist(T val1[], T val2[], int dimension) {
int x;
N sum = 0;
for (x = 0; x < dimension; x++) {
N diff = (N) val1[x] - (N) val2[x];
sum += diff * diff;
}
return sqrt(sum);
}
// K-Means
/**
* Picks k random starting points from the data set.
*/
template <typename T>
void initialPickHeuristicRandom(int k, T values[], int len, int dimension, int stride, T dst[],
unsigned int seed) {
int x, z, num_vals, cntr;
num_vals = len / stride;
cntr = 0;
srand(seed);
unsigned int r_vals[k];
unsigned int r;
for (x = 0; x < k; x++) {
// ensure randomly chosen value is unique
int r_check = 0;
while (r_check == 0) {
r = (unsigned int) rand() % num_vals;
r_check = 1;
for (z = 0; z < x; z++) {
if (r == r_vals[z]) {
r_check = 0;
}
}
}
r_vals[x] = r;
r *= stride;
// set dst to be randomly chosen value
set<T,T>(dst + cntr, values + r, dimension);
cntr += stride;
}
}
/**
* Finds index of closet centroid to a value
*/
template <typename T, typename N>
inline int findClosest(T values[], T oldCenters[], int dimension, int stride, int pop_size) {
int best_ind = 0;
N best_len = euclideanDist <T, N>(values, oldCenters, dimension);
int y;
for (y = stride; y < pop_size; y+=stride) {
N l = euclideanDist <T, N>(values, oldCenters + y, dimension);
if (l < best_len) {
best_len = l;
best_ind = y;
}
}
return best_ind;
}
/**
* Calculates new centroids by averaging value clusters for old centroids.
*/
template <typename T, typename N>
int calculateNewCentroids(int k, T values[], int len, int dimension, int stride, T oldCenters[],
T dst[]) {
int x, pop_size;
pop_size = k * stride;
int popularities[k];
N tmp[pop_size];
//zero popularities
memset(popularities, 0, sizeof(int) * k);
// zero dst, and tmp
for (x = 0; x < pop_size; x++) {
tmp[x] = 0;
}
// put summation for each k in tmp
for (x = 0; x < len; x+=stride) {
int best = findClosest<T, N>(values + x, oldCenters, dimension, stride, pop_size);
add<T, N>(values + x, tmp + best, dimension);
popularities[best / stride]++;
}
int ret = 0;
int y;
// divide to get centroid and set dst to result
for (x = 0; x < pop_size; x+=stride) {
divide<N, int>(tmp + x, popularities[x / stride], dimension);
for (y = 0; y < dimension; y++) {
if ((dst + x)[y] != (T) ((tmp + x)[y])) {
ret = 1;
}
}
set(dst + x, tmp + x, dimension);
}
return ret;
}
template <typename T, typename N>
void runKMeansWithPicks(int k, T finalCentroids[], T values[], int len, int dimension, int stride,
int iterations, T initialPicks[]){
int k_len = k * stride;
int x;
// zero newCenters
for (x = 0; x < k_len; x++) {
finalCentroids[x] = 0;
}
T * c1 = initialPicks;
T * c2 = finalCentroids;
T * temp;
int ret = 1;
for (x = 0; x < iterations; x++) {
ret = calculateNewCentroids<T, N>(k, values, len, dimension, stride, c1, c2);
temp = c1;
c1 = c2;
c2 = temp;
if (ret == 0) {
x = iterations;
}
}
set<T, T>(finalCentroids, c1, dimension);
}
/**
* Runs the k-means algorithm on dataset values with some initial centroids.
*/
template <typename T, typename N>
void runKMeans(int k, T finalCentroids[], T values[], int len, int dimension, int stride,
int iterations, unsigned int seed){
int k_len = k * stride;
T initialPicks [k_len];
initialPickHeuristicRandom<T>(k, values, len, dimension, stride, initialPicks, seed);
runKMeansWithPicks<T, N>(k, finalCentroids, values, len, dimension, stride,
iterations, initialPicks);
}
/**
* Sets each value in values to the closest centroid.
*/
template <typename T, typename N>
void applyCentroids(int k, T centroids[], T values[], int len, int dimension, int stride) {
int x, pop_size;
pop_size = k * stride;
for (x = 0; x < len; x+= stride) {
int best = findClosest<T, N>(values + x, centroids, dimension, stride, pop_size);
set<T, T>(values + x, centroids + best, dimension);
}
}
#endif // KMEANS_H