$d) { $data[$key] = normaliseValue($d, sqrt($d[0]*$d[0] + $d[1] * $d[1])); } var_dump(kMeans($data, 3)); function initialiseCentroids(array $data, $k) { $dimensions = count($data[0]); $centroids = array(); $dimmax = array(); $dimmin = array(); foreach($data as $document) { foreach($document as $dim => $val) { if(!isset($dimmax[$dim]) || $val > $dimmax[$dim]) { $dimmax[$dim] = $val; } if(!isset($dimmin[$dim]) || $val < $dimmin[$dim]) { $dimmin[$dim] = $val; } } } for($i = 0; $i < $k; $i++) { $centroids[$i] = initialiseCentroid($dimensions, $dimmax, $dimmin); } return $centroids; } function initialiseCentroid($dimensions, $dimmax, $dimmin) { $total = 0; $centroid = array(); for($j = 0; $j < $dimensions; $j++) { $centroid[$j] = (rand($dimmin[$j] * 1000, $dimmax[$j] * 1000)); $total += $centroid[$j] * $centroid[$j]; } $centroid = normaliseValue($centroid, sqrt($total)); return $centroid; } function kMeans($data, $k) { $centroids = initialiseCentroids($data, $k); $mapping = array(); while(true) { $new_mapping = assignCentroids($data, $centroids); $changed = false; foreach($new_mapping as $documentID => $centroidID) { if(!isset($mapping[$documentID]) || $centroidID != $mapping[$documentID]) { $mapping = $new_mapping; $changed = true; break; } } if(!$changed){ return formatResults($mapping, $data, $centroids); } $centroids = updateCentroids($mapping, $data, $k); } } function formatResults($mapping, $data, $centroids) { $result = array(); $result['centroids'] = $centroids; foreach($mapping as $documentID => $centroidID) { $result[$centroidID][] = implode(',', $data[$documentID]); } return $result; } function assignCentroids($data, $centroids) { $mapping = array(); foreach($data as $documentID => $document) { $minDist = 100; $minCentroid = null; foreach($centroids as $centroidID => $centroid) { $dist = 0; foreach($centroid as $dim => $value) { $dist += abs($value - $document[$dim]); } if($dist < $minDist) { $minDist = $dist; $minCentroid = $centroidID; } } $mapping[$documentID] = $minCentroid; } return $mapping; } function updateCentroids($mapping, $data, $k) { $centroids = array(); $counts = array_count_values($mapping); foreach($mapping as $documentID => $centroidID) { foreach($data[$documentID] as $dim => $value) { if(!isset($cenntroids[$centroidID][$dim])) { $centroids[$centroidID][$dim] = 0; } $centroids[$centroidID][$dim] += ($value/$counts[$centroidID]); } } if(count($centroids) < $k) { $centroids = array_merge($centroids, initialiseCentroids($data, $k - count($centroids))); } return $centroids; } function normaliseValue(array $vector, $total) { foreach($vector as &$value) { $value = $value/$total; } return $vector; }