package t_kmeans;
import java.io.*;
import java.util.*;
import java.text.DecimalFormat;
import java.util.List;
public class BasicKMeans {
static private String fname= "palsy.dat";
static List attributes = null;
static int maxNum = 0;
static DecimalFormat d2 = new DecimalFormat("##.####");
/**
* @param args the command line arguments
*/
public static void main(String[] args) {
// TODO code application logic here
// Get the data file name
// Read SVMlight format data (*.dat)
attributes = readSVMLightFile(fname);
if(attributes!=null){
System.out.println("Num of attributes : " + maxNum);
System.out.println("Num of instances : " + attributes.size());
Kmeans_cluster(attributes);
}
}// end of mains
// Implementing K-means
static void Kmeans_cluster(List attributes)
{
// Algorithm
// 1.Randomly generate k centroid points: C={C1,…,Ck}
// 2.Partition objects into k nonempty subsets (S1,…,Sk) by assigning each object xito the nearest centroid point by calculating distance metrics d(i,j) between the object xiand the centroid points Cj.
// 3.Stop if no more new assignment: output C, M(C), (S1,…,Sk)
// 4.Update k centroid points C1,…,Ck using the k subsets (S1,…,Sk)
// 5.Go back to Step 2
int k = 2;
double[][] centroids = new double[k][maxNum];
double[][] temporycentroids = new double[k][maxNum];
Vector[] cluster_Array = new Vector[k]; // creating k no of cluster
boolean flag = false;
// implementation
if (k > attributes.size())
System.exit(0);
System.out.println("The Total cluster number: " + k);
//1.Randomly generate k centroid points
centroids = randomCentroids(k);
// Step 2 and 3
// Calculate distance matric between each data point with centroids
do{
cluster_Array = cluster_Array(centroids,k);
//testing the number of k clusters
for(int idx = 0; idx < k; idx++)
{
System.out.println(cluster_Array[idx].size()+ " record " + " in Cluster "+ idx );
} // end of step 2 and 3
// searching new k number centroids points
int attribute_id = 1;
for (int idx = 0; idx < k; idx++)
{
for (int idx1 = 0; idx1 < maxNum; idx1++)
{
temporycentroids [idx][idx1] = calculate_centroids(cluster_Array[idx],attribute_id);
attribute_id++;
}
attribute_id = 1;
}
for (int idx = 0; idx < k; idx++)
{
System.out.println("centroid point " + idx);
for (int idx1 = 0; idx1 < maxNum; idx1++)
{
System.out.print(idx1+". ");
System.out.println(temporycentroids[idx][idx1]);
System.out.print(" ");
}
System.out.println();
} // end of step 4
// Step 5
if (compare_centroids(centroids,temporycentroids,k,maxNum) == true)
{
flag = true;
System.out.println("Centroid points are same.");
for(int idx = 0; idx < k; idx++)
System.out.println(cluster_Array[idx].size()+ " record " + " in Cluster "+ idx );
for (int idx = 0; idx < k; idx++)
{
System.out.println("centroid point " + idx);
for (int idx1 = 0; idx1 < maxNum; idx1++)
{
System.out.print(idx1+". ");
System.out.println(temporycentroids[idx][idx1]);
System.out.print(" ");
}
System.out.println();
}
}
else // update centroids
{
System.out.println("centroid points are different.");
centroids = temporycentroids;
for (int idx = 0; idx < k; idx++)
{
System.out.println("updated centroid point " + idx);
for (int idx1 = 0; idx1 < maxNum; idx1++)
{
System.out.print(idx1+". ");
System.out.println(centroids[idx][idx1]);
System.out.print(" ");
}
System.out.println();
}
}
}while(flag == false);
}
// KmeanCluster
// calculate_centroids start
static double calculate_centroids(List attributes, int attribute_id){
int N = attributes.size();
double centroid = 0.00;
if(N > 0)
{
for(int idx = 0; idx < N; idx++)
centroid = centroid + Double.valueOf(d2.format(getAttributeValue(attributes,idx,attribute_id)));
centroid = Double.valueOf(d2.format(centroid / N));
}
return centroid;
} // end of calculate_centroids function
// start randomCentroids
static double[][] randomCentroids(int k){
int[] all = new int[k];
for (int idx=0; idx<k; idx++){
all[idx] = 0; // initialization
}
Random rd = new Random();
int i = 0;
while (i < k){
int selectedNumber = rd.nextInt(attributes.size());
if (adding(all,i,i+1))
i++;
}
System.out.println("The data point which is done random : " );
for( int idx1=0; idx1<k ; idx1++)
System.out.println(all[idx1]+1 + " ");
double [][] temp = new double [k][maxNum];
for(int row = 0; row< k; row++)
{ int attrib_id = 1;
for(int col = 0; col < maxNum; col++)
{
temp[row][col] = Double.valueOf(d2.format(getAttributeValue(attributes,all[row],attrib_id)));
attrib_id++;
}
}
for(int row = 0; row< k; row++)
{
System.out.print("centroid " + row + ": ");
for(int col = 0; col < maxNum; col++)
{
System.out.println(temp[row][col] + " ");
}
System.out.println();
}
return temp;
}
// end of randomCentroids function
// add function start
static boolean adding(int[] list, int size, int value){
for ( int idx=0; idx<size; idx++){
if (list[idx] == value) //Does random value exist or not in list[]?
return false; //if same
}
list[size] = value;
return true;
}
/*
* end of add function
*/
// start cluster_array function
static Vector[] cluster_Array(double [][] centroids, int k){
//create given number of cluster
Vector[] cluster_Array = new Vector[k]; // create k number of cluster array
int closet_cluster = 0; // to keep nearest centroid
double closet_distance = 0.0; // to keep nearest distance
for(int idx=0; idx < k; idx++)
cluster_Array[idx] = new Vector(); // insert each List into each cluster
for (int idx = 0; idx < attributes.size(); idx++)
{
List fv1 = (List)attributes.get(idx);
for (int idx1 = 0; idx1 < k; idx1++)
{
double[] fv2 = centroids[idx1];
double distance = Double.valueOf(d2.format(distanceMetric(fv1, fv2))); // round to 2 decimal double value
if (idx1==0)
{
closet_distance = distance;
closet_cluster = idx1;
}
else if (idx1 > 0)
{
if(distance <= closet_distance)
{
closet_distance = distance;
closet_cluster = idx1;
}
}
}
cluster_Array[closet_cluster].add((List)attributes.get(idx));
}
return cluster_Array;
}
/*
* Comparing clusters ... old cluster and new one
* implemented by Thu Thu Tun
*/
static boolean compare_centroids(double [][] c1, double[][] c2, int k, int maxNum){
for (int idx = 0; idx < k; idx++)
for (int idx1 = 0; idx1 < maxNum; idx1++)
if (c1[idx][idx1] != c2[idx][idx1])
return false;
System.out.println("temp centroid");
for (int idx = 0; idx < k; idx++)
{
System.out.println("temp centroid point " + idx);
for (int idx1 = 0; idx1 < maxNum; idx1++)
{
System.out.print(idx1+". ");
System.out.println(c1[idx][idx1]);
System.out.print(" ");
}
System.out.println();
}
return true;
}
/**
* Read SVMLight data file into Vector data structure
*
* @param fname
* @return List = a list of feature vectors = [ [ label, [id, value], [id, value],...], .... ]
* a feature vector = [ label, [id, value], [id, value],...]
* label = int
* id = double
* value = double
*/
private static List readSVMLightFile(String fname) {
try{
List featureV = new Vector();
// Reading input by lines:
BufferedReader in = new BufferedReader( new FileReader(fname) );
String str = new String("");
while ((str = in.readLine()) != null){
str = str.trim();
if(str.length() > 0){
String line[];
String fs[];
line = str.split("\\s",2);
String clabel = line[0].trim().replace("+", "");
fs = line[1].trim().split("\\s"); // feature list #:v ...
List fl = new Vector();
fl.add(Integer.parseInt(clabel));
for(int idx = 0; idx < fs.length; idx++)
{
double f[] = {0,0};
String attr[] = fs[idx].split(":");
int aid = Integer.valueOf(attr[0].trim());
if( aid > maxNum) maxNum = aid;
f[0] = aid; // attribute id
f[1] = Double.valueOf(attr[1].trim()); // attribute value
fl.add(f);
}
featureV.add(fl);
}
}
in.close();
return featureV;
}
catch(IOException e){
System.out.println("Error reading file: " + e);
return null;
}
}//end of read file
/**
* Calculate Euclidean distance metric between two sparse feature vectors
* @param i feature vector = [ label, [id, value], [id, value],...]
* @param j feature vector = [ label, [id, value], [id, value],...]
* modified by Thu Thu Tun
* @return
*/
static double distanceMetric(List fv1, double[] fv2){
double[] fv = new double[maxNum];
for(int idx = 0; idx < maxNum; idx++){
fv[idx] = 0;
}
Iterator itr = fv1.iterator();
int k = 0;
while(itr.hasNext())
{
if( k > 0){
double f[] = (double[])itr.next();
fv[(int)f[0]-1] = f[1];
}
else
itr.next();
k++;
}
for (int idx = 0; idx < maxNum; idx++)
{
fv[idx] -= fv2[idx];
}
double distance = 0;
for(int idx = 0; idx < maxNum; idx++)
{
distance += Double.valueOf(d2.format(fv[idx]*fv[idx]));
}
return Double.valueOf(d2.format(Math.sqrt(distance)));
}
/**
* Retrive the attribute value
* @param attributes
* @param data_index 0 based: 0,...,dataSize-1
* @param attribute_id 1 based: 1,...,dimension
* @return
*/
static double getAttributeValue(List attributes, int data_index, int attribute_id)
{
List fv = (List)attributes.get(data_index); // feature vector
Iterator itr = fv.iterator();
int k = 0;
while(itr.hasNext())
{
if( k > 0)
{
double f[] = (double[])itr.next();
if( attribute_id == (int)f[0] )
{
return f[1];
}
}
else
itr.next();
k++;
}
return 0;
}
/**
* Retrive the class label
* @param attributes
* @param data_index 0 based: 0,...,dataSize-1
* @return
*/
static int getClassLabel(List attributes, int data_index)
{
List fv = (List)attributes.get(data_index); // feature vector
return (Integer)fv.get(0);
}
}
Stuck with KMeans Clustering Algorithm DESPERATE HELP NEEDED
Started by Suzanne M, Dec 17 2011 09:30 PM
3 replies to this topic
#1
Posted 17 December 2011 - 09:30 PM
Hi, I am supposed to implement the KMeans clustering algo for numerical and textual data. This is what I have done for data that's been read from a binary file. This code uses packages, I want to know a better way to code KMeans without involvong packages. I would also like some help on how to convert textual data to data that can have KMeans applied directly on them. Finding working codes online is a struggle. Right now m at my wits end.
|
|
|
#2
Posted 18 December 2011 - 06:02 AM
Why do you want to avoid using packages?
What kind of metrics on textual data are you interested in for clustering purposes?
What kind of metrics on textual data are you interested in for clustering purposes?
#3
Posted 18 December 2011 - 06:47 AM
I want to first change the way I have done the clustering of data. The code I have shown clusters data that was read from a dat file. I don't want to use packages since it may seem that I have stolen someone's code. Can someone first tell me how I can improve my existing code? For another algo only I need help with clustering textual data for example data containing a list of words. As of now please suggest how existing code can be improved. Thanks a lot. I want to revamp the code basically.
#4
Posted 23 December 2011 - 07:03 AM
How do I convert the lists in the existing code to maps? I understand the concepts of get and put in maps but I don't know how to implement them for my code. Please help!!!
1 user(s) are reading this topic
0 members, 1 guests, 0 anonymous users


Sign In
Create Account

Back to top









