Jump to content

Stuck with KMeans Clustering Algorithm DESPERATE HELP NEEDED

- - - - -

  • Please log in to reply
3 replies to this topic

#1
Suzanne M

Suzanne M

    Newbie

  • Members
  • Pip
  • 3 posts
Hi, I am supposed to implement the KMeans clustering algo for numerical and textual data. This is what I have done for data that's been read from a binary file. This code uses packages, I want to know a better way to code KMeans without involvong packages. I would also like some help on how to convert textual data to data that can have KMeans applied directly on them. Finding working codes online is a struggle. Right now m at my wits end.



package t_kmeans;

 

import java.io.*;

import java.util.*;

import java.text.DecimalFormat;

import java.util.List;

 

public class BasicKMeans {

    static private String fname= "palsy.dat";

    static List attributes = null;

    static int maxNum = 0;

    static DecimalFormat d2 = new DecimalFormat("##.####");

    /**

     * @param args the command line arguments

     */

    public static void main(String[] args) {

 

        // TODO code application logic here

        // Get the data file name

 

        // Read SVMlight format data (*.dat)

        attributes =  readSVMLightFile(fname);

        if(attributes!=null){

            System.out.println("Num of attributes : " + maxNum);

            System.out.println("Num of instances : " + attributes.size());

            Kmeans_cluster(attributes);

        }

    }// end of mains

 

 

      // Implementing K-means

  static void Kmeans_cluster(List attributes)

  {

    // Algorithm

    // 1.Randomly generate k centroid points: C={C1,…,Ck}

    // 2.Partition objects into k nonempty subsets (S1,…,Sk) by assigning each object xito the nearest centroid point by calculating distance metrics d(i,j) between the object xiand the centroid points Cj.

    // 3.Stop if no more new assignment: output C, M(C), (S1,…,Sk)

    // 4.Update k centroid points C1,…,Ck using the k subsets (S1,…,Sk)

    // 5.Go back to Step 2

 

    int k = 2;

    double[][] centroids = new double[k][maxNum];

    double[][] temporycentroids = new double[k][maxNum];

    Vector[] cluster_Array = new Vector[k]; // creating k no of cluster

    boolean flag = false;

 

    // implementation

    if (k > attributes.size())

        System.exit(0);

 

    System.out.println("The Total cluster number: " + k);

 

    //1.Randomly generate k centroid points

 

    centroids = randomCentroids(k);

 

 

        // Step 2 and 3

        // Calculate distance matric between each data point with centroids

    do{

 

         cluster_Array = cluster_Array(centroids,k);

 

        //testing the number of k clusters

        for(int idx = 0; idx < k; idx++)

        {

            System.out.println(cluster_Array[idx].size()+ " record "  + " in Cluster "+ idx );

        } // end of step 2 and 3

 

 

        // searching new k number centroids points

        int attribute_id = 1;

        for (int idx = 0; idx < k; idx++)

        {

            for (int idx1 = 0; idx1 < maxNum; idx1++)

            {

                temporycentroids [idx][idx1] = calculate_centroids(cluster_Array[idx],attribute_id);

                attribute_id++;

            }

            attribute_id = 1;

        }

 

        for (int idx = 0; idx < k; idx++)

        {

            System.out.println("centroid point " + idx);

            for (int idx1 = 0; idx1 < maxNum; idx1++)

            {

                System.out.print(idx1+". ");

                System.out.println(temporycentroids[idx][idx1]);

                System.out.print(" ");

            }

            System.out.println();

        }    // end of step 4

 

        // Step 5

        if (compare_centroids(centroids,temporycentroids,k,maxNum) == true)

        {

            flag = true;

            System.out.println("Centroid points are same.");

            for(int idx = 0; idx < k; idx++)

                System.out.println(cluster_Array[idx].size()+ " record "  + " in Cluster "+ idx );

 

            for (int idx = 0; idx < k; idx++)

            {

                System.out.println("centroid point " + idx);

                for (int idx1 = 0; idx1 < maxNum; idx1++)

                {

                    System.out.print(idx1+". ");

                    System.out.println(temporycentroids[idx][idx1]);

                    System.out.print(" ");

 

                }

                System.out.println();

            }

        }

        else    // update centroids

        {

                System.out.println("centroid points are different.");

                centroids = temporycentroids;

                    for (int idx = 0; idx < k; idx++)

                    {

                        System.out.println("updated centroid point " + idx);

                        for (int idx1 = 0; idx1 < maxNum; idx1++)

                        {

                            System.out.print(idx1+". ");

                            System.out.println(centroids[idx][idx1]);

                            System.out.print(" ");

 

                        }

                        System.out.println();

                    }

        }

}while(flag == false);

  }

	// KmeanCluster

 

 

 

 

	// calculate_centroids start

 

  static double calculate_centroids(List attributes, int attribute_id){

     int N = attributes.size();

     double centroid = 0.00;

     if(N > 0)

     {

          for(int idx = 0; idx < N; idx++)

          centroid = centroid + Double.valueOf(d2.format(getAttributeValue(attributes,idx,attribute_id)));

          centroid = Double.valueOf(d2.format(centroid / N));

      }

       return centroid;

  }  // end of calculate_centroids function

 

 

	// start randomCentroids

   static double[][] randomCentroids(int k){

        int[] all = new int[k];

        for (int idx=0; idx<k; idx++){

            all[idx] = 0;  // initialization

        }

        Random rd = new Random();

        int i = 0;

        while (i < k){

            int selectedNumber = rd.nextInt(attributes.size());

            if (adding(all,i,i+1))

                i++;

        }

 

        System.out.println("The data point which is done random : " );

        for( int idx1=0; idx1<k ; idx1++)

            System.out.println(all[idx1]+1 + " ");

 

 

         double [][] temp = new double [k][maxNum];

 

         for(int row = 0;  row< k; row++)

         {   int attrib_id = 1;

             for(int col = 0; col < maxNum; col++)

             {

                 temp[row][col] = Double.valueOf(d2.format(getAttributeValue(attributes,all[row],attrib_id)));

                  attrib_id++;

              }

         }

         for(int row = 0;  row< k; row++)

         {

            System.out.print("centroid " + row + ": ");

            for(int col = 0; col < maxNum; col++)

            {

                 System.out.println(temp[row][col] + " ");

             }

            System.out.println();

        }

        return temp;

    }

    // end of randomCentroids function

 

 

// add function start

   static boolean adding(int[] list, int size, int value){

 

       for ( int idx=0; idx<size; idx++){

           if (list[idx] == value) //Does random value exist or not in list[]?

                   return false; //if same

       }

       list[size] = value;

       return true;

    }

   /*

    * end of add function

    */

 

 

// start cluster_array function

    static Vector[] cluster_Array(double [][] centroids, int k){

        //create given number of cluster

        Vector[] cluster_Array = new Vector[k];  // create k number of cluster array

        int closet_cluster = 0;       // to keep nearest centroid

        double closet_distance = 0.0;  // to keep nearest distance

        for(int idx=0; idx < k; idx++)

            cluster_Array[idx] = new Vector(); // insert each List into each cluster

 

        for (int idx = 0; idx < attributes.size(); idx++)

        {

             List fv1 = (List)attributes.get(idx);

             for (int idx1 = 0; idx1 < k; idx1++)

             {

                double[] fv2 = centroids[idx1];

                double distance = Double.valueOf(d2.format(distanceMetric(fv1, fv2))); // round to 2 decimal double value

                if (idx1==0)

                {

                    closet_distance = distance;

                    closet_cluster = idx1;

                }

                else if (idx1 > 0)

                {

                   if(distance <= closet_distance)

                    {

                        closet_distance = distance;

                        closet_cluster = idx1;

                    }

                }

 

             }

             cluster_Array[closet_cluster].add((List)attributes.get(idx));

        }

 

        return cluster_Array;

 }

 

 

    /*

     * Comparing clusters ... old cluster and new one

     * implemented by Thu Thu Tun

     */

 

     static boolean compare_centroids(double [][] c1, double[][] c2, int k, int maxNum){

     for (int idx = 0; idx < k; idx++)

         for (int idx1 = 0; idx1 < maxNum; idx1++)

            if (c1[idx][idx1] != c2[idx][idx1])

                return false;

 

       System.out.println("temp centroid");

       for (int idx = 0; idx < k; idx++)

        {

            System.out.println("temp centroid point " + idx);

            for (int idx1 = 0; idx1 < maxNum; idx1++)

            {

                System.out.print(idx1+". ");

                System.out.println(c1[idx][idx1]);

                System.out.print(" ");

 

            }

            System.out.println();

        }

 

     return true;

 }

 

 /**

   * Read SVMLight data file into Vector data structure

   *

   * @param fname

   * @return List = a list of feature vectors = [ [ label, [id, value], [id, value],...], .... ]

   *     a feature vector = [ label, [id, value], [id, value],...]

   *     label = int

   *     id = double

   *     value = double

   */

    private static List readSVMLightFile(String fname) {

		try{

			List featureV = new Vector();

			// Reading input by lines:

			BufferedReader in = new BufferedReader( new FileReader(fname) );

			String str = new String("");

			while ((str = in.readLine()) != null){

			str = str.trim();

			if(str.length() > 0){

				String line[];

				String fs[];

				line = str.split("\\s",2);

				String clabel = line[0].trim().replace("+", "");

				fs = line[1].trim().split("\\s");   // feature list #:v ...

				List fl = new Vector();

				fl.add(Integer.parseInt(clabel));

			for(int idx = 0; idx < fs.length; idx++)

			{

				double f[] = {0,0};

				String attr[] = fs[idx].split(":");

				int aid = Integer.valueOf(attr[0].trim());

				if( aid > maxNum) maxNum = aid;

				f[0] = aid;                             // attribute id

				f[1] = Double.valueOf(attr[1].trim());    // attribute value

				fl.add(f);

			}

			featureV.add(fl);

        }

      }

      in.close();

      return featureV;

    }

    catch(IOException e){

        System.out.println("Error reading file: " + e);

        return null;

    }

    }//end of read file

 

   /**

   * Calculate Euclidean distance metric between two sparse feature vectors

   * @param i   feature vector = [ label, [id, value], [id, value],...]

   * @param j   feature vector = [ label, [id, value], [id, value],...]

   * modified by Thu Thu Tun

   * @return

   */

  static double distanceMetric(List fv1, double[] fv2){

		double[] fv = new double[maxNum];

		for(int idx = 0; idx < maxNum; idx++){

			fv[idx] = 0;

		}

		Iterator itr = fv1.iterator();

		int k = 0;

		while(itr.hasNext())

		{

		if( k > 0){

			double f[] = (double[])itr.next();

			fv[(int)f[0]-1] = f[1];

		}

		else

			itr.next();

		k++;

    }

 

 

    for (int idx = 0; idx < maxNum; idx++)

    {

      fv[idx] -= fv2[idx];

    }

    double distance = 0;

    for(int idx = 0; idx < maxNum; idx++)

    {

      distance += Double.valueOf(d2.format(fv[idx]*fv[idx]));

    }

 

    return Double.valueOf(d2.format(Math.sqrt(distance)));

    }

 

 

   /**

   * Retrive the attribute value

   * @param attributes

   * @param data_index   0 based: 0,...,dataSize-1

   * @param attribute_id 1 based: 1,...,dimension

   * @return

   */

  static double getAttributeValue(List attributes, int data_index, int attribute_id)

  {

    List fv = (List)attributes.get(data_index);  // feature vector

 

    Iterator itr = fv.iterator();

    int k = 0;

    while(itr.hasNext())

    {

      if( k > 0)

      {

        double f[] = (double[])itr.next();

        if( attribute_id == (int)f[0] )

        {

          return f[1];

        }

      }

      else

        itr.next();

      k++;

    }

 

    return 0;

  }

 

   /**

   * Retrive the class label

   * @param attributes

   * @param data_index   0 based: 0,...,dataSize-1

   * @return

   */

  static int getClassLabel(List attributes, int data_index)

  {

    List fv = (List)attributes.get(data_index);  // feature vector

    return (Integer)fv.get(0);

  }

}


#2
WingedPanther

WingedPanther

    A spammer's worst nightmare

  • Moderators
  • 16,831 posts
  • Location:Upstate, South Carolina
  • Programming Language:C, C++, PL/SQL, Delphi/Object Pascal, Pascal, Transact-SQL, Others
  • Learning:Java, C#, PHP, JavaScript, Lisp, Fortran, Haskell, Others
Why do you want to avoid using packages?

What kind of metrics on textual data are you interested in for clustering purposes?
Programming is a branch of mathematics.
My CodeCall Blog | My Personal Blog

#3
Suzanne M

Suzanne M

    Newbie

  • Members
  • Pip
  • 3 posts
I want to first change the way I have done the clustering of data. The code I have shown clusters data that was read from a dat file. I don't want to use packages since it may seem that I have stolen someone's code. Can someone first tell me how I can improve my existing code? For another algo only I need help with clustering textual data for example data containing a list of words. As of now please suggest how existing code can be improved. Thanks a lot. I want to revamp the code basically.

#4
Suzanne M

Suzanne M

    Newbie

  • Members
  • Pip
  • 3 posts
How do I convert the lists in the existing code to maps? I understand the concepts of get and put in maps but I don't know how to implement them for my code. Please help!!!




1 user(s) are reading this topic

0 members, 1 guests, 0 anonymous users