Clustering update

2011-05-04 12:17:50 +00:00
parent c79eb33bca
commit 21dd2fe5c8
3 changed files with 161 additions and 38 deletions
--- a/src/eva2/server/go/operators/cluster/ClusterAll.java
+++ b/src/eva2/server/go/operators/cluster/ClusterAll.java
@@ -0,0 +1,74 @@
+package eva2.server.go.operators.cluster;
+
+import java.io.Serializable;
+import java.util.Arrays;
+
+import eva2.server.go.populations.Population;
+
+/**
+ * Dummy class which assigns all individuals to a single cluster only.
+ * 
+ * @author mkron
+ *
+ */
+public class ClusterAll implements InterfaceClustering, Serializable {
+	
+	private boolean assignLoners = false; // should loners be assigned?
+	
+	public Object clone() {
+		return new ClusterAll();
+	}
+
+    /**
+     * Try to associate a set of loners with a given set of species. Return a list
+     * of indices assigning loner i with species j for all loners. If no species can
+     * be associated, -1 is returned as individual entry.
+     * Note that the last cluster threshold is used which may have depended on the last
+     * generation.
+     * If the clustering depends on population measures, a reference set may be given
+     * which is the reference population to consider the measures of. This is for cases
+     * where, e.g., subsets of a Population are to be clustered using measures of the
+     * original population.
+     * 
+     * @param loners
+     * @param species
+     * @param referenceSet a reference population for dynamic measures
+     * @return associative list matching loners to species.
+     */
+	@Override
+	public int[] associateLoners(Population loners, Population[] species,
+			Population referenceSet) {
+		if (loners!=null && (loners.size()>0)) {
+			int[] indices = new int[loners.size()]; 
+			if (assignLoners) Arrays.fill(indices, 0);
+			else Arrays.fill(indices, -1);
+			return indices;
+		} else return null;
+	}
+
+	@Override
+	public Population[] cluster(Population pop, Population referenceSet) {
+		// first pop is empty (there are no loners), second pop is complete
+		return new Population[]{pop.cloneWithoutInds(), pop.cloneShallowInds()};
+	}
+
+	@Override
+	public String initClustering(Population pop) {
+		return null;
+	}
+
+	@Override
+	public boolean mergingSpecies(Population species1, Population species2,
+			Population referenceSet) {
+		return true;
+	}
+	
+	public static String globalInfo() {
+		return "A dummy clustering implementation which assigns all elements to a single cluster.";
+	}
+	
+	public String getName() {
+		return "Cluster-all";
+	}
+
+}
--- a/src/eva2/server/go/operators/cluster/ClusteringKMeans.java
+++ b/src/eva2/server/go/operators/cluster/ClusteringKMeans.java
@@ -7,6 +7,8 @@ import eva2.gui.Plot;
 import eva2.server.go.individuals.AbstractEAIndividual;
 import eva2.server.go.individuals.ESIndividualDoubleData;
 import eva2.server.go.individuals.InterfaceDataTypeDouble;
+import eva2.server.go.operators.distancemetric.EuclideanMetric;
+import eva2.server.go.operators.distancemetric.InterfaceDistanceMetric;
 import eva2.server.go.populations.Population;
 import eva2.server.go.problems.F1Problem;
 import eva2.tools.chart2d.Chart2DDPointIconCircle;
@@ -24,11 +26,15 @@ import eva2.tools.math.RNG;
 */
 public class ClusteringKMeans implements InterfaceClustering, java.io.Serializable {

-    public int                         m_K                 = 5;
-    public double[][]                  m_C                 = null;
-    public boolean                     m_UseSearchSpace    = true;
-    public boolean                     m_ReuseC            = false;
-    public boolean                     m_Debug             = false;
+    private int                         m_K                 = 5;
+    private double[][]                  m_C                 = null;
+	private	double 						mergeDist = 0.001;
+	private boolean                     m_UseSearchSpace    = true;
+    private boolean                     m_ReuseC            = false;
+    private boolean                     m_Debug             = false;
+    private int 						minClustSize = 1;
+    InterfaceDistanceMetric metric = new EuclideanMetric();
+    AbstractEAIndividual tmpIndy = null;

    public ClusteringKMeans() {

@@ -38,6 +44,9 @@ public class ClusteringKMeans implements InterfaceClustering, java.io.Serializab
        this.m_Debug            = a.m_Debug;
        this.m_K                = a.m_K;
        this.m_UseSearchSpace   = a.m_UseSearchSpace;
+        this.metric 			= a.metric;
+        this.minClustSize 		= a.minClustSize;
+        this.mergeDist			= a.mergeDist;
        if (a.m_C != null) {
            this.m_C = new double[a.m_C.length][a.m_C[0].length];
            for (int i = 0; i < this.m_C.length; i++) {
@@ -62,12 +71,23 @@ public class ClusteringKMeans implements InterfaceClustering, java.io.Serializab
     * @return Population[]
     */
    public Population[] cluster(Population pop, Population referencePop) {
-        double[][] data     = this.extractClusterDataFrom(pop);
+    	if (pop.size()<m_K) {
+    		// in this case, there arent enough indies to do anything, so we just return them as "unclustered"
+    		Population[] res = new Population[1];
+    		res[0]=pop.cloneShallowInds();
+    		return res;
+    	}
+    	tmpIndy = (AbstractEAIndividual)pop.getEAIndividual(0).clone();
+//        double[][] data     = this.extractClusterDataFrom(pop);
        if (!(this.m_ReuseC) || (this.m_C == null)) {
            this.m_C            = new double[this.m_K][];
            // now choose random initial Cs
-            for (int i = 0; i < this.m_C.length; i++) {
-                this.m_C[i] = data[RNG.randomInt(0, data.length-1)];
+            Population initialSeeds = pop.getRandNIndividuals(this.m_K);
+            for (int i = 0; i < this.m_K; i++) {
+            	if (m_UseSearchSpace) this.m_C[i] = initialSeeds.getEAIndividual(i).getDoublePosition().clone();
+            	else this.m_C[i] = initialSeeds.getEAIndividual(i).getFitness().clone();
+            	
+//                this.m_C[i] = data[RNG.randomInt(0, data.length-1)];
                //this.m_C[i] = data[i];     // This works!!
                // we won't check for double instances assuming that double instances
                // will be ironed out during clustering and to prevent infinite loops
@@ -79,22 +99,23 @@ public class ClusteringKMeans implements InterfaceClustering, java.io.Serializab
        boolean     finished = false;
        double[][]  newC;
        int[]       numbOfAssigned;
-        int[]       assignment = new int[data.length];
+        int[]       assignment = new int[pop.size()];
        int         assign;
        while (!finished) {
            // first assign the data to the closes C
-            for (int i = 0; i < data.length; i++) {
+            for (int i = 0; i < pop.size(); i++) {
                // check which C is closest
                assign = 0;
                for (int j = 1; j < this.m_C.length; j++) {
-                    if (this.distance(this.m_C[assign], data[i]) > this.distance(this.m_C[j], data[i]))
+                    if (this.distance(pop.getEAIndividual(i), this.m_C[assign]) > this.distance(pop.getEAIndividual(i), this.m_C[j]))
+//                    if (this.distance(this.m_C[assign], data[i]) > this.distance(this.m_C[j], data[i]))
                        assign = j;
                }
                assignment[i] = assign;
            }

            // now calcuate the mean of each cluster and calculate new C
-            newC            = new double[this.m_K][data[0].length];
+            newC            = new double[this.m_K][m_C[0].length];
            numbOfAssigned  = new int[this.m_K];
            for (int i = 0; i < newC.length; i++) {
                numbOfAssigned[i] = 1;
@@ -103,7 +124,8 @@ public class ClusteringKMeans implements InterfaceClustering, java.io.Serializab
            for (int i = 0; i < assignment.length; i++) {
                numbOfAssigned[assignment[i]]++;
                for (int j = 0; j < newC[assignment[i]].length; j++) {
-                    newC[assignment[i]][j] += data[i][j];
+                	if (m_UseSearchSpace) newC[assignment[i]][j] += pop.getEAIndividual(i).getDoublePosition()[j];
+                	else newC[assignment[i]][j] += pop.getEAIndividual(i).getFitness(j);
                }
            }
            for (int i = 0; i < newC.length; i++) {
@@ -153,10 +175,10 @@ public class ClusteringKMeans implements InterfaceClustering, java.io.Serializab
                GraphPointSet           mySet;
                DPoint                  myPoint;
                Chart2DDPointIconText   tmp;
-                for (int i = 0; i < data.length; i++) {
+                for (int i = 0; i < pop.size(); i++) {
                    mySet = new GraphPointSet(10+1, plot.getFunctionArea());
                    mySet.setConnectedMode(false);
-                    double[] x  = data[i];
+                    double[] x  = pop.getEAIndividual(i).getDoublePosition();
                    myPoint = new DPoint(x[0], x[1]);
                    tmp = new Chart2DDPointIconText(""+assignment[i]);
                    if (assignment[i] % 2 == 0) tmp.setIcon(new Chart2DDPointIconCircle());
@@ -167,7 +189,7 @@ public class ClusteringKMeans implements InterfaceClustering, java.io.Serializab
            // finally let's check whether or not the C changed and if i can terminate k_Means
            finished = true;
            for (int i = 0; i < this.m_C.length; i++) {
-                if (this.distance(this.m_C[i], newC[i]) > 0.0001) finished = false;
+                if (EuclideanMetric.euclideanDistance(this.m_C[i], newC[i]) > 0.0001) finished = false;
                this.m_C[i] = newC[i];
            }
        } // gosh now i'm done
@@ -200,18 +222,35 @@ public class ClusteringKMeans implements InterfaceClustering, java.io.Serializab
                }
            }
        }
-
-        return result;
+        
+        // now expand to the expected format (unclustered indies at pop of index 0)
+        int largeEnough = 0;
+        // count clusters that are large enough
+        for (int i=0; i<result.length; i++) if (result[i].size()>=getMinClustSize()) largeEnough++;
+        Population[] resExpanded = new Population[largeEnough+1];
+        resExpanded[0]=pop.cloneWithoutInds();
+        int lastIndex = 1;
+        for (int i=0; i<result.length; i++) {
+        	if (result[i].size()>=getMinClustSize()) {
+        		resExpanded[lastIndex]=result[i];
+        		lastIndex++;
+        	} else resExpanded[0].addPopulation(result[i]);
+        }
+        tmpIndy=null;
+        return resExpanded;
    }

-    /** This method allows you to cluster a population using m_C
+    /** 
+     * This method allows you to cluster a population using m_C. The minimal cluster
+     * size is _not_ regarded here.
     * @param pop   The population
     * @param c     The centroids
     * @return The clusters as populations
     */
    public Population[] cluster(Population pop, double[][] c) {
+    	if (tmpIndy==null) tmpIndy=(AbstractEAIndividual)pop.getEAIndividual(0).clone(); // nec. only because the method is public...
        Population[]    result  = new Population[c.length];
-        double[][]      data    = this.extractClusterDataFrom(pop);
+//        double[][]      data    = this.extractClusterDataFrom(pop);
        int             clusterAssigned;

        try {
@@ -224,16 +263,15 @@ public class ClusteringKMeans implements InterfaceClustering, java.io.Serializab
        	e.printStackTrace();
        }
        // let's assign the elements of the population to a c
-        for (int i = 0; i < data.length; i++) {
+        for (int i = 0; i < pop.size(); i++) {
            // find the closest c
            clusterAssigned = 0;
            for (int j = 1; j < c.length; j++) {
-                if (this.distance(data[i], c[clusterAssigned]) > this.distance(data[i], c[j]))
+                if (this.distance(pop.getEAIndividual(i), c[clusterAssigned]) > this.distance(pop.getEAIndividual(i), c[j]))
                    clusterAssigned = j;
            }
            result[clusterAssigned].add(pop.get(i));
        }
-
        return result;
    }

@@ -242,16 +280,13 @@ public class ClusteringKMeans implements InterfaceClustering, java.io.Serializab
     * @param d2
     * @return The scalar distances between d1 and d2
     */
-    private double distance(double[] d1, double[] d2) {
-        double result = 0;
-
-        for (int i = 0; i < d1.length; i++) {
-            result += Math.pow(d1[i] - d2[i], 2);
-        }
-        result = Math.sqrt(result);
-        return result;
+    private double distance(AbstractEAIndividual indy, double[] p) {
+        if (m_UseSearchSpace) ((InterfaceDataTypeDouble)tmpIndy).SetDoubleGenotype(p);
+        else tmpIndy.SetFitness(p);
+        
+        return metric.distance(indy, tmpIndy);
    }
-
+    
    /** This method extracts the double data to cluster from the
     * population
     * @param pop   The population
@@ -281,8 +316,9 @@ public class ClusteringKMeans implements InterfaceClustering, java.io.Serializab
     * @return True if species converge, else False.
     */
    public boolean mergingSpecies(Population species1, Population species2, Population referencePop) {
-        // @todo i could use the BIC metric from X-means to calculate this
-        return false;
+        // TODO i could use the BIC metric from X-means to calculate this
+    	if (metric.distance(species1.getBestEAIndividual(), species2.getBestEAIndividual())<mergeDist) return true;
+    	else return false;
    }

 //    /** This method decides if a unclustered individual belongs to an already established species.
@@ -296,6 +332,7 @@ public class ClusteringKMeans implements InterfaceClustering, java.io.Serializab
 //    }
    
 	public int[] associateLoners(Population loners, Population[] species, Population referencePop) {
+//		tmpIndy = (AbstractEAIndividual)loners.getEAIndividual(0).clone();
 		int[] res=new int[loners.size()];
 		System.err.println("Warning, associateLoners not implemented for " + this.getClass());
 		Arrays.fill(res, -1);
@@ -368,7 +405,7 @@ public class ClusteringKMeans implements InterfaceClustering, java.io.Serializab
        this.m_UseSearchSpace = m;
    }
    public String useSearchSpaceTipText() {
-        return "Toggle between search/objective space distance.";
+        return "Toggel between search/objective space distance.";
    }

    /** This method allows you to toggle reuse of c.
@@ -381,10 +418,20 @@ public class ClusteringKMeans implements InterfaceClustering, java.io.Serializab
        this.m_ReuseC = m;
    }
    public String reuseCTipText() {
-        return "Toggle reuse of previously found cluster centroids.";
+        return "Toggel reuse of previously found cluster centroids.";
    }

 	public String initClustering(Population pop) {
 		return null;
 	}
+
+	public void setMinClustSize(int minClustSize) {
+		this.minClustSize = minClustSize;
+	}
+	public int getMinClustSize() {
+		return minClustSize;
+	}
+	public String minClustSizeTipText() {
+		return "Require a cluster to be at least of this size. Smaller ones are assigned to the unclustered set.";
+	}
 }
--- a/src/eva2/server/go/operators/cluster/InterfaceClustering.java
+++ b/src/eva2/server/go/operators/cluster/InterfaceClustering.java
@@ -1,6 +1,5 @@
 package eva2.server.go.operators.cluster;

-import eva2.server.go.individuals.AbstractEAIndividual;
 import eva2.server.go.populations.Population;

 /** 
@@ -32,7 +31,10 @@ public interface InterfaceClustering {
     * If the clustering depends on population measures, a reference set may be given
     * which is the reference population to consider the measures of. This is for cases
     * where, e.g., subsets of a Population are to be clustered using measures of the
-     * original population. 
+     * original population.
+     * Note that the clustered individuals should only be shallow instances of the members
+     * of the given population pop. The sum of sizes of all returned individuals must be
+     * equal to pop.size().
     * 
     * @param pop       The population of individuals that is to be clustered.
     * @param referenceSet a reference population for dynamic measures