User:Chase-san/StatBufferSet
Jump to navigation
Jump to search
This is a set of stat buffer configuration to make all those nightmares of dealing with and updating multiple multidimensional VCS bins, go away.
Contents
chase.s2.stat.StatBuffer
package chase.s2.stat;
import org.csdgn.utils.VDA;
/**
* This is a visit count stat buffer. However, it is such a buffer where it may
* have any number of segmentation or dimensionality.
*
* For those uninformed, this is part of a statistical data mining system.
*
* @author Chase
*/
public class StatBuffer {
protected static final int scanWidth = 1;
/** Handing the VDA manually is dangerous, and could wipe out all your data */
private VDA<double[]> _binSet;
public int _binTotal;
public int[] _dataIndex;
/** This is fine, since java can handle jagged arrays */
public double[][] _splits;
/**
* Constructs a new buffer
* @param splits The data defining the data splits, usually a jagged array
* @param dataIndex the index of the raw stats each split is on
* @param binTotal total number of indexes for this buffer
*/
public StatBuffer(double[][] splits, int[] dataIndex, int binTotal) {
/* TODO: splits and dataIndex lengths don't equal throw exception */
_binTotal = binTotal;
_dataIndex = dataIndex;
_splits = splits;
int[] dimensions = new int[Math.max(1, splits.length)];
dimensions[0] = 1;
for(int i=0; i<splits.length; ++i)
dimensions[i] = splits[i].length + 1;
_binSet = new VDA<double[]>(dimensions);
Object[] obj = _binSet.getBackingArray();
for(int i=0; i<obj.length; ++i) {
double[] bins = new double[binTotal];
for(int j=0; j<binTotal; ++j)
bins[j] = 0;
obj[i] = bins;
}
}
/**
* Constructs a new stat buffer, using the lazy SplitSet
* @param ss SplitSet to use
* @param binTotal total number of indexes for this buffer
*/
public StatBuffer(SplitSet ss, int binTotal) {
this(ss.getSplitSet(),ss.getIndexSet(),binTotal);
}
/**
* This creates a flat, single dimension stat buffer.
* @param binTotal total number of indexes for this buffer
*/
public StatBuffer(int binTotal) {
this(new double[0][0], new int[0], binTotal);
}
/**
* This method splits the raw data as defined by the data index
* and split information. It outputs a data array which can be
* fed to the VDA to get the desired bin set.
* @param data raw data
* @return data indexes
*/
public int[] convertData(StatData data) {
int[] indexes = new int[Math.max(1, _dataIndex.length)];
if(_splits.length == 0) return indexes;
for(int i = 0; i < indexes.length; ++i) {
int index = 0;
for(;index < _splits[i].length; ++index) {
if(data._data[_dataIndex[i]] < _splits[i][index]) break;
}
indexes[i] = index;
}
return indexes;
}
/**
* Gets the individual stat buffer for the give data.
*/
public double[] getStatBin(StatData data) {
return _binSet.get(convertData(data));
}
/**
* Updates this stat buffer with the given raw data
*/
public void update(StatData data, double weight) {
double[] bins = getStatBin(data);
double index = GFToIndex(data._gf, _binTotal);
updateBin(bins, index, weight);
}
/**
* Gets the best bin from the raw data
*/
public int getBestIndex(StatData data) {
double[] bins = getStatBin(data);
return getBestIndex(bins, scanWidth);
}
/**
* Gets the best guessfactor from the raw data
*/
public double getBestGF(StatData data) {
return indexToGF(getBestIndex(data),_binTotal);
}
/**
* TODO: Replace this with some kind of externalized update method.
* That way it is easier to replace with rolling and different distributions
*/
public static final void updateBin(double[] bin, double index, double weight) {
for(int i=0; i<bin.length; ++i) {
double amount = Math.abs(i - index);
if(amount == 0) bin[i] += weight;
if(amount > 0 && amount < 1) {
bin[i] += amount * weight;
}
}
}
/**
* Static method used to find the best index. Width is number
* of extra bins to each side to test.
*/
public static final int getBestIndex(double[] bin, int width) {
double[] score = getBinScores(bin,width);
int bestIndex = 0; /* TODO: This could be problematic */
for(int i=0; i<bin.length; ++i)
if(score[i] > score[bestIndex])
bestIndex = i;
return bestIndex;
}
/**
* Gets the scores for each bin in the given stat bin.
*/
protected static final double[] getBinScores(double[] bin, int width) {
double[] scores = new double[bin.length];
for(int i=0; i<bin.length; ++i) {
double score = 0;
//It is a little complicated in there, but things will
// never go out of bounds
for(int j=-width; j<width+1; ++j) {
int abs = Math.abs(j) + 1;
if(i+j < 0 || i+j >= bin.length) {
if(i == 0 || i == bin.length - 1) {
score += bin[i - j] / abs;
} else {
int offset = (j < 0) ? -3 : 3;
score += bin[i - j + offset] / abs;
}
} else {
score += bin[i + j] / abs;
}
}
scores[i] = score;
}
return scores;
}
/**
* Converts a bin index to a guessfactor based on total number of bins.
*/
public static final double indexToGF(int index, int bins) {
index = Math.min(Math.max(0,index),bins - 1);
double half = (bins-1.0)/2.0;
return (index / half) - 1.0;
}
/**
* Converts a guessfactor to a bin index, based on number of bins. This
* returns a double, incase there is any need to know how much it over
* hangs each bin. Cast to an int for the raw information.
*/
public static final double GFToIndex(double gf, int bins) {
gf = Math.min(Math.max(-1.0,gf),1.0) + 1.0;
double half = (bins-1.0)/2.0;
return gf * half;
}
}
chase.s2.stat.StatBufferSet
package chase.s2.stat;
import java.util.ArrayList;
import java.util.List;
/**
* This class handles multiple stat buffers, as if they were one.
*/
public final class StatBufferSet {
private static final class StatItem {
StatBuffer buffer;
double weight;
}
private List<StatItem> autobin = new ArrayList<StatItem>();
/**
* Add a stat buffer to this buffer set.
*/
public void add(StatBuffer bin, double weight) {
StatItem si = new StatItem();
si.buffer = bin;
si.weight = weight;
autobin.add(si);
}
/**
* Gets the single best GF from each StatBuffer.
*/
public double[] getBestGFList(StatData data) {
double[] output = new double[autobin.size()];
int i = 0;
for(StatItem s : autobin) {
output[i++] = s.buffer.getBestGF(data);
}
return output;
}
/**
* Gets a list of all the bins for the raw data.
*/
private double[][] getBinList(StatData data) {
double[][] output = new double[autobin.size()][];
int i = 0;
for(StatItem s : autobin) {
output[i++] = s.buffer.getStatBin(data);
}
return output;
}
/**
* Gets the list of weights.
*/
private double[] getWeightList() {
double[] output = new double[autobin.size()];
int i = 0;
for(StatItem s : autobin) {
output[i++] = s.weight;
}
return output;
}
public double[] getCombinedScoreData(StatData sd) {
double[][] data = getBinList(sd);
double[] weights = getWeightList();
return getCombinedScoreData(data,weights);
}
/**
* This combines the data over all the bins in such
* a way that it can be read easily as if it were a
* single stat bin.
*/
private double[] getCombinedScoreData(double[][] data, double[] weights) {
if(weights.length != data.length)
throw new IllegalArgumentException("Data length and weights length must match.");
/* Determine the longest and shortest bin set */
int shortest = Integer.MAX_VALUE;
int longest = Integer.MIN_VALUE;
for(double[] d : data) {
if(d.length > longest) longest = d.length;
if(d.length < shortest) shortest = d.length;
}
/* Get each of their scores, and record the largest of each. */
double[][] score = new double[data.length][];
double[] largest = new double[data.length];
for(int i=0; i<data.length; ++i) {
score[i] = StatBuffer.getBinScores(data[i], StatBuffer.scanWidth);
for(int j=0;j<score[i].length;++j)
if(score[i][j] > largest[i])
largest[i] = score[i][j];
}
/* Equalize each of the stat bins */
for(int i=0; i<data.length; ++i)
for(int j=0;j<score[i].length;++j)
score[i][j] /= largest[i];
/* If they are equal, take the easy route */
if(shortest == longest) {
largest = new double[shortest];
for(int i=0; i<data.length; ++i)
for(int j=0;j<shortest;++j)
largest[j] = data[i][j] * weights[i];
return largest;
}
/* Otherwise we have to do things the 'hard' way! */
/* We use the longest and combine the sums from the others, based on gf */
largest = new double[longest];
for(int i=0; i<data.length; ++i) {
double[] tmpSum = new double[longest];
/* Get the gf for the new length and multiply it by the old weight */
for(int j=0; j<score[i].length; ++j) {
double index = StatBuffer.GFToIndex(StatBuffer.indexToGF(j, score[i].length), longest);
StatBuffer.updateBin(tmpSum, index, score[i][j]);
}
/* Add to output buffer */
for(int j=0; j<longest; ++j) {
largest[j] += tmpSum[j] * weights[i];
}
}
return largest;
}
/**
* Gets the best GF based on weights.
*/
public double getBestGF(StatData data) {
double[] score = getCombinedScoreData(data);
int bestIndex = 0; /* TODO: This could be problematic */
for(int i=0; i<score.length; ++i)
if(score[i] > score[bestIndex])
bestIndex = i;
return StatBuffer.indexToGF(bestIndex, score.length);
}
/**
* Updates this stat buffer handler with the given raw data
*/
public void update(StatData data, double weight) {
for(StatItem s : autobin) {
s.buffer.update(data, weight);
}
}
chase.s2.stat.SplitSet
package chase.s2.stat;
import java.util.ArrayDeque;
import java.util.Deque;
/**
* For the really lazy!
*
* @author Chase
*/
public class SplitSet {
public Deque<double[]> split = new ArrayDeque<double[]>();
public Deque<Integer> indexes = new ArrayDeque<Integer>();
public void add(double[] d, int index) {
split.add(d);
indexes.add(index);
}
public double[][] getSplitSet() {
Object[] obj = split.toArray();
double[][] output = new double[obj.length][];
for(int i=0; i<obj.length; ++i) {
output[i] = (double[])obj[i];
}
return output;
}
public int[] getIndexSet() {
Object[] obj = indexes.toArray();
int[] output = new int[obj.length];
for(int i=0; i<obj.length; ++i) {
output[i] = (Integer)obj[i];
}
return output;
}
}
chase.s2.stat.StatData
package chase.s2.stat;
/**
* This holds data.
*
* @author Chase
*/
public class StatData {
public double _data[], _gf;
public StatData(double[] data, double gf) {
_data = data;
_gf = gf;
}
}