Pseudocode for bootstrapping
Input: flat file of size MxN, containing “real” data. Assume that M is not too large.
Output: user-specified number of files of size MxN which contain “fake” data based on the real input data.
High-level pseudocode:
If the input data is numeric,
generate output data based on the mean and stdev of the input data;
else
generate output data based on the similarity measure.
Pseudocode draft 1:
For each column:
If the input data is numeric,
//generate output data based on the mean and stdev of the input data:
calculate mean and stdev
classify input data based on mean and stdev
calculate probabilities of input data classification for each class
generate fake data
else
//generate output data based on the similarity measure
find out which data values are present
calculate probabilities of input data classification for each class
generate fake data
Pseudocode draft 2:
Assume that the input data is called s[i][j], i=1, …,M, j = 1, .., N.
Assume that the fake data are kept in file fakedata[i][j][filenumber], i=1, …,M, j = 1, .., N, filenumber = 1, .., Z, and Z is assigned by the user.
Assume numBounds = 4 (i.e. there are 4 bounds and thus 3 bins: min, mean-stdev, mean+stdev, max)
for (file = 1, Z) { // generate Z fake data files of size MxN
for j = 1, N //go through each column
if numeric(j),
calculate_mean_and_stdev(j, mean, stdev)
classify_numeric_data(j, bounds[])
calculate_numeric_probabilities (j, mean, stdev, bounds[], prob[])
generate_fake_numeric_data(j, bounds[], prob[], file, M, N, fakedata[][][file])
else
find_values_and_calculate_probabilities(j, vals[], prob[], numPosVals)
generate_fake_nonnumeric_data(j, values[], prob[], numPosVals, file, M, N, fakedata[][][file])
}
}
//calculate mean and stdev for column j
calculate_mean_and_stdev(j, mean, stdev) {
mean =0
stdev = 0
max = +2^30
min = -2^30
for i = 1, M { //go through each row
mean = mean + …
max = ….
min = …
}
mean = mean/M – or is it N? think about it J
for i = 1, M { //go through each row
stdev = stdev + …
}
stdev = sqrt( …)
}
// classify input data based on mean and standard deviation such that:
// the first class is from min[j] to mean[j] – stdev[j];
// the second class is from mean[j] – stdev [j] to mean[j] + stdev [j];
// the third class is from mean[j] + stdev [j] to max[j];
classify_numeric_data(j, bounds) {
bounds[0] = min[j]
bounds[1] = mean[j]-stdev[j],
bounds[2] = mean[j] + stdev[j]
bounds[3] = max[j]
}
//calculate probabilities of input data classification for each class
calculate_numeric_probabilities (j, mean, stdev, bounds[], prob[]) {
int bin[2];
for i = 0, i <= 2, i++) {
bin[i] =0;
}
bool found = false;
for i = 1, M { //go through each row
while(!found) {
for (i = 0; i <=2; i++) {
if (s[i][j] >= bound[i] && s[i][j] <= bound[i+1] ) {
bin[i]++
found = true;
}
}
}
}
for (i = 0; i <=2; i++) {
prob[i] = bin[i]/M;
}
}
//j: column number
// bounds[]: the array with bounds between the bins (see classify_numeric_data //for definitions)
//numPosVals: size of values
//file: filenumber
//M: number of input samples
//N: number of columns
//fakedata: file with fake data
generate_fake_numeric_data(j, bounds[], prob[], file, M, N, fakedata[][][file]) {
for (i = 1, i <= M, i++) {
r = rand()
if r == 0 //take care of the borderline cases
fakedata[i][j][file] = min[j] //i.e. bound[0]
if r == 1
fakedata[i][j][file] = max[j] //i.e. bound[3]
//find which bin this value belongs to
beg = 0;
end = prob[0];
k=0;
bool found = false;
while (!found && k <= numBounds) {
if (beg < r <= end) {
//generate fakedata[i][j] value in between
//bound[k] and bound[k+1]
r = rand()
fakedata[i][j][file] = (bound[k+1] – bound[k])*r + bound[k]
found = true;
}
else {
beg = end;
end = end + prob[k+1];
k++;
}
}
//OR: findValueSlot(r, 0, prob[0], fakedata[][j][file], bound[])
}
}
findValueSlot(beg, end, fakedata[][][], bound[]) {
for (k = 0, k <= numBounds - 1, k++) {
if (beg < r <= end) {
//generate fakedata[i][j] value in between
//bound[k] and bound[k+1]
r = rand()
fakedata[i][j][file] = (bound[k+1] – bound[k])*r + bound[k]
return
}
beg = end
end = end + prob[k+1]
}
}
//Assume that the input file has a small number of possible values for this feature.
//j: column number
// vals[]: the array with possible values; vals[i] is the ith value
// prob[]: probability of each possible values; ; prob[i] = probability of ith value
//numPosVals: size of vals[]
find_values_and calculate_probabilities(j, vals[], prob[], numPosVals) {
vals[] = NIL;
k= 0;
for (i = 1; i<=M; i++) {
if ((z= Search(s[i][j], vals[], 0, numPosVals-1)) == NIL) {
//value not in vals[]
vals[k] = s[i][j]
prob[k]++
k++
}
else {
prob[z]++
}
}
for (i=0, k-1) {
prob[i]=prob[i]/M
}
numPosVals = k
}
//j: column number
// values[]: the array with possible values; values[i] is the ith value
//prob[]: array with probability of each value; prob[i] = probability of ith value
//numPosVals: size of values
//file: filenumber
//M: number of input samples
//N: number of columns
//fakedata: file with fake data
generate_fake_nonnumeric_data(j, values[], prob[], numPosVals, file, M, N, fakedata[][][file])
for (i = 1, M) {
r = rand()
//take care of the beginning and end value of r
if r == 0
fakedata[i][j][file] = vals[0]
if r == 1
fakedata[i][j][file] = vals[numPosVals-1]
beg = 0
end = prob[numPosVals-1]
for (k = 0, k <= numPosVals-1)) {
if (beg <= r < end )
fakedata[i][j][file] = vals[k]
break
}
beg = end
end = end + prob[k+1]
}
}
}
// search function that looks for value v in array A[]
// first: the index of the element from which to begin searching
// last: index of the element from which to stop
// returns NIL if the value is not found, or the index if found
Search(v, A[], first, last) {
while (first <= last) {
mid = floor((first+last)/2)
if v == A[mid]
return mid
if v > A[mid]
first = mid+1
if v < A[mid]
last = mid-1
}
}