Pseudocode for bootstrapping

Input: flat file of size MxN, containing “real” data. Assume that M is not too large.

Output: user-specified number of files of size MxN which contain “fake” data based on the real input data.

High-level pseudocode:

If the input data is numeric,

generate output data based on the mean and stdev of the input data;

else

generate output data based on the similarity measure.

Pseudocode draft 1:

For each column:

If the input data is numeric,

//generate output data based on the mean and stdev of the input data:

calculate mean and stdev

classify input data based on mean and stdev

calculate probabilities of input data classification for each class

generate fake data

else

//generate output data based on the similarity measure

find out which data values are present

calculate probabilities of input data classification for each class

generate fake data

Pseudocode draft 2:

Assume that the input data is called s[i][j], i=1, …,M, j = 1, .., N.

Assume that the fake data are kept in file fakedata[i][j][filenumber], i=1, …,M, j = 1, .., N, filenumber = 1, .., Z, and Z is assigned by the user.

Assume numBounds = 4 (i.e. there are 4 bounds and thus 3 bins: min, mean-stdev, mean+stdev, max)

for (file = 1, Z) { // generate Z fake data files of size MxN

for j = 1, N //go through each column

if numeric(j),

calculate_mean_and_stdev(j, mean, stdev)

classify_numeric_data(j, bounds[])

calculate_numeric_probabilities (j, mean, stdev, bounds[], prob[])

generate_fake_numeric_data(j, bounds[], prob[], file, M, N, fakedata[][][file])

else

find_values_and_calculate_probabilities(j, vals[], prob[], numPosVals)

generate_fake_nonnumeric_data(j, values[], prob[], numPosVals, file, M, N, fakedata[][][file])

}

}

//calculate mean and stdev for column j

calculate_mean_and_stdev(j, mean, stdev) {

mean =0

stdev = 0

max = +2^30

min = -2^30

for i = 1, M { //go through each row

mean = mean + …

max = ….

min = …

}

mean = mean/M – or is it N? think about it J

for i = 1, M { //go through each row

stdev = stdev + …

}

stdev = sqrt( …)

}

// classify input data based on mean and standard deviation such that:

// the first class is from min[j] to mean[j] – stdev[j];

// the second class is from mean[j] – stdev [j] to mean[j] + stdev [j];

// the third class is from mean[j] + stdev [j] to max[j];

classify_numeric_data(j, bounds) {

bounds[0] = min[j]

bounds[1] = mean[j]-stdev[j],

bounds[2] = mean[j] + stdev[j]

bounds[3] = max[j]

}

//calculate probabilities of input data classification for each class

calculate_numeric_probabilities (j, mean, stdev, bounds[], prob[]) {

int bin[2];

for i = 0, i <= 2, i++) {

bin[i] =0;

}

bool found = false;

for i = 1, M { //go through each row

while(!found) {

for (i = 0; i <=2; i++) {

if (s[i][j] >= bound[i] && s[i][j] <= bound[i+1] ) {

bin[i]++

found = true;

}

}

}

}

for (i = 0; i <=2; i++) {

prob[i] = bin[i]/M;

}

}

//j: column number

// bounds[]: the array with bounds between the bins (see classify_numeric_data //for definitions)

//numPosVals: size of values

//file: filenumber

//M: number of input samples

//N: number of columns

//fakedata: file with fake data

generate_fake_numeric_data(j, bounds[], prob[], file, M, N, fakedata[][][file]) {

for (i = 1, i <= M, i++) {

r = rand()

if r == 0 //take care of the borderline cases

fakedata[i][j][file] = min[j] //i.e. bound[0]

if r == 1

fakedata[i][j][file] = max[j] //i.e. bound[3]

//find which bin this value belongs to

beg = 0;

end = prob[0];

k=0;

bool found = false;

while (!found && k <= numBounds) {

if (beg < r <= end) {

//generate fakedata[i][j] value in between

//bound[k] and bound[k+1]

r = rand()

fakedata[i][j][file] = (bound[k+1] – bound[k])*r + bound[k]

found = true;

}

else {

beg = end;

end = end + prob[k+1];

k++;

}

}

//OR: findValueSlot(r, 0, prob[0], fakedata[][j][file], bound[])

}

}

findValueSlot(beg, end, fakedata[][][], bound[]) {

for (k = 0, k <= numBounds - 1, k++) {

if (beg < r <= end) {

//generate fakedata[i][j] value in between

//bound[k] and bound[k+1]

r = rand()

fakedata[i][j][file] = (bound[k+1] – bound[k])*r + bound[k]

return

}

beg = end

end = end + prob[k+1]

}

}

//Assume that the input file has a small number of possible values for this feature.

//j: column number

// vals[]: the array with possible values; vals[i] is the ith value

// prob[]: probability of each possible values; ; prob[i] = probability of ith value

//numPosVals: size of vals[]

find_values_and calculate_probabilities(j, vals[], prob[], numPosVals) {

vals[] = NIL;

k= 0;

for (i = 1; i<=M; i++) {

if ((z= Search(s[i][j], vals[], 0, numPosVals-1)) == NIL) {

//value not in vals[]

vals[k] = s[i][j]

prob[k]++

k++

}

else {

prob[z]++

}

}

for (i=0, k-1) {

prob[i]=prob[i]/M

}

numPosVals = k

}

//j: column number

// values[]: the array with possible values; values[i] is the ith value

//prob[]: array with probability of each value; prob[i] = probability of ith value

//numPosVals: size of values

//file: filenumber

//M: number of input samples

//N: number of columns

//fakedata: file with fake data

generate_fake_nonnumeric_data(j, values[], prob[], numPosVals, file, M, N, fakedata[][][file])

for (i = 1, M) {

r = rand()

//take care of the beginning and end value of r

if r == 0

fakedata[i][j][file] = vals[0]

if r == 1

fakedata[i][j][file] = vals[numPosVals-1]

beg = 0

end = prob[numPosVals-1]

for (k = 0, k <= numPosVals-1)) {

if (beg <= r < end )

fakedata[i][j][file] = vals[k]

break

}

beg = end

end = end + prob[k+1]

}

}

}

// search function that looks for value v in array A[]

// first: the index of the element from which to begin searching

// last: index of the element from which to stop

// returns NIL if the value is not found, or the index if found

Search(v, A[], first, last) {

while (first <= last) {

mid = floor((first+last)/2)

if v == A[mid]

return mid

if v > A[mid]

first = mid+1

if v < A[mid]

last = mid-1

}

}