from multiprocessing import Process
import math
import sys, numpy, random, itertools
import scipy.stats as stats


'''
this script will work only with python 2.6+

input data is comma separated values (csv).
for example, here is a csv file of gene expression where
each column is a condition and each row is a gene:

sampleA,sampleB,sampleC,sampleD
12,43,412.123,24.12
54,21,541.23,123
54.321,43.12,231,53

to use the script:
python multiThreadBootstrap.py [file.csv] [number of threads] [number of randomly chosen pairs] [number of bootstrap rounds]
'''

if len(sys.argv) == 1:
    print
    print 'usage:'
    print 'python multiThreadBootstrap.py [file.csv] [number of threads] [number of randomly chosen pairs] [number of bootstrap rounds]'
    print
    sys.exit()

dataFile = open(sys.argv[1],'r')
libs = dataFile.next().strip().split(',')

data = {}
for lib in libs:
    data[lib] = []

for line in dataFile:
    values = line.strip().split(',')
    for i in range(len(libs)):
        data[libs[i]].append(float(values[i]))

#get a list of comparisons to perform the bootstrap.
comps = list(itertools.combinations(libs,2))

'''
the bootstrap function to calculate correlation.
samples - number of random samples to take from the data.
rounds - the number of times to take random samples.
'''
def bootstrapCorrelation(comps, d, samples, rounds):
    for comp in comps:
        data = [d[comp[0]],d[comp[1]]]
        output = []
        dataLen = len(data[0])
        for r in range(rounds):
            samplesA = []
            samplesB = []
            for s in range(samples):
                index = random.randrange(0,dataLen - 1)

                samplesA.append(data[0][index])
                samplesB.append(data[1][index])

            #this function returns a tuple of correlation value and p-value.
            #we are just taking the correlation value.
            output.append(stats.spearmanr(numpy.array(samplesA),numpy.array(samplesB))[0])

        output.sort()
        outFile = open('bootstrap.' + comp[0] + "." + comp[1],'w')
        outFile.write('\n'.join([str(x) for x in output]))

threads = int(sys.argv[2])
numSamples = int(sys.argv[3])
numRounds = int(sys.argv[4])

bin = int(math.ceil(len(comps) / threads))

for i in range(0, len(comps), bin):

    p = Process(target=bootstrapCorrelation, args=(comps[i:i+bin],data,numSamples,numRounds))
    p.start()