from multiprocessing import Process import time import sys, numpy, random, itertools import scipy.stats as stats startTime = time.time() ''' this script will work only with python 2.6+ input data is comma separated values (csv). for example, here is a csv file of gene expression where each column is a condition and each row is a gene: sampleA,sampleB,sampleC,sampleD 12,43,412.123,24.12 54,21,541.23,123 54.321,43.12,231,53 to use the script: python multiThreadBootstrap.py [file.csv] [number of threads] [number of randomly chosen pairs] [number of bootstrap rounds] ''' if len(sys.argv) == 1: print print 'usage:' print 'python multiThreadBootstrap.py [file.csv] [number of threads] [number of randomly chosen pairs] [number of bootstrap rounds]' print sys.exit() dataFile = open(sys.argv[1],'r') libs = dataFile.next().strip().split(',') data = {} for lib in libs: data[lib] = [] for line in dataFile: values = line.strip().split(',') for i in range(len(libs)): data[libs[i]].append(float(values[i])) #get a list of comparisons to perform the bootstrap. comps = list(itertools.combinations(libs,2)) ''' the bootstrap function to calculate correlation. samples - number of random samples to take from the data. rounds - the number of times to take random samples. ''' def bootstrapCorrelation(comp, data, samples, rounds): output = [] dataLen = len(data[0]) for r in range(rounds): samplesA = [] samplesB = [] for s in range(samples): index = random.randrange(0,dataLen - 1) samplesA.append(data[0][index]) samplesB.append(data[1][index]) #this function returns a tuple of correlation value and p-value. #we are just taking the correlation value. output.append(stats.spearmanr(numpy.array(samplesA),numpy.array(samplesB))[0]) output.sort() outFile = open('bootstrap.' + comp[0] + "." + comp[1],'w') outFile.write('\n'.join([str(x) for x in output])) threads = int(sys.argv[2]) numSamples = int(sys.argv[3]) numRounds = int(sys.argv[4]) for i in range(0, len(comps), threads): for comp in comps[i:i+threads]: vecs = [data[comp[0]],data[comp[1]]] p = Process(target=bootstrapCorrelation, args=(comp,vecs,numSamples,numRounds)) p.start() print 'bootstrap finished: ', time.time()-startTime