from multiprocessing import Process import math import sys, numpy, random, itertools import scipy.stats as stats ''' this script will work only with python 2.6+ input data is comma separated values (csv). for example, here is a csv file of gene expression where each column is a condition and each row is a gene: sampleA,sampleB,sampleC,sampleD 12,43,412.123,24.12 54,21,541.23,123 54.321,43.12,231,53 to use the script: python multiThreadBootstrap.py [file.csv] [number of threads] [number of randomly chosen pairs] [number of bootstrap rounds] ''' if len(sys.argv) == 1: print print 'usage:' print 'python multiThreadBootstrap.py [file.csv] [number of threads] [number of randomly chosen pairs] [number of bootstrap rounds]' print sys.exit() dataFile = open(sys.argv[1],'r') libs = dataFile.next().strip().split(',') data = {} for lib in libs: data[lib] = [] for line in dataFile: values = line.strip().split(',') for i in range(len(libs)): data[libs[i]].append(float(values[i])) #get a list of comparisons to perform the bootstrap. comps = list(itertools.combinations(libs,2)) ''' the bootstrap function to calculate correlation. samples - number of random samples to take from the data. rounds - the number of times to take random samples. ''' def bootstrapCorrelation(comps, d, samples, rounds): for comp in comps: data = [d[comp[0]],d[comp[1]]] output = [] dataLen = len(data[0]) for r in range(rounds): samplesA = [] samplesB = [] for s in range(samples): index = random.randrange(0,dataLen - 1) samplesA.append(data[0][index]) samplesB.append(data[1][index]) #this function returns a tuple of correlation value and p-value. #we are just taking the correlation value. output.append(stats.spearmanr(numpy.array(samplesA),numpy.array(samplesB))[0]) output.sort() outFile = open('bootstrap.' + comp[0] + "." + comp[1],'w') outFile.write('\n'.join([str(x) for x in output])) threads = int(sys.argv[2]) numSamples = int(sys.argv[3]) numRounds = int(sys.argv[4]) bin = int(math.ceil(len(comps) / threads)) for i in range(0, len(comps), bin): p = Process(target=bootstrapCorrelation, args=(comps[i:i+bin],data,numSamples,numRounds)) p.start()