-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgetProbabilitiesFromFit.m
59 lines (40 loc) · 2.09 KB
/
getProbabilitiesFromFit.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
function [ probSequenced observedGenes totalGenesExpressed m v] = getProbabilitiesFromFit( data, totalReads )
%{
This function returns the probability of genes being captured per read
` sequenced as well as the total number expressed.
The probablility that each gene will be expressed is estimated as
the number of reads found for the gene divided by the total number
of genes.
However, not all expressed genes will be observed if the pilot data
is not sequenced deeply enough. The probability for these sequences
are filled in by fitting the data to a zero truncated Poisson-
Lognormal distribution using maximum likelihood estimation.
data = read counts per gene for the pilot data data
totalReads = the number of reads in the pilot data
%}
%seed the random number generators so we aren't all over the place,
%people
s = RandStream('mt19937ar','seed',4503);
RandStream.setDefaultStream(s);
poissLogNormal_cdf = @(x,mu, sigma) zeroTruncatedPoissonLognormalCDF(x,mu, sigma);
dataX=data(data>0);
[params,lambdaCI] = mle(dataX, 'cdf',poissLogNormal_cdf, 'start',[0 var(data)], 'lower',[0 0]);
[mu sigma]=getMuSigmaLognormal( params(1), params(2)^2);
%how many zeros are there? estimate fraction by bootstrapping - this
%workds better than the cdf
observedGenes=sum(data>0);
x=poissrnd(random('lognormal', mu, sigma, 100000,1));
unobservedGenePortion=sum(x==0)/100000;
totalGenesExpressed=round(observedGenes/(1-unobservedGenePortion));
modelDist=lognrnd(mu, sigma, totalGenesExpressed,1);
if length(data)>length(modelDist)
modelDist=[zeros(length(data)-length(modelDist), 1); modelDist];
end
modelDist=lognrnd(mu, sigma, size(data,1),1);
modelProb=modelDist/totalReads;
modelProb=sort(modelProb, 'descend');
modelProb=modelProb(1:size(data,1));
modelProb=sort(modelProb, 'ascend');
probSequenced=sort(data/totalReads, 'ascend');
probSequenced(1:sum(data<=1))=modelProb(1:sum(data<=1));
end