function [TPM,A,s_2,S,warnings] = BOOTPM(D, states, N, M, per1, per2, subsample)
%
%BOOTPM estimates transition probabilities and their standard deviations both theoretically and using the bootstrap technique.
%
%It creates N samples sampling -with replacement- M observations and then computes the relative N TPM's.
%Then, it simply computes the standard deviation of each transition probability using the distribution across all the N TPM's.
%It also compares the boostrapped results to results estimated thoeretically, to detect "strange" results.
%It allows the user to choose a subsample of the data as well as an alternative specification of the states.
%Editing the script, the user can activate different sections (to plot or save A and S, to change this function
%into an interactive script, to test the script, to compute other statistics, to customize it...)
%
%[TPM,A,S,warnings] = BOOTPM(D, states, N, M, per1, per2, subsample)
% D = original data. They must contain columns for the state at period 1, columns for the state at period 2.
% They can contain columns with dummy 0/1 where 1 means "belongs to the subsample". Each row must correspond to an observation.
% There is no need for a column with the identifier and the order of the columns and of the rows is not important.
%
% states = number of states: they have to be named as subsequent integers starting from 1
% (e.g. for 4 states, they must be named as 1,2,3,4 and states=4)
%
% N = number of "bootstrapped" samples.
%
% M = number of observations to be "bootstrapped" for each one of the N samples.
%
% per1 = the column of the data containing the observations relative to the first period of the TPM.
%
% per2 = the column of the data containing the observations relative to the second period of the TPM.
%
% subsample = the column of the data containing the dummy 0/1 where 1 means "belongs to the subsample"
% (set it to 0 if you process the whole dataset)
%
% TPM = the estimated transition matrix
%
% A = matrix containing the average of each transition probability computed from the distribution across all the TPM's
% obtained from the N samples.
%
% s_2 = matrix containing the standard deviation of each transition probability computed as (Bode, 1998):
% s_pij=sqrt[pij*(1-pij)/ni] pij=estimated probability for the cell (i,j); ni=numbe rof observations for state i.
% NOTICE: if the state i is absorbing, this formula reports stddev of i=0
%
% S = matrix containing the standard deviation of each transition probability computed from the distribution across all
% the TPM's obtained from the N samples.
% NOTICE: the standard deviation S is going to be computed as a SAMPLE STD DEV (i.e. dividing by n-1).
% To change this, just edit the script.
% Moreoveor, S is also computed with respect to the average "bootstrapped" probability (matrix A)
% and NOT with respecxt to the estimated probability (matrix TPM).
%
% warnings = matrix of the warnings concerning the process. The typology of the warning is reported in the first column,
% while the others have different meanings according to the typology.
% WARNINGS LEGEND:
% TYPE 1: there is at least one sample where row j1 was empty. 2nd column=row, 3rd=just a 0, 4th=sample.
% TYPE 2: there is at least one sample where row j1 has j2 as an absorbing state. 2nd column=row, 3rd=abs. state, 4th=sample.
% TYPE 3: this indicates |TPM-A|>0.05 at row j1 and column j2. 2nd column = row, 3rd = column, 4th = the difference.
% TYPE 4: this indicates |S-s_2|>0.02 at row j1 and column j2. 2nd column = row, 3rd = column, 4th = the difference.
% I know everything here can be probably written in a better way, more elegantly and leading to faster computations.
% However, I wrote this code just because I could not find on the internet any other MatLab script performing these computations and at the same time being
% very easy to customize even for "unskilled developers" like me. I'm throwing this in the internet just for those who could have the same need.
% Of course, I assume NO responsibility for errors in the code or other damages resulting from its usage.
% You are very welcome to modify it. If you do so or simply have feedbacks, please, let me know.
% If you use it, I would really appreciate a reference.
% 04/04/2007 - Francesco Rullani
% IVS, Dept. of Industrial Economics and Strategy
% Copenhagen Business School
% Kilevej 14A, 2000 Frederiksberg, Denmark.
% Version: 2 - May 13, 2007
% REFERENCES
% Anderson, T. W. and L. A. Goodman (1957), Statistical inference about Markov chains, Annals of Mathematical Statistics, 28(1), 89-110.
% Bickenbach, F. and E. Bode (2001), Markov or Not Markov - This Should Be a Question, Kiel Working Paper No. 1086, Kiel Institute of World Economics, December.
% Bode, E. (1998), Lokale Wissensdiffusion und regionale Divergenz in Deutschland, Kieler Studien 293, Tübingen: Mohr.
% %%%%%%%%%%%%%%%%%%%%TO TEST THE SCRIPT (to test the script: uncomment here and the last sections, comment the "function" line (i.e. the first line))
% vec=sprintf(' ');disp(vec)
% vec=sprintf(' ');disp(vec)
% vec=sprintf('%%%%%%%%%%%%%%%%%%%%Bootstrapping the TMP%%%%%%%%%%%%%%%%%%%%');disp(vec)
% vec=sprintf(' ');disp(vec)
% clear all;
% N=500;
% M=1000;
% OriginalNobs=1000;
% states=3;
% per1=1;
% per2=2;
% subsample=0;
% D(:,1)=(fix(rand(1,OriginalNobs)*states)+1)';
% D(:,2)=(fix(rand(1,OriginalNobs)*states)+1)';
% %to test the subsample process
% % subsample=1;
% % D(:,3)=(round(rand(1,OriginalNobs)))';
% vec=sprintf(' ');disp(vec)
% if OriginalNobs<200
% vec=sprintf('Here are the data for the test:');disp(vec)
% disp(D)
% vec=sprintf(' ');disp(vec)
% else
% vec=sprintf('To see the data for the test type "D".');disp(vec)
% vec=sprintf(' ');disp(vec)
% end
% %%%%%%%%%%%%%%%%%%%INTERACTIVE SCRIPT (to change this function into an interactive script: uncomment here and the last sections, comment the "function" line (i.e. the first line))
% vec=sprintf(' ');disp(vec)
% vec=sprintf(' ');disp(vec)
% vec=sprintf('%%%%%%%%%%%%%%%%%%%%Bootstrapping the TMP%%%%%%%%%%%%%%%%%%%%');disp(vec)
% vec=sprintf(' ');disp(vec)
% clear all;
% N=input('How many samples you want to "bootstrap"? ');
% M=input('How many observations for each one of the N "bootstraped" samples? ');
% states=input('How many states (remember: they have to be named as subsequent integers starting from 1)? ');
% per1=input('Which column of the original data contains the states at peirod 1? ');
% per2=input('Which column of the original data contains the states at peirod 2? ');
% vec=sprintf('Do you want to run the analysis on a subsample?"');disp(vec)
% subsample=input('--> (0=NO, otherwise type the number of the column of the dummy = 1 if "belongs to the subsample"):');
% %loads data
% load data.txt
% D=data(:,:);
% clear data
%%%%%%%%%%%%%%%%%%%%%%BASIC MATRIX
%if there it is for subsample, cuts the excluded observations
if subsample==0
%creates the identifier
id = (1:length(D(:,1)))';
%extracts from D the needed data
X=[id,D(:,per1), D(:,per2)];
else
indexXsub=find(D(:,subsample)==1);
%extracts from D the needed data
Xsub=D(indexXsub,:);
%creates the identifier
id = (1:length(Xsub(:,1)))';
X=[id,Xsub(:,per1), Xsub(:,per2)];
clear Xsub
end
%store the total number of observations into "nobs", and clears D
nobs=length(X(:,1));
clear D
w=1; %this is just the index variable to manage the warnings
%%%%%%%%%%%%%%%%%%%%%%ESTIMATING THE "BASIC" TRANSITION PROBABILITIES MATRIX (Anderson and Goodman's -1957- maximum likelihood estimator)
%for each combination of states
for j1=1:states
for j2=1:states
%computes the number of observations for that combination and writes it to the temporary vector TPMj1
statej12=find(X(:,2)==j1 & X(:,3)==j2);
TPMj1(1,j2)=length(statej12);
end
%transfers to TMP the results stored in the temporary vector TPMj1
%if there are no observations for row j1 then transfers to TPM just a row of zeros, then clears TPMj1
if sum(TPMj1(1,:))==0
TPM(j1,:)=TPMj1(1,:);
else
%transfers to TPM the results stored in the temporary vector TPMj1 dividing them for the sum of the observations in the j1-th row, then clears TPMj1
TPM(j1,:)=TPMj1(1,:)/sum(TPMj1(1,:));
end
%stores the number of observations per row
ni(j1,1)=sum(TPMj1(1,:));
clear TPMj1
end
clear j1 j2 statej12
%%%%%%%%%%%%%%%%%%%%%%STANDARD DEV as s_pij=sqrt[pij*(1-pij)/ni] (Bode, 1998; as in reported in Bickenbach and Bode, 2001)
% NOTICE: if the state i is absorbing, this formula reports stddev of i=0
%for each row of the matrix compute the standard deviation
for t=1:length(TPM)
s_2(t,:)=sqrt(TPM(t,:).*(1-TPM(t,:))/ni(t));
end
%%%%%%%%%%%%%%%%%%%%%%BOOTSTRAPING OBSERVATIONS
%for everyone of the N samples
for s=1:N
%uncomment this if you need to track the state of the process
vec=sprintf('Bootstrapping the %3.2f per cent of the N samples', (s/N*100));disp(vec)
%for everyone of the M observations
for o=1:M
%choose a random number between 0 and 1
rm=rand;
%"fix" rounds the number to the integer part. The "+1" is needed because it transofrms a 0 into a 1 and a nobs-1 into a nobs,
%but what if we get exactly rm=1? in this case, with a neglible bias, we can fix it equal to nobs
if rm==1
obs_i=nobs;
else
obs_i=fix(nobs*rm)+1;
end
%stores the random row into a matrix relative to Nth sample, and then clears obs_i and rm
X_s(o,:)=X(obs_i,:);
clear obs_i rm
end
%%%%%%%%%%%%%%%%%%%%%%ESTIMATING THE "BOOTSTRAPPED" TRANSITION PROBABILITIES (Anderson and Goodman's -1957- maximum likelihood estimator)
%for each combination of states
for j1=1:states
for j2=1:states
%computes the number of observations for that combination and writes it to the temporary vector Pj1
statej12=find(X_s(:,2)==j1 & X_s(:,3)==j2);
Pj1(1,j2)=length(statej12);
end
%transfers to P the results stored in the temporary vector Pj1
%if there are no observations for row j1 then transfers to P just a row of zeros and set the warning to 1 reporting also j1 and s
if sum(Pj1(1,:))==0
P(j1,:,s)=Pj1(1,:);
warn(w,:)=[1,j1,0,s];
w=w+1;
clear Pj1
else
%transfers to P the results stored in the temporary vector Pj1 dividing them for the sum of the observations in the j1-th row, then clears Pj1
P(j1,:,s)=Pj1(1,:)/sum(Pj1(1,:));
%if there is an absorbing state (Prob==1) in row j1 then transfers to P just a row of zeros and set the warning to 2 reporting also j1 and s
absor=find(P(j1,:,s)==1);
if absor>0
warn(w,:)=[2,j1,absor,s];
w=w+1;
else
end
clear Pj1
end
end
%clears X_s
clear X_s
end
%%%%%%%%%%%%%%%%%%%%%%COMPUTES THE MAIN STATISTICS ON THE BOOTSTRAPPED SAMPLES
%Standard deviation is going to be computed as a SAMPLE STD DEV (i.e. dividing by n-1)
%To change this, just comment here and uncomment below.
S=std(P,0,3);
% %standard deviation computed as dividing by n. To change this, just comment here and uncomment above.
% S=std(P,1,3)
%NOTICE: The standard deviation is also computed with respect to the average "bootstrapped" probability (matrix A)
%and NOT with respect to the estimated probability (matrix TPM).
A=mean(P,3);
% if you want to include other statistics, just write the command here remembering that:
% 1) they have to be computed along the 3rd dimension of the matrix P, i.e. P(:,:,here), because P(:,:,s) is the matrix for the s-th sample
% 2) you have to add them to the definition of the function in the first line of the script after TPM, A and S
%%%%%%%%%%%%%%%%%%%%%%COMPUTES THE DIFFERENCES BETWEEN ESTIMATED AND BOOTSTRAPPED VALUES
% computes the difference between TPM and A to detect "strange" situations and to store them in the warnings
diffTA=TPM-A;
maxdiff=0.05;
% detects the rows and columns of the "strange" situations
[diffj1,diffj2]=find(diffTA>maxdiff | diffTA<-maxdiff);
% detects the position of the "strange" situations
diffpos=find(diffTA>maxdiff | diffTA<-maxdiff);
difference=diffTA(diffpos);
% stores them in the warnings
if length(diffj1)>0
warn(w:w+length(diffj1)-1,:)=[3*ones(length(diffj1),1),diffj1,diffj2,difference];
else
end
clear diffTA maxdiff diffj1 diffj2 diffpos difference
% computes the difference between S and s_2 to detect "strange" situations and to store them in the warnings
% NOTICE: if the state i is absorbing, this formula reports stddev of i=0
diffSs=S-s_2;
maxdiff=0.02;
% detects the rows and columns of the "strange" situations
[diffj1,diffj2]=find(diffSs>maxdiff | diffSs<-maxdiff);
% detects the position of the "strange" situations
diffpos=find(diffSs>maxdiff | diffSs<-maxdiff);
difference=diffSs(diffpos);
% stores them in the warnings
if length(diffj1)>0
warn(w:w+length(diffj1)-1,:)=[4*ones(length(diffj1),1),diffj1,diffj2,difference];
else
end
clear diffSs maxdiff diffj1 diffj2 diffpos difference
%%%%%%%%%%%%%%%%%%%%%%ORDERS THE WARNINGS
if w>1
warnings=sortrows(warn);
else
warnings=0;
end
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% ------------TO TEST THE SCRIPT OR MAKE IT INTERACTIVE, UNCOMMENT FROM HERE ON ---------------%
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% %%%%%%%%%%%%%%%%%%%%%%DISPLAY RESULTS
% vec=sprintf(' ');disp(vec)
% vec=sprintf('Here are the final matrixes TPM and s_2 (estimated), A and S (bootstrapped):', w-1);disp(vec)
%
% vec=sprintf(' ');disp(vec)
% TPM
% vec=sprintf(' ');disp(vec)
% A
% vec=sprintf(' ');disp(vec)
% s_2
% vec=sprintf(' ');disp(vec)
% S
%
% vec=sprintf(' ');disp(vec)
% vec=sprintf('%NOTICE: if the state i is absorbing, s_2 reports stddev of i=0');disp(vec)
%
%
% %%%%%%%%%%%%%%%%%%%%%%WARNINGS MANAGEMENT
% if w>1
% vec=sprintf(' ');disp(vec)
% vec=sprintf('There are %8.0f warnings concerning the process. To see them type "warnings".', w-1);disp(vec)
% vec=sprintf('The typology of the warning is reported in the first column, while the others have different meanings according to the typology.');disp(vec)
% vec=sprintf(' ');disp(vec)
% vec=sprintf('WARNINGS LEGEND:');disp(vec)
% vec=sprintf('TYPE 1: this indicates the process has generated at least one sample where row j1 was empty. 2nd column = row, 3rd = just a 0, 4th = sample');disp(vec)
% vec=sprintf('TYPE 2: this indicates the process has generated at least one sample where row j1 has j2 as an absorbing state. 2nd column = row, 3rd = abs. state, 4th = sample');disp(vec)
% vec=sprintf('TYPE 3: this indicates |TPM-A|>0.05 at row j1 and column j2. 2nd column = row, 3rd = column, 4th = the difference');disp(vec)
% vec=sprintf('TYPE 4: this indicates |S-s_2|>0.02 at row j1 and column j2. 2nd column = row, 3rd = column, 4th = the difference');disp(vec)
% vec=sprintf(' ');disp(vec)
% else
% end
%
% %%%%%%%%%%%%%%%%%%%%%%FIGURES
% figure(1)
% mesh(TPM(:,:))
% title('The TPM')
% xlabel('final states');
% ylabel('initial states');
% zlabel('probability');
%
% figure(2)
% mesh(A(:,:))
% title('Average of the probabilities in TPM')
% xlabel('final states');
% ylabel('initial states');
% zlabel('average probability');
%
% figure(3)
% mesh(s_2(:,:))
% title('Standard Deviation of the probabilities in TPM (estimated)')
% xlabel('final states');
% ylabel('initial states');
% zlabel('std. dev.');
%
% figure(4)
% mesh(S(:,:))
% title('Standard Deviation of the probabilities in A (bootstrapped)')
% xlabel('final states');
% ylabel('initial states');
% zlabel('std. dev.');
%
%
% %%%%%%%%%%%%%%%%%%%%SAVE RESULTS
% dlmwrite('TPM.txt',TPM, '\t'); %save "TPM"
% dlmwrite('AvgProb.txt',A, '\t'); %save "A"
% dlmwrite('StdevEst.txt',s_2, '\t'); %save "s_2"
% dlmwrite('StdevBoot.txt',S, '\t'); %save "S"