# S-Plus script developed by Professor Alexander McNeil, A.J.McNeil@hw.ac.uk
# R-version adapted by Scott Ulman (scottulman@hotmail.com)
# This free script using QRMLib is distributed in the hope that it will be useful, 
# but WITHOUT ANY WARRANTY; without even the implied warranty of 
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the 
# GNU General Public License for more details. 

######Load the QRMlib and sp500 data set##################
#QRMlib.pdf is a help file for the functions used by QRMlib.  It is available at
#...\Program Files\R\R-2.6.0\library\QRMlib\Docs
#If you have created the QRMBook workspace and .Rprofile  as described in QRMlib.pdf
#topics 'QRMBook-workspace' and 'profileLoadLibrary', then you may comment out the
#following line:
library(QRMlib);
#if you have previously opened the sp500 timeSeries AND 
#saved the workspace, you may comment out the following line:
data(sp500);
#Alternatively, if you want to load the dataframe instead of timeSeries,
#activate the following line:
#data(sp500.df);
#################################################


#SU: This exercise refers to the Sylized facts in section 4.1.1 of QRM, p. 117.
################# Get some index data
#Select only data after 01/01/1998 and on or before 12/31/2003:
# In R, you must have the year first in an ISO-format: 
#Through R-2.5.1, timeSeries class originally belong in package fCalendar. 
#Version 221.10065 used cutSeries()method to select data only between the 'to' and 'from' dates. 
#Version 240.10068 used cut(). Both required the day PRIOR to desired start in "from".
#R-2.6.0. RMetrics 260.72 moved timeSeries to fSeries from fCalendar. Used window() in place of cut().
#No longer need prior date:
index <- window(sp500,"1998-01-01", "2003-12-31");

#The mk.returns() function in R differs from its S-Plus counterpart.  You MUST pass a timeSeries
#R-Metrics type object as the argument to the function or you will get an error returned.
rseries <- mk.returns(index);
#You must cast the timeSeries to a list to report the summary values (Min, 1st Quantile, Median,
#Mean, 3rd Quantile, and Max):
summary(as.list(rseries));

#Extract the data field only (not the positions which are the dates)
data <- seriesData(rseries);

#Show the number of rows.  The S-Plus numRows()must be replaced with the R-function nrow()which uses 
#only the Data values from the timeSeries object:
#numRows(rseries) is the S-Plus version:
nrow(data); # is the R-version


#Fit extracted data to student-t distribution using the function fit.st()in functionsUnivariate.R from the library.
#fit.st() returns a list telling whether convergence occurred, the parameter estimates, the std error of
#parameter estimates, the asymptotic covariance matrix, and the value of the maximize loglikelihood as 
#shown in the following list() command:
#list(converged=converged,par.ests=par.ests, par.ses=par.ses, asymp.cov=asymp.cov, ll.max=loglh.max)
tmp <- fit.st(data);
#Test if convergence occurred:
tmp$converged;
#Observe parameter estimates
tmp$par.ests;


#simulate a set of random variables from the student-t distribution using parameters estimated from the data:
set.seed(19);
tdata <- -(tmp$par.ests[2]+tmp$par.ests[3]*rt(length(data),df=tmp$par.ests[1]));
#simulate a set of random variable from a normal distribution using sample moments from data as inputs:
ndata <- rnorm(length(data),mean(data),sqrt(var(data)));


#Contrast simulated data with iid data from SP500 time series
par(mfrow=c(3,1));
#S-Plus code used the ts.plot() method and must be modified for R: use the plot() function instead.  
#Use the type="h" with plot to get histogram thin lines in R-language:
plot(data,ylim=range(ndata,tdata,data),main="Real", type="h");
plot(ndata,ylim=range(data,ndata,tdata),ylab="",main="Normal",type="h");
plot(tdata,ylim=range(data,ndata,tdata),ylab="",main="Student t",type="h");
#return to one graph per sheet:
par(mfrow=c(1,1));

par(mfcol=c(3,2));
#S-Plus uses ACF plots (acf.plot()) which do not seem to exist in R 
#(although ACF does). ACF stands for Auto Correlation Function.  We will merely use the R-language acf()
#function with plot=TRUE while passing only data from time series via timeSeries@Data.
#ACF plots under S-Plus
#out <- acf(rseries,plot=FALSE)
#acf.plot(out,main="Real")
#Replaced by these under the R-language
acf(data,plot=TRUE, type="correlation");
acf(ndata,plot=TRUE, type="correlation");
acf(tdata,plot=TRUE, type="correlation");
acf(abs(data),plot=TRUE, type="correlation");
acf(abs(ndata),plot=TRUE, type="correlation");
acf(abs(tdata),plot=TRUE, type="correlation");
par(mfcol=c(1,1));


#Heavy tails as usual
qqnorm(data,ylab="Real");

#Clustered extreme values
#First examine iid data generated by random simulations for the t distribution: tdata.
kval=50; #set threshold so we have 50 observations above threshold
#We will build a QQplot with theoretical exp distribution; its rate will be: 
rate <- kval/length(tdata); #use in the QQplot below
#find threshold using function in functionsEVT.R
u <- findthreshold(tdata,kval);

#In S-Plus, exceedance times vector is filled with index values (from 1 to 1505) from tdata vector
#where values tdata[index] exceed u, the threshold.  
# We rename it to exceedance.indices for clarity since it represents a vector position (not a time):
exceedance.indices <- (1:length(tdata))[tdata>u];
#Fill exceedance.marks vector with all data items from tdata which exceed the threshold value of u:
exceedance.marks <- tdata[tdata>u];
#Plot the exceedances (marks on the Y, indices on the X-axis)
ylabtxt <- paste("Exceedances above threshold ",format(u,digits=4));
plot(x=exceedance.indices, y=exceedance.marks, type="h",main="Clustering Evidence-Simulated t data",
    ylab=ylabtxt,xlab="Time: 1998-2004");
#The following is the unworkable S-Plus call to ts.plot()which we have just replaced 
#ts.plot(its(exceedance.marks,exceedance.times),type="h");

#Find the number of days between successive exceedances:
gaps <- diff(exceedance.indices);
#Use the QQplot function from functionsUtility.R; be sure to use 'exp' and not 'pexp' for the 
#exponential distribution. The rate is the inverse of mean for exponential 
QQplot(gaps,reference="exp",rate=rate);
#Add a straight diagonal line to the plot to help assess the fit of the QQplot:
abline(1,1);

#SU: this is a single-sample Kolmogorov-Smirnoff goodness-of-fit test.  The S-Plus
#function call
#     ks.gof(gaps,dist="exp",rate=rate)
#  must be replaced by the R-function ks.test() contained in the stats package which will
# normally be loaded:
#In R, we must use the following where we must precede the distribution name with 'p' for probability
# distribution.  Also help("pexp") shows the parameter name is 'rate')
ks.test(gaps, "pexp", rate=rate);

#Now repeat the analysis with real data from sp500 rather than the simulated data. 
#Note the fit is much worse.
#First approach: use the timeSeries dates on the horizontal axis; this is easier to interpret
rate2 <- kval/length(rseries@Data); #should be identical to rate
u2 <- findthreshold(as.vector(data),ne=kval);
rseriesTail <- rseries[rseries@Data>u2];
plot.timeSeries(rseriesTail, type="h", main="Clustering Evidence-Real Data", ylab="Excess Returns");
#Calculate number of days between successive exceedances in a vector
realgaps <- diff(as.POSIXct(rseriesTail@positions));
QQplot(realgaps,position=0.5, reference="exp",rate=rate2);
#Add a straight diagonal line to the plot to help assess the fit of the QQplot:
abline(1,1);
#Kolgomorov Smirnov test on real data.
#In R, we must use the following where we must precede the distribution name with 'p' for probability
# distribution.  Also help("pexp") shows the parameter name is 'rate')
ks.test(realgaps, "pexp", rate=rate2);


#Second approach: build exceedance indices and marks as in the non-dated cases:
exceedance.indices2 <- (1:length(data))[data>u2];
exceedance.marks2 <- data[data>u2];
#The following R-language plot
plot(x=exceedance.indices2, y=exceedance.marks2,type="h", main="Clustering Evidence-Real Data",ylab="Large Returns",xlab="Time: 1998-2004");
#replaces this S-Plus plot which allows numbers rather than dates to be passed.
#ts.plot(its(exceedance.marks,exceedance.times),type="h")
realgaps2 <- diff(exceedance.indices2);
QQplot(realgaps2,re="exp",rate=rate2);
abline(1,1);
#Kolgomorov Smirnov test on real data.
#In R, we must use the following where we must precede the distribution name with 'p' for probability
# distribution.  Also help("pexp") shows the parameter name is 'rate')
ks.test(realgaps2, "pexp", rate=rate2);




