User:Wugapodes/Correlations between time series
Appearance
# The following data are eyeballed from [[:File:Figure 2 Monthly Edits on Portuguese Wikipedia (5 years).png]]
pt.data = c(200, 190, 170, 180, 210, 200, 240, 245, 200, 180, 215, 190, 220, 200, 180, 175, 205, 190, 230, 190, 230, 220, 180, 190, 240, 230, 220, 270, 250, 220, 225, 170, 230, 200, 195, 210, 185, 175, 170, 155, 170, 175, 190, 170, 180, 175, 210, 200, 220, 195, 190, 180, 180, 170, 180, 175, 175, 175, 180, 190, 220, 250, 200, 180, 170, 180, 180, 165)
# Compute first difference of the above data
pt.data.diff = diff(pt.data)
# Compute the standard deviation for use in the random walk. It doesn't actually matter what
# value is chosen, but using the value from the data gives us a realistic simulation
pt.data.sd = sd(pt.data)
# Set up vectors that will be manipulated in the for loop
pt.rand = c(200)
pt.cors = c()
pt.cors.p = c()
pt.diff.cors = c()
pt.diff.p = c()
# Perform 1000 iterations of the random walk to show the distribution of p-values expected when comparing time series data
for (j in 1:1000) {
# Compute a random walk based on the data start with rate of change determined by the standard deviation of the data
for (i in 2:length(pt.data)) {
pt.rand[i] = pt.rand[i-1] + rnorm(1,0,pt.data.sd)
}
# Compute first difference for the random walk
pt.diff.rand = diff(pt.rand)
# Store the correlation results for later analysis
## Time series
pt.cors[j] = cor(pt.data,pt.rand)
pt.cors.p[j] = cor.test(pt.data,pt.rand)$p.val
## First difference
pt.diff.cors[j] = cor(pt.data.diff,pt.diff.rand)
pt.diff.p[j] = cor.test(pt.data.diff,pt.diff.rand)$p.val
}
# The distribution of p-values clusters around zero, showing that a random walk will almost always show up as significantly
# correlated to time series data leading to a much higher rate of Type I and Type II errors than expected given our significance level.
hist(pt.cors.p,breaks=40)
# However when the first difference is used, we see that p-values are uniformly distributed as expected, and we will have an
# appropriate error rate for our significance level.
hist(pt.diff.p,breaks=40)
## The above code may be copied, modified, or reused under any of the following licenses: CC0, GPLv2 or later, CC By-SA 3.0, or the GFDL