User:Wugapodes/Correlations between time series

From mediawiki.org
# The following data are eyeballed from [[:File:Figure 2 Monthly Edits on Portuguese Wikipedia (5 years).png]]
pt.data = c(200, 190, 170, 180, 210, 200, 240, 245, 200, 180, 215, 190, 220, 200, 180, 175, 205, 190, 230, 190, 230, 220, 180, 190, 240, 230, 220, 270, 250, 220, 225, 170, 230, 200, 195, 210, 185, 175, 170, 155, 170, 175, 190, 170, 180, 175, 210, 200, 220, 195, 190, 180, 180, 170, 180, 175, 175, 175, 180, 190, 220, 250, 200, 180, 170, 180, 180, 165)

# Compute first difference of the above data
pt.data.diff = diff(pt.data)

# Compute the standard deviation for use in the random walk. It doesn't actually matter what
#  value is chosen, but using the value from the data gives us a realistic simulation
pt.data.sd = sd(pt.data)

# Set up vectors that will be manipulated in the for loop
pt.rand = c(200)
pt.cors = c()
pt.cors.p = c()
pt.diff.cors = c()
pt.diff.p = c()

# Perform 1000 iterations of the random walk to show the distribution of p-values expected when comparing time series data
for (j in 1:1000) {
  # Compute a random walk based on the data start with rate of change determined by the standard deviation of the data
  for (i in 2:length(pt.data)) {
    pt.rand[i] = pt.rand[i-1] + rnorm(1,0,pt.data.sd)
  }

  # Compute first difference for the random walk
  pt.diff.rand = diff(pt.rand)
  
  # Store the correlation results for later analysis
  ## Time series
  pt.cors[j] = cor(pt.data,pt.rand)
  pt.cors.p[j] = cor.test(pt.data,pt.rand)$p.val
  ## First difference
  pt.diff.cors[j] = cor(pt.data.diff,pt.diff.rand)
  pt.diff.p[j] = cor.test(pt.data.diff,pt.diff.rand)$p.val
}

# The distribution of p-values clusters around zero, showing that a random walk will almost always show up as significantly 
#  correlated to time series data leading to a much higher rate of Type I and Type II errors than expected given our significance level.
hist(pt.cors.p,breaks=40)
# However when the first difference is used, we see that p-values are uniformly distributed as expected, and we will have an
#  appropriate error rate for our significance level.
hist(pt.diff.p,breaks=40)

## The above code may be copied, modified, or reused under any of the following licenses: CC0, GPLv2 or later, CC By-SA 3.0, or the GFDL