############################################################################# ### Some routines to compute a naive entropy estimate from a binary sequence ### Vincent Q. Vu ### April 27, 2005 ### ############################################################################# ### Generates a sequence of cointosses ### p can be either a scalar or vector of length n rbinomproc <- function(n=1, p=.5) { as.numeric(runif(n=n) < p) } rmarkovproc <- function(n, p=.5, q=p, initial=0) { coin <- rbind(rbinomproc(n, p=p), rbinomproc(n, p=1-q)) x <- rep(initial, n+1) for(i in 1:n) { x[i+1] <- coin[x[i]+1, i] } x[-1] } markoventropy <- function(p,q) { x <- p * (q*log2(q)+(1-q)*log2(1-q)) + q * (p*log2(p)+(1-p)*log2(1-p)) -x/(p+q) } ### Converts a sequence of 0-1s into a sequence of (base 10) words built ### from a sliding window of length wordlen as.wordseq <- function(x, wordlen=1, overlap=F) { pow2 <- 2^(0:(wordlen-1)) # Powers of 2 if(overlap == T) { nwords <- length(x)-wordlen+1 word <- rep(0, nwords) for(i in 1:nwords) { ## Converts a LSB binary sequence into a base 10 integer word[i] <- sum(pow2*x[i:(i+wordlen-1)]) } } else { nwords <- floor(length(x) / wordlen) x <- matrix(x[1:(nwords*wordlen)], nrow=nwords, ncol=wordlen, byrow=T) word <- c(apply(x, 1, function(r) sum(pow2*r))) } word } ### Convert the binary sequence into a sequence of words ### then compute the empirical probabilities worddist <- function(x, wordlen=1, overlap=F) { p <- table(as.wordseq(x, wordlen=wordlen, overlap=overlap)) p / sum(p) } ### Compute entropy (in bits) of a probability vector entropy <- function(p) { -sum(p*log2(p)) } ### Computes the entropy rate (entropy/wordlen) of a binary sequence as ### a function of window size (word length in my jargon) entropyrate <- function(x, maxwordlen=floor(length(x)/2), overlap=F) { e <- rep(0,maxwordlen) for(i in 1:maxwordlen) { e[i] <- entropy(worddist(x, wordlen=i, overlap=overlap)) / i } e } ### Plots empirical entropy rate of a fixed sequence as a function of ### wordlength plotentropy <- function(x, useinv=F, maxwordlen=2*floor(log2(length(x))), overlap=F) { e <- entropyrate(x, maxwordlen=maxwordlen, overlap=overlap) if(useinv == F) { plot(e, xlab="Word length (bin)", ylab="Empirical Entropy Rate (bit/bin)", main=paste("Sample size:", length(x), "Max wordlen:", maxwordlen)) } else { t <- 1:length(e) t <- 1/t plot(t, e, xlab="Inverse word length (1/bin)", ylab="Empirical Entropy Rate (bit/bin)", main=paste("Sample size:", length(x), "Max wordlen:", maxwordlen)) } } ### exploreentropy <- function(overlap=F) { x11(w=16,h=12) par(mfrow=c(4,4)) for(n in c(32,64,128,256,512,1024,2048,4096)) { print(n) x <- rbinomproc(n=n, p=.5) plotentropy(x, useinv=F, overlap=overlap) plotentropy(x, useinv=T, overlap=overlap) } } plotrankfreq <- function(x, wordlen) { plot(sort(table(as.wordseq(x, wordlen=wordlen))), xlab="Rank", ylab="Frequency", main=paste("sample size:", length(x), "wordlen:", wordlen)) } ### ### Computes entropy of a discrete distribution ### hhat <- function(x) { x <- x[x > 0] x <- x/sum(x) sum(-x * log(x)) / log(2) } ### ### ### test <- function(n=1, nbins=1, wordlen=1) { x <- rmultinom(n=n, size=floor(nbins/wordlen), rep(1/(2^wordlen), 2^wordlen)) ### estimate entropy rate apply(x, 2, hhat) / wordlen } ### explorehhatdist <- function(nbins, n=10) { h <- sapply(1:floor(log2(nbins)), function(wordlen) test(n=n, nbins=nbins, wordlen=wordlen)) t(h) } ### Regress the entropy rates on 1/T with weights 2^(1-T) teststrong <- function(x, maxwordlen=2*floor(log2(length(x)))) { h <- entropyrate(x, maxwordlen=maxwordlen) t <- 1:length(h) plot(1/t, h, xlab="1/T", ylab="entropy rate") abline(l <- lm(h~I(1/t), weights=2^(1-t))) print(coef(l)) } testmarkov <- function(n, p, q, replicates=1, maxwordlen=2*floor(log2(n)), ...) { m <- markoventropy(p,q) plot(0, m, xlim=c(0,1), ylim=c(max(.5*m,0), min(2*m,1)), xlab=paste("1/T, min(1/T)= 1/", maxwordlen, sep=""), ylab="entropy rate", main=paste("n=", n, "p=",p,"q=",q), type="n") abline(h=markoventropy(p,q), lty="dashed", col="red") for(i in 1:replicates) { x <- rmarkovproc(n, p, q) h <- entropyrate(x, maxwordlen=maxwordlen) t <- 1:length(h) points(1/t, h, pch=19) abline(l <- lm(h~I(1/t), weights=2^(1-t))) } } makeindplots <- function() { x11(w=12, h=6) par(mfrow=c(2,5)) for(i in 1:10) { testmarkov(4096, p=2^(-i), q=2^(-i), replicates=8) } } makecorplots <- function() { x11(w=12, h=6) par(mfrow=c(2,5)) for(i in 1:10) { testmarkov(4096, p=2^(-i), q=1-2^(-i), replicates=8) } }