Cross-Entropy and Clustering

July 26, 2019

Using minimum cross-entropy (MinXEnt) to determine the number of genetic clusters (K) is not always straightforward.

Sometimes, the difference in cross-entropy from K = n to K = n + 1 is negligible. Thus, selecting the K at which cross-entropy is at its mininimum may lead to an overestimation of K. To fix this, I use minimization of “slope.” Slope = e^-difference - 1 (where difference = XEnt_{n + 1} - XEnt_n). In other words, the smallest K is selected where the cross-entropy difference starts to plateau and slope approaches zero. This is achieved by selecting K based on the lowest slope value in the upper quartile.

#determining best K and picking best replicate for best K
ce <- list()
for(k in 1:maxK) ce[[k]] <- cross.entropy(snmf.obj, K=k)
ce.K <- c()
for(k in 1:maxK) ce.K[k] <- min(ce[[k]])
diff <- ce.K[-1] - ce.K[-maxK]
slope <- exp(-diff) - 1
#K is selected based on the smallest slope value in the upper quartile
best.K <- min(which(slope <= quantile(slope)[4]))
best.run <- which.min(ce[[best.K]])

Number of genetic clusters based on cross-entropy slope approaching zero

While I am highlighting here one approach to select the optimal value of K, it should be noted that the NALgen method is not sensitive to K (the response variable is a gene flow metric).

library(adegenet)
library(ggsci)
library(LEA)
library(mapplots)
library(maps)
library(changepoint)

futr <- pal_futurama()
schwifty <- pal_rickandmorty()
futr.cols <- colorRampPalette(futr(12)[c(12,11,3,7,8,6,1,2)])
schwifty.cols <- colorRampPalette(schwifty(12)[c(3,12,4,6,1,9,8,2)])
futrschwift.cols <- colorRampPalette(c("gray15", schwifty(12)[3], futr(12)[11], schwifty(12)[c(1,9)], futr(12)[c(8,6,2)]))
cols <- futrschwift.cols(12)


#plot parameters
par(mfrow=c(2,2), fg="gray50", pty='m', bty='o', mar=c(4,4,4,4), cex.main=1.3, cex.axis=1.1, cex.lab=1.2)


genind_names <- c("full_m0.1_lclp_n.s", 
         "full_m0.1_lchp_n.s", 
         "full_m0.1_hclp_n.s", 
         "full_m0.1_hchp_n.s", 
         "full_m0.5_lclp_n.s", 
         "full_m0.5_lchp_n.s", 
         "full_m0.5_hclp_n.s", 
         "full_m0.5_hchp_n.s")

ls_names <- c("LCLP landscape (m = 0.1)", 
          "LCHP landscape (m = 0.1)", 
          "HCLP landscape (m = 0.1)", 
          "HCHP landscape (m = 0.1)", 
          "LCLP landscape (m = 0.5)", 
          "LCHP landscape (m = 0.5)", 
          "HCLP landscape (m = 0.5)", 
          "HCHP landscape (m = 0.5)")

lea.dir <- c(
       "C:/Users/chazh/Documents/Research Projects/Reticulitermes/Simulations/popRange/m_0_1/Simulated_Landscapes/500neutral+50selectedSNPs/Low_Complexity_Landscapes/landscapeR/Low_Permeability/",
       "C:/Users/chazh/Documents/Research Projects/Reticulitermes/Simulations/popRange/m_0_1/Simulated_Landscapes/500neutral+50selectedSNPs/Low_Complexity_Landscapes/landscapeR/High_Permeability/",
       "C:/Users/chazh/Documents/Research Projects/Reticulitermes/Simulations/popRange/m_0_1/Simulated_Landscapes/500neutral+50selectedSNPs/High_Complexity_Landscapes/virtualspecies/Low_Permeability/",
           "C:/Users/chazh/Documents/Research Projects/Reticulitermes/Simulations/popRange/m_0_1/Simulated_Landscapes/500neutral+50selectedSNPs/High_Complexity_Landscapes/virtualspecies/High_Permeability/",
       "C:/Users/chazh/Documents/Research Projects/Reticulitermes/Simulations/popRange/m_0_5/Simulated_Landscapes/500neutral+50selectedSNPs/Low_Complexity_Landscapes/landscapeR/Low_Permeability/",
       "C:/Users/chazh/Documents/Research Projects/Reticulitermes/Simulations/popRange/m_0_5/Simulated_Landscapes/500neutral+50selectedSNPs/Low_Complexity_Landscapes/landscapeR/High_Permeability/",
       "C:/Users/chazh/Documents/Research Projects/Reticulitermes/Simulations/popRange/m_0_5/Simulated_Landscapes/500neutral+50selectedSNPs/High_Complexity_Landscapes/virtualspecies/Low_Permeability/",
       "C:/Users/chazh/Documents/Research Projects/Reticulitermes/Simulations/popRange/m_0_5/Simulated_Landscapes/500neutral+50selectedSNPs/High_Complexity_Landscapes/virtualspecies/High_Permeability/"
           )

bestK <- c()

for (i in 1:8){
     setwd(lea.dir[i])

     reps <- 5
     maxK <- 15

     #loading saved files from previous LEA run
     snmf.obj <- load.snmfProject(paste0(genind_names[i],".snmfProject"))
     
     #determining best K and picking best replicate for best K
     ce <- list()
     for(k in 1:maxK) ce[[k]] <- cross.entropy(snmf.obj, K=k)
     ce.K <- c()
     for(k in 1:maxK) ce.K[k] <- min(ce[[k]])
     diff <- ce.K[-1] - ce.K[-maxK]
     slope <- exp(-diff) - 1
     #K is selected based on the smallest slope value in the upper quartile
     best.K <- min(which(slope <= quantile(slope)[4]))
     best.run <- which.min(ce[[best.K]])
     bestK[i] <- best.K
     
     plot(ce.K, pch=21, cex=1.1, bg="red3", ylab="Cross-entropy", xlab="K")
     plot(diff, pch=21, cex=1.1, bg="red3", ylab="Difference", xlab="K")
     plot(slope, pch=21, cex=1.1, bg="red3", ylab="Slope", xlab="K")
     
     barchart(snmf.obj, K=best.K, run=best.run,
             border=NA, space=0, col=futrschwift.cols(best.K),
               xlab="Individuals", ylab="Ancestry proportions",
               main=paste("Ancestry bar chart for", ls_names[i]))
}

names(bestK) <- ls_names
print(bestK)

## LCLP landscape (m = 0.1) LCHP landscape (m = 0.1) HCLP landscape (m = 0.1) 
##                        5                        5                        5 
## HCHP landscape (m = 0.1) LCLP landscape (m = 0.5) LCHP landscape (m = 0.5) 
##                        5                        5                        5 
## HCLP landscape (m = 0.5) HCHP landscape (m = 0.5) 
##                        5                        5