Chapter 3.1 Multivariate case

3.1.1 Fonctions

library(sm)
library(plotly)

Cross validation error computation

Compute_CV=function(x,y,method,h=NULL,weights=NA)
{
  hopt=h
  if (is.null(h)) {hopt=sm.regression(x,y,method=method,eval.points=x,eval.grid=F,display='none',nbins=0,poly.index=0)$h}
  n=nrow(x)
  yest=rep(NA,n)
  if (is.na(weights)) weights=rep(1,n)
  CV=0
  for (i in 1:n)
  {
    yest[i]=(sm.regression(x[-i,],y[-i],h=hopt,eval.points=x,eval.grid=F,display='none',nbins=0,poly.index=0)$estimate)[i]
     CV=CV+(y[i]-yest[i])^2*weights[i] 
  }
  return(list('CV'=CV,'y_est'=yest))
}

ROC Curve

ROC=function(z,p1,plot=FALSE)
{
  p=c(-0.0001,sort(unique(p1)),1.0001)#seq(-0.01,1.01,0.01)c(-0.0001,sort(unique(p1))-0.00000000001,1.0001)
  lp=length(p)
  ROC=matrix(NA,lp,2)
  for (s in 1:lp)
  {
    ROC[s,1]=mean(p1[z==0]>=p[s])
    ROC[s,2]=mean(p1[z==1]>=p[s])  
  }
  #AUC=sum((ROC[-(lp),2])*(ROC[-(lp),1]-ROC[-(1),1]))
  AUC=sum((ROC[-(lp),2]+ROC[-1,2])*(ROC[-(lp),1]-ROC[-(1),1])/2)
  
  colnames(ROC)=c("FPR","TPR")
  if (plot)  
  {
    plot(ROC,type='l',main=paste('AUC =', AUC))
    abline(0,1,col='orange')
  }
  
  return(list(ROC=ROC,AUC=AUC))
}

Non parametric estimation of Bayes classifier

Classif_NP=function(X,Y,X0=NA,plot=FALSE,me='cv',h=NULL)
{
  X=as.matrix(X)
  n=length(Y)
  V=sort(unique(Y))
  n_V=length(V)
  Prob=matrix(NA,n,n_V)
  colnames(Prob)=V
  Class=rep(NA,n)
  if (!is.na(max(X0)))
  {
    X0=as.matrix(X0)
    P0=matrix(NA,nrow(X0),n_V)
    Class0=rep(NA,nrow(X0))
  }
  for (v in 1:n_V)
  {
    z=as.numeric(Y==V[v])
    for (i in 1:n )
    {
      if (is.null(h)) {value=sm.regression(X[-i,],z[-i], eval.points=X,eval.grid=FALSE,display='none',method=me,poly.index=0)$estimate[i]}
      else {value=sm.regression(X[-i,],z[-i], eval.points=X,eval.grid=FALSE,display='none',h=h,poly.index=0)$estimate[i]}
      if (!is.na(value)) {Prob[i,v]=value}
      if (is.na(value)) {Prob[i,v]=0#mean(z[-i])
      }
    }
    if (!is.na(max(X0))) {if (is.null(h)) {P0[,v]=sm.regression(X,z,eval.points=X0,eval.grid=FALSE,display='none',method=me,poly.index=0)$estimate}
      else {P0[,v]=sm.regression(X,z,eval.points=X0,eval.grid=FALSE,display='none',h=h,poly.index=0)$estimate}
    P0[is.na(P0[,v]),v]=mean(z)}
  }
  if (n_V==2) {Roc=ROC(Y==V[2],Prob[,2],plot)}
  Class=V[apply(Prob,1,which.max)]
  V_est=sort(unique(Class))
  if (length(V_est)==n_V){M_table=table(Y,Class)}
  else {
    M_table=matrix(0,n_V,n_V)
    M_table0=table(Y,Class)
    for (j in 1:length(V_est)) {M_table[,which(V==V_est[j])]=M_table0[,j]}
  }
  Err=1-(sum(diag(M_table))/sum(M_table))
  if (!is.na(max(X0))) {Class0=V[apply(P0,1,which.max)]}
  if (!is.na(max(X0))) {return(list(Class=Class, Prob=Prob, M_table=M_table, Err=Err, Class0=Class0,Prob0=P0,Auc=ifelse(n_V==2,Roc$AUC,NA)))}
  else {return(list(Class=Class, Prob=Prob, M_table=M_table, Err=Err,Auc=ifelse(n_V==2,Roc$AUC,NA)))}
}

3.1.2 Density

Simulation

library(sm)
#library(plotly)
library(plot3D)
#library(rpanel)

n=1000
alpha=0.4
mu11=7
mu12=0
sig11=1
sig12=3
mu21=0
mu22=3
sig21=2
sig22=2
x1=matrix(c(rnorm(n,mu11,sig11),rnorm(n,mu12,sig12)),ncol=2)
x2=matrix(c(rnorm(n,mu21,sig21),rnorm(n,mu22,sig22)),ncol=2)
z=rbinom(n,1,alpha)


x=x1
x[z==1,]=x2[z==1,]
M=300
r1=seq(-10,10,len=M)
r2=seq(-10,10,len=M)
x_1=x[,1]
x_2=x[,2]

Estimation de la densité

par(mfrow=c(1,3))
re_co=sm.density(x,method='cv',structure='common',nbins=0,poly.index=0)$estimate
re_sc=sm.density(x,method='cv',structure="scaled",nbins=0,poly.index=0)$estimate
re_se=sm.density(x,method='cv',structure='separate',nbins=0,poly.index=0)$estimate

par(mfrow=c(1,2))
rx=sm.density(x,method='cv',eval.points=x
             ,eval.grid=F,poly.index=0,
             nbins=0)
r=sm.density(x,method='cv',eval.points=cbind(r1,r2)
             ,eval.grid=T,poly.index=0,
             nbins=0)
f_k=r$estimate
plot_ly(z=~f_k,type='surface')

Comparaison avec la vraie densité

f=matrix(NA,M,M)
for (j in 1:M)
{
  for (k in 1:M)
  {
    f[j,k]=(1-alpha)*dnorm(r1[j],mu11,sig11)*dnorm(r2[k],mu12,sig12)+(1-alpha)*dnorm(r1[j],mu21,sig21)*dnorm(r2[k],mu22,sig22)
  }
}

v1=rep(r1,M)
v2=rep(r2,rep(M,M))
fv=as.vector(f)
fra=data.frame(v1,v2,fv)
par(mfrow=c(1,3))
image(f,zlim=range(c(f,r$estimate)))
image(r$estimate,zlim=range(c(f,r$estimate)))
image(abs(f-r$estimate),zlim=range(c(f,r$estimate)))

surf3D(matrix(rep(r1,M),M,M),matrix(rep(r2,rep(M,M)),M,M),f,clim=range(c(f,f_k)))
surf3D(matrix(rep(r1,M),M,M),matrix(rep(r2,rep(M,M)),M,M),f_k,clim=range(c(f,f_k)))
surf3D(matrix(rep(r1,M),M,M),matrix(rep(r2,rep(M,M)),M,M),abs(f-f_k),clim=range(c(f,f_k)))

dif=abs(f-f_k)
plot_ly(z=~f,type='surface')
plot_ly(z=~f_k,type='surface')
plot_ly(z=~dif,type='surface')

Estimation du mode

p_mo=which(f==max(f),arr.ind=T)
(mode=c(r1[p_mo[1]],r2[p_mo[2]]))
[1] 6.98996656 0.03344482
p_mok=which(f_k==max(f_k),arr.ind=T)
(modek=c(r1[p_mok[1]],r2[p_mok[2]]))
[1]  7.056856 -0.367893
p_mo1=which(f[1:(M/2),]==max(f[1:(M/2),]),arr.ind=T)
(modek1=c(r1[p_mo1[1]],r2[p_mo1[2]]))
[1] -0.03344482  2.97658863
p_mok1=which(f_k[1:(M/2),]==max(f_k[1:(M/2),]),arr.ind=T)
(modek1=c(r1[p_mok1[1]],r2[p_mok1[2]]))
[1] -0.5685619  2.9096990
#plot_ly(fra,x=~v1,y=~v2,z = ~fv, type = "surface")

Real world data

Estimation de la densité

data(faithful)
attach(faithful)
par(mfrow=c(1,3))
sm.density(faithful,method='cv',structure='common',nbins=0,poly.index=0)
sm.density(faithful,method='cv',structure="scaled",nbins=0,poly.index=0)
sm.density(faithful,method='cv',structure='separate',nbins=0,poly.index=0)

Estimation du mode

x=eruptions
y=waiting
x0=seq(min(x),max(x),len=100)
y0=seq(min(y),max(y),len=100)

re=sm.density(cbind(x,y),method='cv',eval.points=cbind(x0,y0),
              eval.grid=T,structure='separate',nbins=0,poly.index=0)

plot_ly(z=~re$estimate,type='surface')
p_mo=which(re$estimate==max(re$estimate),arr.ind=T)
(mode=c(x0[p_mo[1]],y0[p_mo[2]]))
[1]  4.428283 81.010101
p_mo1=which(re$estimate[1:30,1:30]==max(re$estimate[1:30,1:30]),arr.ind=T)
(mode1=c(x0[p_mo1[1]],y0[p_mo1[2]]))
[1]  1.918182 53.171717

3.1.2 Regression

Real world data

load('Randonnee.RData')
x=long
y=lat
z=alti # variable of interest 
x0=seq(min(x),max(x),len=100)
y0=seq(min(y),max(y),len=100)
re0=sm.regression(cbind(x,y),z,method='cv',eval.points=cbind(x0,y0),
              eval.grid=T,structure='common',nbins=0,poly.index=0)

plot_ly(z=~re0$estimate,type='surface')
alti_profile=sm.regression(cbind(x,y),z,method='cv',eval.points=Position,
              eval.grid=F,structure='common',nbins=0,poly.index=0)$estimate
km=rep(0,nrow(Position))
for (j in 1:(nrow(Position)-1)) {km[j+1]=km[j]+sqrt((Position[j+1,1]-Position[j,1])^2+(Position[j+1,2]-Position[j,2])^2)}
#km=cumsum(km)/1000
plot(km/10,alti_profile,type='l')

Qualité

CV with Kernel Estimator

RE=Compute_CV(cbind(x,y),z,method='cv')
RE$CV/length(alti)
[1] 143.2316
plot(RE$y_est,z)
abline(0,1)

CV avec moyenne Z

n=length(z)
CV_z=0
for (j in 1:n) 
  {
  CV_z=CV_z+(z[j]-mean(z[-j]))^2
} 
CV_z/n
[1] 1582.402

3.1.3 Classification

data(iris)
v=combn(1:4,2)
for(j in 1:ncol(v)) 
  { 
  print(colnames(iris)[v[,j]])
  C_NP=Classif_NP(iris[,v[,j]],iris[,5])
  print(C_NP$Err)
  print(C_NP$M_table)
}
[1] "Sepal.Length" "Sepal.Width" 
[1] 0.2133333
            Class
Y            setosa versicolor virginica
  setosa         49          1         0
  versicolor      0         35        15
  virginica       0         16        34
[1] "Sepal.Length" "Petal.Length"
[1] 0.06
            Class
Y            setosa versicolor virginica
  setosa         50          0         0
  versicolor      0         46         4
  virginica       0          5        45
[1] "Sepal.Length" "Petal.Width" 
[1] 0.04
            Class
Y            setosa versicolor virginica
  setosa         50          0         0
  versicolor      0         48         2
  virginica       0          4        46
[1] "Sepal.Width"  "Petal.Length"
[1] 0.08
            Class
Y            setosa versicolor virginica
  setosa         49          1         0
  versicolor      0         45         5
  virginica       0          6        44
[1] "Sepal.Width" "Petal.Width"
[1] 0.06
            Class
Y            setosa versicolor virginica
  setosa         49          1         0
  versicolor      0         46         4
  virginica       0          4        46
[1] "Petal.Length" "Petal.Width" 
[1] 0.03333333
            Class
Y            setosa versicolor virginica
  setosa         50          0         0
  versicolor      0         48         2
  virginica       0          3        47