--- output: html_document --- ```{r, include=FALSE} knitr::opts_chunk$set(echo = TRUE,comment='',message=FALSE,class.source="bg-info", class.output="bg-success",fig.align = 'center',out.width='40%',fig.asp=1) ``` # Chapter 2. Kernel estimation of the regression function {.tabset .tabset-fade .tabset-pills} ## 2.1 Fonctions ```{r} library(sm) ``` ### Cross validation error computation ```{r} Compute_CV=function(x,y,method,h=NULL,weights=NA) { hopt=h if (is.null(h)) {hopt=sm.regression(x,y,method=method,display='none',poly.index=0)$h} n=length(x) if (is.na(weights)) weights=rep(1,n) CV=0 for (i in 1:n) { CV=CV+(y[i]-sm.regression(x[-i],y[-i],h=hopt,eval.points=x[i],display='none',poly.index=0)$estimate)^2*weights[i] } return(CV) } ``` ### ROC Curve ```{r} ROC=function(z,p1,plot=FALSE) { p=c(-0.0001,sort(unique(p1)),1.0001)#seq(-0.01,1.01,0.01)c(-0.0001,sort(unique(p1))-0.00000000001,1.0001) lp=length(p) ROC=matrix(NA,lp,2) for (s in 1:lp) { ROC[s,1]=mean(p1[z==0]>=p[s]) ROC[s,2]=mean(p1[z==1]>=p[s]) } #AUC=sum((ROC[-(lp),2])*(ROC[-(lp),1]-ROC[-(1),1])) AUC=sum((ROC[-(lp),2]+ROC[-1,2])*(ROC[-(lp),1]-ROC[-(1),1])/2) colnames(ROC)=c("FPR","TPR") if (plot) { plot(ROC,type='l',main=paste('AUC =', AUC)) abline(0,1,col='orange') } return(list(ROC=ROC,AUC=AUC)) } ``` ### Non parametric estimation of Bayes classifier ```{r} Classif_NP=function(X,Y,X0=NA,plot=FALSE,me='cv',h=NULL) { X=as.matrix(X) n=length(Y) V=sort(unique(Y)) n_V=length(V) Prob=matrix(NA,n,n_V) colnames(Prob)=V Class=rep(NA,n) if (!is.na(max(X0))) { X0=as.matrix(X0) P0=matrix(NA,nrow(X0),n_V) Class0=rep(NA,nrow(X0)) } for (v in 1:n_V) { z=as.numeric(Y==V[v]) for (i in 1:n ) { if (is.null(h)) {value=sm.regression(X[-i,],z[-i], eval.points=X,eval.grid=FALSE,display='none',method=me,poly.index=0)$estimate[i]} else {value=sm.regression(X[-i,],z[-i], eval.points=X,eval.grid=FALSE,display='none',h=h,poly.index=0)$estimate[i]} if (!is.na(value)) {Prob[i,v]=value} if (is.na(value)) {Prob[i,v]=0#mean(z[-i]) } } if (!is.na(max(X0))) {if (is.null(h)) {P0[,v]=sm.regression(X,z,eval.points=X0,eval.grid=FALSE,display='none',method=me,poly.index=0)$estimate} else {P0[,v]=sm.regression(X,z,eval.points=X0,eval.grid=FALSE,display='none',h=h,poly.index=0)$estimate} P0[is.na(P0[,v]),v]=mean(z)} } if (n_V==2) {Roc=ROC(Y==V[2],Prob[,2],plot)} Class=V[apply(Prob,1,which.max)] V_est=sort(unique(Class)) if (length(V_est)==n_V){M_table=table(Y,Class)} else { M_table=matrix(0,n_V,n_V) M_table0=table(Y,Class) for (j in 1:length(V_est)) {M_table[,which(V==V_est[j])]=M_table0[,j]} } Err=1-(sum(diag(M_table))/sum(M_table)) if (!is.na(max(X0))) {Class0=V[apply(P0,1,which.max)]} if (!is.na(max(X0))) {return(list(Class=Class, Prob=Prob, M_table=M_table, Err=Err, Class0=Class0,Prob0=P0,Auc=ifelse(n_V==2,Roc$AUC,NA)))} else {return(list(Class=Class, Prob=Prob, M_table=M_table, Err=Err,Auc=ifelse(n_V==2,Roc$AUC,NA)))} } ``` ## 2.2 Regression ### 2.2.1 Simulation #### Regression 1.a \[Y=10e^{-3X}+e^{-\frac{X}{2}}\epsilon, \] avec $X \sim \mathcal{U}([0;3])$ et $\epsilon \sim \mathcal{N}(0,1)$. ```{r} n=100 x=runif(n,0,3) e=rnorm(n,0,exp(-x/2)) y=10*exp(-3*x)+e M=1000 gr=seq(min(x),max(x),len=M) plot(gr,10*exp(-3*gr),ylab='y',type='l',ylim=c(-1,10)) lines(x,y,col="red",type="p") sm.regression(x,y,h=0.01,col="blue",eval.points=gr,add=T,nbins=0,poly.index=0) sm.regression(x,y,h=.1,col="orange",eval.points=gr,add=T,nbins=0,poly.index=0) sm.regression(x,y,h=1,col="green",eval.points=gr,add=T,nbins=0,poly.index=0) legend(x="topright",leg=c('h=0.01','h=0.1','h=1'), col=c("blue","orange","green"),lty=1) M=1000 gr=seq(min(x),max(x),len=M) plot(gr,10*exp(-3*gr),ylab='y',type='l',ylim=c(-1,10)) lines(x,y,col="red",type="p") sm.regression(x,y,method="cv",col="blue",eval.points=gr,add=T,nbins=0,poly.index=0) reg3=sm.regression(x,y,method="cv",col="blue",eval.points=x,add=T,nbins=0,poly.index=0,display='none') sm.regression(x,y,method="df",col="orange",add=T,eval.points=gr,nbins=0,poly.index=0) reg1=sm.regression(x,y,method="df",eval.points=x,nbins=0,add=T,poly.index=0,display='none') sm.regression(x,y,method="aicc",col="green",add=T,eval.points=gr,nbins=0,poly.index=0) reg2=sm.regression(x,y,method="aicc",add=T,eval.points=x,nbins=0,poly.index=0,display='none') legend(x="topright",leg=c('cv',"df",'aicc'), col=c("blue","orange","green"),lty=1) plot(reg1$estimate,y,col="orange") lines(reg2$estimate,y,col="green",type='p') lines(reg3$estimate,y,col="blue",type='p') abline(0,1) ``` ##### Erreur quadratique et Validation croisée ```{r} # Erreur quadratique mean((reg1$estimate-y)^2) mean((reg2$estimate-y)^2) mean((reg3$estimate-y)^2) #Validation croisée Compute_CV(x,y,'cv')/n Compute_CV(x,y,'df')/n Compute_CV(x,y,'aicc')/n # CV avec moyenne Y n=length(x) CV_y=0 for (j in 1:n) { CV_y=CV_y+(y[j]-mean(y[-j]))^2 } CV_y/n # Variance non corrigée e var(e)*(n-1)/n ``` #### Regression 1.b \[Y=10e^{-3X}+e^{-\frac{X}{2}}\epsilon, \] avec $X \sim \mathcal{E}(1)$ et $\epsilon \sim \mathcal{N}(0,1)$. ```{r} n=100 x=rexp(n,1) e=rnorm(n,0,exp(-x/2)) y=10*exp(-3*x)+e M=1000 gr=seq(min(x),max(x),len=M) plot(gr,10*exp(-3*gr),ylab='y',type='l',ylim=c(-1,10)) lines(x,y,col="red",type="p") sm.regression(x,y,h=0.01,col="blue",eval.points=gr,add=T,nbins=0,poly.index=0) sm.regression(x,y,h=.1,col="orange",eval.points=gr,add=T,nbins=0,poly.index=0) sm.regression(x,y,h=1,col="green",eval.points=gr,add=T,nbins=0,poly.index=0) legend(x="topright",leg=c('h=0.01','h=0.1','h=1'), col=c("blue","orange","green"),lty=1) M=1000 gr=seq(min(x),max(x),len=M) plot(gr,10*exp(-3*gr),ylab='y',type='l',ylim=c(-1,10)) lines(x,y,col="red",type="p") sm.regression(x,y,method="cv",col="blue",eval.points=gr,add=T,nbins=0,poly.index=0) reg3=sm.regression(x,y,method="cv",col="blue",eval.points=x,add=T,nbins=0,poly.index=0,display='none') sm.regression(x,y,method="df",col="orange",add=T,eval.points=gr,nbins=0,poly.index=0) reg1=sm.regression(x,y,method="df",eval.points=x,nbins=0,add=T,poly.index=0,display='none') sm.regression(x,y,method="aicc",col="green",add=T,eval.points=gr,nbins=0,poly.index=0) reg2=sm.regression(x,y,method="aicc",add=T,eval.points=x,nbins=0,poly.index=0,display='none') legend(x="topright",leg=c('cv',"df",'aicc'), col=c("blue","orange","green"),lty=1) plot(reg1$estimate,y,col="orange") lines(reg2$estimate,y,col="green",type='p') lines(reg3$estimate,y,col="blue",type='p') abline(0,1) ``` ##### Erreur quadratique et Validation croisée ```{r} # Erreur quadratique mean((reg1$estimate-y)^2) mean((reg2$estimate-y)^2) mean((reg3$estimate-y)^2) #Validation croisée Compute_CV(x,y,'cv')/n Compute_CV(x,y,'df')/n Compute_CV(x,y,'aicc')/n # CV avec moyenne Y n=length(x) CV_y=0 for (j in 1:n) { CV_y=CV_y+(y[j]-mean(y[-j]))^2 } CV_y/n # Variance non corrigée e var(e)*(n-1)/n ``` #### Regression 2.a \[Y=7cos(7X)+10e^{-3X}+3e^{-\frac{X}{2}}\epsilon, \] avec $X \sim \mathcal{U}([0;3])$ et $\epsilon \sim \mathcal{N}(0,1)$. ```{r} n=1000 x=runif(n,0,3) e=rnorm(n,0,exp(-x/2)) y=10*exp(-3*x)+7*cos(7*x)+e M=1000 gr=seq(min(x),max(x),len=M) plot(gr,10*exp(-3*gr)+7*cos(7*gr),ylab='y',type='l') lines(x,y,col="red",type="p") sm.regression(x,y,h=0.01,col="blue",eval.points=gr,add=T,nbins=0,poly.index=0) sm.regression(x,y,h=.1,col="orange",eval.points=gr,add=T,nbins=0,poly.index=0) sm.regression(x,y,h=1,col="green",eval.points=gr,add=T,nbins=0,poly.index=0) legend(x="topright",leg=c('h=0.01','h=0.1','h=1'), col=c("blue","orange","green"),lty=1) M=1000 gr=seq(min(x),max(x),len=M) plot(gr,10*exp(-3*gr)+7*cos(7*gr),ylab='y',type='l' ) lines(x,y,col="red",type="p") sm.regression(x,y,method="cv",col="blue",eval.points=gr,add=T,nbins=0,poly.index=0) reg3=sm.regression(x,y,method="cv",col="blue",eval.points=x,add=T,nbins=0,poly.index=0,display='none') sm.regression(x,y,method="df",col="orange",add=T,eval.points=gr,nbins=0,poly.index=0) reg1=sm.regression(x,y,method="df",eval.points=x,nbins=0,add=T,poly.index=0,display='none') sm.regression(x,y,method="aicc",col="green",add=T,eval.points=gr,nbins=0,poly.index=0) reg2=sm.regression(x,y,method="aicc",add=T,eval.points=x,nbins=0,poly.index=0,display='none') legend(x="topright",leg=c('cv',"df",'aicc'), col=c("blue","orange","green"),lty=1) plot(reg1$estimate,y,col="orange") lines(reg2$estimate,y,col="green",type='p') lines(reg3$estimate,y,col="blue",type='p') abline(0,1) ``` ##### Erreur quadratique et Validation croisée ```{r} # Erreur quadratique mean((reg1$estimate-y)^2) mean((reg2$estimate-y)^2) mean((reg3$estimate-y)^2) #Validation croisée Compute_CV(x,y,'cv')/n Compute_CV(x,y,'df')/n Compute_CV(x,y,'aicc')/n # CV avec moyenne Y n=length(x) CV_y=0 for (j in 1:n) { CV_y=CV_y+(y[j]-mean(y[-j]))^2 } CV_y/n # Variance non corrigée e var(e)*(n-1)/n ``` #### Regression 2.b \[Y=7cos(7X)+10e^{-3X}+3e^{-\frac{X}{2}}\epsilon, \] avec $X \sim \mathcal{E}(1)$ et $\epsilon \sim \mathcal{N}(0,1)$. ```{r} n=1000 x=rexp(n,1) e=rnorm(n,0,exp(-x/2)) y=10*exp(-3*x)+7*cos(7*x)+e M=1000 gr=seq(min(x),max(x),len=M) plot(gr,10*exp(-3*gr)+7*cos(7*gr),ylab='y',type='l') lines(x,y,col="red",type="p") sm.regression(x,y,h=0.01,col="blue",eval.points=gr,add=T,nbins=0,poly.index=0) sm.regression(x,y,h=.1,col="orange",eval.points=gr,add=T,nbins=0,poly.index=0) sm.regression(x,y,h=1,col="green",eval.points=gr,add=T,nbins=0,poly.index=0) legend(x="topright",leg=c('h=0.01','h=0.1','h=1'), col=c("blue","orange","green"),lty=1) M=1000 gr=seq(min(x),max(x),len=M) plot(gr,10*exp(-3*gr)+7*cos(7*gr),ylab='y',type='l') lines(x,y,col="red",type="p") sm.regression(x,y,method="cv",col="blue",eval.points=gr,add=T,nbins=0,poly.index=0) reg3=sm.regression(x,y,method="cv",col="blue",eval.points=x,add=T,nbins=0,poly.index=0,display='none') sm.regression(x,y,method="df",col="orange",add=T,eval.points=gr,nbins=0,poly.index=0) reg1=sm.regression(x,y,method="df",eval.points=x,nbins=0,add=T,poly.index=0,display='none') sm.regression(x,y,method="aicc",col="green",add=T,eval.points=gr,nbins=0,poly.index=0) reg2=sm.regression(x,y,method="aicc",add=T,eval.points=x,nbins=0,poly.index=0,display='none') legend(x="topright",leg=c('cv',"df",'aicc'), col=c("blue","orange","green"),lty=1) plot(reg1$estimate,y,col="orange") lines(reg2$estimate,y,col="green",type='p') lines(reg3$estimate,y,col="blue",type='p') abline(0,1) ``` ##### Erreur quadratique et Validation croisée ```{r} # Erreur quadratique mean((reg1$estimate-y)^2) mean((reg2$estimate-y)^2) mean((reg3$estimate-y)^2) #Validation croisée Compute_CV(x,y,'cv')/n Compute_CV(x,y,'df')/n Compute_CV(x,y,'aicc')/n # CV avec moyenne Y n=length(x) CV_y=0 for (j in 1:n) { CV_y=CV_y+(y[j]-mean(y[-j]))^2 } CV_y/n # Variance non corrigée e var(e)*(n-1)/n ``` ##### Erreur quadratique et Validation croisée ```{r} # Erreur quadratique mean((reg1$estimate-y)^2) mean((reg2$estimate-y)^2) mean((reg3$estimate-y)^2) #Validation croisée Compute_CV(x,y,'cv')/n Compute_CV(x,y,'df')/n Compute_CV(x,y,'aicc')/n # CV avec moyenne Y n=length(x) CV_y=0 for (j in 1:n) { CV_y=CV_y+(y[j]-mean(y[-j]))^2 } CV_y/n # Variance non corrigée e var(e)*(n-1)/n ``` ### 2.2.2 Real world data ```{r} # load data library(MASS) data(mcycle) head(mcycle) x=mcycle$times y=mcycle$accel n=length(x) ``` ```{r} # plot data plot(x, y, xlab = "Time (ms)", ylab = "Acceleration (g)") sm.regression(x,y,h=0.1,col="blue",eval.points=sort(x),add=T,nbins=0,poly.index=0) sm.regression(x,y,h=1,col="orange",eval.points=sort(x),add=T,nbins=0,poly.index=0) sm.regression(x,y,h=10,col="green",eval.points=sort(x),add=T,nbins=0,poly.index=0) legend(x="topright",leg=c('h=0.1','h=1','h=10'), col=c("blue","orange","green"),lty=1) plot(x,y,col="red",type="p") reg3=sm.regression(x,y,method="cv",col="blue", eval.points=sort(x),add=T,nbins=0,poly.index=0) legend(x="topright",leg=c('cv',"df",'aicc'), col=c("blue","orange","green"),lty=1) reg1=sm.regression(x,y,method="df",col="orange",eval.points=sort(x),add=T,nbins=0,poly.index=0) reg2=sm.regression(x,y,method="aicc",col="green", eval.points=sort(x),add=T,nbins=0,poly.index=0) plot(reg1$estimate,y[order(x)],col="orange") lines(reg2$estimate,y[order(x)],col="green",type='p') lines(reg3$estimate,y[order(x)],col="blue",type='p') abline(0,1) ``` ##### Erreur quadratique et Validation croisée ```{r} # Erreur quadratique mean((reg1$estimate-y[order(x)])^2) mean((reg2$estimate-y[order(x)])^2) mean((reg3$estimate-y[order(x)])^2) #Validation croisée Compute_CV(x,y,'cv')/n Compute_CV(x,y,'df')/n Compute_CV(x,y,'aicc')/n # CV avec moyenne Y n=length(x) CV_y=0 for (j in 1:n) { CV_y=CV_y+(y[j]-mean(y[-j]))^2 } CV_y/n ``` ## 2.3 Supervised classification (Discrimination) ### 2.3.1 Simulation \[f_X(x)=\alpha f_{\mathcal{N}(0,2)}(x)+(1-\alpha)f_{\mathcal{N}(4,1)}(x)\] ```{r} rmixing2=function(n,alpha,l0,l1,p0,p1) # Generate data from a mixing model { z=rbinom(n,1,alpha) f1=eval(parse(text=paste('r',l1,'(',paste(c(n,p1),collapse=','),')',sep=''))) f0=eval(parse(text=paste('r',l0,'(',paste(c(n,p0),collapse=','),')',sep=''))) x=z*f1+(1-z)*f0 return(list(x=x,z=z)) } dmixing2=function(t,alpha,l0,l1,p0,p1) # draw the density of the mixing model # p1_t (resp. p0_t) returns the true P(Y=1|X) (resp. P(Y=0|X)) { mix=alpha*eval(parse(text=paste('d',l1,'(t,',paste(p1,collapse=','),')',sep='')))+(1-alpha)*eval(parse(text=paste('d',l0,'(t,',paste(p0,collapse=','),')',sep=''))) p1_t=alpha*eval(parse(text=paste('d',l1,'(t,',paste(p1,collapse=','),')',sep='')))/mix p0_t=(1-alpha)*eval(parse(text=paste('d',l0,'(t,',paste(p0,collapse=','),')',sep='')))/mix return(list(mix=mix,p0_t=p0_t,p1_t=p1_t)) } #Example n=300 alpha=0.3 l0='norm' p0=c(4,1) l1='norm' p1=c(0,2) s=seq(-10,10,0.001) # Generate data r=rmixing2(n,alpha,l0,l1,p0,p1) x=r$x z=r$z ``` ### Compute true P(Y=0|X) and P(Y=1|X) \[P(Y=1|x)=\frac{\alpha f_{\mathcal{N}(0,2)}(x)}{f_X(x)}\] \[P(Y=0|x)=\frac{(1-\alpha) f_{\mathcal{N}(4,1)}(x)}{f_X(x)}\] ```{r} p0_x=dmixing2(x,alpha,l0,l1,p0,p1)$p0_t p1_x=dmixing2(x,alpha,l0,l1,p0,p1)$p1_t u=sort(x) Re_cv=Classif_NP(x,z,me='cv',plot=FALSE) Re_aicc=Classif_NP(x,z,me='aicc',plot=FALSE) Re_df=Classif_NP(x,z,me='df',plot=FALSE) plot(u,p0_x[order(x)],ylab='P(Y=0|x)',type='l') lines(u,Re_cv$Prob[order(x),1],col="blue") lines(u,Re_df$Prob[order(x),1],col="orange") lines(u,Re_aicc$Prob[order(x),1],col="green") legend(x="topleft",leg=c('true','cv',"df",'aicc'), col=c('black',"blue","orange","green"),lty=1) plot(u,p1_x[order(x)],ylab='P(Y=1|x)',type='l') lines(u,Re_cv$Prob[order(x),2],col="blue") lines(u,Re_df$Prob[order(x),2],col="orange") lines(u,Re_aicc$Prob[order(x),2],col="green") legend(x="topright",leg=c('true','cv',"df",'aicc'), col=c('black',"blue","orange","green"),lty=1) ``` ### Compare to Bayes' Classifier #### ROC curves and AUC ```{r} ROC_t=ROC(z,p1_x,plot = TRUE) ROC_cv=ROC(z,Re_cv$Prob[,2],plot=FALSE)$ROC ROC_df=ROC(z,Re_df$Prob[,2],plot=FALSE)$ROC ROC_aicc=ROC(z,Re_aicc$Prob[,2],plot=FALSE)$ROC lines(ROC_cv[,1],ROC_cv[,2],col="blue") lines(ROC_df[,1],ROC_df[,2],col="orange") lines(ROC_aicc[,1],ROC_aicc[,2],col="green") legend(x="topright",leg=c('true','cv',"df",'aicc'), col=c('black',"blue","orange","green"),lty=1) ROC_t$AUC Re_cv$Auc Re_df$Auc Re_aicc$Auc ``` #### Confusion matrices and errors ```{r} Class_Bayes=as.numeric(p1_x>0.5) (M_table=table(z,Class_Bayes)) (err=1-sum(diag(M_table))/n) Re_cv$M_table Re_cv$Err Re_df$M_table Re_df$Err Re_aicc$M_table Re_aicc$Err ``` ### 2.3.2 Real world data #### Dopage ```{r} load('Dopage.RData') x=hema z=test Classif_NP(x,z) ``` ```{r} load('Dopage.RData') x=hema z=test (x0=runif(7,min(x),max(x))) Classif_NP(x,z,x0) ``` #### Iris ```{r} data('iris') attach(iris) for (j in colnames(iris)[1:4]) { print(j) x=get(j) z=Species (x0=runif(7,min(x),max(x))) r=Classif_NP(x,z,x0) cat(' \n') cat('Qualité de la classification \n') cat('matrice de confusion: \n') print(r$M_table) cat('Erreur:\n') print(r$Err) cat(' \n') cat('Estimation \n') cat('x0:\n') print(x0) cat('Class0:\n ') print(r$Class0) cat('Prob0:\n') print(r$Prob0) cat(' \n') cat(' \n') } ```