作者

zsc

发布日期

2017年11月13日

## 1、标准化——数据处理

preProcess(x, method = c("center", "scale"),  
       thresh = 0.95,  pcaComp = NULL,  na.remove = TRUE,  k = 5,  
       knnSummary = mean,  outcome = NULL,  fudge = 0.2,  numUnique = 3,  
       verbose = FALSE,  freqCut = 95/5,  uniqueCut = 10,  cutoff = 0.9, ...)   predict(object, newdata, ...)  
       
# 解释
1. x: 为一个矩阵或数据框,对于非数值型变量将被忽略  

2. method: 指定数据标准化的方法,默认为"center""scale"。(必须同时使用这两个,若选一个只能对应中心化或均值化)  
   - 其中center表示预测变量值减去均值;scale表示预测变量值除以标准差,故默认标准化方法就是$(x-mu)/std$
   - 如果使用range方法,则数据标准为[0,1]的范围,即$(x-min)/(max-min)$
ppMethods <- c("BoxCox", "YeoJohnson", "expoTrans", "invHyperbolicSine",
               "center", "scale", "range", 
               "knnImpute", "bagImpute", "medianImpute", 
               "pca", "ica", 
               "spatialSign", 
               "ignore", "keep", 
               "remove", 
               "zv", "nzv", "conditionalX",
               "corr")

2. iris实例 ———— scale(均值方差标准化)

Show the code
library(caret)
library(dplyr)
head(scale(iris[,1:4]))
     Sepal.Length Sepal.Width Petal.Length Petal.Width
[1,]   -0.8976739  1.01560199    -1.335752   -1.311052
[2,]   -1.1392005 -0.13153881    -1.335752   -1.311052
[3,]   -1.3807271  0.32731751    -1.392399   -1.311052
[4,]   -1.5014904  0.09788935    -1.279104   -1.311052
[5,]   -1.0184372  1.24503015    -1.335752   -1.311052
[6,]   -0.5353840  1.93331463    -1.165809   -1.048667
Show the code
stand_scale <- preProcess(iris)  #采用(x-mu)/std的标准化方法,与scale()函数效果一样
head(predict(stand_scale ,iris[,1:4]))
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1   -0.8976739  1.01560199    -1.335752   -1.311052
2   -1.1392005 -0.13153881    -1.335752   -1.311052
3   -1.3807271  0.32731751    -1.392399   -1.311052
4   -1.5014904  0.09788935    -1.279104   -1.311052
5   -1.0184372  1.24503015    -1.335752   -1.311052
6   -0.5353840  1.93331463    -1.165809   -1.048667
Show the code
all(scale(iris[,1:4])==predict(stand_scale ,iris[,1:4])) #中间存在误差,但是大多数以及相等了
[1] FALSE

3. iris实例————max-min(标准化)

Show the code
normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}
iris_max=as.data.frame(lapply(iris[1:4],normalize))
head(iris_max)
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1   0.22222222   0.6250000   0.06779661  0.04166667
2   0.16666667   0.4166667   0.06779661  0.04166667
3   0.11111111   0.5000000   0.05084746  0.04166667
4   0.08333333   0.4583333   0.08474576  0.04166667
5   0.19444444   0.6666667   0.06779661  0.04166667
6   0.30555556   0.7916667   0.11864407  0.12500000
Show the code
stand_range <- preProcess(iris[1:4],method = "range")
iris_max_prePro=predict(stand_range ,iris[1:4])
head(iris_max_prePro)
  Sepal.Length Sepal.Width Petal.Length Petal.Width
1   0.22222222   0.6250000   0.06779661  0.04166667
2   0.16666667   0.4166667   0.06779661  0.04166667
3   0.11111111   0.5000000   0.05084746  0.04166667
4   0.08333333   0.4583333   0.08474576  0.04166667
5   0.19444444   0.6666667   0.06779661  0.04166667
6   0.30555556   0.7916667   0.11864407  0.12500000
Show the code
all(iris_max==iris_max_prePro)# 正常,和我们预期的一样,两个数据相等
[1] TRUE

4. 还原标准化数据—–salce(均值-方差标准化)

比如: 我们经常对train数据进行标准化,并且用train数据的参数去标准化test数据,R中没有内置的函数。
还有:有些模型和数据 存在量纲差异,需要标准化后建模,并预测,预测出来的值是标准化后的数据,此时需要把预测的数据还原成真实数据,此时就需要原始数据的标准化参数了

Show the code
stand_scale <- preProcess(iris)  #采用(x-mu)/std的标准化方法,与scale()函数效果一样
iris_scale=predict(stand_scale,iris[,1:4])


iris_new <- t(apply(iris_scale,1,function(x){x*stand_scale$std+stand_scale$mean}))
iris_new %>% head(.,10)
      Sepal.Length Sepal.Width Petal.Length Petal.Width
 [1,]          5.1         3.5          1.4         0.2
 [2,]          4.9         3.0          1.4         0.2
 [3,]          4.7         3.2          1.3         0.2
 [4,]          4.6         3.1          1.5         0.2
 [5,]          5.0         3.6          1.4         0.2
 [6,]          5.4         3.9          1.7         0.4
 [7,]          4.6         3.4          1.4         0.3
 [8,]          5.0         3.4          1.5         0.2
 [9,]          4.4         2.9          1.4         0.2
[10,]          4.9         3.1          1.5         0.1
Show the code
head(iris,10)
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1           5.1         3.5          1.4         0.2  setosa
2           4.9         3.0          1.4         0.2  setosa
3           4.7         3.2          1.3         0.2  setosa
4           4.6         3.1          1.5         0.2  setosa
5           5.0         3.6          1.4         0.2  setosa
6           5.4         3.9          1.7         0.4  setosa
7           4.6         3.4          1.4         0.3  setosa
8           5.0         3.4          1.5         0.2  setosa
9           4.4         2.9          1.4         0.2  setosa
10          4.9         3.1          1.5         0.1  setosa

5. 还原标准化数据—–max-min

Show the code
stand_range <- preProcess(iris[1:4],method = "range")
iris_max_prePro=predict(stand_range ,iris[1:4])

t(apply(iris_max_prePro,1,function(x){x*(stand_range$ranges[2,]-stand_range$ranges[1,])+stand_range$ranges[1,] })) %>% head(.,10)
      Sepal.Length Sepal.Width Petal.Length Petal.Width
 [1,]          5.1         3.5          1.4         0.2
 [2,]          4.9         3.0          1.4         0.2
 [3,]          4.7         3.2          1.3         0.2
 [4,]          4.6         3.1          1.5         0.2
 [5,]          5.0         3.6          1.4         0.2
 [6,]          5.4         3.9          1.7         0.4
 [7,]          4.6         3.4          1.4         0.3
 [8,]          5.0         3.4          1.5         0.2
 [9,]          4.4         2.9          1.4         0.2
[10,]          4.9         3.1          1.5         0.1
Show the code
head(iris,10)
   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
1           5.1         3.5          1.4         0.2  setosa
2           4.9         3.0          1.4         0.2  setosa
3           4.7         3.2          1.3         0.2  setosa
4           4.6         3.1          1.5         0.2  setosa
5           5.0         3.6          1.4         0.2  setosa
6           5.4         3.9          1.7         0.4  setosa
7           4.6         3.4          1.4         0.3  setosa
8           5.0         3.4          1.5         0.2  setosa
9           4.4         2.9          1.4         0.2  setosa
10          4.9         3.1          1.5         0.1  setosa

6. 还原标准化数据—DMwR::unscale()函数

unscale(vals, norm.data, col.ids)

    vals    : 要还原标准化的数值型矩阵,或者数值型数据框  
    norm.data    : 以及标准化后的数据,必须是用scale()函数标准化后的数据  
    col.ids : 那些列可以不用标准化(默认全部列都有进行)  
Show the code
############ 新版本的的R好像没有这个包了
# 我把iris数据集分为7:3
library(caret)
library(dplyr)
library(DMwR)
library(e1071)



#############################################################
########## 标准化数据
# 
data(algae)
algae[,4:12] %>% head() # 可以发现数据存在量纲的差异,我们要预测a1(因变量) ,其他为自变量
normData <- scale(algae[,4:12]) # 把train数据进行标准化
t <- svm(a1 ~ .,normData[1:100,] %>% as.data.frame() ) #决策树模型
normPs <- predict(t,as.data.frame(normData[101:nrow(normData),])) # 可以发现预测的数据不是最终的数据,需要进行标准化还原
normPs %>% head() # 预测值 ,预测数据,都在0--1范围
algae[101:nrow(normData),"a1"] %>% head()# 真实值
unscale(normPs,normData) %>% head()#于是还原预测数据
Show the code
sessionInfo()
R version 4.2.1 (2022-06-23)
Platform: aarch64-apple-darwin20 (64-bit)
Running under: macOS Monterey 12.5.1

Matrix products: default
BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib

locale:
[1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
[1] dplyr_1.0.9     caret_6.0-93    lattice_0.20-45 ggplot2_3.3.6  

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.9           lubridate_1.8.0      listenv_0.8.0       
 [4] class_7.3-20         assertthat_0.2.1     digest_0.6.29       
 [7] ipred_0.9-13         foreach_1.5.2        utf8_1.2.2          
[10] parallelly_1.32.1    R6_2.5.1             plyr_1.8.7          
[13] stats4_4.2.1         hardhat_1.2.0        evaluate_0.16       
[16] pillar_1.8.1         rlang_1.0.4          data.table_1.14.2   
[19] rstudioapi_0.14      rpart_4.1.16         Matrix_1.4-1        
[22] rmarkdown_2.16.1     splines_4.2.1        gower_1.0.0         
[25] stringr_1.4.1        htmlwidgets_1.5.4    munsell_0.5.0       
[28] compiler_4.2.1       xfun_0.32            pkgconfig_2.0.3     
[31] globals_0.16.0       htmltools_0.5.3      nnet_7.3-17         
[34] tidyselect_1.1.2     tibble_3.1.8         prodlim_2019.11.13  
[37] codetools_0.2-18     fansi_1.0.3          future_1.27.0       
[40] withr_2.5.0          ModelMetrics_1.2.2.2 MASS_7.3-58.1       
[43] recipes_1.0.1        grid_4.2.1           nlme_3.1-159        
[46] jsonlite_1.8.0       gtable_0.3.0         lifecycle_1.0.1     
[49] DBI_1.1.3            magrittr_2.0.3       pROC_1.18.0         
[52] scales_1.2.1         future.apply_1.9.0   cli_3.3.0           
[55] stringi_1.7.8        reshape2_1.4.4       timeDate_4021.104   
[58] generics_0.1.3       vctrs_0.4.1          lava_1.6.10         
[61] iterators_1.0.14     tools_4.2.1          glue_1.6.2          
[64] purrr_0.3.4          parallel_4.2.1       fastmap_1.1.0       
[67] survival_3.4-0       yaml_2.3.5           colorspace_2.0-3    
[70] knitr_1.40