作者

zsc

发布日期

2018年8月31日

独热编码–哑变量

独热编码: n种状态 转变为n列

哑变量: n种状态转变为n-1列(目的:为了防止共线性的问题)

Show the code
testFrame <- data.frame(First=sample(1:10, 20, replace=TRUE),
                        Second=sample(1:20, 20, replace=TRUE),
                        Third=sample(1:10, 20, replace=TRUE),
                        Fourth=factor(rep(c("=A", "=B", "=C", "=D"),5)),
                        Fifth=ordered(rep(c("=E", "=F", "=G","=H", "=I"), 4)),
                        Sixth=rep(c("=a", "=b"), 10),
                        Seventh=factor(c(rep(c("=J","=K","=L"),6),"=J","=K")) , 
                        stringsAsFactors=F)

head(testFrame)
#>   First Second Third Fourth Fifth Sixth Seventh
#> 1     8     14    10     =A    =E    =a      =J
#> 2     2     14     4     =B    =F    =b      =K
#> 3     8      5     7     =C    =G    =a      =L
#> 4     4     14     4     =D    =H    =b      =J
#> 5     2      6     3     =A    =I    =a      =K
#> 6    10     14     9     =B    =E    =b      =L
str(testFrame)
#> 'data.frame':    20 obs. of  7 variables:
#>  $ First  : int  8 2 8 4 2 10 8 2 4 3 ...
#>  $ Second : int  14 14 5 14 6 14 16 6 4 4 ...
#>  $ Third  : int  10 4 7 4 3 9 7 3 3 10 ...
#>  $ Fourth : Factor w/ 4 levels "=A","=B","=C",..: 1 2 3 4 1 2 3 4 1 2 ...
#>  $ Fifth  : Ord.factor w/ 5 levels "=E"<"=F"<"=G"<..: 1 2 3 4 5 1 2 3 4 5 ...
#>  $ Sixth  : chr  "=a" "=b" "=a" "=b" ...
#>  $ Seventh: Factor w/ 3 levels "=J","=K","=L": 1 2 3 1 2 3 1 2 3 1 ...

独热编码1

Show the code
###########################################################################
#### 以下涉及公式的地方,
#### 公式右边,-1代表不要截距项同时生成独热编码(只在一个因子变量的情况下)
#### 公式左边为要排除的变量
###########################################################################


###### 独热编码1  
library(data.table)
library(magrittr)
iris_dt = data.table(iris)
library(mltools)## 此编码必须依赖data.table
one_hot(iris_dt)%>% head()
#>    Sepal.Length Sepal.Width Petal.Length Petal.Width Species_setosa
#> 1:          5.1         3.5          1.4         0.2              1
#> 2:          4.9         3.0          1.4         0.2              1
#> 3:          4.7         3.2          1.3         0.2              1
#> 4:          4.6         3.1          1.5         0.2              1
#> 5:          5.0         3.6          1.4         0.2              1
#> 6:          5.4         3.9          1.7         0.4              1
#>    Species_versicolor Species_virginica
#> 1:                  0                 0
#> 2:                  0                 0
#> 3:                  0                 0
#> 4:                  0                 0
#> 5:                  0                 0
#> 6:                  0                 0

独热编码2

Show the code
###### 独热编码2
library(onehot) #先编码后预测输出独热编码
encoder <- onehot(iris)
x <- predict(encoder, iris)
x %>% head()
#>      Sepal.Length Sepal.Width Petal.Length Petal.Width Species=setosa
#> [1,]          5.1         3.5          1.4         0.2              1
#> [2,]          4.9         3.0          1.4         0.2              1
#> [3,]          4.7         3.2          1.3         0.2              1
#> [4,]          4.6         3.1          1.5         0.2              1
#> [5,]          5.0         3.6          1.4         0.2              1
#> [6,]          5.4         3.9          1.7         0.4              1
#>      Species=versicolor Species=virginica
#> [1,]                  0                 0
#> [2,]                  0                 0
#> [3,]                  0                 0
#> [4,]                  0                 0
#> [5,]                  0                 0
#> [6,]                  0                 0

## 类似的还有 caret::dummyVars函数
library(caret)
dummy <- dummyVars(~ ., data = iris, fullRank = TRUE)
predict(dummy,iris)  %>% head()
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species.versicolor
#> 1          5.1         3.5          1.4         0.2                  0
#> 2          4.9         3.0          1.4         0.2                  0
#> 3          4.7         3.2          1.3         0.2                  0
#> 4          4.6         3.1          1.5         0.2                  0
#> 5          5.0         3.6          1.4         0.2                  0
#> 6          5.4         3.9          1.7         0.4                  0
#>   Species.virginica
#> 1                 0
#> 2                 0
#> 3                 0
#> 4                 0
#> 5                 0
#> 6                 0

dummy <- dummyVars(~.-1, data = iris, fullRank = TRUE)
predict(dummy,iris)  %>% head()
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Species.setosa
#> 1          5.1         3.5          1.4         0.2              1
#> 2          4.9         3.0          1.4         0.2              1
#> 3          4.7         3.2          1.3         0.2              1
#> 4          4.6         3.1          1.5         0.2              1
#> 5          5.0         3.6          1.4         0.2              1
#> 6          5.4         3.9          1.7         0.4              1
#>   Species.versicolor Species.virginica
#> 1                  0                 0
#> 2                  0                 0
#> 3                  0                 0
#> 4                  0                 0
#> 5                  0                 0
#> 6                  0                 0

独热编码3

Show the code
###### 独热编码3
model.matrix(~.-1,iris)  %>% head()
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Speciessetosa
#> 1          5.1         3.5          1.4         0.2             1
#> 2          4.9         3.0          1.4         0.2             1
#> 3          4.7         3.2          1.3         0.2             1
#> 4          4.6         3.1          1.5         0.2             1
#> 5          5.0         3.6          1.4         0.2             1
#> 6          5.4         3.9          1.7         0.4             1
#>   Speciesversicolor Speciesvirginica
#> 1                 0                0
#> 2                 0                0
#> 3                 0                0
#> 4                 0                0
#> 5                 0                0
#> 6                 0                0
#### 类似Matrix包,只不过这个包用的是系数矩阵,这个包对大数据的时候特别友好
library(Matrix)
sparse.model.matrix(~.-1, data = iris)%>% head()
#> 6 x 7 sparse Matrix of class "dgCMatrix"
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Speciessetosa
#> 1          5.1         3.5          1.4         0.2             1
#> 2          4.9         3.0          1.4         0.2             1
#> 3          4.7         3.2          1.3         0.2             1
#> 4          4.6         3.1          1.5         0.2             1
#> 5          5.0         3.6          1.4         0.2             1
#> 6          5.4         3.9          1.7         0.4             1
#>   Speciesversicolor Speciesvirginica
#> 1                 .                .
#> 2                 .                .
#> 3                 .                .
#> 4                 .                .
#> 5                 .                .
#> 6                 .                .

独热编码4

Show the code

###### 独热编码4
library(qdapTools)
#只适用于factor向量,不能用数据框,必须是一列,生成的列名为因子水平
mtabulate(iris$Species) %>% head()
#>   setosa versicolor virginica
#> 1      1          0         0
#> 2      1          0         0
#> 3      1          0         0
#> 4      1          0         0
#> 5      1          0         0
#> 6      1          0         0
# 同理,下面这个也只能适用于只含有因子的向量(不能用于数据框),生成的列名为因子水平
library(nnet)
class.ind(iris$Species) %>% head()
#>      setosa versicolor virginica
#> [1,]      1          0         0
#> [2,]      1          0         0
#> [3,]      1          0         0
#> [4,]      1          0         0
#> [5,]      1          0         0
#> [6,]      1          0         0

独热编码5

Show the code
###### 独热编码5
## 只能用于因子的数据框,不能包含向量or数字列
library(ade4)
acm.disjonctif(iris[,5,drop=F]) %>% head()
#>   Species.setosa Species.versicolor Species.virginica
#> 1              1                  0                 0
#> 2              1                  0                 0
#> 3              1                  0                 0
#> 4              1                  0                 0
#> 5              1                  0                 0
#> 6              1                  0                 0

独热编码6

  • 好像没有这个dummies包了
Show the code
###### 独热编码 6  ************
library(dummies)
# 全部转换,指定列, 设置all=F  只显示被转换的因子变量
alldata <- dummy.data.frame(iris, names=c("Species"), sep="_",all=T)
alldata %>% head() 

#把所有factor类型都转变为独热编码
dummy.data.frame(iris, dummy.class="factor") %>% head()
dummy.data.frame(testFrame, dummy.class="factor") %>% head() 

独热编码7

Show the code
###### 独热编码7
library(useful)
build.x(~.,iris)%>% head()
#>   (Intercept) Sepal.Length Sepal.Width Petal.Length Petal.Width
#> 1           1          5.1         3.5          1.4         0.2
#> 2           1          4.9         3.0          1.4         0.2
#> 3           1          4.7         3.2          1.3         0.2
#> 4           1          4.6         3.1          1.5         0.2
#> 5           1          5.0         3.6          1.4         0.2
#> 6           1          5.4         3.9          1.7         0.4
#>   Speciesversicolor Speciesvirginica
#> 1                 0                0
#> 2                 0                0
#> 3                 0                0
#> 4                 0                0
#> 5                 0                0
#> 6                 0                0
build.x(~.-1,iris)%>% head()
#>   Sepal.Length Sepal.Width Petal.Length Petal.Width Speciessetosa
#> 1          5.1         3.5          1.4         0.2             1
#> 2          4.9         3.0          1.4         0.2             1
#> 3          4.7         3.2          1.3         0.2             1
#> 4          4.6         3.1          1.5         0.2             1
#> 5          5.0         3.6          1.4         0.2             1
#> 6          5.4         3.9          1.7         0.4             1
#>   Speciesversicolor Speciesvirginica
#> 1                 0                0
#> 2                 0                0
#> 3                 0                0
#> 4                 0                0
#> 5                 0                0
#> 6                 0                0
build.y(Species~.,data = iris) # 直接把公式左边的保留即可,不做任何修改
#>   [1] setosa     setosa     setosa     setosa     setosa     setosa    
#>   [7] setosa     setosa     setosa     setosa     setosa     setosa    
#>  [13] setosa     setosa     setosa     setosa     setosa     setosa    
#>  [19] setosa     setosa     setosa     setosa     setosa     setosa    
#>  [25] setosa     setosa     setosa     setosa     setosa     setosa    
#>  [31] setosa     setosa     setosa     setosa     setosa     setosa    
#>  [37] setosa     setosa     setosa     setosa     setosa     setosa    
#>  [43] setosa     setosa     setosa     setosa     setosa     setosa    
#>  [49] setosa     setosa     versicolor versicolor versicolor versicolor
#>  [55] versicolor versicolor versicolor versicolor versicolor versicolor
#>  [61] versicolor versicolor versicolor versicolor versicolor versicolor
#>  [67] versicolor versicolor versicolor versicolor versicolor versicolor
#>  [73] versicolor versicolor versicolor versicolor versicolor versicolor
#>  [79] versicolor versicolor versicolor versicolor versicolor versicolor
#>  [85] versicolor versicolor versicolor versicolor versicolor versicolor
#>  [91] versicolor versicolor versicolor versicolor versicolor versicolor
#>  [97] versicolor versicolor versicolor versicolor virginica  virginica 
#> [103] virginica  virginica  virginica  virginica  virginica  virginica 
#> [109] virginica  virginica  virginica  virginica  virginica  virginica 
#> [115] virginica  virginica  virginica  virginica  virginica  virginica 
#> [121] virginica  virginica  virginica  virginica  virginica  virginica 
#> [127] virginica  virginica  virginica  virginica  virginica  virginica 
#> [133] virginica  virginica  virginica  virginica  virginica  virginica 
#> [139] virginica  virginica  virginica  virginica  virginica  virginica 
#> [145] virginica  virginica  virginica  virginica  virginica  virginica 
#> Levels: setosa versicolor virginica

独热编码8

Show the code
###### 独热编码8
head(model.matrix(First~ Second + Fourth + Fifth, testFrame)) 
#>   (Intercept) Second Fourth=B Fourth=C Fourth=D       Fifth.L    Fifth.Q
#> 1           1     14        0        0        0 -6.324555e-01  0.5345225
#> 2           1     14        1        0        0 -3.162278e-01 -0.2672612
#> 3           1      5        0        1        0 -1.481950e-18 -0.5345225
#> 4           1     14        0        0        1  3.162278e-01 -0.2672612
#> 5           1      6        0        0        0  6.324555e-01  0.5345225
#> 6           1     14        1        0        0 -6.324555e-01  0.5345225
#>         Fifth.C    Fifth^4
#> 1 -3.162278e-01  0.1195229
#> 2  6.324555e-01 -0.4780914
#> 3 -3.893692e-16  0.7171372
#> 4 -6.324555e-01 -0.4780914
#> 5  3.162278e-01  0.1195229
#> 6 -3.162278e-01  0.1195229
head(model.matrix(First~ .-1, testFrame))
#>   Second Third Fourth=A Fourth=B Fourth=C Fourth=D       Fifth.L    Fifth.Q
#> 1     14    10        1        0        0        0 -6.324555e-01  0.5345225
#> 2     14     4        0        1        0        0 -3.162278e-01 -0.2672612
#> 3      5     7        0        0        1        0 -1.481950e-18 -0.5345225
#> 4     14     4        0        0        0        1  3.162278e-01 -0.2672612
#> 5      6     3        1        0        0        0  6.324555e-01  0.5345225
#> 6     14     9        0        1        0        0 -6.324555e-01  0.5345225
#>         Fifth.C    Fifth^4 Sixth=b Seventh=K Seventh=L
#> 1 -3.162278e-01  0.1195229       0         0         0
#> 2  6.324555e-01 -0.4780914       1         1         0
#> 3 -3.893692e-16  0.7171372       0         0         1
#> 4 -6.324555e-01 -0.4780914       1         0         0
#> 5  3.162278e-01  0.1195229       0         1         0
#> 6 -3.162278e-01  0.1195229       1         0         1
head(model.matrix(First~ ., testFrame))
#>   (Intercept) Second Third Fourth=B Fourth=C Fourth=D       Fifth.L    Fifth.Q
#> 1           1     14    10        0        0        0 -6.324555e-01  0.5345225
#> 2           1     14     4        1        0        0 -3.162278e-01 -0.2672612
#> 3           1      5     7        0        1        0 -1.481950e-18 -0.5345225
#> 4           1     14     4        0        0        1  3.162278e-01 -0.2672612
#> 5           1      6     3        0        0        0  6.324555e-01  0.5345225
#> 6           1     14     9        1        0        0 -6.324555e-01  0.5345225
#>         Fifth.C    Fifth^4 Sixth=b Seventh=K Seventh=L
#> 1 -3.162278e-01  0.1195229       0         0         0
#> 2  6.324555e-01 -0.4780914       1         1         0
#> 3 -3.893692e-16  0.7171372       0         0         1
#> 4 -6.324555e-01 -0.4780914       1         0         0
#> 5  3.162278e-01  0.1195229       0         1         0
#> 6 -3.162278e-01  0.1195229       1         0         1

独热编码 转变为原始变量 (即逆运算)

Show the code
##独热编码 转变为原始变量 (即逆运算)

d = model.matrix(~Species-1 ,iris) 
d =data.frame(d)
ifelse(rowSums(d)>=1,names(d)[max.col(d)],NA)
#>                   1                   2                   3                   4 
#>     "Speciessetosa"     "Speciessetosa"     "Speciessetosa"     "Speciessetosa" 
#>                   5                   6                   7                   8 
#>     "Speciessetosa"     "Speciessetosa"     "Speciessetosa"     "Speciessetosa" 
#>                   9                  10                  11                  12 
#>     "Speciessetosa"     "Speciessetosa"     "Speciessetosa"     "Speciessetosa" 
#>                  13                  14                  15                  16 
#>     "Speciessetosa"     "Speciessetosa"     "Speciessetosa"     "Speciessetosa" 
#>                  17                  18                  19                  20 
#>     "Speciessetosa"     "Speciessetosa"     "Speciessetosa"     "Speciessetosa" 
#>                  21                  22                  23                  24 
#>     "Speciessetosa"     "Speciessetosa"     "Speciessetosa"     "Speciessetosa" 
#>                  25                  26                  27                  28 
#>     "Speciessetosa"     "Speciessetosa"     "Speciessetosa"     "Speciessetosa" 
#>                  29                  30                  31                  32 
#>     "Speciessetosa"     "Speciessetosa"     "Speciessetosa"     "Speciessetosa" 
#>                  33                  34                  35                  36 
#>     "Speciessetosa"     "Speciessetosa"     "Speciessetosa"     "Speciessetosa" 
#>                  37                  38                  39                  40 
#>     "Speciessetosa"     "Speciessetosa"     "Speciessetosa"     "Speciessetosa" 
#>                  41                  42                  43                  44 
#>     "Speciessetosa"     "Speciessetosa"     "Speciessetosa"     "Speciessetosa" 
#>                  45                  46                  47                  48 
#>     "Speciessetosa"     "Speciessetosa"     "Speciessetosa"     "Speciessetosa" 
#>                  49                  50                  51                  52 
#>     "Speciessetosa"     "Speciessetosa" "Speciesversicolor" "Speciesversicolor" 
#>                  53                  54                  55                  56 
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" 
#>                  57                  58                  59                  60 
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" 
#>                  61                  62                  63                  64 
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" 
#>                  65                  66                  67                  68 
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" 
#>                  69                  70                  71                  72 
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" 
#>                  73                  74                  75                  76 
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" 
#>                  77                  78                  79                  80 
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" 
#>                  81                  82                  83                  84 
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" 
#>                  85                  86                  87                  88 
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" 
#>                  89                  90                  91                  92 
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" 
#>                  93                  94                  95                  96 
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" 
#>                  97                  98                  99                 100 
#> "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" "Speciesversicolor" 
#>                 101                 102                 103                 104 
#>  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica" 
#>                 105                 106                 107                 108 
#>  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica" 
#>                 109                 110                 111                 112 
#>  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica" 
#>                 113                 114                 115                 116 
#>  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica" 
#>                 117                 118                 119                 120 
#>  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica" 
#>                 121                 122                 123                 124 
#>  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica" 
#>                 125                 126                 127                 128 
#>  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica" 
#>                 129                 130                 131                 132 
#>  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica" 
#>                 133                 134                 135                 136 
#>  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica" 
#>                 137                 138                 139                 140 
#>  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica" 
#>                 141                 142                 143                 144 
#>  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica" 
#>                 145                 146                 147                 148 
#>  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica"  "Speciesvirginica" 
#>                 149                 150 
#>  "Speciesvirginica"  "Speciesvirginica"
Show the code
sessionInfo()
#> R version 4.2.1 (2022-06-23)
#> Platform: aarch64-apple-darwin20 (64-bit)
#> Running under: macOS Monterey 12.5.1
#> 
#> Matrix products: default
#> BLAS:   /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRblas.0.dylib
#> LAPACK: /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/lib/libRlapack.dylib
#> 
#> locale:
#> [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] useful_1.2.6      ade4_1.7-19       nnet_7.3-17       qdapTools_1.3.5  
#>  [5] Matrix_1.4-1      caret_6.0-93      lattice_0.20-45   ggplot2_3.3.6    
#>  [9] onehot_0.1.1      mltools_0.3.5     magrittr_2.0.3    data.table_1.14.2
#> 
#> loaded via a namespace (and not attached):
#>  [1] Rcpp_1.0.9           lubridate_1.8.0      listenv_0.8.0       
#>  [4] class_7.3-20         assertthat_0.2.1     digest_0.6.29       
#>  [7] ipred_0.9-13         foreach_1.5.2        utf8_1.2.2          
#> [10] parallelly_1.32.1    chron_2.3-57         R6_2.5.1            
#> [13] plyr_1.8.7           stats4_4.2.1         hardhat_1.2.0       
#> [16] evaluate_0.16        pillar_1.8.1         rlang_1.0.4         
#> [19] rstudioapi_0.14      rpart_4.1.16         rmarkdown_2.16.1    
#> [22] splines_4.2.1        gower_1.0.0          stringr_1.4.1       
#> [25] htmlwidgets_1.5.4    RCurl_1.98-1.8       munsell_0.5.0       
#> [28] compiler_4.2.1       xfun_0.32            pkgconfig_2.0.3     
#> [31] globals_0.16.0       htmltools_0.5.3      tidyselect_1.1.2    
#> [34] tibble_3.1.8         prodlim_2019.11.13   codetools_0.2-18    
#> [37] fansi_1.0.3          future_1.27.0        dplyr_1.0.9         
#> [40] withr_2.5.0          bitops_1.0-7         ModelMetrics_1.2.2.2
#> [43] MASS_7.3-58.1        recipes_1.0.1        grid_4.2.1          
#> [46] nlme_3.1-159         jsonlite_1.8.0       gtable_0.3.0        
#> [49] lifecycle_1.0.1      DBI_1.1.3            pROC_1.18.0         
#> [52] scales_1.2.1         future.apply_1.9.0   cli_3.3.0           
#> [55] stringi_1.7.8        reshape2_1.4.4       timeDate_4021.104   
#> [58] generics_0.1.3       vctrs_0.4.1          lava_1.6.10         
#> [61] iterators_1.0.14     tools_4.2.1          glue_1.6.2          
#> [64] purrr_0.3.4          parallel_4.2.1       fastmap_1.1.0       
#> [67] survival_3.4-0       yaml_2.3.5           colorspace_2.0-3    
#> [70] knitr_1.40