作业25答案
用到的包(如提示包不存在可以用install.packages("包名")的方式安装):
require('readr')
require('ggplot2')
require('dplyr')
require('tidyr')
require('caret')
require('corrplot')
require('Hmisc')
require('parallel')
require('doParallel')
require('ggthemes')
require('e1071')
练习1 数据导入
- 将数据集的csv文件导入
voice_Original <- read.csv("voice.csv",header = TRUE)
练习2 查看数据
- 用Hmisc包中describe 函数是对数据集进行概述
describe(voice_Original)
voice_Original
21 Variables 3168 Observations
-------------------------------------------------------------------
meanfreq
n missing unique Info Mean .05 .10 .25
3168 0 3166 1 0.1809 0.1260 0.1411 0.1637
.50 .75 .90 .95
0.1848 0.1991 0.2177 0.2291
lowest : 0.03936 0.04825 0.05965 0.05978 0.06218
highest: 0.24353 0.24436 0.24704 0.24964 0.25112
-------------------------------------------------------------------
sd
n missing unique Info Mean .05 .10 .25
3168 0 3166 1 0.05713 0.03162 0.03396 0.04195
.50 .75 .90 .95
0.05916 0.06702 0.07966 0.08549
lowest : 0.01836 0.02178 0.02400 0.02427 0.02456
highest: 0.11126 0.11126 0.11265 0.11451 0.11527
-------------------------------------------------------------------
median
n missing unique Info Mean .05 .10 .25
3168 0 3077 1 0.1856 0.1164 0.1340 0.1696
.50 .75 .90 .95
0.1900 0.2106 0.2274 0.2358
lowest : 0.01097 0.01359 0.01579 0.02699 0.02936
highest: 0.25663 0.25698 0.25742 0.26054 0.26122
-------------------------------------------------------------------
Q25
n missing unique Info Mean .05 .10 .25
3168 0 3103 1 0.1405 0.04358 0.07509 0.11109
.50 .75 .90 .95
0.14029 0.17594 0.20063 0.21524
lowest : 0.0002288 0.0002355 0.0002395 0.0002502 0.0002669
highest: 0.2394595 0.2405416 0.2407352 0.2421235 0.2473469
-------------------------------------------------------------------
Q75
n missing unique Info Mean .05 .10 .25
3168 0 3034 1 0.2248 0.1874 0.1963 0.2087
.50 .75 .90 .95
0.2257 0.2437 0.2536 0.2577
lowest : 0.04295 0.05827 0.07596 0.09019 0.09267
highest: 0.26879 0.26892 0.26894 0.26985 0.27347
-------------------------------------------------------------------
IQR
n missing unique Info Mean .05 .10 .25
3168 0 3073 1 0.08431 0.02549 0.02931 0.04256
.50 .75 .90 .95
0.09428 0.11418 0.13284 0.15632
lowest : 0.01456 0.01492 0.01511 0.01549 0.01659
highest: 0.24530 0.24597 0.24819 0.24877 0.25223
-------------------------------------------------------------------
skew
n missing unique Info Mean .05 .10 .25
3168 0 3166 1 3.14 1.123 1.299 1.650
.50 .75 .90 .95
2.197 2.932 3.916 6.918
lowest : 0.1417 0.2850 0.3260 0.5296 0.5487
highest: 32.3507 33.1673 33.5663 34.5375 34.7255
-------------------------------------------------------------------
kurt
n missing unique Info Mean .05 .10 .25
3168 0 3166 1 36.57 3.755 4.293 5.670
.50 .75 .90 .95
8.318 13.649 27.294 75.169
lowest : 2.068 2.210 2.269 2.293 2.463
highest: 1128.535 1193.434 1202.685 1271.354 1309.613
-------------------------------------------------------------------
sp.ent
n missing unique Info Mean .05 .10 .25
3168 0 3166 1 0.8951 0.8168 0.8322 0.8618
.50 .75 .90 .95
0.9018 0.9287 0.9513 0.9630
lowest : 0.7387 0.7476 0.7477 0.7485 0.7487
highest: 0.9764 0.9765 0.9765 0.9785 0.9820
-------------------------------------------------------------------
sfm
n missing unique Info Mean .05 .10 .25
3168 0 3166 1 0.4082 0.1584 0.1883 0.2580
.50 .75 .90 .95
0.3963 0.5337 0.6713 0.7328
lowest : 0.03688 0.08024 0.08096 0.08220 0.08266
highest: 0.82259 0.82267 0.82610 0.83135 0.84294
-------------------------------------------------------------------
mode
n missing unique Info Mean .05 .10 .25
3168 0 2825 1 0.1653 0.00000 0.01629 0.11802
.50 .75 .90 .95
0.18660 0.22110 0.24901 0.26081
lowest : 0.0000000 0.0007279 0.0007749 0.0008008 0.0008427
highest: 0.2791181 0.2795230 0.2795852 0.2797034 0.2800000
-------------------------------------------------------------------
centroid
n missing unique Info Mean .05 .10 .25
3168 0 3166 1 0.1809 0.1260 0.1411 0.1637
.50 .75 .90 .95
0.1848 0.1991 0.2177 0.2291
lowest : 0.03936 0.04825 0.05965 0.05978 0.06218
highest: 0.24353 0.24436 0.24704 0.24964 0.25112
-------------------------------------------------------------------
meanfun
n missing unique Info Mean .05 .10 .25
3168 0 3166 1 0.1428 0.09363 0.10160 0.11700
.50 .75 .90 .95
0.14052 0.16958 0.18519 0.19343
lowest : 0.05557 0.05705 0.06097 0.06254 0.06348
highest: 0.22342 0.22576 0.22915 0.23114 0.23764
-------------------------------------------------------------------
minfun
n missing unique Info Mean .05 .10 .25
3168 0 913 1 0.0368 0.01579 0.01613 0.01822
.50 .75 .90 .95
0.04611 0.04790 0.05054 0.05644
lowest : 0.009775 0.009785 0.009901 0.009911 0.010163
highest: 0.168421 0.178571 0.185185 0.200000 0.204082
-------------------------------------------------------------------
maxfun
n missing unique Info Mean .05 .10 .25
3168 0 123 0.99 0.2588 0.1925 0.2192 0.2540
.50 .75 .90 .95
0.2712 0.2775 0.2791 0.2791
lowest : 0.1031 0.1053 0.1087 0.1111 0.1124
highest: 0.2774 0.2775 0.2778 0.2791 0.2791
-------------------------------------------------------------------
meandom
n missing unique Info Mean .05 .10 .25
3168 0 2999 1 0.8292 0.1045 0.1888 0.4198
.50 .75 .90 .95
0.7658 1.1772 1.5602 1.8004
lowest : 0.007812 0.007979 0.007990 0.008185 0.008247
highest: 2.544271 2.591580 2.676989 2.805246 2.957682
-------------------------------------------------------------------
mindom
n missing unique Info Mean .05 .10
3168 0 77 0.92 0.05265 0.007812 0.007812
.25 .50 .75 .90 .95
0.007812 0.023438 0.070312 0.164062 0.187500
lowest : 0.004883 0.007812 0.014648 0.015625 0.019531
highest: 0.343750 0.351562 0.400391 0.449219 0.458984
-------------------------------------------------------------------
maxdom
n missing unique Info Mean .05 .10 .25
3168 0 1054 1 5.047 0.3125 0.6094 2.0703
.50 .75 .90 .95
4.9922 7.0078 9.4219 10.6406
lowest : 0.007812 0.015625 0.023438 0.054688 0.070312
highest: 21.515625 21.562500 21.796875 21.843750 21.867188
-------------------------------------------------------------------
dfrange
n missing unique Info Mean .05 .10 .25
3168 0 1091 1 4.995 0.2656 0.5607 2.0449
.50 .75 .90 .95
4.9453 6.9922 9.3750 10.6090
lowest : 0.000000 0.007812 0.015625 0.019531 0.024414
highest: 21.492188 21.539062 21.773438 21.820312 21.843750
-------------------------------------------------------------------
modindx
n missing unique Info Mean .05 .10 .25
3168 0 3079 1 0.1738 0.05775 0.07365 0.09977
.50 .75 .90 .95
0.13936 0.20918 0.32436 0.40552
lowest : 0.00000 0.01988 0.02165 0.02194 0.02217
highest: 0.84448 0.85470 0.85776 0.87950 0.93237
-------------------------------------------------------------------
label
n missing unique
3168 0 2
female (1584, 50%), male (1584, 50%)
-------------------------------------------------------------------
练习3 数据操作
- 将数据集中的sp.ent属性由0.9作为阈值分为两类
### add a categorcial variable
voice_Original <- voice_Original%>%
mutate(sp.ent=
ifelse(sp.ent>0.9,"High","Low"))
练习4 图形绘制操作
- 用ggplot画出meanfreq,dfrange与label属性的散点图
### visual exploration of the dataset
require(ggplot2)
voice_Original%>%
ggplot(aes(x=meanfreq,y=dfrange))+
geom_point(aes(color=label))+
theme_wsj()