Numeric univariate and multivariate analysis in R
Check html version
Importing the Data
#install.packages(c("FactoMineR", "factoextra"))
``` r
library("FactoMineR")
library("factoextra")
data(decathlon2)
head(decathlon2)
## X100m Long.jump Shot.put High.jump X400m X110m.hurdle Discus
## SEBRLE 11.04 7.58 14.83 2.07 49.81 14.69 43.75
## CLAY 10.76 7.40 14.26 1.86 49.37 14.05 50.72
## BERNARD 11.02 7.23 14.25 1.92 48.93 14.99 40.87
## YURKOV 11.34 7.09 15.19 2.10 50.42 15.31 46.26
## ZSIVOCZKY 11.13 7.30 13.48 2.01 48.62 14.17 45.67
## McMULLEN 10.83 7.31 13.76 2.13 49.91 14.38 44.41
## Pole.vault Javeline X1500m Rank Points Competition
## SEBRLE 5.02 63.19 291.7 1 8217 Decastar
## CLAY 4.92 60.15 301.5 2 8122 Decastar
## BERNARD 5.32 62.77 280.1 4 8067 Decastar
## YURKOV 4.72 63.44 276.4 5 8036 Decastar
## ZSIVOCZKY 4.42 55.37 268.0 7 8004 Decastar
## McMULLEN 4.42 56.37 285.1 8 7995 Decastar
Inspecting the Data
– Number of rows, Columns
– Variables - type, Values
library(tibble)
glimpse(decathlon2)
## Rows: 27
## Columns: 13
## $ X100m <dbl> 11.04, 10.76, 11.02, 11.34, 11.13, 10.83, 11.64, 11.37, 1…
## $ Long.jump <dbl> 7.58, 7.40, 7.23, 7.09, 7.30, 7.31, 6.81, 7.56, 6.97, 7.2…
## $ Shot.put <dbl> 14.83, 14.26, 14.25, 15.19, 13.48, 13.76, 14.57, 14.41, 1…
## $ High.jump <dbl> 2.07, 1.86, 1.92, 2.10, 2.01, 2.13, 1.95, 1.86, 1.95, 1.9…
## $ X400m <dbl> 49.81, 49.37, 48.93, 50.42, 48.62, 49.91, 50.14, 51.10, 4…
## $ X110m.hurdle <dbl> 14.69, 14.05, 14.99, 15.31, 14.17, 14.38, 14.93, 15.06, 1…
## $ Discus <dbl> 43.75, 50.72, 40.87, 46.26, 45.67, 44.41, 47.60, 44.99, 4…
## $ Pole.vault <dbl> 5.02, 4.92, 5.32, 4.72, 4.42, 4.42, 4.92, 4.82, 4.72, 4.6…
## $ Javeline <dbl> 63.19, 60.15, 62.77, 63.44, 55.37, 56.37, 52.33, 57.19, 5…
## $ X1500m <dbl> 291.70, 301.50, 280.10, 276.40, 268.00, 285.10, 262.10, 2…
## $ Rank <int> 1, 2, 4, 5, 7, 8, 9, 10, 11, 12, 13, 1, 2, 3, 4, 5, 6, 7,…
## $ Points <int> 8217, 8122, 8067, 8036, 8004, 7995, 7802, 7733, 7708, 765…
## $ Competition <fct> Decastar, Decastar, Decastar, Decastar, Decastar, Decasta…
Random sample of the dataframe
sample(decathlon2)
## X110m.hurdle X100m Pole.vault Points High.jump Long.jump Shot.put
## SEBRLE 14.69 11.04 5.02 8217 2.07 7.58 14.83
## CLAY 14.05 10.76 4.92 8122 1.86 7.40 14.26
## BERNARD 14.99 11.02 5.32 8067 1.92 7.23 14.25
## YURKOV 15.31 11.34 4.72 8036 2.10 7.09 15.19
## ZSIVOCZKY 14.17 11.13 4.42 8004 2.01 7.30 13.48
## McMULLEN 14.38 10.83 4.42 7995 2.13 7.31 13.76
## MARTINEAU 14.93 11.64 4.92 7802 1.95 6.81 14.57
## HERNU 15.06 11.37 4.82 7733 1.86 7.56 14.41
## BARRAS 14.48 11.33 4.72 7708 1.95 6.97 14.09
## NOOL 15.29 11.33 4.62 7651 1.98 7.27 12.68
## BOURGUIGNON 15.67 11.36 5.02 7313 1.86 6.80 13.46
## Sebrle 14.05 10.85 5.00 8893 2.12 7.84 16.36
## Clay 14.13 10.44 4.90 8820 2.06 7.96 15.23
## Karpov 13.97 10.50 4.60 8725 2.09 7.81 15.93
## Macey 14.56 10.89 4.40 8414 2.15 7.47 15.73
## Warners 14.01 10.62 4.90 8343 1.97 7.74 14.48
## Zsivoczky 14.95 10.91 4.70 8287 2.12 7.14 15.31
## Hernu 14.25 10.97 4.80 8237 2.03 7.19 14.65
## Bernard 14.17 10.69 4.40 8225 2.12 7.48 14.80
## Schwarzl 14.25 10.98 5.10 8102 1.94 7.49 14.01
## Pogorelov 14.21 10.95 5.00 8084 2.06 7.31 15.10
## Schoenbeck 14.34 10.90 5.00 8077 1.88 7.30 14.77
## Barras 14.37 11.14 4.60 8067 1.94 6.99 14.91
## KARPOV 14.09 11.02 4.92 8099 2.04 7.30 14.77
## WARNERS 14.23 11.11 4.92 8030 1.98 7.60 14.31
## Nool 14.80 10.80 5.40 8235 1.88 7.53 14.26
## Drews 14.01 10.87 5.00 7926 1.88 7.38 13.07
## Competition Javeline X1500m X400m Rank Discus
## SEBRLE Decastar 63.19 291.70 49.81 1 43.75
## CLAY Decastar 60.15 301.50 49.37 2 50.72
## BERNARD Decastar 62.77 280.10 48.93 4 40.87
## YURKOV Decastar 63.44 276.40 50.42 5 46.26
## ZSIVOCZKY Decastar 55.37 268.00 48.62 7 45.67
## McMULLEN Decastar 56.37 285.10 49.91 8 44.41
## MARTINEAU Decastar 52.33 262.10 50.14 9 47.60
## HERNU Decastar 57.19 285.10 51.10 10 44.99
## BARRAS Decastar 55.40 282.00 49.48 11 42.10
## NOOL Decastar 57.44 266.60 49.20 12 37.92
## BOURGUIGNON Decastar 54.68 291.70 51.16 13 40.49
## Sebrle OlympicG 70.52 280.01 48.36 1 48.72
## Clay OlympicG 69.71 282.00 49.19 2 50.11
## Karpov OlympicG 55.54 278.11 46.81 3 51.65
## Macey OlympicG 58.46 265.42 48.97 4 48.34
## Warners OlympicG 55.39 278.05 47.97 5 43.73
## Zsivoczky OlympicG 63.45 269.54 49.40 6 45.62
## Hernu OlympicG 57.76 264.35 48.73 7 44.72
## Bernard OlympicG 55.27 276.31 49.13 9 44.75
## Schwarzl OlympicG 56.32 273.56 49.76 10 42.43
## Pogorelov OlympicG 53.45 287.63 50.79 11 44.60
## Schoenbeck OlympicG 60.89 278.82 50.30 12 44.41
## Barras OlympicG 64.55 267.09 49.41 13 44.83
## KARPOV Decastar 50.31 300.20 48.37 3 48.95
## WARNERS Decastar 51.77 278.10 48.68 6 41.10
## Nool OlympicG 61.33 276.33 48.81 8 42.05
## Drews OlympicG 51.53 274.21 48.51 19 40.11
Summary of all the variables of the dataframe
summary(decathlon2)
## X100m Long.jump Shot.put High.jump
## Min. :10.44 Min. :6.800 Min. :12.68 Min. :1.860
## 1st Qu.:10.84 1st Qu.:7.210 1st Qu.:14.17 1st Qu.:1.930
## Median :10.97 Median :7.310 Median :14.57 Median :1.980
## Mean :10.99 Mean :7.365 Mean :14.54 Mean :1.998
## 3rd Qu.:11.13 3rd Qu.:7.545 3rd Qu.:15.01 3rd Qu.:2.080
## Max. :11.64 Max. :7.960 Max. :16.36 Max. :2.150
## X400m X110m.hurdle Discus Pole.vault
## Min. :46.81 Min. :13.97 Min. :37.92 Min. :4.400
## 1st Qu.:48.70 1st Qu.:14.15 1st Qu.:42.27 1st Qu.:4.660
## Median :49.20 Median :14.34 Median :44.72 Median :4.900
## Mean :49.31 Mean :14.50 Mean :44.85 Mean :4.836
## 3rd Qu.:49.86 3rd Qu.:14.87 3rd Qu.:46.93 3rd Qu.:5.000
## Max. :51.16 Max. :15.67 Max. :51.65 Max. :5.400
## Javeline X1500m Rank Points Competition
## Min. :50.31 Min. :262.1 Min. : 1.000 Min. :7313 Decastar:13
## 1st Qu.:55.32 1st Qu.:271.6 1st Qu.: 4.000 1st Qu.:8000 OlympicG:14
## Median :57.19 Median :278.1 Median : 7.000 Median :8084
## Mean :58.32 Mean :278.5 Mean : 7.444 Mean :8119
## 3rd Qu.:62.05 3rd Qu.:283.6 3rd Qu.:10.500 3rd Qu.:8236
## Max. :70.52 Max. :301.5 Max. :19.000 Max. :8893
#https://stackoverflow.com/questions/50848273/call-many-variables-in-a-for-loop-with-dplyr-ggplot-function
plotUniCat <- function(df, x) {
x <- sym(x)
df %>%
filter(!is.na(!!x)) %>%
count(!!x) %>%
mutate(prop = prop.table(n)) %>%
ggplot(aes(y=prop, x=!!x)) +
geom_bar(stat = "identity")
}
Checking the column names of the dataframe
colnames(decathlon2)
## [1] "X100m" "Long.jump" "Shot.put" "High.jump" "X400m"
## [6] "X110m.hurdle" "Discus" "Pole.vault" "Javeline" "X1500m"
## [11] "Rank" "Points" "Competition"
Inspecting the structure of the dataframe
str(decathlon2)
## 'data.frame': 27 obs. of 13 variables:
## $ X100m : num 11 10.8 11 11.3 11.1 ...
## $ Long.jump : num 7.58 7.4 7.23 7.09 7.3 7.31 6.81 7.56 6.97 7.27 ...
## $ Shot.put : num 14.8 14.3 14.2 15.2 13.5 ...
## $ High.jump : num 2.07 1.86 1.92 2.1 2.01 2.13 1.95 1.86 1.95 1.98 ...
## $ X400m : num 49.8 49.4 48.9 50.4 48.6 ...
## $ X110m.hurdle: num 14.7 14.1 15 15.3 14.2 ...
## $ Discus : num 43.8 50.7 40.9 46.3 45.7 ...
## $ Pole.vault : num 5.02 4.92 5.32 4.72 4.42 4.42 4.92 4.82 4.72 4.62 ...
## $ Javeline : num 63.2 60.1 62.8 63.4 55.4 ...
## $ X1500m : num 292 302 280 276 268 ...
## $ Rank : int 1 2 4 5 7 8 9 10 11 12 ...
## $ Points : int 8217 8122 8067 8036 8004 7995 7802 7733 7708 7651 ...
## $ Competition : Factor w/ 2 levels "Decastar","OlympicG": 1 1 1 1 1 1 1 1 1 1 ...
Readying the Data for univariate distributions plotting of numeric variables
library(dplyr)
data_num <- decathlon2 %>% select_if(is.numeric)
str(data_num)
## 'data.frame': 27 obs. of 12 variables:
## $ X100m : num 11 10.8 11 11.3 11.1 ...
## $ Long.jump : num 7.58 7.4 7.23 7.09 7.3 7.31 6.81 7.56 6.97 7.27 ...
## $ Shot.put : num 14.8 14.3 14.2 15.2 13.5 ...
## $ High.jump : num 2.07 1.86 1.92 2.1 2.01 2.13 1.95 1.86 1.95 1.98 ...
## $ X400m : num 49.8 49.4 48.9 50.4 48.6 ...
## $ X110m.hurdle: num 14.7 14.1 15 15.3 14.2 ...
## $ Discus : num 43.8 50.7 40.9 46.3 45.7 ...
## $ Pole.vault : num 5.02 4.92 5.32 4.72 4.42 4.42 4.92 4.82 4.72 4.62 ...
## $ Javeline : num 63.2 60.1 62.8 63.4 55.4 ...
## $ X1500m : num 292 302 280 276 268 ...
## $ Rank : int 1 2 4 5 7 8 9 10 11 12 ...
## $ Points : int 8217 8122 8067 8036 8004 7995 7802 7733 7708 7651 ...
variables <- colnames(data_num)
out <- lapply(variables, function(i) plotUniCat(decathlon2,i))
Creating histograms for the columns in the dataframe
hist function
The main argument is for the main title of the plot.
#https://stackoverflow.com/questions/17963962/plot-size-and-resolution-with-r-markdown-knitr-pandoc-beamer
par(mfrow=c(4, 3))
for (i in names(data_num)){
hist(data_num[, i],xlab = (i))}
# main title not specified

Creating histogram (frequency) for all the columns in the dataframe
par(mfrow=c(4, 3))
for (i in names(data_num)){
hist(data_num[, i], main = paste0(i), freq=TRUE,
xlab= paste0(i), ylim = c(0,20),ylab = "frequency")}
# main title not specified

Creating histogram (frequency) for all the columns in the dataframe
par(mfrow=c(4, 3))
for (i in names(data_num)){
hist(data_num[, i], main = paste0(i), freq=TRUE, xlab= paste0(i))}
# main title is specified

Creating density plot for all the columns in the dataframe
par(mfrow=c(4, 3))
for (i in names(data_num)){
plot(density(data_num[, i]), main = paste0(i), xlab= paste0(i))
}

Bivariate Relationships and Correlation plots
library(psych)
pairs.panels(data_num, col="red")

#methods(class = class(decathlon2[,'Competition']))
methods(class = 'factor')
## [1] [ [[ [[<- [<- all.equal
## [6] as.character as.data.frame as.Date as.list as.logical
## [11] as.POSIXlt as.vector c coerce droplevels
## [16] format initialize is.na<- length<- levels<-
## [21] Math Ops plot print recode
## [26] relevel relist rep scale_type show
## [31] slotsFromS3 summary Summary type_sum xtfrm
## see '?methods' for accessing help and source code
levels(decathlon2[,'Competition'])
## [1] "Decastar" "OlympicG"
nlevels(decathlon2[,'Competition'])
## [1] 2
summary(decathlon2[,'Competition'])
## Decastar OlympicG
## 13 14
Correlation Matrix with GGally
library(GGally)
# Check correlations (as scatterplots), distribution and print corrleation coefficient
ggpairs(data_num, title="correlogram with ggpairs()")

library(GGally)
# Nice visualization of correlations
ggcorr(data_num, method = c("everything", "pearson"))

# https://www.r-graph-gallery.com/199-correlation-matrix-with-ggally.html
# Quick display of two cabapilities of GGally, to assess the distribution and correlation of variables
library(GGally)
# From the help page:
data(flea)
head(flea)
## species tars1 tars2 head aede1 aede2 aede3
## 1 Concinna 191 131 53 150 15 104
## 2 Concinna 185 134 50 147 13 105
## 3 Concinna 200 137 52 144 14 102
## 4 Concinna 173 127 50 144 16 97
## 5 Concinna 171 118 49 153 13 106
## 6 Concinna 160 118 47 140 15 99
ggpairs(flea, columns = 2:4, ggplot2::aes(colour=species))

ggpairs(decathlon2, columns = 1:12, ggplot2::aes(colour=Competition))
