Download the script for this session at the link below (Click Link –> Right Click on Page –> Save Page As):
R can be used for basic arithmetic and it can also be used as a calculator. We start the session with doing basic computations.
1+3
## [1] 4
1+(4*2)
## [1] 9
1+(4^2) ##^: exponent
## [1] 17
Trigonometric functions
sin(30)
## [1] -0.9880316
log10 (10)
## [1] 1
log10 (20)
## [1] 1.30103
Comparisons
2 == 2
## [1] TRUE
2 == 4
## [1] FALSE
1 < 2
## [1] TRUE
R can also be used for more complex computations which involve statistical tests. These are used to analyse our data after cleaning and visualising our data. In this section, we will continue to work with the penguins data from the data visualisation section. These data have already been cleaned so we will go straight into analysis. load the processed data
#### Extra packages
install.packages('broom')
install.packages('stargazer')
install.packages('effects')
#### Load libraries
library(readxl)
library(tidyverse)
library(janitor)
library(dplyr)
library(palmerpenguins)
library(broom)
library(stargazer)
library(effects)
library(ggplot2)
#### Load data ----
<- palmerpenguins::penguins
penguins str(penguins)
## tibble [344 × 8] (S3: tbl_df/tbl/data.frame)
## $ species : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ island : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ bill_length_mm : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
## $ bill_depth_mm : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
## $ flipper_length_mm: int [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
## $ body_mass_g : int [1:344] 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
## $ sex : Factor w/ 2 levels "female","male": 2 1 1 NA 1 2 1 2 NA NA ...
## $ year : int [1:344] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
View(penguins)
#### Selecting and filtering data
<- select (penguins, species, island, bill_length_mm, bill_depth_mm)
penguins_bill <- select (penguins, species, island, flipper_length_mm)
penguins_flipper <- select(penguins, -flipper_length_mm, -bill_length_mm, -bill_depth_mm)
penguins_bodymass <- filter(penguins, year == 2009)
penguins_09 <- filter(penguins, year < 2008)
penguins2
#### say for instance you want the penguin bill information for 2007 and 2008 only
##### option 1
<- select (penguins, species, island, bill_length_mm, bill_depth_mm, year)
penguins_bill2 <- filter(penguins_bill2, year < 2009)
penguins_bill2 penguins_bill2
## # A tibble: 224 × 5
## species island bill_length_mm bill_depth_mm year
## <fct> <fct> <dbl> <dbl> <int>
## 1 Adelie Torgersen 39.1 18.7 2007
## 2 Adelie Torgersen 39.5 17.4 2007
## 3 Adelie Torgersen 40.3 18 2007
## 4 Adelie Torgersen NA NA 2007
## 5 Adelie Torgersen 36.7 19.3 2007
## 6 Adelie Torgersen 39.3 20.6 2007
## 7 Adelie Torgersen 38.9 17.8 2007
## 8 Adelie Torgersen 39.2 19.6 2007
## 9 Adelie Torgersen 34.1 18.1 2007
## 10 Adelie Torgersen 42 20.2 2007
## # … with 214 more rows
<- na.omit(penguins_bill2)
penguins_bill2 penguins_bill2
## # A tibble: 223 × 5
## species island bill_length_mm bill_depth_mm year
## <fct> <fct> <dbl> <dbl> <int>
## 1 Adelie Torgersen 39.1 18.7 2007
## 2 Adelie Torgersen 39.5 17.4 2007
## 3 Adelie Torgersen 40.3 18 2007
## 4 Adelie Torgersen 36.7 19.3 2007
## 5 Adelie Torgersen 39.3 20.6 2007
## 6 Adelie Torgersen 38.9 17.8 2007
## 7 Adelie Torgersen 39.2 19.6 2007
## 8 Adelie Torgersen 34.1 18.1 2007
## 9 Adelie Torgersen 42 20.2 2007
## 10 Adelie Torgersen 37.8 17.1 2007
## # … with 213 more rows
##### option 2
<- select(filter(penguins, year < 2009), species, island, bill_length_mm, bill_depth_mm, year)
penguins_bill2_2 <- na.omit(penguins_bill2_2)
penguins_bill2_2 penguins_bill2_2
## # A tibble: 223 × 5
## species island bill_length_mm bill_depth_mm year
## <fct> <fct> <dbl> <dbl> <int>
## 1 Adelie Torgersen 39.1 18.7 2007
## 2 Adelie Torgersen 39.5 17.4 2007
## 3 Adelie Torgersen 40.3 18 2007
## 4 Adelie Torgersen 36.7 19.3 2007
## 5 Adelie Torgersen 39.3 20.6 2007
## 6 Adelie Torgersen 38.9 17.8 2007
## 7 Adelie Torgersen 39.2 19.6 2007
## 8 Adelie Torgersen 34.1 18.1 2007
## 9 Adelie Torgersen 42 20.2 2007
## 10 Adelie Torgersen 37.8 17.1 2007
## # … with 213 more rows
##### option 3
## you can use the pipe function
<-penguins %>%
Penguins_billfilter(year < 2009) %>%
select(species, island, bill_length_mm, bill_depth_mm, year) %>%
na.omit()
Penguins_bill
## # A tibble: 223 × 5
## species island bill_length_mm bill_depth_mm year
## <fct> <fct> <dbl> <dbl> <int>
## 1 Adelie Torgersen 39.1 18.7 2007
## 2 Adelie Torgersen 39.5 17.4 2007
## 3 Adelie Torgersen 40.3 18 2007
## 4 Adelie Torgersen 36.7 19.3 2007
## 5 Adelie Torgersen 39.3 20.6 2007
## 6 Adelie Torgersen 38.9 17.8 2007
## 7 Adelie Torgersen 39.2 19.6 2007
## 8 Adelie Torgersen 34.1 18.1 2007
## 9 Adelie Torgersen 42 20.2 2007
## 10 Adelie Torgersen 37.8 17.1 2007
## # … with 213 more rows
##### mutate
<-Penguins_bill %>% mutate(bill_length_m = bill_length_mm/1000) #added a new column with new units for bill_length
Penguins_bill Penguins_bill
## # A tibble: 223 × 6
## species island bill_length_mm bill_depth_mm year bill_length_m
## <fct> <fct> <dbl> <dbl> <int> <dbl>
## 1 Adelie Torgersen 39.1 18.7 2007 0.0391
## 2 Adelie Torgersen 39.5 17.4 2007 0.0395
## 3 Adelie Torgersen 40.3 18 2007 0.0403
## 4 Adelie Torgersen 36.7 19.3 2007 0.0367
## 5 Adelie Torgersen 39.3 20.6 2007 0.0393
## 6 Adelie Torgersen 38.9 17.8 2007 0.0389
## 7 Adelie Torgersen 39.2 19.6 2007 0.0392
## 8 Adelie Torgersen 34.1 18.1 2007 0.0341
## 9 Adelie Torgersen 42 20.2 2007 0.042
## 10 Adelie Torgersen 37.8 17.1 2007 0.0378
## # … with 213 more rows
<-Penguins_bill %>% mutate(bill_depth_m = bill_depth_mm/1000)
Penguins_bill
##### group_by
<-Penguins_bill %>%
Penguins_bill_meangroup_by(species, year) %>%
summarise(mean_bill_length_mm = mean(bill_length_mm))
Penguins_bill_mean
## # A tibble: 6 × 3
## # Groups: species [3]
## species year mean_bill_length_mm
## <fct> <int> <dbl>
## 1 Adelie 2007 38.8
## 2 Adelie 2008 38.6
## 3 Chinstrap 2007 48.7
## 4 Chinstrap 2008 48.7
## 5 Gentoo 2007 47.0
## 6 Gentoo 2008 46.9
<-Penguins_bill %>%
Penguins_bill_meangroup_by(species, year) %>%
summarise(mean_bill_length_mm = mean(bill_length_mm)) %>%
arrange(desc(year))
Penguins_bill_mean
## # A tibble: 6 × 3
## # Groups: species [3]
## species year mean_bill_length_mm
## <fct> <int> <dbl>
## 1 Adelie 2008 38.6
## 2 Chinstrap 2008 48.7
## 3 Gentoo 2008 46.9
## 4 Adelie 2007 38.8
## 5 Chinstrap 2007 48.7
## 6 Gentoo 2007 47.0
<-Penguins_bill %>%
Penguins_bill_meangroup_by(species, year) %>%
summarise(mean_bill_length_mm = mean(bill_length_mm))
Penguins_bill_mean
## # A tibble: 6 × 3
## # Groups: species [3]
## species year mean_bill_length_mm
## <fct> <int> <dbl>
## 1 Adelie 2007 38.8
## 2 Adelie 2008 38.6
## 3 Chinstrap 2007 48.7
## 4 Chinstrap 2008 48.7
## 5 Gentoo 2007 47.0
## 6 Gentoo 2008 46.9
=0.05
alpha<- Penguins_bill %>%
Penguins_bill_Stats group_by(species, island, year) %>%
summarise(
n=n(),
mean_bill_length_mm=mean(na.omit(bill_length_mm)),
bill_length_sd=sd(na.omit(bill_length_mm)),
mean_bill_depth_mm=mean(na.omit(bill_length_mm)),
bill_depth_sd=sd(na.omit(bill_length_mm)))
Penguins_bill_Stats
## # A tibble: 10 × 8
## # Groups: species, island [5]
## species island year n mean_bill_lengt… bill_length_sd mean_bill_depth…
## <fct> <fct> <int> <int> <dbl> <dbl> <dbl>
## 1 Adelie Biscoe 2007 10 38.3 1.85 38.3
## 2 Adelie Biscoe 2008 18 38.7 2.59 38.7
## 3 Adelie Dream 2007 20 39.1 2.32 39.1
## 4 Adelie Dream 2008 16 38.2 2.79 38.2
## 5 Adelie Torge… 2007 19 38.8 2.93 38.8
## 6 Adelie Torge… 2008 16 38.8 3.65 38.8
## 7 Chinstrap Dream 2007 26 48.7 3.47 48.7
## 8 Chinstrap Dream 2008 18 48.7 3.62 48.7
## 9 Gentoo Biscoe 2007 34 47.0 3.27 47.0
## 10 Gentoo Biscoe 2008 46 46.9 2.64 46.9
## # … with 1 more variable: bill_depth_sd <dbl>
##### Plot stats
names(Penguins_bill_Stats)
## [1] "species" "island" "year"
## [4] "n" "mean_bill_length_mm" "bill_length_sd"
## [7] "mean_bill_depth_mm" "bill_depth_sd"
<-ggplot(Penguins_bill_Stats, aes(x = species, y = mean_bill_length_mm))+
StatsPlotgeom_line(size = 1, colour = "red")+
geom_point(shape = 19, size = 1, colour = "red")+
geom_errorbar(aes(ymin=mean_bill_length_mm-bill_length_sd, ymax=mean_bill_length_mm+bill_length_sd),width=.5, colour ="black", linetype = "solid")
<-ggplot(Penguins_bill_Stats, aes(x = species, y = mean_bill_length_mm))+
StatsPlotgeom_line(size = 1, colour = "red")+
geom_point(shape = 19, size = 1, colour = "red")+
geom_errorbar(aes(ymin=mean_bill_length_mm-bill_length_sd, ymax=mean_bill_length_mm+bill_length_sd),width=.5, colour ="black", linetype = "solid")+
theme_classic()+
xlab("Species_Names")+
ylab("Bill_length (mm)")
<-ggplot(Penguins_bill_Stats, aes(x = species, y = mean_bill_length_mm))+
StatsPlotgeom_line(size = 1, colour = "red")+
geom_point(shape = 19, size = 1, colour = "red")+
geom_errorbar(aes(ymin=mean_bill_length_mm-bill_length_sd, ymax=mean_bill_length_mm+bill_length_sd),width=.5, colour ="black", linetype = "solid")+
theme_classic()+
xlab("Species_Names")+
ylab("Bill_length (mm)")+facet_grid(rows=vars(year))
<-ggplot(Penguins_bill_Stats, aes(x = species, y = mean_bill_length_mm))+
StatsPlotgeom_line(size = 8, colour = "red")+
geom_point(shape = 19, size = 4, colour = "red")+
geom_errorbar(aes(ymin=mean_bill_length_mm-bill_length_sd, ymax=mean_bill_length_mm+bill_length_sd),width=.1, colour ="black", linetype = "solid")+
theme_classic()+
xlab("Species names")+
ylab("Bill length (mm)")+facet_grid(rows=vars(year, island))
StatsPlot
<-ggplot(Penguins_bill_Stats, aes(x = species, y = mean_bill_length_mm, col=island))+
StatsPlot2geom_line(size = 8)+
geom_point(shape = 19, size = 4)+
geom_errorbar(aes(ymin=mean_bill_length_mm-bill_length_sd, ymax=mean_bill_length_mm+bill_length_sd),width=.1, colour ="black", linetype = "solid")+
theme_classic()+
xlab("Species names")+
ylab("Bill length (mm)")+facet_grid(rows=vars(year))
StatsPlot2
#### Categorical linear model ----
<- lm(bill_length_mm ~ island, data = penguins)
lm
summary(lm)
##
## Call:
## lm(formula = bill_length_mm ~ island, data = penguins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.0677 -3.8559 0.2958 3.8175 14.3425
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.2575 0.3897 116.127 < 2e-16 ***
## islandDream -1.0897 0.5970 -1.825 0.0688 .
## islandTorgersen -6.3065 0.8057 -7.827 6.44e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.036 on 339 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.154, Adjusted R-squared: 0.149
## F-statistic: 30.86 on 2 and 339 DF, p-value: 4.86e-13
#### Analysis of Variance ----
<- aov(bill_length_mm ~ island, data = penguins)
aov aov
## Call:
## aov(formula = bill_length_mm ~ island, data = penguins)
##
## Terms:
## island Residuals
## Sum of Squares 1565.599 8598.607
## Deg. of Freedom 2 339
##
## Residual standard error: 5.03633
## Estimated effects may be unbalanced
## 2 observations deleted due to missingness
summary(aov)
## Df Sum Sq Mean Sq F value Pr(>F)
## island 2 1566 782.8 30.86 4.86e-13 ***
## Residuals 339 8599 25.4
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 2 observations deleted due to missingness
##### Post-hoc test - if there is a significant difference
<- TukeyHSD(aov)
Tukey.aov Tukey.aov
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = bill_length_mm ~ island, data = penguins)
##
## $island
## diff lwr upr p adj
## Dream-Biscoe -1.089743 -2.495170 0.3156833 0.1628734
## Torgersen-Biscoe -6.306505 -8.203279 -4.4097305 0.0000000
## Torgersen-Dream -5.216762 -7.188974 -3.2445487 0.0000000
#### Continuous linear model ----
<- lm(bill_length_mm ~ body_mass_g, data = penguins)
lm2
summary(lm2)
##
## Call:
## lm(formula = bill_length_mm ~ body_mass_g, data = penguins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.1251 -3.0434 -0.8089 2.0711 16.1109
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.690e+01 1.269e+00 21.19 <2e-16 ***
## body_mass_g 4.051e-03 2.967e-04 13.65 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.394 on 340 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.3542, Adjusted R-squared: 0.3523
## F-statistic: 186.4 on 1 and 340 DF, p-value: < 2.2e-16
#### Categorical and Continuous model ----
<- lm(bill_length_mm ~ body_mass_g + island, data = penguins)
lm3
summary(lm3)
##
## Call:
## lm(formula = bill_length_mm ~ body_mass_g + island, data = penguins)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.4676 -2.3806 0.0498 2.4179 13.8966
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 21.7428965 1.6468756 13.203 < 2e-16 ***
## body_mass_g 0.0049861 0.0003431 14.532 < 2e-16 ***
## islandDream 3.9118982 0.5818015 6.724 7.56e-11 ***
## islandTorgersen -1.2723006 0.7216507 -1.763 0.0788 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.957 on 338 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.4793, Adjusted R-squared: 0.4747
## F-statistic: 103.7 on 3 and 338 DF, p-value: < 2.2e-16
#### Summarise results ----
<- tidy(lm)
lm_coefficients <- tidy(lm2)
lm2_coefficients <- tidy(lm3) lm3_coefficients
#### Plot effects ----
<- allEffects(lm)
lm_effects <- allEffects(lm2)
lm2_effects <- allEffects(lm3)
lm3_effects <- allEffects(aov)
aov_effects
plot(lm_effects)
plot(lm2_effects)
plot(lm3_effects)
plot(aov_effects)
#### Stargazer summary table ----
# export as a text; copy and paste
::stargazer(lm, lm2, lm3, type = 'text')### you can do this for the aov too stargazer
##
## =============================================================================================
## Dependent variable:
## -------------------------------------------------------------------------
## bill_length_mm
## (1) (2) (3)
## ---------------------------------------------------------------------------------------------
## islandDream -1.090* 3.912***
## (0.597) (0.582)
##
## islandTorgersen -6.307*** -1.272*
## (0.806) (0.722)
##
## body_mass_g 0.004*** 0.005***
## (0.0003) (0.0003)
##
## Constant 45.257*** 26.899*** 21.743***
## (0.390) (1.269) (1.647)
##
## ---------------------------------------------------------------------------------------------
## Observations 342 342 342
## R2 0.154 0.354 0.479
## Adjusted R2 0.149 0.352 0.475
## Residual Std. Error 5.036 (df = 339) 4.394 (df = 340) 3.957 (df = 338)
## F Statistic 30.862*** (df = 2; 339) 186.443*** (df = 1; 340) 103.720*** (df = 3; 338)
## =============================================================================================
## Note: *p<0.1; **p<0.05; ***p<0.01