library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gapminder)
library(ggplot2)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ purrr 0.3.5
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(corrplot)
## corrplot 0.92 loaded
summary(gapminder)
## country continent year lifeExp
## Afghanistan: 12 Africa :624 Min. :1952 Min. :23.60
## Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.20
## Algeria : 12 Asia :396 Median :1980 Median :60.71
## Angola : 12 Europe :360 Mean :1980 Mean :59.47
## Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.85
## Australia : 12 Max. :2007 Max. :82.60
## (Other) :1632
## pop gdpPercap
## Min. :6.001e+04 Min. : 241.2
## 1st Qu.:2.794e+06 1st Qu.: 1202.1
## Median :7.024e+06 Median : 3531.8
## Mean :2.960e+07 Mean : 7215.3
## 3rd Qu.:1.959e+07 3rd Qu.: 9325.5
## Max. :1.319e+09 Max. :113523.1
##
head(gapminder)
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
tail(gapminder)
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Zimbabwe Africa 1982 60.4 7636524 789.
## 2 Zimbabwe Africa 1987 62.4 9216418 706.
## 3 Zimbabwe Africa 1992 60.4 10704340 693.
## 4 Zimbabwe Africa 1997 46.8 11404948 792.
## 5 Zimbabwe Africa 2002 40.0 11926563 672.
## 6 Zimbabwe Africa 2007 43.5 12311143 470.
gapminder %>%
pivot_wider(names_from ="continent", values_from = c("lifeExp","gdpPercap"))
## # A tibble: 1,704 × 13
## country year pop lifeE…¹ lifeE…² lifeE…³ lifeE…⁴ lifeE…⁵ gdpPe…⁶ gdpPe…⁷
## <fct> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Afghani… 1952 8.43e6 28.8 NA NA NA NA 779. NA
## 2 Afghani… 1957 9.24e6 30.3 NA NA NA NA 821. NA
## 3 Afghani… 1962 1.03e7 32.0 NA NA NA NA 853. NA
## 4 Afghani… 1967 1.15e7 34.0 NA NA NA NA 836. NA
## 5 Afghani… 1972 1.31e7 36.1 NA NA NA NA 740. NA
## 6 Afghani… 1977 1.49e7 38.4 NA NA NA NA 786. NA
## 7 Afghani… 1982 1.29e7 39.9 NA NA NA NA 978. NA
## 8 Afghani… 1987 1.39e7 40.8 NA NA NA NA 852. NA
## 9 Afghani… 1992 1.63e7 41.7 NA NA NA NA 649. NA
## 10 Afghani… 1997 2.22e7 41.8 NA NA NA NA 635. NA
## # … with 1,694 more rows, 3 more variables: gdpPercap_Africa <dbl>,
## # gdpPercap_Americas <dbl>, gdpPercap_Oceania <dbl>, and abbreviated variable
## # names ¹lifeExp_Asia, ²lifeExp_Europe, ³lifeExp_Africa, ⁴lifeExp_Americas,
## # ⁵lifeExp_Oceania, ⁶gdpPercap_Asia, ⁷gdpPercap_Europe
To find data for canada
canada <- filter(gapminder, country == "Canada")
head(canada)
## # A tibble: 6 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Canada Americas 1952 68.8 14785584 11367.
## 2 Canada Americas 1957 70.0 17010154 12490.
## 3 Canada Americas 1962 71.3 18985849 13462.
## 4 Canada Americas 1967 72.1 20819767 16077.
## 5 Canada Americas 1972 72.9 22284500 18971.
## 6 Canada Americas 1977 74.2 23796400 22091.
## # A tibble: 12 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Canada Americas 1952 68.8 14785584 11367.
## 2 Canada Americas 1957 70.0 17010154 12490.
## 3 Canada Americas 1962 71.3 18985849 13462.
## 4 Canada Americas 1967 72.1 20819767 16077.
## 5 Canada Americas 1972 72.9 22284500 18971.
## 6 Canada Americas 1977 74.2 23796400 22091.
## 7 Canada Americas 1982 75.8 25201900 22899.
## 8 Canada Americas 1987 76.9 26549700 26627.
## 9 Canada Americas 1992 78.0 28523502 26343.
## 10 Canada Americas 1997 78.6 30305843 28955.
## 11 Canada Americas 2002 79.8 31902268 33329.
## 12 Canada Americas 2007 80.7 33390141 36319.
## # A tibble: 12 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Canada Americas 1952 68.8 14785584 11367.
## 2 Canada Americas 1957 70.0 17010154 12490.
## 3 Canada Americas 1962 71.3 18985849 13462.
## 4 Canada Americas 1967 72.1 20819767 16077.
## 5 Canada Americas 1972 72.9 22284500 18971.
## 6 Canada Americas 1977 74.2 23796400 22091.
## 7 Canada Americas 1982 75.8 25201900 22899.
## 8 Canada Americas 1987 76.9 26549700 26627.
## 9 Canada Americas 1992 78.0 28523502 26343.
## 10 Canada Americas 1997 78.6 30305843 28955.
## 11 Canada Americas 2002 79.8 31902268 33329.
## 12 Canada Americas 2007 80.7 33390141 36319.
mynewdat <- data.frame( x, y)
print (mynewdat)
## country continent year lifeExp pop gdpPercap country.1 continent.1
## 1 Canada Americas 1952 68.750 14785584 11367.16 Canada Americas
## 2 Canada Americas 1957 69.960 17010154 12489.95 Canada Americas
## 3 Canada Americas 1962 71.300 18985849 13462.49 Canada Americas
## 4 Canada Americas 1967 72.130 20819767 16076.59 Canada Americas
## 5 Canada Americas 1972 72.880 22284500 18970.57 Canada Americas
## 6 Canada Americas 1977 74.210 23796400 22090.88 Canada Americas
## 7 Canada Americas 1982 75.760 25201900 22898.79 Canada Americas
## 8 Canada Americas 1987 76.860 26549700 26626.52 Canada Americas
## 9 Canada Americas 1992 77.950 28523502 26342.88 Canada Americas
## 10 Canada Americas 1997 78.610 30305843 28954.93 Canada Americas
## 11 Canada Americas 2002 79.770 31902268 33328.97 Canada Americas
## 12 Canada Americas 2007 80.653 33390141 36319.24 Canada Americas
## year.1 lifeExp.1 pop.1 gdpPercap.1
## 1 1952 68.750 14785584 11367.16
## 2 1957 69.960 17010154 12489.95
## 3 1962 71.300 18985849 13462.49
## 4 1967 72.130 20819767 16076.59
## 5 1972 72.880 22284500 18970.57
## 6 1977 74.210 23796400 22090.88
## 7 1982 75.760 25201900 22898.79
## 8 1987 76.860 26549700 26626.52
## 9 1992 77.950 28523502 26342.88
## 10 1997 78.610 30305843 28954.93
## 11 2002 79.770 31902268 33328.97
## 12 2007 80.653 33390141 36319.24
select(mynewdat, lifeExp, gdpPercap)
## lifeExp gdpPercap
## 1 68.750 11367.16
## 2 69.960 12489.95
## 3 71.300 13462.49
## 4 72.130 16076.59
## 5 72.880 18970.57
## 6 74.210 22090.88
## 7 75.760 22898.79
## 8 76.860 26626.52
## 9 77.950 26342.88
## 10 78.610 28954.93
## 11 79.770 33328.97
## 12 80.653 36319.24
workdata <- select(mynewdat, lifeExp, gdpPercap)
workdata
## lifeExp gdpPercap
## 1 68.750 11367.16
## 2 69.960 12489.95
## 3 71.300 13462.49
## 4 72.130 16076.59
## 5 72.880 18970.57
## 6 74.210 22090.88
## 7 75.760 22898.79
## 8 76.860 26626.52
## 9 77.950 26342.88
## 10 78.610 28954.93
## 11 79.770 33328.97
## 12 80.653 36319.24
view(workdata)
summary(workdata)
## lifeExp gdpPercap
## Min. :68.75 Min. :11367
## 1st Qu.:71.92 1st Qu.:15423
## Median :74.98 Median :22495
## Mean :74.90 Mean :22411
## 3rd Qu.:78.11 3rd Qu.:27209
## Max. :80.65 Max. :36319
sd(workdata$lifeExp)
## [1] 3.952871
sd(workdata$gdpPercap)
## [1] 8210.113
hist(workdata$lifeExp)
hist(workdata$gdpPercap)
#from the command above with my specific sample from canada comparing the variables,lifeExp greater than 60 and gdpPercap > 40000 #from this sample min lifeExp is 68.75, 1st quartile is 71.92, median is 74.98,mean is 74.90, 3rd quartile is 78.11 and maximum is 80.65 #from this sample min gdpPercap is 11367, 1st quartile is 15423, median is 22495, the mean is 22411,3rd quartile is 27209, and maximum vualue is 36319. #standard deviation for life expectancy is 3.952871 while that of gdpPer cap is 8210.113
ggplot(workdata, aes(x=lifeExp, y=gdpPercap)) +
geom_point(size=4, shape=16) +
geom_smooth(method=lm) +
labs(title = "Relationship between lifeExp and gdpPercap")
## `geom_smooth()` using formula 'y ~ x'
#looks like there is a linear positive correlation between the variables life expectancy and gdppercapital
#stating the alaternate hypothesis # HA There is a relationship between life expectancy and gdpPercap.
x <- c(68.750,69.960, 71.300, 72.130, 72.880, 74.210, 75.760, 76.860, 77.950, 78.610,79.770,80.653)
x
## [1] 68.750 69.960 71.300 72.130 72.880 74.210 75.760 76.860 77.950 78.610
## [11] 79.770 80.653
y <- c(11367.16, 12489.95, 13462.49, 16076.59, 18970.57, 22090.88, 22898.79, 26626.52, 26342.88, 28954.93, 33328.97, 36319.24)
y
## [1] 11367.16 12489.95 13462.49 16076.59 18970.57 22090.88 22898.79 26626.52
## [9] 26342.88 28954.93 33328.97 36319.24
#To check the correlation test with my sample data i would run the argument below
corr.1 <- round(cor(x,y),2)
corr.1
## [1] 0.99
view(corr.1)
cor.test(x,y)
##
## Pearson's product-moment correlation
##
## data: x and y
## t = 18.712, df = 10, p-value = 4.109e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.9493136 0.9961957
## sample estimates:
## cor
## 0.9860194
#cor value is 0.99
##Answer #gapminder is a dataset that has six variables(coloumns) ,“country”, “continent”, “year”,“life expectancy”, “population”,and “gdpPercapita” and 1704 observations (rows).This Data is available for one hundred and forty two (142) countries for every five years from 1952 to 2007 #descriptive statics of the gapminder dataset
#to list variable names names(gapminder) #for a descriptive statistics of the dataset summary(gapminder)
gapminder %>%
filter(gdpPercap < 40000) %>%
ggplot(aes(x=gdpPercap, y=lifeExp)) +
geom_point()
gapminder %>%
filter(gdpPercap < 40000) %>%
ggplot(aes(x=gdpPercap, y=lifeExp, col = continent)) +
geom_point()
gapminder %>%
filter(gdpPercap < 40000) %>%
ggplot(aes(x=log(gdpPercap), y=lifeExp, col = continent)) +
geom_point(alpha = .45) +
geom_smooth(method = lm)
## `geom_smooth()` using formula 'y ~ x'
gapminder %>%
filter(gdpPercap < 40000) %>%
ggplot(aes(x=gdpPercap, y=lifeExp, col = continent)) +
geom_point(alpha = .45) +
geom_smooth(method = lm) +
facet_wrap(~continent)
## `geom_smooth()` using formula 'y ~ x'
#load tibble pacjkage
as_tibble(gapminder)
## # A tibble: 1,704 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
## 7 Afghanistan Asia 1982 39.9 12881816 978.
## 8 Afghanistan Asia 1987 40.8 13867957 852.
## 9 Afghanistan Asia 1992 41.7 16317921 649.
## 10 Afghanistan Asia 1997 41.8 22227415 635.
## # … with 1,694 more rows
library(tibble)
#show dataset information
as_tibble(gapminder)
## # A tibble: 1,704 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Afghanistan Asia 1952 28.8 8425333 779.
## 2 Afghanistan Asia 1957 30.3 9240934 821.
## 3 Afghanistan Asia 1962 32.0 10267083 853.
## 4 Afghanistan Asia 1967 34.0 11537966 836.
## 5 Afghanistan Asia 1972 36.1 13079460 740.
## 6 Afghanistan Asia 1977 38.4 14880372 786.
## 7 Afghanistan Asia 1982 39.9 12881816 978.
## 8 Afghanistan Asia 1987 40.8 13867957 852.
## 9 Afghanistan Asia 1992 41.7 16317921 649.
## 10 Afghanistan Asia 1997 41.8 22227415 635.
## # … with 1,694 more rows
gapminder %>%
select(continent,lifeExp,gdpPercap)
## # A tibble: 1,704 × 3
## continent lifeExp gdpPercap
## <fct> <dbl> <dbl>
## 1 Asia 28.8 779.
## 2 Asia 30.3 821.
## 3 Asia 32.0 853.
## 4 Asia 34.0 836.
## 5 Asia 36.1 740.
## 6 Asia 38.4 786.
## 7 Asia 39.9 978.
## 8 Asia 40.8 852.
## 9 Asia 41.7 649.
## 10 Asia 41.8 635.
## # … with 1,694 more rows
#Questions for EDA
#what is the type of each variable #there are two major types of
varibales in this dataset gapminder, categorical which is country with
142, 12 levels each and continent with 5 levels having 1704 countries in
total and there is the continuous variable which is year, population,
life expectancy and gdppercap. Below is the various levels of the factor
variables as well as the information on the continuous variables which
is the #country continent year lifeExp
#Afghanistan: 12 Africa :624 Min. :1952 Min. :23.60
#Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.20
#Algeria : 12 Asia :396 Median :1980 Median :60.71
#Angola : 12 Europe :360 Mean :1980 Mean :59.47
#Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.85
#Australia : 12 Max. :2007 Max. :82.60
#(Other) :1632 #pop gdpPercap
#Min. :6.001e+04 Min. : 241.2
#1st Qu.:2.794e+06 1st Qu.: 1202.1
#Median :7.024e+06 Median : 3531.8
#Mean :2.960e+07 Mean : 7215.3
#3rd Qu.:1.959e+07 3rd Qu.: 9325.5
#Max. :1.319e+09 Max. :113523.1 #in total there are six variables,
country, continent, year, lifeExpectancy and gdpPercapital
#what is the range of years? #Answer:the years range from 1952 to 2007 #is there data for every year over this period? #Answer:yes, there is data for every year over this period #what is the average life expectancy accross time and countries #Answer: the average life expectancy accross time and these countries is 59.47
#Description of data cleaning and transformation
#came up with my research question , which is there a relationship
between life expectancy and gdppercap in Canada? #first i had to dowload
the dataset gapminder into my R workspace
#installed,library(dplyr),library(gapminder),library(ggplot2),library(tidyverse),library(corrplot)
#initially ran a general plot on all the continents to see if there was
any relationship between life expectancy and gdpPercap less than 40000
#used the select function to select the variables i worked with which is
lifeexpectancy and gdpPercapital in Canada(had to get a sample to run my
tests which is canadan) #created a new dataset from the main dataset and
called it workdata #ran the summary command to visualise my new data
#ran the standard deviation for both varibles #selected assigned a new
argument for the two coloumns lifeExp gretaer than 60,and gdpPercap less
than 40000 calling them x and y variables on my R workspace #plotted
scatter plot and from the graph the points where clustered around the
straight line which indicated a strong positive correlation between the
two variables x and y #i also plotted a histogram for the two variables
but i couldnt get a significant interpretation, but the scatter plot
gave a better description of the correlation. #Description of
correlation analysis (steps for visualisation, checking assumption for
correlation analysis, performing correlation analysis)
#REPORT: from my result # stating the Null hypothesis which is H0 below # H0 -There is NO relationship between life expectancy and gdpPercap. #stating the alaternate hypothesis # HA There is a relationship between life expectancy and gdpPercap. #since the p value is way less than 0.05 then i choose to reject the null hypothesis #Pearson’s product-moment correlation #data: x and y #t = 18.712, df = 10(which is degree of freedom and formular is n-2 where n is number of observations ), p-value = 4.109e-09(which is the significance level of the t-test and from this figure it is way less than the significance level alpha = 0.05) #therefore i choose to reject the null hypothesis #alternative hypothesis: true correlation is not equal to 0 #95 percent confidence interval: 0.9493136 0.9961957 #sample estimates: cor 0.9860194
#Conclusion #i can conclude that there is a significant, strong, positive correlation between # the life expectancy and the gdpPercap in Canada in this dataset(one sample proportional test. #from the results obtained from the sample test. the p value is way less than 0.05 which is the significant level hence i support the alternative hypothesis #select significance level p= 4.109e-09 (0.000000004109) (p < 5%)