ASSIGNMENT 7

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(gapminder)
library(ggplot2)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ tibble  3.1.8     ✔ purrr   0.3.5
## ✔ tidyr   1.2.1     ✔ stringr 1.4.1
## ✔ readr   2.1.3     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(corrplot)
## corrplot 0.92 loaded
summary(gapminder)
##         country        continent        year         lifeExp     
##  Afghanistan:  12   Africa  :624   Min.   :1952   Min.   :23.60  
##  Albania    :  12   Americas:300   1st Qu.:1966   1st Qu.:48.20  
##  Algeria    :  12   Asia    :396   Median :1980   Median :60.71  
##  Angola     :  12   Europe  :360   Mean   :1980   Mean   :59.47  
##  Argentina  :  12   Oceania : 24   3rd Qu.:1993   3rd Qu.:70.85  
##  Australia  :  12                  Max.   :2007   Max.   :82.60  
##  (Other)    :1632                                                
##       pop              gdpPercap       
##  Min.   :6.001e+04   Min.   :   241.2  
##  1st Qu.:2.794e+06   1st Qu.:  1202.1  
##  Median :7.024e+06   Median :  3531.8  
##  Mean   :2.960e+07   Mean   :  7215.3  
##  3rd Qu.:1.959e+07   3rd Qu.:  9325.5  
##  Max.   :1.319e+09   Max.   :113523.1  
## 
head(gapminder)
## # A tibble: 6 × 6
##   country     continent  year lifeExp      pop gdpPercap
##   <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Afghanistan Asia       1952    28.8  8425333      779.
## 2 Afghanistan Asia       1957    30.3  9240934      821.
## 3 Afghanistan Asia       1962    32.0 10267083      853.
## 4 Afghanistan Asia       1967    34.0 11537966      836.
## 5 Afghanistan Asia       1972    36.1 13079460      740.
## 6 Afghanistan Asia       1977    38.4 14880372      786.
tail(gapminder)
## # A tibble: 6 × 6
##   country  continent  year lifeExp      pop gdpPercap
##   <fct>    <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Zimbabwe Africa     1982    60.4  7636524      789.
## 2 Zimbabwe Africa     1987    62.4  9216418      706.
## 3 Zimbabwe Africa     1992    60.4 10704340      693.
## 4 Zimbabwe Africa     1997    46.8 11404948      792.
## 5 Zimbabwe Africa     2002    40.0 11926563      672.
## 6 Zimbabwe Africa     2007    43.5 12311143      470.
 gapminder %>%
  pivot_wider(names_from ="continent", values_from = c("lifeExp","gdpPercap"))
## # A tibble: 1,704 × 13
##    country   year    pop lifeE…¹ lifeE…² lifeE…³ lifeE…⁴ lifeE…⁵ gdpPe…⁶ gdpPe…⁷
##    <fct>    <int>  <int>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>   <dbl>
##  1 Afghani…  1952 8.43e6    28.8      NA      NA      NA      NA    779.      NA
##  2 Afghani…  1957 9.24e6    30.3      NA      NA      NA      NA    821.      NA
##  3 Afghani…  1962 1.03e7    32.0      NA      NA      NA      NA    853.      NA
##  4 Afghani…  1967 1.15e7    34.0      NA      NA      NA      NA    836.      NA
##  5 Afghani…  1972 1.31e7    36.1      NA      NA      NA      NA    740.      NA
##  6 Afghani…  1977 1.49e7    38.4      NA      NA      NA      NA    786.      NA
##  7 Afghani…  1982 1.29e7    39.9      NA      NA      NA      NA    978.      NA
##  8 Afghani…  1987 1.39e7    40.8      NA      NA      NA      NA    852.      NA
##  9 Afghani…  1992 1.63e7    41.7      NA      NA      NA      NA    649.      NA
## 10 Afghani…  1997 2.22e7    41.8      NA      NA      NA      NA    635.      NA
## # … with 1,694 more rows, 3 more variables: gdpPercap_Africa <dbl>,
## #   gdpPercap_Americas <dbl>, gdpPercap_Oceania <dbl>, and abbreviated variable
## #   names ¹​lifeExp_Asia, ²​lifeExp_Europe, ³​lifeExp_Africa, ⁴​lifeExp_Americas,
## #   ⁵​lifeExp_Oceania, ⁶​gdpPercap_Asia, ⁷​gdpPercap_Europe

To find data for canada

canada <- filter(gapminder, country == "Canada")
head(canada)
## # A tibble: 6 × 6
##   country continent  year lifeExp      pop gdpPercap
##   <fct>   <fct>     <int>   <dbl>    <int>     <dbl>
## 1 Canada  Americas   1952    68.8 14785584    11367.
## 2 Canada  Americas   1957    70.0 17010154    12490.
## 3 Canada  Americas   1962    71.3 18985849    13462.
## 4 Canada  Americas   1967    72.1 20819767    16077.
## 5 Canada  Americas   1972    72.9 22284500    18971.
## 6 Canada  Americas   1977    74.2 23796400    22091.
## # A tibble: 12 × 6
##    country continent  year lifeExp      pop gdpPercap
##    <fct>   <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Canada  Americas   1952    68.8 14785584    11367.
##  2 Canada  Americas   1957    70.0 17010154    12490.
##  3 Canada  Americas   1962    71.3 18985849    13462.
##  4 Canada  Americas   1967    72.1 20819767    16077.
##  5 Canada  Americas   1972    72.9 22284500    18971.
##  6 Canada  Americas   1977    74.2 23796400    22091.
##  7 Canada  Americas   1982    75.8 25201900    22899.
##  8 Canada  Americas   1987    76.9 26549700    26627.
##  9 Canada  Americas   1992    78.0 28523502    26343.
## 10 Canada  Americas   1997    78.6 30305843    28955.
## 11 Canada  Americas   2002    79.8 31902268    33329.
## 12 Canada  Americas   2007    80.7 33390141    36319.
## # A tibble: 12 × 6
##    country continent  year lifeExp      pop gdpPercap
##    <fct>   <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Canada  Americas   1952    68.8 14785584    11367.
##  2 Canada  Americas   1957    70.0 17010154    12490.
##  3 Canada  Americas   1962    71.3 18985849    13462.
##  4 Canada  Americas   1967    72.1 20819767    16077.
##  5 Canada  Americas   1972    72.9 22284500    18971.
##  6 Canada  Americas   1977    74.2 23796400    22091.
##  7 Canada  Americas   1982    75.8 25201900    22899.
##  8 Canada  Americas   1987    76.9 26549700    26627.
##  9 Canada  Americas   1992    78.0 28523502    26343.
## 10 Canada  Americas   1997    78.6 30305843    28955.
## 11 Canada  Americas   2002    79.8 31902268    33329.
## 12 Canada  Americas   2007    80.7 33390141    36319.
mynewdat <- data.frame( x, y)
print (mynewdat)
##    country continent year lifeExp      pop gdpPercap country.1 continent.1
## 1   Canada  Americas 1952  68.750 14785584  11367.16    Canada    Americas
## 2   Canada  Americas 1957  69.960 17010154  12489.95    Canada    Americas
## 3   Canada  Americas 1962  71.300 18985849  13462.49    Canada    Americas
## 4   Canada  Americas 1967  72.130 20819767  16076.59    Canada    Americas
## 5   Canada  Americas 1972  72.880 22284500  18970.57    Canada    Americas
## 6   Canada  Americas 1977  74.210 23796400  22090.88    Canada    Americas
## 7   Canada  Americas 1982  75.760 25201900  22898.79    Canada    Americas
## 8   Canada  Americas 1987  76.860 26549700  26626.52    Canada    Americas
## 9   Canada  Americas 1992  77.950 28523502  26342.88    Canada    Americas
## 10  Canada  Americas 1997  78.610 30305843  28954.93    Canada    Americas
## 11  Canada  Americas 2002  79.770 31902268  33328.97    Canada    Americas
## 12  Canada  Americas 2007  80.653 33390141  36319.24    Canada    Americas
##    year.1 lifeExp.1    pop.1 gdpPercap.1
## 1    1952    68.750 14785584    11367.16
## 2    1957    69.960 17010154    12489.95
## 3    1962    71.300 18985849    13462.49
## 4    1967    72.130 20819767    16076.59
## 5    1972    72.880 22284500    18970.57
## 6    1977    74.210 23796400    22090.88
## 7    1982    75.760 25201900    22898.79
## 8    1987    76.860 26549700    26626.52
## 9    1992    77.950 28523502    26342.88
## 10   1997    78.610 30305843    28954.93
## 11   2002    79.770 31902268    33328.97
## 12   2007    80.653 33390141    36319.24
select(mynewdat, lifeExp, gdpPercap)
##    lifeExp gdpPercap
## 1   68.750  11367.16
## 2   69.960  12489.95
## 3   71.300  13462.49
## 4   72.130  16076.59
## 5   72.880  18970.57
## 6   74.210  22090.88
## 7   75.760  22898.79
## 8   76.860  26626.52
## 9   77.950  26342.88
## 10  78.610  28954.93
## 11  79.770  33328.97
## 12  80.653  36319.24
workdata <- select(mynewdat, lifeExp, gdpPercap)
workdata
##    lifeExp gdpPercap
## 1   68.750  11367.16
## 2   69.960  12489.95
## 3   71.300  13462.49
## 4   72.130  16076.59
## 5   72.880  18970.57
## 6   74.210  22090.88
## 7   75.760  22898.79
## 8   76.860  26626.52
## 9   77.950  26342.88
## 10  78.610  28954.93
## 11  79.770  33328.97
## 12  80.653  36319.24
view(workdata)
summary(workdata) 
##     lifeExp        gdpPercap    
##  Min.   :68.75   Min.   :11367  
##  1st Qu.:71.92   1st Qu.:15423  
##  Median :74.98   Median :22495  
##  Mean   :74.90   Mean   :22411  
##  3rd Qu.:78.11   3rd Qu.:27209  
##  Max.   :80.65   Max.   :36319
sd(workdata$lifeExp)
## [1] 3.952871
sd(workdata$gdpPercap)
## [1] 8210.113
hist(workdata$lifeExp)

hist(workdata$gdpPercap)

#from the command above with my specific sample from canada comparing the variables,lifeExp greater than 60 and gdpPercap > 40000 #from this sample min lifeExp is 68.75, 1st quartile is 71.92, median is 74.98,mean is 74.90, 3rd quartile is 78.11 and maximum is 80.65 #from this sample min gdpPercap is 11367, 1st quartile is 15423, median is 22495, the mean is 22411,3rd quartile is 27209, and maximum vualue is 36319. #standard deviation for life expectancy is 3.952871 while that of gdpPer cap is 8210.113

ggplot(workdata, aes(x=lifeExp, y=gdpPercap)) + 
  geom_point(size=4, shape=16) +
  geom_smooth(method=lm) +
  labs(title = "Relationship between lifeExp and gdpPercap")
## `geom_smooth()` using formula 'y ~ x'

#looks like there is a linear positive correlation between the variables life expectancy and gdppercapital

stating the Null hypothesis which is H0 below

H0 -There is NO relationship between life expectancy and gdpPercap.

#stating the alaternate hypothesis # HA There is a relationship between life expectancy and gdpPercap.

x <- c(68.750,69.960, 71.300, 72.130, 72.880, 74.210, 75.760, 76.860, 77.950, 78.610,79.770,80.653)
x
##  [1] 68.750 69.960 71.300 72.130 72.880 74.210 75.760 76.860 77.950 78.610
## [11] 79.770 80.653
y <- c(11367.16, 12489.95, 13462.49, 16076.59, 18970.57, 22090.88, 22898.79, 26626.52, 26342.88, 28954.93, 33328.97, 36319.24)
y
##  [1] 11367.16 12489.95 13462.49 16076.59 18970.57 22090.88 22898.79 26626.52
##  [9] 26342.88 28954.93 33328.97 36319.24

#To check the correlation test with my sample data i would run the argument below

corr.1 <- round(cor(x,y),2)
corr.1
## [1] 0.99
view(corr.1)
cor.test(x,y)
## 
##  Pearson's product-moment correlation
## 
## data:  x and y
## t = 18.712, df = 10, p-value = 4.109e-09
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.9493136 0.9961957
## sample estimates:
##       cor 
## 0.9860194
#cor value is 0.99

##Answer #gapminder is a dataset that has six variables(coloumns) ,“country”, “continent”, “year”,“life expectancy”, “population”,and “gdpPercapita” and 1704 observations (rows).This Data is available for one hundred and forty two (142) countries for every five years from 1952 to 2007 #descriptive statics of the gapminder dataset

#to list variable names names(gapminder) #for a descriptive statistics of the dataset summary(gapminder)

gapminder %>% 
  filter(gdpPercap < 40000) %>%
  ggplot(aes(x=gdpPercap, y=lifeExp)) + 
  geom_point() 

gapminder %>% 
  filter(gdpPercap < 40000) %>%
  ggplot(aes(x=gdpPercap, y=lifeExp, col = continent)) + 
  geom_point()

gapminder %>% 
  filter(gdpPercap < 40000) %>%
  ggplot(aes(x=log(gdpPercap), y=lifeExp, col = continent)) + 
  geom_point(alpha = .45) +
  geom_smooth(method = lm)
## `geom_smooth()` using formula 'y ~ x'

gapminder %>% 
  filter(gdpPercap < 40000) %>%
  ggplot(aes(x=gdpPercap, y=lifeExp, col = continent)) + 
  geom_point(alpha = .45) +
  geom_smooth(method = lm) +
  facet_wrap(~continent)
## `geom_smooth()` using formula 'y ~ x'

#load tibble pacjkage
as_tibble(gapminder)
## # A tibble: 1,704 × 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # … with 1,694 more rows
library(tibble)
#show dataset information
as_tibble(gapminder)
## # A tibble: 1,704 × 6
##    country     continent  year lifeExp      pop gdpPercap
##    <fct>       <fct>     <int>   <dbl>    <int>     <dbl>
##  1 Afghanistan Asia       1952    28.8  8425333      779.
##  2 Afghanistan Asia       1957    30.3  9240934      821.
##  3 Afghanistan Asia       1962    32.0 10267083      853.
##  4 Afghanistan Asia       1967    34.0 11537966      836.
##  5 Afghanistan Asia       1972    36.1 13079460      740.
##  6 Afghanistan Asia       1977    38.4 14880372      786.
##  7 Afghanistan Asia       1982    39.9 12881816      978.
##  8 Afghanistan Asia       1987    40.8 13867957      852.
##  9 Afghanistan Asia       1992    41.7 16317921      649.
## 10 Afghanistan Asia       1997    41.8 22227415      635.
## # … with 1,694 more rows
gapminder %>% 
select(continent,lifeExp,gdpPercap) 
## # A tibble: 1,704 × 3
##    continent lifeExp gdpPercap
##    <fct>       <dbl>     <dbl>
##  1 Asia         28.8      779.
##  2 Asia         30.3      821.
##  3 Asia         32.0      853.
##  4 Asia         34.0      836.
##  5 Asia         36.1      740.
##  6 Asia         38.4      786.
##  7 Asia         39.9      978.
##  8 Asia         40.8      852.
##  9 Asia         41.7      649.
## 10 Asia         41.8      635.
## # … with 1,694 more rows

#Questions for EDA
#what is the type of each variable #there are two major types of varibales in this dataset gapminder, categorical which is country with 142, 12 levels each and continent with 5 levels having 1704 countries in total and there is the continuous variable which is year, population, life expectancy and gdppercap. Below is the various levels of the factor variables as well as the information on the continuous variables which is the #country continent year lifeExp
#Afghanistan: 12 Africa :624 Min. :1952 Min. :23.60
#Albania : 12 Americas:300 1st Qu.:1966 1st Qu.:48.20
#Algeria : 12 Asia :396 Median :1980 Median :60.71
#Angola : 12 Europe :360 Mean :1980 Mean :59.47
#Argentina : 12 Oceania : 24 3rd Qu.:1993 3rd Qu.:70.85
#Australia : 12 Max. :2007 Max. :82.60
#(Other) :1632 #pop gdpPercap
#Min. :6.001e+04 Min. : 241.2
#1st Qu.:2.794e+06 1st Qu.: 1202.1
#Median :7.024e+06 Median : 3531.8
#Mean :2.960e+07 Mean : 7215.3
#3rd Qu.:1.959e+07 3rd Qu.: 9325.5
#Max. :1.319e+09 Max. :113523.1 #in total there are six variables, country, continent, year, lifeExpectancy and gdpPercapital

#what is the range of years? #Answer:the years range from 1952 to 2007 #is there data for every year over this period? #Answer:yes, there is data for every year over this period #what is the average life expectancy accross time and countries #Answer: the average life expectancy accross time and these countries is 59.47

#Description of data cleaning and transformation
#came up with my research question , which is there a relationship between life expectancy and gdppercap in Canada? #first i had to dowload the dataset gapminder into my R workspace #installed,library(dplyr),library(gapminder),library(ggplot2),library(tidyverse),library(corrplot) #initially ran a general plot on all the continents to see if there was any relationship between life expectancy and gdpPercap less than 40000 #used the select function to select the variables i worked with which is lifeexpectancy and gdpPercapital in Canada(had to get a sample to run my tests which is canadan) #created a new dataset from the main dataset and called it workdata #ran the summary command to visualise my new data #ran the standard deviation for both varibles #selected assigned a new argument for the two coloumns lifeExp gretaer than 60,and gdpPercap less than 40000 calling them x and y variables on my R workspace #plotted scatter plot and from the graph the points where clustered around the straight line which indicated a strong positive correlation between the two variables x and y #i also plotted a histogram for the two variables but i couldnt get a significant interpretation, but the scatter plot gave a better description of the correlation. #Description of correlation analysis (steps for visualisation, checking assumption for correlation analysis, performing correlation analysis)

#REPORT: from my result # stating the Null hypothesis which is H0 below # H0 -There is NO relationship between life expectancy and gdpPercap. #stating the alaternate hypothesis # HA There is a relationship between life expectancy and gdpPercap. #since the p value is way less than 0.05 then i choose to reject the null hypothesis #Pearson’s product-moment correlation #data: x and y #t = 18.712, df = 10(which is degree of freedom and formular is n-2 where n is number of observations ), p-value = 4.109e-09(which is the significance level of the t-test and from this figure it is way less than the significance level alpha = 0.05) #therefore i choose to reject the null hypothesis #alternative hypothesis: true correlation is not equal to 0 #95 percent confidence interval: 0.9493136 0.9961957 #sample estimates: cor 0.9860194

#Conclusion #i can conclude that there is a significant, strong, positive correlation between # the life expectancy and the gdpPercap in Canada in this dataset(one sample proportional test. #from the results obtained from the sample test. the p value is way less than 0.05 which is the significant level hence i support the alternative hypothesis #select significance level p= 4.109e-09 (0.000000004109) (p < 5%)