Data handling

Steps in data handling

  • Save the raw data - never modify it manually
  • Reshape the data (use a script file…)
  • Clean the data (data errors, outliers, missing values, imputations, etc.)
  • Save the cleaned data
  • Perform the exploratory analysis
  • Conduct the final analysis
  • Save outputs (figures, tables, documents)
  • Reproduce the analysis

Cleaning the data

  • Missing values - do not underestimate
    • Code missing values explicitly (never use -1, 999, or .)
    • Check all variables for missing values
    • Check 0’s and missing values
    • Treat missing values explicitly
    • Interpolate if necessary
  • Check basic statistics for all variables (min, max, mean, median, etc)
    • Use univariate plots (density plots, bar plots for categorical variables)
    • Check for outliers
    • Check for skewed distributions (use log transformation, if necessary)
    • Use bivariate plots (and comparisons (scatter plots…)
      • Check logical inconsistency (age less than tenure?)

Working directory

  • Organize your project in a neat directory structure
  • Create a directory structure as follows _root.directory__ ./data -/figures
  • Set the working directory
getwd()                       # List the working directory
setwd("/directoryAddress")    # Set the working directory
list.files()                        # List files in the working directory

Data structures

Data types

  • Vectors: A single column of data (even a column of length 1)
  • Matrices: Two-dimensiona data (all rows and columns of all the same type)
  • Arrays: Multi-dimensional data (all rows and columns of all the same type)
  • Dataframes: Similar to matrics, but the columns can have different data types
    • Dataframes are the most common data structure in R
  • Datatables: Similar to dataframes but uses a different syntax
  • Lists: are arbitrary combinations of disparate object types in R

Indexing in dataframes

  • Rows, columns, and cells in a dataframe is accessed through indexing
df <- data.frame(a = c(1,2,3,4,5), b=c(4, 3, 6, 1, 1), c=c("a", "a", "b", "c", "c"))
# A column
df[, 3]
## [1] a a b c c
## Levels: a b c
df[,"c"]
## [1] a a b c c
## Levels: a b c
df$c
## [1] a a b c c
## Levels: a b c
# Single cell
df[2,3]
## [1] a
## Levels: a b c
df$c[2]
## [1] a
## Levels: a b c
# A row
df[2,]
##   a b c
## 2 2 3 a
# A range
df[3:5, 2:3]
##   b c
## 3 6 b
## 4 1 c
## 5 1 c
# By name
df[3:4, "c"]
## [1] b c
## Levels: a b c
# Combining two dataframes
dc <- data.frame(d = c(10,20,30,40,50), e=c("a1", "a2", "b3", "d3", "e2"))
dr <- c(100,200,300,400, 500)
# Column append
df
##   a b c
## 1 1 4 a
## 2 2 3 a
## 3 3 6 b
## 4 4 1 c
## 5 5 1 c
df <- cbind(df, dc)
df
##   a b c  d  e
## 1 1 4 a 10 a1
## 2 2 3 a 20 a2
## 3 3 6 b 30 b3
## 4 4 1 c 40 d3
## 5 5 1 c 50 e2
# Row append
df <- rbind(df, dr)
## Warning in `[<-.factor`(`*tmp*`, ri, value = 300): invalid factor level, NA
## generated

## Warning in `[<-.factor`(`*tmp*`, ri, value = 300): invalid factor level, NA
## generated
df
##     a   b    c   d    e
## 1   1   4    a  10   a1
## 2   2   3    a  20   a2
## 3   3   6    b  30   b3
## 4   4   1    c  40   d3
## 5   5   1    c  50   e2
## 6 100 200 <NA> 400 <NA>
# Notice the missing values!!

Data entry

Manual data entry

dt <- data.frame(a = c(1:10), b = rnorm(10, 0, 0.5), c = c(1:2))
dt$d <- dt$a + dt$b + 2*dt$c
dt <- within(dt, e <- a + b + 2*c)
dt
##     a           b c         d         e
## 1   1  0.14893311 1  3.148933  3.148933
## 2   2  0.74409573 2  6.744096  6.744096
## 3   3 -0.50156182 1  4.498438  4.498438
## 4   4  0.09676564 2  8.096766  8.096766
## 5   5 -0.07089712 1  6.929103  6.929103
## 6   6  0.01252595 2 10.012526 10.012526
## 7   7 -0.34263153 1  8.657368  8.657368
## 8   8 -0.70059708 2 11.299403 11.299403
## 9   9 -0.39497329 1 10.605027 10.605027
## 10 10 -0.25895121 2 13.741049 13.741049
a <- c(1, 2, 4, 2, 6, 3, 5, 8, 1, 10)
b <- rep(c(2:3), times=5)
c <- rep(c(2:3), each=5)
d <- rep(c(2:3), length.out=10)
e <- seq(0, 180, length.out = 10)
f <- seq(0, 45, by=5)
dd <- data.frame(a, b, c, d, e, f)
dd
##     a b c d   e  f
## 1   1 2 2 2   0  0
## 2   2 3 2 3  20  5
## 3   4 2 2 2  40 10
## 4   2 3 2 3  60 15
## 5   6 2 2 2  80 20
## 6   3 3 3 3 100 25
## 7   5 2 3 2 120 30
## 8   8 3 3 3 140 35
## 9   1 2 3 2 160 40
## 10 10 3 3 3 180 45

Empty data.frame

dt <- data.frame(a = character(), b = integer(), c = double())
dt
## [1] a b c
## <0 rows> (or 0-length row.names)
summary(dt)
##     a          b             c      
##  NULL:   Min.   : NA   Min.   : NA  
##          1st Qu.: NA   1st Qu.: NA  
##          Median : NA   Median : NA  
##          Mean   :NaN   Mean   :NaN  
##          3rd Qu.: NA   3rd Qu.: NA  
##          Max.   : NA   Max.   : NA

Read from the clipboard

  • This is not a good practice! Don’t do it.
dt <- read.table(file = "clipboard", header = TRUE)
  • Copy data to clipboard
dt <- data.frame(a = c(1:10), b = rnorm(10, 0, 0.5), c = c(1:2))
write.table(dt, file="clipboard")
write.csv(dt, file="dtdata.csv")

Read from a csv file

data.csv <- read.csv("filename", header = TRUE)
data.csv <- read.csv("URLaddress", header = TRUE)
* Decimal point
* Seperator
* Quotes for strings
* Dates
* Strings as factors (stringAsFactors = FALSE)

Read other formats

library(foreign)
data.spss <- read.spss("filename")
data.spss <- read.spss("filename")
data.stata <- read.stata("filename")
dat.xls <- read.xlsx(f, sheetIndex=1)

Saving the dataset

# Save csv data
write.csv(aa, "filename.csv")
# R data, rda is not necessary but useful, much smaller than csv data
save(aa, x1, x2, file="_filename_.rda")
# Load R data
load("_filename_.rda")

Optimizing the dataset

  • Which one is smaller, a1 or a2
a1 <- rep(123456, each=100000)
a2 <- rep(123456L, each=100000) 
object.size(a1)
## 800040 bytes
object.size(a2)
## 400040 bytes
  • … av or af?
av <- c("Textile clothing", "Food industry", "Chemicals")[sample(3, 100000, replace=TRUE)]
af <- factor(av)
object.size(av)
## 800224 bytes
object.size(af)
## 400616 bytes

View the data

  • RStudio “Environment” pane
  • Structure of the data
dt <- data.frame(a = c(1:1000), b = rnorm(1000, 0, 1), 
    c = c(1:2), d = sample(5, 1000, replace = TRUE))
dt$b[runif(100)<0.1] <- NA
dt$d[runif(100)<0.1] <- NA
dt <- within(dt, e <- c + (c-1)*d  + (b * (c-1) * (d/5)) + rnorm(1000, 0, 1))
head(dt)
##   a            b c d         e
## 1 1 -0.008867292 1 5 0.2861593
## 2 2  0.432745087 2 2 4.0176431
## 3 3  0.133195235 1 3 1.4048631
## 4 4  1.144456906 2 1 1.1988143
## 5 5 -0.353134877 1 3 3.5409434
## 6 6  0.635261659 2 3 4.6232731
tail(dt)
##         a          b c  d         e
## 995   995  1.4706336 1  3 1.5229362
## 996   996 -1.2153300 2  3 3.3321125
## 997   997 -0.5636842 1  3 0.2954061
## 998   998 -1.3725112 2 NA        NA
## 999   999 -1.7518523 1  1 1.9696645
## 1000 1000 -1.2328778 2  2 4.4393176
colnames(dt)
## [1] "a" "b" "c" "d" "e"
dim(dt)
## [1] 1000    5
str(dt)
## 'data.frame':    1000 obs. of  5 variables:
##  $ a: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ b: num  -0.00887 0.43275 0.1332 1.14446 -0.35313 ...
##  $ c: int  1 2 1 2 1 2 1 2 1 2 ...
##  $ d: int  5 2 3 1 3 3 4 4 2 5 ...
##  $ e: num  0.286 4.018 1.405 1.199 3.541 ...
table(dt$d)
## 
##   1   2   3   4   5 
## 151 182 197 187 163
table(dt$c, dt$d)
##    
##       1   2   3   4   5
##   1  78  90  92  99  81
##   2  73  92 105  88  82
xtabs( ~ c + d, data = dt)
##    d
## c     1   2   3   4   5
##   1  78  90  92  99  81
##   2  73  92 105  88  82

Data preparation

Missing values

library(mice)
## Loading required package: Rcpp
## mice 2.25 2015-11-09
# Check the pattern of missing data
md.pattern(dt)
##     a c   d   b   e    
## 750 1 1   1   1   1   0
## 130 1 1   1   0   0   2
## 100 1 1   0   1   0   2
##  20 1 1   0   0   0   3
##     0 0 120 150 250 520

Descriptive statistics

  • Summary
summary(dt)
##        a                b                  c             d        
##  Min.   :   1.0   Min.   :-3.06067   Min.   :1.0   Min.   :1.000  
##  1st Qu.: 250.8   1st Qu.:-0.68503   1st Qu.:1.0   1st Qu.:2.000  
##  Median : 500.5   Median : 0.00023   Median :1.5   Median :3.000  
##  Mean   : 500.5   Mean   : 0.00629   Mean   :1.5   Mean   :3.033  
##  3rd Qu.: 750.2   3rd Qu.: 0.68432   3rd Qu.:2.0   3rd Qu.:4.000  
##  Max.   :1000.0   Max.   : 2.83409   Max.   :2.0   Max.   :5.000  
##                   NA's   :150                      NA's   :120    
##        e         
##  Min.   :-1.958  
##  1st Qu.: 1.079  
##  Median : 2.309  
##  Mean   : 2.961  
##  3rd Qu.: 4.972  
##  Max.   :10.686  
##  NA's   :250
summary(subset(dt, c == 1))
##        a               b                  c           d        
##  Min.   :  1.0   Min.   :-2.98049   Min.   :1   Min.   :1.000  
##  1st Qu.:250.5   1st Qu.:-0.64906   1st Qu.:1   1st Qu.:2.000  
##  Median :500.0   Median : 0.00023   Median :1   Median :3.000  
##  Mean   :500.0   Mean   : 0.04000   Mean   :1   Mean   :3.034  
##  3rd Qu.:749.5   3rd Qu.: 0.72110   3rd Qu.:1   3rd Qu.:4.000  
##  Max.   :999.0   Max.   : 2.83409   Max.   :1   Max.   :5.000  
##                  NA's   :80                     NA's   :60     
##        e          
##  Min.   :-1.9577  
##  1st Qu.: 0.3112  
##  Median : 1.1240  
##  Mean   : 1.0163  
##  3rd Qu.: 1.6743  
##  Max.   : 3.5409  
##  NA's   :120
summary(subset(dt, c == 2))
##        a                b                  c           d        
##  Min.   :   2.0   Min.   :-3.06067   Min.   :2   Min.   :1.000  
##  1st Qu.: 251.5   1st Qu.:-0.75782   1st Qu.:2   1st Qu.:2.000  
##  Median : 501.0   Median :-0.00997   Median :2   Median :3.000  
##  Mean   : 501.0   Mean   :-0.02663   Mean   :2   Mean   :3.032  
##  3rd Qu.: 750.5   3rd Qu.: 0.65524   3rd Qu.:2   3rd Qu.:4.000  
##  Max.   :1000.0   Max.   : 2.81631   Max.   :2   Max.   :5.000  
##                   NA's   :70                     NA's   :60     
##        e          
##  Min.   : 0.5551  
##  1st Qu.: 3.6914  
##  Median : 5.0033  
##  Mean   : 4.9592  
##  3rd Qu.: 6.1634  
##  Max.   :10.6858  
##  NA's   :130
  • stargazer package
library(stargazer)
stargazer(dt, type="html", digits = 1)
Statistic N Mean St. Dev. Min Max
a 1,000 500.5 288.8 1 1,000
b 850 0.01 1.0 -3.1 2.8
c 1,000 1.5 0.5 1 2
d 880 3.0 1.4 1 5
e 750 3.0 2.4 -2.0 10.7

Plots

  • Univariate plots
library(ggplot2)
ggplot(dt, aes(x = b)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 150 rows containing non-finite values (stat_bin).

ggplot(dt, aes(x = b)) + geom_density()
## Warning: Removed 150 rows containing non-finite values (stat_density).

ggplot(dt, aes(x = b)) + geom_density() +
 stat_function(fun = dnorm, args = list(mean = 0, sd = 1), col="blue")
## Warning: Removed 150 rows containing non-finite values (stat_density).

ggplot(dt, aes(x = d)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 120 rows containing non-finite values (stat_bin).

ggplot(dt, aes(x = d)) + geom_histogram(binwidth=1)
## Warning: Removed 120 rows containing non-finite values (stat_bin).

ggplot(dt, aes(x = d)) + geom_bar()
## Warning: Removed 120 rows containing non-finite values (stat_count).

  • Bivariate plots
ggplot(dt, aes(x = d, y = b)) + geom_point()
## Warning: Removed 250 rows containing missing values (geom_point).

ggplot(dt, aes(x = d, y = b)) + geom_point(size=5)
## Warning: Removed 250 rows containing missing values (geom_point).

ggplot(dt, aes(x = d, y = b)) + geom_point(size=5, alpha=0.1)
## Warning: Removed 250 rows containing missing values (geom_point).

# If overplotted...
ggplot(dt, aes(x = d, y = b)) + geom_jitter(size=5, alpha=0.5, width=0.1)
## Warning: Removed 250 rows containing missing values (geom_point).

ggplot(dt, aes(x = d, y = b)) + geom_jitter(size=5, alpha=0.5, width=0.5)
## Warning: Removed 250 rows containing missing values (geom_point).

# Bivariate plots
ggplot(dt, aes(x = b, y = e)) + geom_point()
## Warning: Removed 250 rows containing missing values (geom_point).

ggplot(dt, aes(x = b, y = e)) + geom_point() + 
    geom_smooth(formula = y ~ x)
## Warning: Removed 250 rows containing non-finite values (stat_smooth).

## Warning: Removed 250 rows containing missing values (geom_point).

ggplot(dt, aes(x = b, y = e)) + geom_point() + 
    geom_smooth(formula = y ~ x, se = FALSE, method = "lm")
## Warning: Removed 250 rows containing non-finite values (stat_smooth).

## Warning: Removed 250 rows containing missing values (geom_point).

ggplot(dt, aes(x = b, y = e)) + geom_point() + facet_wrap(~ c) + 
    geom_smooth(formula = y ~ x, se = FALSE, method = "lm")
## Warning: Removed 250 rows containing non-finite values (stat_smooth).

## Warning: Removed 250 rows containing missing values (geom_point).

ggplot(dt, aes(x = b, y = e)) + geom_point() + facet_wrap(c ~ d, nrow = 2) + 
    geom_smooth(formula = y ~ x, se = FALSE, method = "lm")
## Warning: Removed 250 rows containing non-finite values (stat_smooth).

## Warning: Removed 250 rows containing missing values (geom_point).

Correlations

# Correlations - variables 2-4
cor(dt[, 2:5])
##    b  c  d  e
## b  1 NA NA NA
## c NA  1 NA NA
## d NA NA  1 NA
## e NA NA NA  1
cor(dt[, 2:5], use = "complete.obs")
##              b             c             d          e
## b  1.000000000 -0.0196713614 -0.0013721072 0.09930274
## c -0.019671361  1.0000000000 -0.0006829834 0.81374125
## d -0.001372107 -0.0006829834  1.0000000000 0.24386125
## e  0.099302738  0.8137412515  0.2438612518 1.00000000
cor(dt[, 2:5], use = "pairwise.complete.obs")
##              b            c            d          e
## b  1.000000000 -0.033009139 -0.001372107 0.09930274
## c -0.033009139  1.000000000 -0.000836488 0.81374125
## d -0.001372107 -0.000836488  1.000000000 0.24386125
## e  0.099302738  0.813741251  0.243861252 1.00000000
cor(dt[, 2:5], use = "pairwise.complete.obs", method = "kendall")
##               b            c             d          e
## b  1.0000000000 -0.020636621 -0.0008006355 0.04514108
## c -0.0206366208  1.000000000 -0.0014063116 0.67985924
## d -0.0008006355 -0.001406312  1.0000000000 0.14503453
## e  0.0451410770  0.679859240  0.1450345299 1.00000000
cor(dt[, 2:5], use = "pairwise.complete.obs", method = "spearman")
##               b            c             d         e
## b  1.0000000000 -0.025259741 -0.0008800671 0.0660115
## c -0.0252597411  1.000000000 -0.0015714113 0.8320996
## d -0.0008800671 -0.001571411  1.0000000000 0.1807443
## e  0.0660114951  0.832099569  0.1807443053 1.0000000

Plot all data all together

library(GGally)
ggpairs(dt)

Basic tests

  • Chi-square test for categorical variables
tab <- xtabs(~ c + d, data = dt)
tab
##    d
## c     1   2   3   4   5
##   1  78  90  92  99  81
##   2  73  92 105  88  82
chisq.test(tab)
## 
##  Pearson's Chi-squared test
## 
## data:  tab
## X-squared = 1.6986, df = 4, p-value = 0.791
  • t-test for continuous variables
t.test(dt$b, mu = 0)
## 
##  One Sample t-test
## 
## data:  dt$b
## t = 0.18167, df = 849, p-value = 0.8559
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  -0.06168931  0.07427348
## sample estimates:
##   mean of x 
## 0.006292083
t.test(dt$b, dt$e)
## 
##  Welch Two Sample t-test
## 
## data:  dt$b and dt$e
## t = -31.09, df = 975.73, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.141676 -2.768618
## sample estimates:
##   mean of x   mean of y 
## 0.006292083 2.961439245
t.test(dt$b, dt$e, var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  dt$b and dt$e
## t = -32.491, df = 1598, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -3.133547 -2.776747
## sample estimates:
##   mean of x   mean of y 
## 0.006292083 2.961439245
with(dt, t.test(b[c == 1], b[c == 2]))
## 
##  Welch Two Sample t-test
## 
## data:  b[c == 1] and b[c == 2]
## t = 0.96173, df = 847.42, p-value = 0.3365
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.06935272  0.20261293
## sample estimates:
##   mean of x   mean of y 
##  0.03999908 -0.02663103

Data visualization

Make a dataset

library(ggplot2)
library(ggthemes)
DT <- data.frame(country = rep(c("A", "B", "C"), each=20), date = rep(1991:2010, times=3),
    gdp = c(1 + cumsum(rnorm(20, 0.01, 0.02)), 1.1 + cumsum(rnorm(20, 0.01, 0.04)), 
    1.2 + cumsum(rnorm(20, 0.04, 0.02))))

ggplot graphics

c1 <- ggplot(DT, aes(x=date, y=gdp, color=country)) + geom_line()
c1

# Thick lines
c1 <- ggplot(DT, aes(x=date, y=gdp, color=country)) + geom_line(size=1.25)
c1

# Change legend title
c1 <- c1 + guides(color = guide_legend(title = "Country"))
c1

# Remove legend text
c1 <- c1 +  guides(color = guide_legend(title = NULL))
c1

# Add titles
c1 <- c1 + labs(title="GDP level, 1990-2010", x="", y="GDP (Million USD)")
c1

# Bigger title
c1 <- c1 + theme(plot.title = element_text(size = rel(2))) 
c1

# Use LibreOffice colors
c1 + scale_color_calc()

# HC theme
c1 + theme_hc()

# HC theme + axis characteristics
c1 + theme_hc() + theme(axis.line = element_line(colour = "black", size=0.5), 
                        axis.text=element_text(colour="black")) + geom_line(size=1.25)

# Classic theme
c1 + theme_classic()

# Minimal  theme
c1 + theme_minimal()

# Economist theme
c1 + theme_economist()

# GoogleDocs theme
c1 + theme_gdocs()

My theme

theme_et <- function (base_size = 12, base_family = "ubuntu") {
  theme(rect = element_rect(fill = "#FFFFFF", linetype = 0, colour = NA), 
        text = element_text(size = base_size, family = base_family), 
        plot.title = element_text( size = rel(1.1), hjust = 0.5),
        axis.title.x = element_text(hjust = 0.5, size = rel(0.85)), 
        axis.title.y = element_text(hjust = 0.5, size = rel(0.85)), 
        axis.text = element_text(colour = "black"), 
        axis.line = element_line(colour = "black", size = rel(0.3)), 
        legend.text = element_text(colour = "black", size= rel(.85)), 
        panel.grid.major.y = element_line(color = "gray", size = rel(0.3)), 
        panel.grid.major.x = element_blank(), 
        panel.grid.minor.y = element_blank(), panel.grid.minor.x = element_blank(), 
        panel.border = element_blank(), panel.background = element_blank(), 
        legend.position = "bottom", legend.key = element_rect(fill = "#FFFFFF00"))
}

c1 + theme_et() 

Modifying themes

c1 + theme(plot.title = element_text(size = rel(2), colour = "blue"))

c1 + theme(axis.text = element_text(colour = "blue", size=rel(1.5)))

c1 + theme(legend.position = "bottom")

c1 + theme(legend.position = c(0.5, 0.5))

c1 + theme(legend.position = "none")

c1 + theme(legend.text = element_text(size = 20, colour = "red", angle = 45))

# Add text
c1 + geom_text(data = DT, aes(x=date, y=gdp+.03, label = country))

# Add formula
c1 + annotate("text", x=1995, y=1.5, parse=TRUE, label="frac(1, sqrt(2 * pi)) * e ^ {-x^2 / 2}")