Create a dataset

  • Load libraries to be used
library(data.table)
library(ggplot2)
library(mice)
## Loading required package: Rcpp
## mice 2.25 2015-11-09
library(plm)
## Loading required package: Formula
## 
## Attaching package: 'plm'
## The following object is masked from 'package:data.table':
## 
##     between
library(stargazer)
## 
## Please cite as:
##  Hlavac, Marek (2015). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2. http://CRAN.R-project.org/package=stargazer
  • Create a firm level dataset
nf <- 1000
nt <- 20
ns <- 10
nobs <- nf*nt

kelas <- 0.1
lelas <- 0.2
melas <- 0.75

id <- rep(c(1:nf), each = nt)
year <- rep(c(1:nt), times = nf)
sect <- rep(c(1:ns), each = nobs / ns)
kg <- rnorm(nobs, 0.01, 0.02)
lg <- (kg/10) + rnorm(nobs, 0.01, 0.01)
mg <- (lg/10) + rnorm(nobs, 0.005, 0.01)
tc <- rnorm(nobs, 0.01, 0.01)
fs <- max(0.1, rep(rnorm(nf, 2, 0.5), each = nt))
re <- max(0.1, rep(rnorm(nf, 0, 0.5), each = nt))
ee <- rnorm(nobs, 0, 0.1)

fdata <- data.table(id, year, sect, kg, lg, mg, tc, fs, re, ee)
fdata[, lk := fs*(1 + cumsum(kg)), by = id]
fdata[, ll := fs*(1 + cumsum(lg)), by = id]
fdata[, lm := fs*(1 + cumsum(mg)), by = id]
fdata[, lA := fs*(1 + cumsum(tc)), by = id]
fdata[, fe := max(0.1, rep(rnorm(nf, 0, 0.5)) + mean(lk)/5 + mean(ll)/5), by = id]
fdata[, lq := (lA + kelas*lk + lelas*ll + melas*lm) + ee]
fdata[, lqre := re + (lA + kelas*lk + lelas*ll + melas*lm) + ee]
fdata[, lqfe := fe + (lA + kelas*lk + lelas*ll + melas*lm) + ee]
  • Add some missing values
  • Generate purely random missing values, 2.5%
fdata[0.025 > runif(10000), lk := NA]
fdata[0.025 > runif(10000), ll := NA]
fdata[0.025 > runif(10000), lm := NA]
  • Generate missing values for all 2.5% of firms
fdata[id %in% sample(1:nf, floor(0.025*nf)), lk := NA]
fdata[id %in% sample(1:nf, floor(0.025*nf)), ll := NA]
fdata[id %in% sample(1:nf, floor(0.025*nf)), lm := NA]

Checking the dataset

head(fdata)
##    id year sect           kg           lg           mg           tc
## 1:  1    1    1 -0.016145729 -0.004581795  0.015579593  0.016280215
## 2:  1    2    1  0.030589417 -0.005651539  0.009099845 -0.011030608
## 3:  1    3    1 -0.010849058  0.009445855 -0.003626348  0.017538325
## 4:  1    4    1  0.043610043  0.001641193  0.008538700  0.006780934
## 5:  1    5    1 -0.005236426  0.006642762 -0.009212304  0.011906120
## 6:  1    6    1  0.054899888  0.013334368  0.017186476  0.011854136
##          fs       re          ee       lk       ll       lm       lA
## 1: 3.516949 1.451088  0.15119627 3.460166 3.500836 3.571742 3.574206
## 2: 3.516949 1.451088 -0.12269481 3.567747 3.480959 3.603746 3.535412
## 3: 3.516949 1.451088  0.12296523 3.529592 3.514180 3.590992 3.597093
## 4: 3.516949 1.451088  0.04716347 3.682966 3.519952 3.621022 3.620942
## 5: 3.516949 1.451088  0.16137831 3.664550 3.543314 3.588623 3.662815
## 6: 3.516949 1.451088  0.07311895 3.857630 3.590211 3.649067 3.704505
##          fe       lq     lqre     lqfe
## 1: 3.034242 7.450393 8.901481 10.48463
## 2: 3.034242 7.168493 8.619581 10.20273
## 3: 3.034242 7.469098 8.920186 10.50334
## 4: 3.034242 7.456159 8.907247 10.49040
## 5: 3.034242 7.590778 9.041866 10.62502
## 6: 3.034242 7.618230 9.069318 10.65247
str(fdata)
## Classes 'data.table' and 'data.frame':   20000 obs. of  18 variables:
##  $ id  : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ year: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ sect: int  1 1 1 1 1 1 1 1 1 1 ...
##  $ kg  : num  -0.01615 0.03059 -0.01085 0.04361 -0.00524 ...
##  $ lg  : num  -0.00458 -0.00565 0.00945 0.00164 0.00664 ...
##  $ mg  : num  0.01558 0.0091 -0.00363 0.00854 -0.00921 ...
##  $ tc  : num  0.01628 -0.01103 0.01754 0.00678 0.01191 ...
##  $ fs  : num  3.52 3.52 3.52 3.52 3.52 ...
##  $ re  : num  1.45 1.45 1.45 1.45 1.45 ...
##  $ ee  : num  0.1512 -0.1227 0.123 0.0472 0.1614 ...
##  $ lk  : num  3.46 3.57 3.53 3.68 3.66 ...
##  $ ll  : num  3.5 3.48 3.51 3.52 3.54 ...
##  $ lm  : num  3.57 3.6 3.59 3.62 3.59 ...
##  $ lA  : num  3.57 3.54 3.6 3.62 3.66 ...
##  $ fe  : num  3.03 3.03 3.03 3.03 3.03 ...
##  $ lq  : num  7.45 7.17 7.47 7.46 7.59 ...
##  $ lqre: num  8.9 8.62 8.92 8.91 9.04 ...
##  $ lqfe: num  10.5 10.2 10.5 10.5 10.6 ...
##  - attr(*, ".internal.selfref")=<externalptr> 
##  - attr(*, "index")= atomic  
##   ..- attr(*, "__id")= int
summary(fdata)
##        id              year            sect            kg           
##  Min.   :   1.0   Min.   : 1.00   Min.   : 1.0   Min.   :-0.065235  
##  1st Qu.: 250.8   1st Qu.: 5.75   1st Qu.: 3.0   1st Qu.:-0.004033  
##  Median : 500.5   Median :10.50   Median : 5.5   Median : 0.009834  
##  Mean   : 500.5   Mean   :10.50   Mean   : 5.5   Mean   : 0.009818  
##  3rd Qu.: 750.2   3rd Qu.:15.25   3rd Qu.: 8.0   3rd Qu.: 0.023203  
##  Max.   :1000.0   Max.   :20.00   Max.   :10.0   Max.   : 0.086575  
##                                                                     
##        lg                  mg                   tc           
##  Min.   :-0.028632   Min.   :-0.0295643   Min.   :-0.031440  
##  1st Qu.: 0.004244   1st Qu.:-0.0006132   1st Qu.: 0.003253  
##  Median : 0.011106   Median : 0.0062823   Median : 0.009982  
##  Mean   : 0.011121   Mean   : 0.0062048   Mean   : 0.009995  
##  3rd Qu.: 0.017959   3rd Qu.: 0.0129267   3rd Qu.: 0.016765  
##  Max.   : 0.050489   Max.   : 0.0440762   Max.   : 0.052860  
##                                                              
##        fs              re              ee                   lk       
##  Min.   :3.517   Min.   :1.451   Min.   :-0.4158939   Min.   :3.128  
##  1st Qu.:3.517   1st Qu.:1.451   1st Qu.:-0.0678918   1st Qu.:3.647  
##  Median :3.517   Median :1.451   Median : 0.0004533   Median :3.844  
##  Mean   :3.517   Mean   :1.451   Mean   : 0.0002693   Mean   :3.883  
##  3rd Qu.:3.517   3rd Qu.:1.451   3rd Qu.: 0.0678528   3rd Qu.:4.083  
##  Max.   :3.517   Max.   :1.451   Max.   : 0.3998554   Max.   :5.243  
##                                                       NA's   :966    
##        ll              lm              lA              fe       
##  Min.   :3.430   Min.   :3.347   Min.   :3.395   Min.   :2.734  
##  1st Qu.:3.715   1st Qu.:3.611   1st Qu.:3.694   1st Qu.:3.050  
##  Median :3.913   Median :3.728   Median :3.870   Median :3.151  
##  Mean   :3.929   Mean   :3.747   Mean   :3.888   Mean   :3.170  
##  3rd Qu.:4.120   3rd Qu.:3.859   3rd Qu.:4.059   3rd Qu.:3.269  
##  Max.   :4.848   Max.   :4.506   Max.   :4.634   Max.   :4.242  
##  NA's   :996     NA's   :909                                    
##        lq             lqre             lqfe       
##  Min.   :6.950   Min.   : 8.401   Min.   : 9.942  
##  1st Qu.:7.542   1st Qu.: 8.993   1st Qu.:10.701  
##  Median :7.853   Median : 9.304   Median :11.024  
##  Mean   :7.872   Mean   : 9.323   Mean   :11.042  
##  3rd Qu.:8.180   3rd Qu.: 9.631   3rd Qu.:11.363  
##  Max.   :9.269   Max.   :10.720   Max.   :12.886  
## 
table(fdata$year, fdata$sect)
##     
##        1   2   3   4   5   6   7   8   9  10
##   1  100 100 100 100 100 100 100 100 100 100
##   2  100 100 100 100 100 100 100 100 100 100
##   3  100 100 100 100 100 100 100 100 100 100
##   4  100 100 100 100 100 100 100 100 100 100
##   5  100 100 100 100 100 100 100 100 100 100
##   6  100 100 100 100 100 100 100 100 100 100
##   7  100 100 100 100 100 100 100 100 100 100
##   8  100 100 100 100 100 100 100 100 100 100
##   9  100 100 100 100 100 100 100 100 100 100
##   10 100 100 100 100 100 100 100 100 100 100
##   11 100 100 100 100 100 100 100 100 100 100
##   12 100 100 100 100 100 100 100 100 100 100
##   13 100 100 100 100 100 100 100 100 100 100
##   14 100 100 100 100 100 100 100 100 100 100
##   15 100 100 100 100 100 100 100 100 100 100
##   16 100 100 100 100 100 100 100 100 100 100
##   17 100 100 100 100 100 100 100 100 100 100
##   18 100 100 100 100 100 100 100 100 100 100
##   19 100 100 100 100 100 100 100 100 100 100
##   20 100 100 100 100 100 100 100 100 100 100
table(fdata$year, fdata$sect)
##     
##        1   2   3   4   5   6   7   8   9  10
##   1  100 100 100 100 100 100 100 100 100 100
##   2  100 100 100 100 100 100 100 100 100 100
##   3  100 100 100 100 100 100 100 100 100 100
##   4  100 100 100 100 100 100 100 100 100 100
##   5  100 100 100 100 100 100 100 100 100 100
##   6  100 100 100 100 100 100 100 100 100 100
##   7  100 100 100 100 100 100 100 100 100 100
##   8  100 100 100 100 100 100 100 100 100 100
##   9  100 100 100 100 100 100 100 100 100 100
##   10 100 100 100 100 100 100 100 100 100 100
##   11 100 100 100 100 100 100 100 100 100 100
##   12 100 100 100 100 100 100 100 100 100 100
##   13 100 100 100 100 100 100 100 100 100 100
##   14 100 100 100 100 100 100 100 100 100 100
##   15 100 100 100 100 100 100 100 100 100 100
##   16 100 100 100 100 100 100 100 100 100 100
##   17 100 100 100 100 100 100 100 100 100 100
##   18 100 100 100 100 100 100 100 100 100 100
##   19 100 100 100 100 100 100 100 100 100 100
##   20 100 100 100 100 100 100 100 100 100 100
# Check the pattern of missing data
md.pattern(fdata)
##       id year sect kg lg mg tc fs re ee lA fe lq lqre lqfe  lm  lk  ll
## 17292  1    1    1  1  1  1  1  1  1  1  1  1  1    1    1   1   1   1
##   838  1    1    1  1  1  1  1  1  1  1  1  1  1    1    1   1   0   1
##   911  1    1    1  1  1  1  1  1  1  1  1  1  1    1    1   1   1   0
##   798  1    1    1  1  1  1  1  1  1  1  1  1  1    1    1   0   1   1
##    50  1    1    1  1  1  1  1  1  1  1  1  1  1    1    1   1   0   0
##    76  1    1    1  1  1  1  1  1  1  1  1  1  1    1    1   0   0   1
##    33  1    1    1  1  1  1  1  1  1  1  1  1  1    1    1   0   1   0
##     2  1    1    1  1  1  1  1  1  1  1  1  1  1    1    1   0   0   0
##        0    0    0  0  0  0  0  0  0  0  0  0  0    0    0 909 966 996
##           
## 17292    0
##   838    1
##   911    1
##   798    1
##    50    2
##    76    2
##    33    2
##     2    3
##       2871
# Use a dataset with only non-missing data
cdata <- na.omit(fdata)
dim(fdata)
## [1] 20000    18
dim(cdata)
## [1] 17292    18
# List all tables in the dataset
tables()
##      NAME    NROW NCOL MB
## [1,] cdata 17,292   18  3
## [2,] fdata 20,000   18  3
##      COLS                                                          KEY
## [1,] id,year,sect,kg,lg,mg,tc,fs,re,ee,lk,ll,lm,lA,fe,lq,lqre,lqfe    
## [2,] id,year,sect,kg,lg,mg,tc,fs,re,ee,lk,ll,lm,lA,fe,lq,lqre,lqfe    
## Total: 6MB

Datatable operations

  • DT[ i, j, by ]
    • Take DT
    • subset rows using i,
    • then calculate j
    • grouped by by

Deleting variables

# One variable
fdata[, kg := NULL]
# Many variables
fdata[, c("lg", "mg", "tc", "fs", "fe", "ee") := NULL]

Subsetting rows

  • Observations are in rows
# Rows 3-5
fdata[3:5,]
##    id year sect       re       lk       ll       lm       lA       lq
## 1:  1    3    1 1.451088 3.529592 3.514180 3.590992 3.597093 7.469098
## 2:  1    4    1 1.451088 3.682966 3.519952 3.621022 3.620942 7.456159
## 3:  1    5    1 1.451088 3.664550 3.543314 3.588623 3.662815 7.590778
##        lqre     lqfe
## 1: 8.920186 10.50334
## 2: 8.907247 10.49040
## 3: 9.041866 10.62502
fdata[3:5]
##    id year sect       re       lk       ll       lm       lA       lq
## 1:  1    3    1 1.451088 3.529592 3.514180 3.590992 3.597093 7.469098
## 2:  1    4    1 1.451088 3.682966 3.519952 3.621022 3.620942 7.456159
## 3:  1    5    1 1.451088 3.664550 3.543314 3.588623 3.662815 7.590778
##        lqre     lqfe
## 1: 8.920186 10.50334
## 2: 8.907247 10.49040
## 3: 9.041866 10.62502
# All rows where id == 1
fdata[id == 1]
##     id year sect       re       lk       ll       lm       lA       lq
##  1:  1    1    1 1.451088 3.460166 3.500836 3.571742 3.574206 7.450393
##  2:  1    2    1 1.451088 3.567747 3.480959 3.603746 3.535412 7.168493
##  3:  1    3    1 1.451088 3.529592 3.514180 3.590992 3.597093 7.469098
##  4:  1    4    1 1.451088 3.682966 3.519952 3.621022 3.620942 7.456159
##  5:  1    5    1 1.451088 3.664550 3.543314 3.588623 3.662815 7.590778
##  6:  1    6    1 1.451088 3.857630 3.590211 3.649067 3.704505 7.618230
##  7:  1    7    1 1.451088 3.910919 3.618489 3.639170 3.756205 7.786042
##  8:  1    8    1 1.451088 3.868801 3.609093 3.649490 3.768084 7.555434
##  9:  1    9    1 1.451088 3.770436 3.606317 3.685651 3.773644 7.607890
## 10:  1   10    1 1.451088 3.771010 3.617458 3.659897 3.780566 7.561238
## 11:  1   11    1 1.451088 3.887096 3.692892 3.674701 3.859623 7.739880
## 12:  1   12    1 1.451088 3.866649 3.667201 3.720489 3.868465 7.866521
## 13:  1   13    1 1.451088 3.740189 3.651298 3.783396 3.934781 7.979833
## 14:  1   14    1 1.451088 3.780198 3.724378 3.749956 3.965957 8.060119
## 15:  1   15    1 1.451088 3.754797 3.728538 3.781083 4.001969 8.084231
## 16:  1   16    1 1.451088 3.730195 3.819053 3.900343 4.027229 8.057397
## 17:  1   17    1 1.451088 3.726540       NA 3.977675 4.005608 7.974660
## 18:  1   18    1 1.451088 3.678192 3.902402 3.997159 4.044663 8.272886
## 19:  1   19    1 1.451088 3.648364 4.023976 4.060166 4.154387 8.453051
## 20:  1   20    1 1.451088 3.752109 4.091652 4.064794 4.138883 8.211524
##         lqre     lqfe
##  1: 8.901481 10.48463
##  2: 8.619581 10.20273
##  3: 8.920186 10.50334
##  4: 8.907247 10.49040
##  5: 9.041866 10.62502
##  6: 9.069318 10.65247
##  7: 9.237130 10.82028
##  8: 9.006522 10.58968
##  9: 9.058978 10.64213
## 10: 9.012326 10.59548
## 11: 9.190969 10.77412
## 12: 9.317609 10.90076
## 13: 9.430921 11.01407
## 14: 9.511207 11.09436
## 15: 9.535319 11.11847
## 16: 9.508485 11.09164
## 17: 9.425748 11.00890
## 18: 9.723974 11.30713
## 19: 9.904139 11.48729
## 20: 9.662612 11.24577
setkey(fdata, id, year)
fdata[.(1)]
##     id year sect       re       lk       ll       lm       lA       lq
##  1:  1    1    1 1.451088 3.460166 3.500836 3.571742 3.574206 7.450393
##  2:  1    2    1 1.451088 3.567747 3.480959 3.603746 3.535412 7.168493
##  3:  1    3    1 1.451088 3.529592 3.514180 3.590992 3.597093 7.469098
##  4:  1    4    1 1.451088 3.682966 3.519952 3.621022 3.620942 7.456159
##  5:  1    5    1 1.451088 3.664550 3.543314 3.588623 3.662815 7.590778
##  6:  1    6    1 1.451088 3.857630 3.590211 3.649067 3.704505 7.618230
##  7:  1    7    1 1.451088 3.910919 3.618489 3.639170 3.756205 7.786042
##  8:  1    8    1 1.451088 3.868801 3.609093 3.649490 3.768084 7.555434
##  9:  1    9    1 1.451088 3.770436 3.606317 3.685651 3.773644 7.607890
## 10:  1   10    1 1.451088 3.771010 3.617458 3.659897 3.780566 7.561238
## 11:  1   11    1 1.451088 3.887096 3.692892 3.674701 3.859623 7.739880
## 12:  1   12    1 1.451088 3.866649 3.667201 3.720489 3.868465 7.866521
## 13:  1   13    1 1.451088 3.740189 3.651298 3.783396 3.934781 7.979833
## 14:  1   14    1 1.451088 3.780198 3.724378 3.749956 3.965957 8.060119
## 15:  1   15    1 1.451088 3.754797 3.728538 3.781083 4.001969 8.084231
## 16:  1   16    1 1.451088 3.730195 3.819053 3.900343 4.027229 8.057397
## 17:  1   17    1 1.451088 3.726540       NA 3.977675 4.005608 7.974660
## 18:  1   18    1 1.451088 3.678192 3.902402 3.997159 4.044663 8.272886
## 19:  1   19    1 1.451088 3.648364 4.023976 4.060166 4.154387 8.453051
## 20:  1   20    1 1.451088 3.752109 4.091652 4.064794 4.138883 8.211524
##         lqre     lqfe
##  1: 8.901481 10.48463
##  2: 8.619581 10.20273
##  3: 8.920186 10.50334
##  4: 8.907247 10.49040
##  5: 9.041866 10.62502
##  6: 9.069318 10.65247
##  7: 9.237130 10.82028
##  8: 9.006522 10.58968
##  9: 9.058978 10.64213
## 10: 9.012326 10.59548
## 11: 9.190969 10.77412
## 12: 9.317609 10.90076
## 13: 9.430921 11.01407
## 14: 9.511207 11.09436
## 15: 9.535319 11.11847
## 16: 9.508485 11.09164
## 17: 9.425748 11.00890
## 18: 9.723974 11.30713
## 19: 9.904139 11.48729
## 20: 9.662612 11.24577
fdata[.(c(1:2)), .(id, year, sect, lq)]
##     id year sect       lq
##  1:  1    1    1 7.450393
##  2:  1    2    1 7.168493
##  3:  1    3    1 7.469098
##  4:  1    4    1 7.456159
##  5:  1    5    1 7.590778
##  6:  1    6    1 7.618230
##  7:  1    7    1 7.786042
##  8:  1    8    1 7.555434
##  9:  1    9    1 7.607890
## 10:  1   10    1 7.561238
## 11:  1   11    1 7.739880
## 12:  1   12    1 7.866521
## 13:  1   13    1 7.979833
## 14:  1   14    1 8.060119
## 15:  1   15    1 8.084231
## 16:  1   16    1 8.057397
## 17:  1   17    1 7.974660
## 18:  1   18    1 8.272886
## 19:  1   19    1 8.453051
## 20:  1   20    1 8.211524
## 21:  2    1    1 7.403333
## 22:  2    2    1 7.306158
## 23:  2    3    1 7.405809
## 24:  2    4    1 7.296536
## 25:  2    5    1 7.282988
## 26:  2    6    1 7.471193
## 27:  2    7    1 7.696668
## 28:  2    8    1 7.780785
## 29:  2    9    1 7.673840
## 30:  2   10    1 7.818132
## 31:  2   11    1 7.718537
## 32:  2   12    1 7.927354
## 33:  2   13    1 8.009635
## 34:  2   14    1 8.503631
## 35:  2   15    1 8.091959
## 36:  2   16    1 8.185478
## 37:  2   17    1 8.444078
## 38:  2   18    1 8.298519
## 39:  2   19    1 8.360964
## 40:  2   20    1 8.531302
##     id year sect       lq
fdata[.(c(1:2), c(10,20))]
##    id year sect       re       lk       ll       lm       lA       lq
## 1:  1   10    1 1.451088 3.771010 3.617458 3.659897 3.780566 7.561238
## 2:  2   20    1 1.451088 3.752543 4.330262 3.925079 4.288035 8.531302
##        lqre     lqfe
## 1: 9.012326 10.59548
## 2: 9.982390 11.57269
fdata[(id %in% c(1,2)) & (year %in% c(10, 20))]
##    id year sect       re       lk       ll       lm       lA       lq
## 1:  1   10    1 1.451088 3.771010 3.617458 3.659897 3.780566 7.561238
## 2:  1   20    1 1.451088 3.752109 4.091652 4.064794 4.138883 8.211524
## 3:  2   10    1 1.451088 3.794768 4.051088 3.732258 3.858232 7.818132
## 4:  2   20    1 1.451088 3.752543 4.330262 3.925079 4.288035 8.531302
##        lqre     lqfe
## 1: 9.012326 10.59548
## 2: 9.662612 11.24577
## 3: 9.269220 10.85952
## 4: 9.982390 11.57269
fdata[.(c(1:2))][year %in% c(10, 20)]
##    id year sect       re       lk       ll       lm       lA       lq
## 1:  1   10    1 1.451088 3.771010 3.617458 3.659897 3.780566 7.561238
## 2:  1   20    1 1.451088 3.752109 4.091652 4.064794 4.138883 8.211524
## 3:  2   10    1 1.451088 3.794768 4.051088 3.732258 3.858232 7.818132
## 4:  2   20    1 1.451088 3.752543 4.330262 3.925079 4.288035 8.531302
##        lqre     lqfe
## 1: 9.012326 10.59548
## 2: 9.662612 11.24577
## 3: 9.269220 10.85952
## 4: 9.982390 11.57269
sM <- data.table(id=c(1,1,2,2), year=c(10,20,10,20))
sM
##    id year
## 1:  1   10
## 2:  1   20
## 3:  2   10
## 4:  2   20
fdata[sM]
##    id year sect       re       lk       ll       lm       lA       lq
## 1:  1   10    1 1.451088 3.771010 3.617458 3.659897 3.780566 7.561238
## 2:  1   20    1 1.451088 3.752109 4.091652 4.064794 4.138883 8.211524
## 3:  2   10    1 1.451088 3.794768 4.051088 3.732258 3.858232 7.818132
## 4:  2   20    1 1.451088 3.752543 4.330262 3.925079 4.288035 8.531302
##        lqre     lqfe
## 1: 9.012326 10.59548
## 2: 9.662612 11.24577
## 3: 9.269220 10.85952
## 4: 9.982390 11.57269

Selecting columns

  • Variables are in columns
# Rows 3-5 of column id
fdata[3:5, id]
## [1] 1 1 1
# Rows 3-5 of columns id, year and lq
fdata[3:5, .(id, year, lq)]
##    id year       lq
## 1:  1    3 7.469098
## 2:  1    4 7.456159
## 3:  1    5 7.590778
# NOTE: .() means list()

# First 10 rows of id, year, lq and lk for missing lk's
fdata[is.na(lk), .(id, year, lq, lk)][1:10]
##     id year       lq lk
##  1:  7    2 7.438563 NA
##  2:  9   19 8.613044 NA
##  3: 13   14 8.247240 NA
##  4: 14    1 7.266216 NA
##  5: 14    2 7.054240 NA
##  6: 14    3 7.295258 NA
##  7: 14    4 7.442786 NA
##  8: 14    5 7.389027 NA
##  9: 14    6 7.754037 NA
## 10: 14    7 7.479533 NA
# Mean of lq
fdata[, mean(lq)]
## [1] 7.871772
# Mean and standard deviation of lq
fdata[, .(mean(lq), sd(lq))]
##          V1        V2
## 1: 7.871772 0.4045916
# Mean and standard deviation of lk
fdata[, .(mean(lk), sd(lk))]
##    V1 V2
## 1: NA NA
# Assigning names to computed columns
fdata[, .(mean.lq = mean(lq), sd.lq = sd(lq))]
##     mean.lq     sd.lq
## 1: 7.871772 0.4045916
# Even functions can be written
fdata[, {plot(lq)
        plot(ll)}]

## NULL

Rows and columns

# Doing __j__ by group
fdata[, .(mean.lq = mean(lq), sd.lq = sd(lq)), by=sect]
##     sect  mean.lq     sd.lq
##  1:    1 7.897290 0.4166096
##  2:    2 7.893605 0.4143390
##  3:    3 7.848226 0.4034730
##  4:    4 7.875375 0.4131847
##  5:    5 7.873004 0.4059001
##  6:    6 7.857760 0.3985663
##  7:    7 7.863143 0.3983187
##  8:    8 7.864263 0.3904034
##  9:    9 7.867744 0.3936005
## 10:   10 7.877315 0.4089648
# Doing __j__ by several groups
sectq <- fdata[, .(mean.lq = mean(lq), sd.lq = sd(lq)), by=.(sect, year)]
sectq
##      sect year  mean.lq     sd.lq
##   1:    1    1 7.270624 0.1158688
##   2:    1    2 7.335145 0.1041169
##   3:    1    3 7.435708 0.1111992
##   4:    1    4 7.490405 0.1361855
##   5:    1    5 7.537824 0.1488705
##  ---                             
## 196:   10   16 8.228432 0.2232385
## 197:   10   17 8.293724 0.2247541
## 198:   10   18 8.330793 0.2418411
## 199:   10   19 8.391150 0.2382373
## 200:   10   20 8.477096 0.2588875
tables()
##      NAME    NROW NCOL MB
## [1,] cdata 17,292   18  3
## [2,] fdata 20,000   11  2
## [3,] sectq    200    4  1
## [4,] sM         4    2  1
##      COLS                                                          KEY    
## [1,] id,year,sect,kg,lg,mg,tc,fs,re,ee,lk,ll,lm,lA,fe,lq,lqre,lqfe        
## [2,] id,year,sect,re,lk,ll,lm,lA,lq,lqre,lqfe                      id,year
## [3,] sect,year,mean.lq,sd.lq                                              
## [4,] id,year                                                              
## Total: 7MB
# Call functions in by
fdata[, .(mean.lq = mean(lq), sd.lq = sd(lq)), by=.(sect, year>5)]
##     sect  year  mean.lq     sd.lq
##  1:    1 FALSE 7.413941 0.1582613
##  2:    1  TRUE 8.058406 0.3452765
##  3:    2 FALSE 7.409087 0.1449751
##  4:    2  TRUE 8.055111 0.3428375
##  5:    3 FALSE 7.384280 0.1538802
##  6:    3  TRUE 8.002875 0.3368661
##  7:    4 FALSE 7.400697 0.1515523
##  8:    4  TRUE 8.033600 0.3461393
##  9:    5 FALSE 7.401239 0.1542755
## 10:    5  TRUE 8.030259 0.3358633
## 11:    6 FALSE 7.391411 0.1610338
## 12:    6  TRUE 8.013210 0.3263230
## 13:    7 FALSE 7.392609 0.1506302
## 14:    7  TRUE 8.019988 0.3248988
## 15:    8 FALSE 7.404136 0.1552749
## 16:    8  TRUE 8.017638 0.3179128
## 17:    9 FALSE 7.405400 0.1583768
## 18:    9  TRUE 8.021859 0.3212127
## 19:   10 FALSE 7.405281 0.1602341
## 20:   10  TRUE 8.034659 0.3397027
fdata[, .(mean.lq = mean(lq), sd.lq = sd(lq), nobs = length(lq)), by=is.na(ll)]
##    is.na  mean.lq     sd.lq  nobs
## 1: FALSE 7.871753 0.4050232 19004
## 2:  TRUE 7.872142 0.3964655   996
# Grouping only on a subset
fdata[1:5, .(mean.lq = mean(lq), sd.lq = sd(lq)), by=.(year)]
##    year  mean.lq sd.lq
## 1:    1 7.450393    NA
## 2:    2 7.168493    NA
## 3:    3 7.469098    NA
## 4:    4 7.456159    NA
## 5:    5 7.590778    NA
# Using .N to get the total number of observations
fdata[is.na(lk), .N, by = sect]
##     sect   N
##  1:    1  89
##  2:    2 103
##  3:    3  75
##  4:    4  88
##  5:    5 134
##  6:    6 109
##  7:    7  86
##  8:    8 113
##  9:    9  73
## 10:   10  96

Creating/updating columns

head(fdata)
##    id year sect       re       lk       ll       lm       lA       lq
## 1:  1    1    1 1.451088 3.460166 3.500836 3.571742 3.574206 7.450393
## 2:  1    2    1 1.451088 3.567747 3.480959 3.603746 3.535412 7.168493
## 3:  1    3    1 1.451088 3.529592 3.514180 3.590992 3.597093 7.469098
## 4:  1    4    1 1.451088 3.682966 3.519952 3.621022 3.620942 7.456159
## 5:  1    5    1 1.451088 3.664550 3.543314 3.588623 3.662815 7.590778
## 6:  1    6    1 1.451088 3.857630 3.590211 3.649067 3.704505 7.618230
##        lqre     lqfe
## 1: 8.901481 10.48463
## 2: 8.619581 10.20273
## 3: 8.920186 10.50334
## 4: 8.907247 10.49040
## 5: 9.041866 10.62502
## 6: 9.069318 10.65247
fdata[, Q1 := exp(lq)]
head(fdata)
##    id year sect       re       lk       ll       lm       lA       lq
## 1:  1    1    1 1.451088 3.460166 3.500836 3.571742 3.574206 7.450393
## 2:  1    2    1 1.451088 3.567747 3.480959 3.603746 3.535412 7.168493
## 3:  1    3    1 1.451088 3.529592 3.514180 3.590992 3.597093 7.469098
## 4:  1    4    1 1.451088 3.682966 3.519952 3.621022 3.620942 7.456159
## 5:  1    5    1 1.451088 3.664550 3.543314 3.588623 3.662815 7.590778
## 6:  1    6    1 1.451088 3.857630 3.590211 3.649067 3.704505 7.618230
##        lqre     lqfe       Q1
## 1: 8.901481 10.48463 1720.539
## 2: 8.619581 10.20273 1297.887
## 3: 8.920186 10.50334 1753.025
## 4: 8.907247 10.49040 1730.488
## 5: 9.041866 10.62502 1979.854
## 6: 9.069318 10.65247 2034.956
# Be careful: DT <- DT[...] is redundant. The variables are added into DT
fdata[, c("K", "L", "M") := list(exp(lk), exp(ll), exp(lm))]
head(fdata)
##    id year sect       re       lk       ll       lm       lA       lq
## 1:  1    1    1 1.451088 3.460166 3.500836 3.571742 3.574206 7.450393
## 2:  1    2    1 1.451088 3.567747 3.480959 3.603746 3.535412 7.168493
## 3:  1    3    1 1.451088 3.529592 3.514180 3.590992 3.597093 7.469098
## 4:  1    4    1 1.451088 3.682966 3.519952 3.621022 3.620942 7.456159
## 5:  1    5    1 1.451088 3.664550 3.543314 3.588623 3.662815 7.590778
## 6:  1    6    1 1.451088 3.857630 3.590211 3.649067 3.704505 7.618230
##        lqre     lqfe       Q1        K        L        M
## 1: 8.901481 10.48463 1720.539 31.82225 33.14313 35.57852
## 2: 8.619581 10.20273 1297.887 35.43667 32.49088 36.73558
## 3: 8.920186 10.50334 1753.025 34.11003 33.58837 36.27004
## 4: 8.907247 10.49040 1730.488 39.76416 33.78280 37.37576
## 5: 9.041866 10.62502 1979.854 39.03855 34.58134 36.18422
## 6: 9.069318 10.65247 2034.956 47.35298 36.24170 38.43879

Indexing and keys

setkey(fdata, id, year)
fdata[.(1), .(id, year, Q1)]
##     id year       Q1
##  1:  1    1 1720.539
##  2:  1    2 1297.887
##  3:  1    3 1753.025
##  4:  1    4 1730.488
##  5:  1    5 1979.854
##  6:  1    6 2034.956
##  7:  1    7 2406.772
##  8:  1    8 1911.099
##  9:  1    9 2014.024
## 10:  1   10 1922.224
## 11:  1   11 2298.198
## 12:  1   12 2608.475
## 13:  1   13 2921.443
## 14:  1   14 3165.666
## 15:  1   15 3242.925
## 16:  1   16 3157.061
## 17:  1   17 2906.368
## 18:  1   18 3916.234
## 19:  1   19 4689.359
## 20:  1   20 3683.151
fdata[.(c(1:10)), mult = "first", .(id, year, Q1)]
##     id year       Q1
##  1:  1    1 1720.539
##  2:  2    1 1641.447
##  3:  3    1 1208.197
##  4:  4    1 1416.273
##  5:  5    1 1495.350
##  6:  6    1 1364.348
##  7:  7    1 1733.457
##  8:  8    1 1442.326
##  9:  9    1 1231.892
## 10: 10    1 1344.686
fdata[.(c(1:10)), mult = "last", .(id, year, Q1)]
##     id year       Q1
##  1:  1   20 3683.151
##  2:  2   20 5071.042
##  3:  3   20 5442.939
##  4:  4   20 6074.515
##  5:  5   20 3495.429
##  6:  6   20 3756.415
##  7:  7   20 7441.535
##  8:  8   20 9415.008
##  9:  9   20 4610.867
## 10: 10   20 4858.761
fdata[.(1,20)]
##    id year sect       re       lk       ll       lm       lA       lq
## 1:  1   20    1 1.451088 3.752109 4.091652 4.064794 4.138883 8.211524
##        lqre     lqfe       Q1        K        L        M
## 1: 9.662612 11.24577 3683.151 42.61084 59.83865 58.25291
fdata[.(c(1:10)), sum(Q1), by = .EACHI]
##     id       V1
##  1:  1 51359.75
##  2:  2 56529.36
##  3:  3 61336.67
##  4:  4 66528.10
##  5:  5 53819.73
##  6:  6 50331.37
##  7:  7 66448.85
##  8:  8 83715.62
##  9:  9 58769.70
## 10: 10 53525.65

Changing variable names

setnames(fdata, "Q1", "Q")
# Changing variable names
fdata[, list(sum(Q), round(sum(L, na.rm = TRUE), 0)), by = year]
##     year      V1    V2
##  1:    1 1447810 33741
##  2:    2 1552832 34741
##  3:    3 1641569 35704
##  4:    4 1762303 37418
##  5:    5 1881921 38930
##  6:    6 1996175 40590
##  7:    7 2127278 42666
##  8:    8 2286357 43927
##  9:    9 2430265 45927
## 10:   10 2597294 48066
## 11:   11 2747768 49957
## 12:   12 2933000 50753
## 13:   13 3127524 53384
## 14:   14 3345488 55929
## 15:   15 3551326 58260
## 16:   16 3782430 60660
## 17:   17 4041872 62738
## 18:   18 4286426 65331
## 19:   19 4567648 68675
## 20:   20 4892994 71342
# Sorting
fdata[order(-year)]
##          id year sect       re       lk       ll       lm       lA
##     1:    1   20    1 1.451088 3.752109 4.091652 4.064794 4.138883
##     2:    2   20    1 1.451088 3.752543 4.330262 3.925079 4.288035
##     3:    3   20    1 1.451088 4.476468       NA 3.929152 4.249692
##     4:    4   20    1 1.451088 4.006754 4.480976 3.952056 4.348375
##     5:    5   20    1 1.451088 4.159681 4.427926 3.823097 3.975474
##    ---                                                            
## 19996:  996    1   10 1.451088 3.436875 3.556975 3.545940 3.618952
## 19997:  997    1   10 1.451088 3.592625 3.545064 3.541389 3.581739
## 19998:  998    1   10 1.451088 3.585301 3.551032 3.570112 3.550223
## 19999:  999    1   10 1.451088 3.539822 3.566888 3.560680 3.575617
## 20000: 1000    1   10 1.451088 3.493055 3.533276 3.554256 3.538039
##              lq      lqre     lqfe        Q        K        L        M
##     1: 8.211524  9.662612 11.24577 3683.151 42.61084 59.83865 58.25291
##     2: 8.531302  9.982390 11.57269 5071.042 42.62934 75.96415 50.65708
##     3: 8.602074 10.053162 11.86603 5442.939 87.92359       NA 50.86383
##     4: 8.711857 10.162946 12.08799 6074.515 54.96813 88.32083 52.04223
##     5: 8.159211  9.610299 11.22789 3495.429 64.05110 83.75752 45.74567
##    ---                                                                
## 19996: 7.470873  8.921961 10.61985 1756.140 31.08964 35.05698 34.67227
## 19997: 7.124611  8.575699 10.35491 1242.164 36.32931 34.64189 34.51481
## 19998: 7.188222  8.639310 10.60989 1323.748 36.06420 34.84928 35.52058
## 19999: 7.426827  8.877915 10.39506 1680.466 34.46078 35.40623 35.18713
## 20000: 7.384582  8.835670 10.56906 1610.955 32.88626 34.23593 34.96180

Chaining

fdata[, list(sum(Q), round(sum(L, na.rm = TRUE), 0)), by = year][order(-year)]
##     year      V1    V2
##  1:   20 4892994 71342
##  2:   19 4567648 68675
##  3:   18 4286426 65331
##  4:   17 4041872 62738
##  5:   16 3782430 60660
##  6:   15 3551326 58260
##  7:   14 3345488 55929
##  8:   13 3127524 53384
##  9:   12 2933000 50753
## 10:   11 2747768 49957
## 11:   10 2597294 48066
## 12:    9 2430265 45927
## 13:    8 2286357 43927
## 14:    7 2127278 42666
## 15:    6 1996175 40590
## 16:    5 1881921 38930
## 17:    4 1762303 37418
## 18:    3 1641569 35704
## 19:    2 1552832 34741
## 20:    1 1447810 33741

Melting and casting

  • Melting: Transforming the dataset into long format
  • Casting: Transforming the dataset into wide format
# Sectoral prices
sPrice <- data.table(
    year = c(1:nt),
    p1 = 1 + cumsum(rnorm(nt, 0.02, 0.04)),
    p2 = 1 + cumsum(rnorm(nt, 0.03, 0.05)),
    p3 = 1 + cumsum(rnorm(nt, 0.04, 0.02)),
    p4 = 1 + cumsum(rnorm(nt, 0.05, 0.03)),
    p5 = 1 + cumsum(rnorm(nt, 0.02, 0.04)),
    p6 = 1 + cumsum(rnorm(nt, 0.03, 0.06)),
    p7 = 1 + cumsum(rnorm(nt, 0.04, 0.02)),
    p8 = 1 + cumsum(rnorm(nt, 0.05, 0.07)),
    p9 = 1 + cumsum(rnorm(nt, 0.05, 0.04)),
    p10 = 1 + cumsum(rnorm(nt, 0.05, 0.06)))

head(sPrice)
##    year       p1       p2       p3       p4        p5        p6       p7
## 1:    1 1.049122 1.066042 1.023709 1.036918 0.9922281 0.9590573 1.028753
## 2:    2 1.080498 1.115911 1.063487 1.067135 0.9646375 0.9843723 1.086069
## 3:    3 1.060964 1.241893 1.126497 1.109609 1.0141422 1.0506699 1.130542
## 4:    4 1.150692 1.266971 1.125099 1.117109 1.0274516 1.0626343 1.139303
## 5:    5 1.183603 1.296739 1.142110 1.205988 1.0276317 0.9551767 1.165413
## 6:    6 1.134129 1.289638 1.175852 1.282831 0.9631355 1.1276886 1.163754
##          p8       p9      p10
## 1: 1.077405 1.071320 1.063231
## 2: 1.122361 1.167985 1.092872
## 3: 1.157053 1.203949 1.245890
## 4: 1.220952 1.247592 1.329994
## 5: 1.319415 1.265053 1.354957
## 6: 1.395719 1.322034 1.439796
sPriceM <- melt(sPrice, id.vars=1, variable.name = "sectName", value.name = "price")
head(sPriceM)
##    year sectName    price
## 1:    1       p1 1.049122
## 2:    2       p1 1.080498
## 3:    3       p1 1.060964
## 4:    4       p1 1.150692
## 5:    5       p1 1.183603
## 6:    6       p1 1.134129
sPriceM$sect <- as.numeric(unlist(gsub("[^0-9]", "", unlist(sPriceM$sectName)), ""))
sPriceM[, sectName := NULL]
head(sPriceM)
##    year    price sect
## 1:    1 1.049122    1
## 2:    2 1.080498    1
## 3:    3 1.060964    1
## 4:    4 1.150692    1
## 5:    5 1.183603    1
## 6:    6 1.134129    1
ggplot(sPriceM, aes(x=year, y=price, group=sect, color=as.factor(sect))) + geom_line() +
    ggtitle("Price indices")

setkey(sPriceM, sect, year)
setkey(fdata, sect, year)
fdataM <- fdata[sPriceM]
head(fdataM)
##    id year sect       re       lk       ll       lm       lA       lq
## 1:  1    1    1 1.451088 3.460166 3.500836 3.571742 3.574206 7.450393
## 2:  2    1    1 1.451088 3.460921       NA 3.548402 3.538912 7.403333
## 3:  3    1    1 1.451088 3.524785       NA 3.547351 3.566118 7.096884
## 4:  4    1    1 1.451088 3.628487 3.517694 3.542273 3.592309 7.255784
## 5:  5    1    1 1.451088 3.667655 3.579819 3.528693 3.524091 7.310116
## 6:  6    1    1 1.451088 3.496021 3.582952 3.575014 3.533474 7.218432
##        lqre     lqfe        Q        K        L        M    price
## 1: 8.901481 10.48463 1720.539 31.82225 33.14313 35.57852 1.049122
## 2: 8.854422 10.44473 1641.447 31.84629       NA 34.75773 1.049122
## 3: 8.547972 10.36084 1208.197 33.94646       NA 34.72122 1.049122
## 4: 8.706872 10.63192 1416.273 37.65580 33.70662 34.54534 1.049122
## 5: 8.761204 10.37879 1495.350 39.15997 35.86706 34.07939 1.049122
## 6: 8.669520 10.19897 1364.348 32.98395 35.97959 35.69512 1.049122

Merging datasets

  • Create sector level variables
sdata <- fdata[, .(sQ = sum(Q, na.rm=T), sL = sum(L, na.rm=T), sM = sum(M, na.rm=T)),
    by = .(sect, year)]
summary(sdata)
##       sect           year             sQ               sL      
##  Min.   : 1.0   Min.   : 1.00   Min.   :139528   Min.   :3218  
##  1st Qu.: 3.0   1st Qu.: 5.75   1st Qu.:194502   1st Qu.:4005  
##  Median : 5.5   Median :10.50   Median :265816   Median :4862  
##  Mean   : 5.5   Mean   :10.50   Mean   :285001   Mean   :4994  
##  3rd Qu.: 8.0   3rd Qu.:15.25   3rd Qu.:368845   3rd Qu.:5941  
##  Max.   :10.0   Max.   :20.00   Max.   :510215   Max.   :7468  
##        sM      
##  Min.   :3187  
##  1st Qu.:3627  
##  Median :4066  
##  Mean   :4105  
##  3rd Qu.:4520  
##  Max.   :5275
dim(sdata)
## [1] 200   5
# Set keys for datasets
setkey(fdata, sect, year)
setkey(sdata, sect, year)
# Merge it with the firm level data
hdata <- fdata[sdata]
# That's all...
dim(hdata)
## [1] 20000    18
dim(fdata)
## [1] 20000    15
identical(hdata$ll, fdata$ll)
## [1] TRUE
# Shorter way
fdata[, c("sQ", "sL", "sM") := list(sum(Q, na.rm=T), sum(L, na.rm=T), sum(M, na.rm=T)),
    by = .(sect, year)]
identical(hdata, fdata)
## [1] TRUE
# Find deviations from sectoral geometric averages
fdata[, lqdev := lq - mean(lq, na.rm=T), by = .(sect)]
fdata[, lkdev := lk - mean(lk, na.rm=T), by = .(sect)]
fdata[, lldev := ll - mean(ll, na.rm=T), by = .(sect)]
fdata[, lmdev := lm - mean(lm, na.rm=T), by = .(sect)]

Regression analysis

Estimation

model1 <- lm(lq ~ lk + ll + lm + year, data = fdata)
model2 <- lm(lqdev ~ lkdev + lldev + lmdev + year, data = fdata)
model3 <- lm(lqre ~ lk + ll + lm + year, data = fdata)
model4 <- lm(lqfe ~ lk + ll + lm + year, data = fdata)
model5 <- plm(lq ~ lk + ll + lm + year, data = fdata, model="random", effect="individual")

series re is constant and has been removed

model6 <- plm(lq ~ lk + ll + lm + year, data = fdata, model="within", effect="individual")

series re is constant and has been removed

stargazer(model1, model2, model3, model4, model5, model6, type="html",
    model.numbers=FALSE, 
    dep.var.labels.include = FALSE, dep.var.caption  = "Production function estimates", 
    column.labels = c("OLS", "OLS+dev", "OLS RE", "OLS FE", "Random effects", "Fixed
        effects"))
Production function estimates
OLS OLS OLS OLS panel
linear
effects
lk 0.092*** 0.092*** 0.229*** 0.096*** 0.097***
(0.005) (0.005) (0.008) (0.007) (0.007)
ll 0.260*** 0.260*** 0.399*** 0.224*** 0.214***
(0.010) (0.010) (0.015) (0.013) (0.014)
lm 0.749*** 0.749*** 0.704*** 0.755*** 0.757***
(0.010) (0.010) (0.016) (0.013) (0.015)
lkdev 0.091***
(0.005)
lldev 0.263***
(0.010)
lmdev 0.747***
(0.010)
year 0.033*** 0.033*** 0.033*** 0.024***
(0.0005) (0.0005) (0.0005) (0.001)
year2 0.044*** 0.044***
(0.006) (0.006)
year3 0.071*** 0.072***
(0.006) (0.006)
year4 0.111*** 0.112***
(0.006) (0.006)
year5 0.145*** 0.147***
(0.006) (0.006)
year6 0.177*** 0.178***
(0.007) (0.007)
year7 0.209*** 0.211***
(0.007) (0.007)
year8 0.253*** 0.255***
(0.007) (0.007)
year9 0.281*** 0.283***
(0.007) (0.008)
year10 0.317*** 0.320***
(0.008) (0.008)
year11 0.343*** 0.346***
(0.008) (0.009)
year12 0.380*** 0.384***
(0.009) (0.009)
year13 0.413*** 0.416***
(0.009) (0.010)
year14 0.452*** 0.456***
(0.009) (0.010)
year15 0.483*** 0.487***
(0.010) (0.010)
year16 0.518*** 0.523***
(0.010) (0.011)
year17 0.550*** 0.555***
(0.011) (0.012)
year18 0.583*** 0.588***
(0.011) (0.012)
year19 0.618*** 0.623***
(0.012) (0.013)
year20 0.656*** 0.662***
(0.012) (0.013)
Constant 3.342*** -0.346*** 4.793*** 5.699*** 3.459***
(0.049) (0.005) (0.049) (0.073) (0.064)
Observations 17,292 17,292 17,292 17,292 17,292 17,292
R2 0.856 0.857 0.856 0.734 0.904 0.906
Adjusted R2 0.856 0.857 0.856 0.734 0.903 0.856
Residual Std. Error (df = 17287) 0.153 0.153 0.153 0.228
F Statistic 25,739.050*** (df = 4; 17287) 25,904.810*** (df = 4; 17287) 25,739.050*** (df = 4; 17287) 11,926.170*** (df = 4; 17287) 7,412.629*** (df = 22; 17269) 7,173.585*** (df = 22; 16342)
Note: p<0.1; p<0.05; p<0.01

Regression output

attributes(model1)
## $names
##  [1] "coefficients"  "residuals"     "effects"       "rank"         
##  [5] "fitted.values" "assign"        "qr"            "df.residual"  
##  [9] "na.action"     "xlevels"       "call"          "terms"        
## [13] "model"        
## 
## $class
## [1] "lm"
attributes(model5)
## $names
## [1] "coefficients" "vcov"         "residuals"    "df.residual" 
## [5] "formula"      "model"        "ercomp"       "args"        
## [9] "call"        
## 
## $class
## [1] "plm"        "panelmodel"
class(model1$residuals)
## [1] "numeric"
hist(model1$residuals, plot = TRUE)

Monte Carlo analysis

  • Test the relationship between sample size and estimation error
ntest <- 1000
coeffA <- matrix(NA, nrow=ntest, ncol=5)
coeffB <- matrix(NA, nrow=ntest, ncol=5)
for (j in c(1:5)) {
    for (i in c(1:ntest)) {
        ns <- floor(10*exp(j))
        x1 <- rnorm(ns, 0, 1)
        x2 <- rnorm(ns, 0, 2)
        y <- 1 + 0.5*x1 + 2*x2 + rnorm(ns, 0, 1)
        mod <- lm(y ~ x1 + x2)
        coeffA[i,j] <- mod$coefficients[2]
        coeffB[i,j] <- mod$coefficients[3]
    }
}
coeffA <- as.data.frame(coeffA)
coeffB <- as.data.frame(coeffB)
names(coeffB) <- names(coeffA) <- as.character(floor(10*exp(c(1:5))))
head(coeffA)
##          27        73       200       545      1484
## 1 0.4211819 0.4458851 0.4191215 0.5652003 0.5215624
## 2 0.3619817 0.4798847 0.4378743 0.5226709 0.4945236
## 3 0.4115284 0.3940921 0.4949028 0.4445838 0.4837831
## 4 0.6026856 0.5614531 0.6064617 0.5558239 0.5110329
## 5 0.2935289 0.7281571 0.5584724 0.5051947 0.4949447
## 6 0.6342117 0.4158434 0.4564914 0.4722227 0.5465058
coeffAA <- melt(coeffA, id.vars=0, variable.name = "N_obs", value.name = "Estimate")
head(coeffAA)
##   N_obs  Estimate
## 1    27 0.4211819
## 2    27 0.3619817
## 3    27 0.4115284
## 4    27 0.6026856
## 5    27 0.2935289
## 6    27 0.6342117
coeffBB <- melt(coeffB, id.vars=0, variable.name = "N_obs", value.name = "Estimate")

ggplot(coeffAA, aes(x=Estimate, group=N_obs, color=N_obs)) + geom_density() + 
    ggtitle("Coefficient A")

ggplot(coeffBB, aes(x=Estimate, group=N_obs, color=N_obs)) + geom_density() + 
    ggtitle("Coefficient B")