9 Week 5 Demo
9.1 Setup
First, we’ll load the packages we’ll be using in this week’s brief demo.
devtools::install_github("conjugateprior/austin")
library(austin)
library(quanteda)
library(quanteda.textstats)
9.2 Wordscores
We can inspect the function for the wordscores model by Laver, Benoit, and Garry (2003) in the following way:
classic.wordscores
## function (wfm, scores)
## {
## if (!is.wfm(wfm))
## stop("Function not applicable to this object")
## if (length(scores) != length(docs(wfm)))
## stop("There are not the same number of documents as scores")
## if (any(is.na(scores)))
## stop("One of the reference document scores is NA\nFit the model with known scores and use 'predict' to get virgin score estimates")
## thecall <- match.call()
## C.all <- as.worddoc(wfm)
## C <- C.all[rowSums(C.all) > 0, ]
## F <- scale(C, center = FALSE, scale = colSums(C))
## ws <- apply(F, 1, function(x) {
## sum(scores * x)
## })/rowSums(F)
## pi <- matrix(ws, nrow = length(ws))
## rownames(pi) <- rownames(C)
## colnames(pi) <- c("Score")
## val <- list(pi = pi, theta = scores, data = wfm, call = thecall)
## class(val) <- c("classic.wordscores", "wordscores", class(val))
## return(val)
## }
## <bytecode: 0x2d4acbca8>
## <environment: namespace:austin>
We can then take some example data included in the austin
package.
And here our reference documents are all those documents marked with an “R” for reference; i.e., columns one to five.
ref
## docs
## words R1 R2 R3 R4 R5
## A 2 0 0 0 0
## B 3 0 0 0 0
## C 10 0 0 0 0
## D 22 0 0 0 0
## E 45 0 0 0 0
## F 78 2 0 0 0
## G 115 3 0 0 0
## H 146 10 0 0 0
## I 158 22 0 0 0
## J 146 45 0 0 0
## K 115 78 2 0 0
## L 78 115 3 0 0
## M 45 146 10 0 0
## N 22 158 22 0 0
## O 10 146 45 0 0
## P 3 115 78 2 0
## Q 2 78 115 3 0
## R 0 45 146 10 0
## S 0 22 158 22 0
## T 0 10 146 45 0
## U 0 3 115 78 2
## V 0 2 78 115 3
## W 0 0 45 146 10
## X 0 0 22 158 22
## Y 0 0 10 146 45
## Z 0 0 3 115 78
## ZA 0 0 2 78 115
## ZB 0 0 0 45 146
## ZC 0 0 0 22 158
## ZD 0 0 0 10 146
## ZE 0 0 0 3 115
## ZF 0 0 0 2 78
## ZG 0 0 0 0 45
## ZH 0 0 0 0 22
## ZI 0 0 0 0 10
## ZJ 0 0 0 0 3
## ZK 0 0 0 0 2
This matrix is simply a series of words (here: letters) and reference texts with word counts for each of them.
We can then look at the wordscores for each of the words, calculated using the reference dimensions for each of the reference documents.
ws <- classic.wordscores(ref, scores=seq(-1.5,1.5,by=0.75))
ws
## $pi
## Score
## A -1.5000000
## B -1.5000000
## C -1.5000000
## D -1.5000000
## E -1.5000000
## F -1.4812500
## G -1.4809322
## H -1.4519231
## I -1.4083333
## J -1.3232984
## K -1.1846154
## L -1.0369898
## M -0.8805970
## N -0.7500000
## O -0.6194030
## P -0.4507576
## Q -0.2992424
## R -0.1305970
## S 0.0000000
## T 0.1305970
## U 0.2992424
## V 0.4507576
## W 0.6194030
## X 0.7500000
## Y 0.8805970
## Z 1.0369898
## ZA 1.1846154
## ZB 1.3232984
## ZC 1.4083333
## ZD 1.4519231
## ZE 1.4809322
## ZF 1.4812500
## ZG 1.5000000
## ZH 1.5000000
## ZI 1.5000000
## ZJ 1.5000000
## ZK 1.5000000
##
## $theta
## [1] -1.50 -0.75 0.00 0.75 1.50
##
## $data
## docs
## words R1 R2 R3 R4 R5
## A 2 0 0 0 0
## B 3 0 0 0 0
## C 10 0 0 0 0
## D 22 0 0 0 0
## E 45 0 0 0 0
## F 78 2 0 0 0
## G 115 3 0 0 0
## H 146 10 0 0 0
## I 158 22 0 0 0
## J 146 45 0 0 0
## K 115 78 2 0 0
## L 78 115 3 0 0
## M 45 146 10 0 0
## N 22 158 22 0 0
## O 10 146 45 0 0
## P 3 115 78 2 0
## Q 2 78 115 3 0
## R 0 45 146 10 0
## S 0 22 158 22 0
## T 0 10 146 45 0
## U 0 3 115 78 2
## V 0 2 78 115 3
## W 0 0 45 146 10
## X 0 0 22 158 22
## Y 0 0 10 146 45
## Z 0 0 3 115 78
## ZA 0 0 2 78 115
## ZB 0 0 0 45 146
## ZC 0 0 0 22 158
## ZD 0 0 0 10 146
## ZE 0 0 0 3 115
## ZF 0 0 0 2 78
## ZG 0 0 0 0 45
## ZH 0 0 0 0 22
## ZI 0 0 0 0 10
## ZJ 0 0 0 0 3
## ZK 0 0 0 0 2
##
## $call
## classic.wordscores(wfm = ref, scores = seq(-1.5, 1.5, by = 0.75))
##
## attr(,"class")
## [1] "classic.wordscores" "wordscores"
## [3] "list"
And here we see the thetas contained in this wordscores object, i.e., the reference dimensions for each of the reference documents and the pis, i.e., the estimated wordscores for each word.
We can now use these to score the so-called “virgin” texts as follows.
#get "virgin" documents
vir <- getdocs(lbg, 'V1')
vir
## docs
## words V1
## A 0
## B 0
## C 0
## D 0
## E 0
## F 0
## G 0
## H 2
## I 3
## J 10
## K 22
## L 45
## M 78
## N 115
## O 146
## P 158
## Q 146
## R 115
## S 78
## T 45
## U 22
## V 10
## W 3
## X 2
## Y 0
## Z 0
## ZA 0
## ZB 0
## ZC 0
## ZD 0
## ZE 0
## ZF 0
## ZG 0
## ZH 0
## ZI 0
## ZJ 0
## ZK 0
# predict textscores for the virgin documents
predict(ws, newdata=vir)
## 37 of 37 words (100%) are scorable
##
## Score Std. Err. Rescaled Lower Upper
## V1 -0.448 0.0119 -0.448 -0.459 -0.437
9.3 Wordfish
If we wish, we can inspect the function for the wordscores model by Slapin and Proksch (2008) in the following way. This is a much more complex algorithm, which is not printed here, but you can inspect on your own devices.
wordfish
We can then simulate some data, formatted appropriately for wordfiash estimation in the following way:
dd <- sim.wordfish()
dd
## $Y
## docs
## words D01 D02 D03 D04 D05 D06 D07 D08 D09 D10
## W01 17 19 22 13 17 11 16 12 6 3
## W02 18 21 18 16 12 19 11 7 10 4
## W03 22 21 22 19 11 14 11 3 6 1
## W04 22 19 18 15 16 13 18 6 2 8
## W05 28 21 12 10 13 10 5 14 1 3
## W06 5 7 12 13 15 8 12 13 23 19
## W07 13 9 5 16 11 17 15 11 35 30
## W08 8 7 7 10 9 15 18 23 21 23
## W09 4 12 8 10 9 13 18 25 15 19
## W10 5 3 7 11 19 16 13 18 17 18
## W11 66 55 49 48 38 37 27 24 21 6
## W12 53 56 47 39 49 28 22 15 12 14
## W13 63 55 47 49 48 31 24 16 17 16
## W14 57 64 48 51 27 36 24 27 11 12
## W15 58 48 57 44 36 39 29 27 16 5
## W16 17 13 24 28 24 32 41 56 67 61
## W17 9 19 16 36 30 34 53 34 58 57
## W18 11 19 34 27 42 38 48 58 49 66
## W19 10 18 27 22 37 52 59 60 60 69
## W20 14 14 20 23 37 37 36 51 53 66
##
## $theta
## [1] -1.4863011 -1.1560120 -0.8257228 -0.4954337 -0.1651446
## [6] 0.1651446 0.4954337 0.8257228 1.1560120 1.4863011
##
## $doclen
## D01 D02 D03 D04 D05 D06 D07 D08 D09 D10
## 500 500 500 500 500 500 500 500 500 500
##
## $psi
## [1] 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1
##
## $beta
## [1] 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 1 1 1 1 1
##
## attr(,"class")
## [1] "wordfish.simdata" "list"
Here we can see the document and word-level FEs, as well as the specified range of the thetas to be estimates.
Then estimating the document positions is simply a matter of implementing this algorithm.
## Call:
## wordfish(wfm = dd$Y)
##
## Document Positions:
## Estimate Std. Error Lower Upper
## D01 -1.4243 0.10560 -1.6313 -1.21736
## D02 -1.1483 0.09747 -1.3394 -0.95727
## D03 -0.7701 0.08954 -0.9456 -0.59455
## D04 -0.4878 0.08591 -0.6562 -0.31942
## D05 -0.1977 0.08414 -0.3626 -0.03279
## D06 0.0313 0.08411 -0.1336 0.19616
## D07 0.4346 0.08704 0.2640 0.60517
## D08 0.7163 0.09140 0.5372 0.89546
## D09 1.2277 0.10447 1.0229 1.43243
## D10 1.6166 0.11933 1.3827 1.85046